From 2057e6d77a5ce49a18f314f5918e1648884cb688 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Mon, 3 Jul 2023 22:24:14 -0700 Subject: [PATCH 01/39] Add pre-commit --- .github/workflows/pre-commit.yaml | 41 +++++++++++++++++ .pre-commit-config.yaml | 74 +++++++++++++++++++++++++++++++ pyproject.toml | 49 ++++++++++++++++++++ 3 files changed, 164 insertions(+) create mode 100644 .github/workflows/pre-commit.yaml create mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml new file mode 100644 index 0000000000..190610a7aa --- /dev/null +++ b/.github/workflows/pre-commit.yaml @@ -0,0 +1,41 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - uses: pre-commit/action@v3.0.0 + diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..6c03a4ad6c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,74 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +repos: +- repo: https://github.com/timothycrosley/isort + rev: 5.12.0 + hooks: + - id: isort + additional_dependencies: [toml] +- repo: https://github.com/psf/black + rev: 23.1.0 + hooks: + - id: black + types_or: [python, cython] +- repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501] + types_or: [python, cython] +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v16.0.5 + hooks: + - id: clang-format + types_or: [c, c++, cuda, proto, textproto, java] + args: ["-fallback-style=none", "-style=file", "-i"] +- repo: https://github.com/codespell-project/codespell + rev: v2.2.4 + hooks: + - id: codespell + additional_dependencies: [tomli] + args: ["--toml", "pyproject.toml"] + exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$) +# More details about these pre-commit hooks here: +# https://pre-commit.com/hooks.html +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: check-case-conflict + - id: check-executables-have-shebangs + - id: check-merge-conflict + - id: check-json + - id: check-toml + - id: check-yaml + - id: check-shebang-scripts-are-executable + - id: end-of-file-fixer + types_or: [c, c++, cuda, proto, textproto, java, python] + - id: mixed-line-ending + - id: requirements-txt-fixer + - id: trailing-whitespace + diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..1a8da1f4d3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,49 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +[tool.codespell] +# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override - +# this is only to allow you to run codespell interactively +skip = "./.git,./.github" +# ignore short words, and typename parameters like OffsetT +ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b" +# use the 'clear' dictionary for unambiguous spelling mistakes +builtin = "clear" +# disable warnings about binary files and wrong encoding +quiet-level = 3 + +[tool.isort] +profile = "black" +use_parentheses = true +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +ensure_newline_before_comments = true +line_length = 88 +balanced_wrapping = true +indent = " " +skip = ["build"] + From 36998f0d1113a3cd03d5a1dc2c0fd9dd2a354965 Mon Sep 17 00:00:00 2001 From: David Yastremsky Date: Tue, 4 Jul 2023 12:03:03 -0700 Subject: [PATCH 02/39] Fix typos, exec/shebang, formatting --- .github/workflows/codeql.yml | 6 +- .pre-commit-config.yaml | 2 + CMakeLists.txt | 2 +- Dockerfile.QA | 2 +- Dockerfile.sdk | 6 +- Dockerfile.win10.min | 4 +- README.md | 92 +- build.py | 2 +- compose.py | 417 +-- deploy/alibaba-cloud/README.md | 8 +- deploy/aws/README.md | 6 +- deploy/aws/templates/deployment.yaml | 4 +- deploy/fleetcommand/README.md | 4 +- deploy/gcp/README.md | 2 +- deploy/gke-marketplace-app/README.md | 28 +- .../gke-marketplace-app/benchmark/README.md | 15 +- .../model-store/bert_base_tf_gpu/config.pbtxt | 2 +- .../bert_base_trt_gpu/config.pbtxt | 2 +- .../bert_distill_tf_cpu/config.pbtxt | 2 +- .../bert_distill_tf_gpu/config.pbtxt | 2 +- .../perf-analyzer-script/perf_query.sh | 0 .../client-sample/bert_request.json | 6 +- .../client-sample/locustfile_bert.py | 17 +- .../client-sample/perf_analyzer_grpc.sh | 0 .../server-deployer/build_and_push.sh | 3 +- .../chart/triton/templates/application.yaml | 8 +- .../chart/triton/templates/deployment.yaml | 2 +- .../chart/triton/templates/service.yaml | 4 +- .../server-deployer/chart/triton/values.yaml | 2 +- .../server-deployer/data-test/schema.yaml | 20 +- .../server-deployer/schema.yaml | 20 +- .../gke-marketplace-app/trt-engine/README.md | 12 +- .../onnx_float32_int32_int32/config.pbtxt | 0 .../mlflow_triton/__init__.py | 6 +- .../mlflow_triton/config.py | 55 +- .../mlflow_triton/deployments.py | 299 +- .../scripts/publish_model_to_mlflow.py | 22 +- .../scripts/triton_flavor.py | 16 +- deploy/mlflow-triton-plugin/setup.py | 6 +- docker/cpu_only/entrypoint.d/12-banner.sh | 0 .../entrypoint.d/50-gpu-driver-check2.sh | 0 docker/entrypoint.d/50-gpu-driver-check2.sh | 0 .../56-network-driver-version-check.sh | 2 +- docker/entrypoint.d/70-shm-check.sh | 2 +- docker/entrypoint.d/99-check-run-aip-mode.sh | 0 docker/sagemaker/serve | 8 +- docs/Makefile | 2 +- docs/README.md | 30 +- docs/_static/custom.css | 4 +- docs/conf.py | 66 +- docs/customization_guide/build.md | 12 +- .../inference_protocols.md | 26 +- docs/examples/README.md | 2 +- .../concurrency_and_dynamic_batching/Makefile | 4 +- .../README.md | 22 +- .../tao/convert_peoplenet.sh | 0 .../simple_identity/config.pbtxt | 0 docs/getting_started/quickstart.md | 8 +- docs/index.md | 8 +- docs/protocol/extension_logging.md | 22 +- .../protocol/extension_model_configuration.md | 2 +- docs/protocol/extension_parameters.md | 4 +- docs/protocol/extension_schedule_policy.md | 2 +- docs/protocol/extension_sequence.md | 4 +- docs/protocol/extension_statistics.md | 10 +- docs/protocol/extension_trace.md | 2 +- docs/user_guide/architecture.md | 12 +- docs/user_guide/custom_operations.md | 4 +- docs/user_guide/decoupled_models.md | 8 +- docs/user_guide/faq.md | 4 +- docs/user_guide/jetson.md | 4 +- docs/user_guide/metrics.md | 30 +- docs/user_guide/model_analyzer.md | 2 +- docs/user_guide/model_configuration.md | 68 +- docs/user_guide/model_management.md | 10 +- docs/user_guide/model_repository.md | 18 +- docs/user_guide/optimization.md | 10 +- docs/user_guide/performance_tuning.md | 8 +- docs/user_guide/rate_limiter.md | 4 +- docs/user_guide/response_cache.md | 68 +- qa/L0_async_work_queue/test.sh | 0 qa/L0_backend_config/test.sh | 50 +- qa/L0_backend_fastertransformer/test.sh | 2 +- qa/L0_backend_identity/identity_test.py | 192 +- .../models/argument_validation/1/model.py | 74 +- .../argument_validation/test.sh | 1 + qa/L0_backend_python/bls/test.sh | 8 +- qa/L0_backend_python/common.sh | 3 +- qa/L0_backend_python/custom_metrics/test.sh | 2 +- .../decoupled/decoupled_test.py | 108 +- .../decoupled/models/decoupled_bls/1/model.py | 127 +- .../models/decoupled_bls_stream/1/model.py | 72 +- .../models/decoupled_execute_error/1/model.py | 54 +- .../1/model.py | 48 +- .../1/model.py | 47 +- qa/L0_backend_python/decoupled/test.sh | 1 + .../ensemble/ensemble_test.py | 46 +- qa/L0_backend_python/ensemble/test.sh | 0 qa/L0_backend_python/env/test.sh | 2 +- qa/L0_backend_python/examples/test.sh | 2 +- qa/L0_backend_python/io/io_test.py | 74 +- qa/L0_backend_python/io/test.sh | 0 .../lifecycle/lifecycle_test.py | 59 +- qa/L0_backend_python/lifecycle/test.sh | 2 +- qa/L0_backend_python/logging/logging_test.py | 20 +- qa/L0_backend_python/logging/test.sh | 4 +- .../model_control/model_control_test.py | 23 +- qa/L0_backend_python/model_control/test.sh | 0 qa/L0_backend_python/python_test.py | 297 +- qa/L0_backend_python/python_unittest.py | 26 +- .../restart/models/restart/1/model.py | 23 +- qa/L0_backend_python/restart/restart_test.py | 23 +- qa/L0_backend_python/restart/test.sh | 0 qa/L0_backend_python/variants/test.sh | 2 +- qa/L0_batch_custom/batch_custom_test.py | 200 +- qa/L0_batch_custom/test.sh | 4 +- qa/L0_batch_input/batch_input_test.py | 170 +- qa/L0_batch_input/test.sh | 0 qa/L0_batcher/batcher_test.py | 1348 +++++---- qa/L0_batcher/test.sh | 2 +- qa/L0_batcher/verify_timestamps.py | 45 +- .../buffer_attributes_test.py | 65 +- qa/L0_buffer_attributes/models/bls/1/model.py | 23 +- .../models/identity/1/model.py | 10 +- qa/L0_buffer_attributes/test.sh | 3 +- qa/L0_client_build_variants/test.sh | 2 +- qa/L0_client_java/test.sh | 0 .../client_memory_mail.py | 12 +- .../models/custom_identity_int32/config.pbtxt | 2 +- qa/L0_client_memory_growth/test.sh | 2 +- qa/L0_client_nobatch/client_test.py | 200 +- qa/L0_client_timeout/client_timeout_test.py | 157 +- .../models/custom_identity_int32/config.pbtxt | 2 +- qa/L0_client_timeout/test.sh | 0 .../models/custom_identity_int32/config.pbtxt | 2 +- qa/L0_cmdline_trace/test.sh | 2 +- qa/L0_cmdline_trace/trace_client.py | 37 +- qa/L0_cuda_graph/test.sh | 0 qa/L0_cuda_graph/trt_cuda_graph_test.py | 72 +- .../cuda_shared_memory_test.py | 137 +- qa/L0_cuda_shared_memory/test.sh | 0 qa/L0_custom_ops/cuda_op_test.py | 66 +- qa/L0_custom_ops/mod_op_test.py | 77 +- qa/L0_custom_ops/onnx_op_test.py | 74 +- qa/L0_custom_ops/vision_op_test.py | 74 +- qa/L0_custom_ops/zero_out_test.py | 64 +- qa/L0_data_compression/test.sh | 0 qa/L0_data_compression/validation.py | 12 +- qa/L0_decoupled/decoupled_test.py | 400 +-- qa/L0_decoupled/test.sh | 16 +- qa/L0_device_memory_tracker/test.py | 32 +- qa/L0_device_memory_tracker/test.sh | 0 qa/L0_dlpack_multi_gpu/test.sh | 2 +- qa/L0_doc_links/test.sh | 3 +- qa/L0_dyna_implicit_state/test.sh | 0 .../dyna_sequence_batcher_test.py | 1016 ++++--- qa/L0_dyna_sequence_batcher/test.sh | 2 +- .../client_plugin_test/1/model.py | 25 +- qa/L0_grpc/grpc_basic_auth_test.py | 19 +- qa/L0_grpc/grpc_client_plugin_test.py | 36 +- qa/L0_grpc/python_grpc_aio_test.py | 23 +- qa/L0_grpc/python_unit_test.py | 93 +- qa/L0_grpc/test.sh | 2 +- qa/L0_http/http_basic_auth_test.py | 19 +- qa/L0_http/http_client_plugin_test.py | 64 +- qa/L0_http/http_test.py | 124 +- qa/L0_http/python_http_aio_test.py | 14 +- qa/L0_http/test.sh | 4 +- qa/L0_http_fuzz/fuzztest.py | 55 +- qa/L0_http_fuzz/test.sh | 6 +- qa/L0_https/test.sh | 16 +- qa/L0_implicit_state/implicit_state.py | 2 +- qa/L0_implicit_state/test.sh | 0 qa/L0_infer/infer_test.py | 1184 ++++---- qa/L0_infer/install_and_test.sh | 2 +- qa/L0_infer_reshape/infer_reshape_test.py | 252 +- qa/L0_infer_variable/infer_variable_test.py | 452 +-- qa/L0_infer_zero/infer_zero_test.py | 332 ++- qa/L0_inferentia_perf_analyzer/test.sh | 34 +- qa/L0_io/test.sh | 2 +- .../MemoryGrowthTest.java | 1481 +++++---- qa/L0_java_memory_growth/test.sh | 2 +- qa/L0_java_resnet/ResnetTest.java | 986 +++--- qa/L0_java_sequence_batcher/SequenceTest.java | 1019 +++---- qa/L0_json/test.sh | 0 qa/L0_large_payload/large_payload_test.py | 102 +- qa/L0_large_payload/test.sh | 0 qa/L0_libtorch_inference_mode/test.sh | 0 .../client.py | 26 +- .../gen_models.py | 18 +- .../models/libtorch_multi_device/config.pbtxt | 0 .../test.sh | 8 +- qa/L0_libtorch_io_names/io_names_client.py | 46 +- qa/L0_libtorch_io_names/test.sh | 0 qa/L0_libtorch_nvfuser/test.sh | 0 qa/L0_libtorch_optimized_execution/test.sh | 0 .../libtorch_shared_weights_test.py | 21 +- qa/L0_libtorch_shared_weights/test.sh | 3 +- qa/L0_lifecycle/lifecycle_test.py | 2459 ++++++++------- qa/L0_lifecycle/test.sh | 8 +- qa/L0_logging/logging_endpoint_test.py | 330 +- qa/L0_logging/test.sh | 14 +- qa/L0_long_running_stress/crashing_client.py | 60 +- qa/L0_long_running_stress/scenarios.py | 653 ++-- qa/L0_long_running_stress/stress.py | 508 ++-- qa/L0_long_running_stress/stress_mail.py | 28 +- qa/L0_memory/test.sh | 0 qa/L0_memory_growth/busy_op_test.py | 84 +- qa/L0_memory_growth/server_memory_mail.py | 22 +- qa/L0_metrics/metrics_test.py | 34 +- qa/L0_metrics/test.sh | 2 +- qa/L0_mlflow/plugin_test.py | 53 +- qa/L0_mlflow/test.sh | 10 +- .../conflicting_max_batch_size/model.py | 13 +- .../conflicting_scheduler_sequence/model.py | 13 +- .../python/input_missing_datatype/model.py | 13 +- .../python/input_missing_dims/model.py | 13 +- .../python/input_missing_name/model.py | 13 +- .../python/input_wrong_property/model.py | 19 +- .../python/no_return/model.py | 13 +- .../python/output_missing_datatype/model.py | 13 +- .../python/output_missing_dims/model.py | 13 +- .../python/output_missing_name/model.py | 13 +- .../python/output_wrong_property/model.py | 19 +- .../onnx/cpu_instance/config.pbtxt | 0 .../openvino/partial_config/config.pbtxt | 0 .../conflicting_scheduler_ensemble/model.py | 9 +- .../ensemble_first_step/model.py | 9 +- .../ensemble_second_step/model.py | 9 +- .../python/dynamic_batching/model.py | 13 +- .../python/dynamic_batching_no_op/model.py | 13 +- .../python/incomplete_input/model.py | 11 +- .../reshape_config_provided/config.pbtxt | 0 qa/L0_model_config/compare_status.py | 45 +- qa/L0_model_config/noautofill_test.py | 8 +- qa/L0_model_config/test.sh | 6 +- .../python_addsub/__init__.py | 109 +- .../python_subadd/__init__.py | 109 +- qa/L0_model_namespacing/test.py | 101 +- qa/L0_model_namespacing/test.sh | 0 .../addsub_repo/composing_model/1/model.py | 4 +- .../addsub_repo/simple_addsub/config.pbtxt | 12 +- .../subadd_repo/composing_model/1/model.py | 4 +- .../subadd_repo/simple_subadd/config.pbtxt | 12 +- .../addsub_repo/composing_model/1/model.py | 4 +- .../addsub_repo/simple_addsub/config.pbtxt | 12 +- .../subadd_repo/composing_model/1/model.py | 4 +- .../subadd_repo/simple_subadd/config.pbtxt | 12 +- .../addsub_repo/composing_addsub/1/model.py | 4 +- .../addsub_repo/simple_ensemble/config.pbtxt | 12 +- .../subadd_repo/composing_subadd/1/model.py | 4 +- .../subadd_repo/simple_ensemble/config.pbtxt | 12 +- .../addsub_repo/composing_addsub/1/model.py | 4 +- .../addsub_repo/simple_addsub/config.pbtxt | 12 +- .../subadd_repo/composing_subadd/1/model.py | 4 +- .../subadd_repo/simple_subadd/config.pbtxt | 12 +- qa/L0_model_queue/model_queue_test.py | 392 ++- qa/L0_model_update/instance_update_test.py | 166 +- qa/L0_multi_server/test.sh | 0 .../models/nan_inf_output/1/model.py | 14 +- qa/L0_nan_inf/nan_inf_test.py | 49 +- .../nullchar_string_client.py | 63 +- qa/L0_nullchar_string/test.sh | 0 .../ensemble_identity_2_float32/config.pbtxt | 0 .../models/identity_2_float32/config.pbtxt | 0 .../pipeline_identity_2_float32/config.pbtxt | 0 qa/L0_optional_input/optional_input_test.py | 227 +- qa/L0_output_name/output_name_test.py | 20 +- qa/L0_output_name/test.sh | 0 qa/L0_output_validation/lt_op_val_client.py | 17 +- qa/L0_output_validation/test.sh | 0 qa/L0_parallel_copy/parallel_copy_test.py | 80 +- .../model_repository/parameter/1/model.py | 43 +- qa/L0_parameters/parameters_test.py | 162 +- qa/L0_parameters/test.sh | 8 +- .../config.pbtxt | 0 .../passive_instance_test.py | 15 +- qa/L0_passive_instance/test.sh | 0 qa/L0_perf_analyzer/test.sh | 24 +- qa/L0_perf_analyzer_doc_links/test.sh | 9 +- qa/L0_perf_analyzer_ground_truth/test.sh | 4 +- qa/L0_perf_analyzer_report/test.sh | 2 +- qa/L0_perf_kaldi/create_data.sh | 2 +- qa/L0_perf_kaldi/test.sh | 0 qa/L0_perf_nomodel/run_test.sh | 2 +- qa/L0_perf_pyclients/simple_perf_client.py | 317 +- qa/L0_perf_resnet/run_test.sh | 2 +- qa/L0_query/query_e2e.py | 108 +- qa/L0_query/test.sh | 0 qa/L0_rate_limiter/rate_limiter_test.py | 143 +- qa/L0_rate_limiter/test.sh | 2 +- qa/L0_register/test.sh | 0 qa/L0_repoagent_checksum/identity_test.py | 68 +- qa/L0_response_cache/test.sh | 8 +- qa/L0_sagemaker/sagemaker_multi_model_test.py | 226 +- qa/L0_sagemaker/sagemaker_test.py | 329 +- .../saved_model_shape_test.py | 302 +- qa/L0_savedmodel_shape/test.sh | 0 qa/L0_secure_grpc/test.sh | 14 +- .../sequence_batcher_test.py | 2 +- qa/L0_sequence_batcher/test.sh | 6 +- .../sequence_corrid_batcher_test.py | 139 +- qa/L0_sequence_stress/sequence_stress.py | 428 +-- qa/L0_server_status/server_status_test.py | 534 ++-- qa/L0_shared_memory/shared_memory_test.py | 164 +- qa/L0_shared_memory/test.sh | 0 qa/L0_simple_ensemble/ensemble_test.py | 73 +- qa/L0_simple_nodejs_client/test.sh | 0 qa/L0_socket/test.sh | 2 +- qa/L0_storage_S3_local/mock_s3_service.py | 36 +- qa/L0_storage_azure/test.sh | 2 +- qa/L0_storage_swiftstack/infer_test.py | 270 +- qa/L0_string_io/string_client_test.py | 152 +- qa/L0_tf_gpu_io/tf_gpu_io_test.py | 64 +- qa/L0_tf_parameters/test.sh | 0 qa/L0_tf_parameters/tf_parameter_test.py | 44 +- qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py | 13 +- qa/L0_tf_unknown_rank/test.sh | 0 qa/L0_tf_unknown_rank/tf_unknown_rank_test.py | 27 +- .../tftrt_optimization_test.py | 36 +- qa/L0_trace/test.sh | 2 +- qa/L0_trace/trace_endpoint_test.py | 423 ++- qa/L0_triton_repo_agent/test.sh | 0 .../trt_data_dependent_shape_test.py | 25 +- qa/L0_trt_dla/dla_test.py | 23 +- qa/L0_trt_dla/test.sh | 0 qa/L0_trt_dynamic_shape/test.sh | 2 +- .../trt_dynamic_shape_test.py | 76 +- .../trt_error_propagation_test.py | 28 +- qa/L0_trt_plugin/test.sh | 0 qa/L0_trt_plugin/trt_plugin_test.py | 56 +- .../trt_reformat_free_test.py | 194 +- qa/L0_trt_shape_tensors/test.sh | 2 +- .../trt_shape_tensor_test.py | 674 +++-- qa/L0_vertex_ai/test.sh | 4 +- qa/L0_vertex_ai/vertex_ai_test.py | 241 +- qa/L0_warmup/decoupled/1/model.py | 9 +- qa/L0_warmup/failing_infer/1/model.py | 11 +- qa/L0_warmup/test.sh | 0 qa/common/check_copyright.py | 192 +- qa/common/check_massif_log.py | 45 +- qa/common/check_valgrind_log.py | 42 +- qa/common/cuda_op_kernel.cu.cc.patch | 8 +- qa/common/gen_ensemble_model_utils.py | 626 ++-- qa/common/gen_qa_custom_ops | 6 +- qa/common/gen_qa_custom_ops_models.py | 239 +- .../gen_qa_dyna_sequence_implicit_models.py | 470 +-- qa/common/gen_qa_dyna_sequence_models.py | 825 ++--- qa/common/gen_qa_identity_models.py | 853 +++--- qa/common/gen_qa_implicit_models.py | 4 +- qa/common/gen_qa_model_repository | 6 +- qa/common/gen_qa_models.py | 2646 +++++++++++------ qa/common/gen_qa_noshape_models.py | 438 +-- qa/common/gen_qa_ragged_models.py | 442 +-- qa/common/gen_qa_reshape_models.py | 1364 ++++++--- qa/common/gen_qa_sequence_models.py | 812 +++-- qa/common/gen_qa_tf_parameters.py | 47 +- qa/common/gen_qa_torchtrt_models.py | 32 +- qa/common/gen_qa_trt_data_dependent_shape.py | 65 +- qa/common/gen_qa_trt_format_models.py | 351 ++- qa/common/gen_qa_trt_plugin_models.py | 312 +- qa/common/gen_tag_sigdef.py | 233 +- qa/common/infer_test.py | 263 +- qa/common/infer_util.py | 860 +++--- .../non_aligned_validation_batched.json | 56 +- .../non_aligned_validation_no_batch.json | 56 +- .../simple_model.py | 101 +- .../validation_batched.json | 64 +- .../validation_no_batch.json | 64 +- .../wrong_validation_batched.json | 64 +- .../wrong_validation_no_batch.json | 64 +- qa/common/libtorch_infer_client.py | 40 +- qa/common/nightly_email_helper.py | 41 +- .../int_data.json | 4 +- .../int_data_diff_shape.json | 4 +- .../perf_analyzer_input_data_json/output.json | 2 +- .../string_data_with_shape.json | 8 +- .../wrong_output.json | 2 +- .../wrong_output_2.json | 2 +- qa/common/reporter.py | 120 +- qa/common/sequence_util.py | 824 ++--- qa/common/shm_util.py | 314 +- qa/common/test_util.py | 173 +- qa/common/trace_summary.py | 345 ++- qa/common/util.sh | 3 +- .../custom_zero_1_float32/config.pbtxt | 0 qa/python_models/add_sub/model.py | 52 +- qa/python_models/add_sub_gpu/config.pbtxt | 8 +- qa/python_models/auto_complete/model.py | 60 +- qa/python_models/auto_complete_error/model.py | 13 +- qa/python_models/bls/model.py | 364 +-- qa/python_models/bls_async/model.py | 104 +- qa/python_models/bls_finalize_error/model.py | 14 +- qa/python_models/bls_init_error/model.py | 14 +- qa/python_models/bls_memory/model.py | 52 +- qa/python_models/bls_memory_async/model.py | 32 +- .../bls_model_loading/config.pbtxt | 4 +- qa/python_models/bls_model_loading/model.py | 37 +- qa/python_models/bls_onnx_warmup/config.pbtxt | 2 +- qa/python_models/bls_undefined/model.py | 5 +- .../cuda_memory_consumer/1/model.py | 20 +- qa/python_models/custom_metrics/config.pbtxt | 4 +- qa/python_models/custom_metrics/model.py | 88 +- qa/python_models/delayed_model/model.py | 8 +- qa/python_models/dlpack_add_sub/model.py | 103 +- qa/python_models/dlpack_empty_output/model.py | 10 +- qa/python_models/dlpack_identity/model.py | 10 +- qa/python_models/dlpack_io_identity/model.py | 55 +- .../dlpack_io_identity_decoupled/model.py | 44 +- qa/python_models/dlpack_square/model.py | 58 +- qa/python_models/dlpack_sub_add/model.py | 103 +- qa/python_models/dlpack_test/model.py | 175 +- qa/python_models/execute_error/model.py | 15 +- .../execute_return_error/model.py | 3 +- qa/python_models/fini_error/model.py | 5 +- qa/python_models/ground_truth/model.py | 10 +- qa/python_models/identity_fp32/model.py | 5 +- .../identity_fp32_logging/model.py | 5 +- .../identity_fp32_timeout/model.py | 6 +- qa/python_models/init_args/model.py | 2 +- qa/python_models/init_error/model.py | 7 +- qa/python_models/init_exit/model.py | 5 +- qa/python_models/model_env/model.py | 11 +- qa/python_models/model_init_del/model.py | 11 +- qa/python_models/model_init_del/util.py | 4 +- qa/python_models/multi_file/file1.py | 6 +- qa/python_models/multi_file/file2.py | 6 +- qa/python_models/multi_file/model.py | 13 +- qa/python_models/non_contiguous/model.py | 13 +- qa/python_models/optional/model.py | 16 +- qa/python_models/python_version/model.py | 31 +- qa/python_models/pytorch_fp32_fp32/model.py | 8 +- .../response_sender_error/model.py | 38 +- qa/python_models/sequence_int32/config.pbtxt | 6 +- qa/python_models/sequence_int32/model.py | 63 +- qa/python_models/string/model.py | 8 +- qa/python_models/string_fixed/model.py | 16 +- qa/python_models/string_identity/model.py | 16 +- qa/python_models/sub_add/model.py | 56 +- .../torchvision/resnet50/config.pbtxt | 0 .../torchvision/resnet50/model.py | 24 +- qa/python_models/variable_gpu_output/model.py | 17 +- qa/python_models/wrong_model/model.py | 5 +- src/CMakeLists.txt | 8 +- src/command_line_parser.cc | 6 +- src/common.h | 23 +- src/data_compressor.h | 8 +- src/grpc/grpc_server.cc | 6 +- src/grpc/stream_infer_handler.cc | 2 +- src/http_server.cc | 6 +- src/sagemaker_server.h | 2 +- src/shared_memory_manager.cc | 4 +- src/simple.cc | 2 +- .../src/distributed_addsub.cc | 2 +- .../relocation_repoagent/src/relocation.cc | 2 +- src/vertex_ai_server.h | 2 +- 456 files changed, 23165 insertions(+), 17677 deletions(-) mode change 100644 => 100755 compose.py mode change 100644 => 100755 deploy/gke-marketplace-app/benchmark/perf-analyzer-script/perf_query.sh mode change 100644 => 100755 deploy/gke-marketplace-app/client-sample/locustfile_bert.py mode change 100644 => 100755 deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh mode change 100644 => 100755 deploy/gke-marketplace-app/server-deployer/build_and_push.sh mode change 100755 => 100644 deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt mode change 100644 => 100755 deploy/mlflow-triton-plugin/mlflow_triton/__init__.py mode change 100644 => 100755 deploy/mlflow-triton-plugin/mlflow_triton/config.py mode change 100644 => 100755 deploy/mlflow-triton-plugin/mlflow_triton/deployments.py mode change 100644 => 100755 deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py mode change 100644 => 100755 deploy/mlflow-triton-plugin/scripts/triton_flavor.py mode change 100644 => 100755 deploy/mlflow-triton-plugin/setup.py mode change 100644 => 100755 docker/cpu_only/entrypoint.d/12-banner.sh mode change 100644 => 100755 docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh mode change 100644 => 100755 docker/entrypoint.d/50-gpu-driver-check2.sh mode change 100644 => 100755 docker/entrypoint.d/56-network-driver-version-check.sh mode change 100644 => 100755 docker/entrypoint.d/70-shm-check.sh mode change 100644 => 100755 docker/entrypoint.d/99-check-run-aip-mode.sh mode change 100644 => 100755 docs/conf.py mode change 100644 => 100755 docs/examples/jetson/concurrency_and_dynamic_batching/tao/convert_peoplenet.sh mode change 100755 => 100644 docs/examples/model_repository/simple_identity/config.pbtxt mode change 100644 => 100755 qa/L0_async_work_queue/test.sh mode change 100644 => 100755 qa/L0_backend_config/test.sh mode change 100644 => 100755 qa/L0_backend_fastertransformer/test.sh mode change 100644 => 100755 qa/L0_backend_identity/identity_test.py mode change 100644 => 100755 qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py mode change 100644 => 100755 qa/L0_backend_python/argument_validation/test.sh mode change 100644 => 100755 qa/L0_backend_python/bls/test.sh mode change 100644 => 100755 qa/L0_backend_python/common.sh mode change 100644 => 100755 qa/L0_backend_python/custom_metrics/test.sh mode change 100644 => 100755 qa/L0_backend_python/decoupled/decoupled_test.py mode change 100644 => 100755 qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py mode change 100644 => 100755 qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py mode change 100644 => 100755 qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py mode change 100644 => 100755 qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py mode change 100644 => 100755 qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py mode change 100644 => 100755 qa/L0_backend_python/decoupled/test.sh mode change 100644 => 100755 qa/L0_backend_python/ensemble/ensemble_test.py mode change 100644 => 100755 qa/L0_backend_python/ensemble/test.sh mode change 100644 => 100755 qa/L0_backend_python/env/test.sh mode change 100644 => 100755 qa/L0_backend_python/examples/test.sh mode change 100644 => 100755 qa/L0_backend_python/io/io_test.py mode change 100644 => 100755 qa/L0_backend_python/io/test.sh mode change 100644 => 100755 qa/L0_backend_python/lifecycle/lifecycle_test.py mode change 100644 => 100755 qa/L0_backend_python/lifecycle/test.sh mode change 100644 => 100755 qa/L0_backend_python/logging/logging_test.py mode change 100644 => 100755 qa/L0_backend_python/model_control/model_control_test.py mode change 100644 => 100755 qa/L0_backend_python/model_control/test.sh mode change 100644 => 100755 qa/L0_backend_python/python_test.py mode change 100644 => 100755 qa/L0_backend_python/python_unittest.py mode change 100644 => 100755 qa/L0_backend_python/restart/models/restart/1/model.py mode change 100644 => 100755 qa/L0_backend_python/restart/restart_test.py mode change 100644 => 100755 qa/L0_backend_python/restart/test.sh mode change 100644 => 100755 qa/L0_backend_python/variants/test.sh mode change 100644 => 100755 qa/L0_batch_custom/batch_custom_test.py mode change 100644 => 100755 qa/L0_batch_input/batch_input_test.py mode change 100644 => 100755 qa/L0_batch_input/test.sh mode change 100644 => 100755 qa/L0_batcher/batcher_test.py mode change 100644 => 100755 qa/L0_batcher/test.sh mode change 100644 => 100755 qa/L0_batcher/verify_timestamps.py mode change 100644 => 100755 qa/L0_buffer_attributes/buffer_attributes_test.py mode change 100644 => 100755 qa/L0_buffer_attributes/models/bls/1/model.py mode change 100644 => 100755 qa/L0_buffer_attributes/models/identity/1/model.py mode change 100644 => 100755 qa/L0_buffer_attributes/test.sh mode change 100644 => 100755 qa/L0_client_java/test.sh mode change 100644 => 100755 qa/L0_client_memory_growth/client_memory_mail.py mode change 100644 => 100755 qa/L0_client_nobatch/client_test.py mode change 100644 => 100755 qa/L0_client_timeout/client_timeout_test.py mode change 100644 => 100755 qa/L0_client_timeout/test.sh mode change 100644 => 100755 qa/L0_cmdline_trace/trace_client.py mode change 100644 => 100755 qa/L0_cuda_graph/test.sh mode change 100644 => 100755 qa/L0_cuda_graph/trt_cuda_graph_test.py mode change 100644 => 100755 qa/L0_cuda_shared_memory/cuda_shared_memory_test.py mode change 100644 => 100755 qa/L0_cuda_shared_memory/test.sh mode change 100644 => 100755 qa/L0_custom_ops/cuda_op_test.py mode change 100644 => 100755 qa/L0_custom_ops/mod_op_test.py mode change 100644 => 100755 qa/L0_custom_ops/onnx_op_test.py mode change 100644 => 100755 qa/L0_custom_ops/vision_op_test.py mode change 100644 => 100755 qa/L0_custom_ops/zero_out_test.py mode change 100644 => 100755 qa/L0_data_compression/test.sh mode change 100644 => 100755 qa/L0_data_compression/validation.py mode change 100644 => 100755 qa/L0_decoupled/decoupled_test.py mode change 100644 => 100755 qa/L0_decoupled/test.sh mode change 100644 => 100755 qa/L0_device_memory_tracker/test.py mode change 100644 => 100755 qa/L0_device_memory_tracker/test.sh mode change 100644 => 100755 qa/L0_dlpack_multi_gpu/test.sh mode change 100644 => 100755 qa/L0_doc_links/test.sh mode change 100644 => 100755 qa/L0_dyna_implicit_state/test.sh mode change 100644 => 100755 qa/L0_dyna_sequence_batcher/dyna_sequence_batcher_test.py mode change 100644 => 100755 qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py mode change 100644 => 100755 qa/L0_grpc/grpc_basic_auth_test.py mode change 100644 => 100755 qa/L0_grpc/grpc_client_plugin_test.py mode change 100644 => 100755 qa/L0_grpc/python_grpc_aio_test.py mode change 100644 => 100755 qa/L0_grpc/python_unit_test.py mode change 100644 => 100755 qa/L0_grpc/test.sh mode change 100644 => 100755 qa/L0_http/http_basic_auth_test.py mode change 100644 => 100755 qa/L0_http/http_client_plugin_test.py mode change 100644 => 100755 qa/L0_http/http_test.py mode change 100644 => 100755 qa/L0_http/python_http_aio_test.py mode change 100644 => 100755 qa/L0_http/test.sh mode change 100644 => 100755 qa/L0_http_fuzz/fuzztest.py mode change 100644 => 100755 qa/L0_http_fuzz/test.sh mode change 100644 => 100755 qa/L0_https/test.sh mode change 100644 => 100755 qa/L0_implicit_state/implicit_state.py mode change 100644 => 100755 qa/L0_implicit_state/test.sh mode change 100644 => 100755 qa/L0_infer/infer_test.py mode change 100644 => 100755 qa/L0_infer_reshape/infer_reshape_test.py mode change 100644 => 100755 qa/L0_infer_variable/infer_variable_test.py mode change 100644 => 100755 qa/L0_infer_zero/infer_zero_test.py mode change 100644 => 100755 qa/L0_inferentia_perf_analyzer/test.sh mode change 100644 => 100755 qa/L0_json/test.sh mode change 100644 => 100755 qa/L0_large_payload/large_payload_test.py mode change 100644 => 100755 qa/L0_large_payload/test.sh mode change 100644 => 100755 qa/L0_libtorch_inference_mode/test.sh mode change 100644 => 100755 qa/L0_libtorch_instance_group_kind_model/client.py mode change 100755 => 100644 qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt mode change 100644 => 100755 qa/L0_libtorch_io_names/io_names_client.py mode change 100644 => 100755 qa/L0_libtorch_io_names/test.sh mode change 100644 => 100755 qa/L0_libtorch_nvfuser/test.sh mode change 100644 => 100755 qa/L0_libtorch_optimized_execution/test.sh mode change 100644 => 100755 qa/L0_libtorch_shared_weights/libtorch_shared_weights_test.py mode change 100644 => 100755 qa/L0_libtorch_shared_weights/test.sh mode change 100644 => 100755 qa/L0_lifecycle/lifecycle_test.py mode change 100644 => 100755 qa/L0_logging/logging_endpoint_test.py mode change 100644 => 100755 qa/L0_long_running_stress/crashing_client.py mode change 100644 => 100755 qa/L0_long_running_stress/scenarios.py mode change 100644 => 100755 qa/L0_long_running_stress/stress.py mode change 100644 => 100755 qa/L0_long_running_stress/stress_mail.py mode change 100644 => 100755 qa/L0_memory/test.sh mode change 100644 => 100755 qa/L0_memory_growth/busy_op_test.py mode change 100644 => 100755 qa/L0_memory_growth/server_memory_mail.py mode change 100644 => 100755 qa/L0_mlflow/plugin_test.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/no_return/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform_success/onnx/cpu_instance/config.pbtxt mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/config.pbtxt mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/config.pbtxt mode change 100644 => 100755 qa/L0_model_config/compare_status.py mode change 100644 => 100755 qa/L0_model_config/noautofill_test.py mode change 100644 => 100755 qa/L0_model_namespacing/python_addsub/__init__.py mode change 100644 => 100755 qa/L0_model_namespacing/python_subadd/__init__.py mode change 100644 => 100755 qa/L0_model_namespacing/test.py mode change 100644 => 100755 qa/L0_model_namespacing/test.sh mode change 100644 => 100755 qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py mode change 100755 => 100644 qa/L0_model_namespacing/test_duplication/addsub_repo/simple_addsub/config.pbtxt mode change 100644 => 100755 qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py mode change 100755 => 100644 qa/L0_model_namespacing/test_duplication/subadd_repo/simple_subadd/config.pbtxt mode change 100644 => 100755 qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py mode change 100755 => 100644 qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/simple_addsub/config.pbtxt mode change 100644 => 100755 qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py mode change 100755 => 100644 qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/simple_subadd/config.pbtxt mode change 100644 => 100755 qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py mode change 100755 => 100644 qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/simple_ensemble/config.pbtxt mode change 100644 => 100755 qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py mode change 100755 => 100644 qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/simple_ensemble/config.pbtxt mode change 100644 => 100755 qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py mode change 100755 => 100644 qa/L0_model_namespacing/test_no_duplication/addsub_repo/simple_addsub/config.pbtxt mode change 100644 => 100755 qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py mode change 100755 => 100644 qa/L0_model_namespacing/test_no_duplication/subadd_repo/simple_subadd/config.pbtxt mode change 100644 => 100755 qa/L0_model_queue/model_queue_test.py mode change 100644 => 100755 qa/L0_model_update/instance_update_test.py mode change 100644 => 100755 qa/L0_multi_server/test.sh mode change 100644 => 100755 qa/L0_nan_inf/models/nan_inf_output/1/model.py mode change 100644 => 100755 qa/L0_nan_inf/nan_inf_test.py mode change 100644 => 100755 qa/L0_nullchar_string/nullchar_string_client.py mode change 100644 => 100755 qa/L0_nullchar_string/test.sh mode change 100755 => 100644 qa/L0_optional_input/models/ensemble_identity_2_float32/config.pbtxt mode change 100755 => 100644 qa/L0_optional_input/models/identity_2_float32/config.pbtxt mode change 100755 => 100644 qa/L0_optional_input/models/pipeline_identity_2_float32/config.pbtxt mode change 100644 => 100755 qa/L0_optional_input/optional_input_test.py mode change 100644 => 100755 qa/L0_output_name/output_name_test.py mode change 100644 => 100755 qa/L0_output_name/test.sh mode change 100644 => 100755 qa/L0_output_validation/lt_op_val_client.py mode change 100644 => 100755 qa/L0_output_validation/test.sh mode change 100644 => 100755 qa/L0_parallel_copy/parallel_copy_test.py mode change 100644 => 100755 qa/L0_parameters/model_repository/parameter/1/model.py mode change 100644 => 100755 qa/L0_parameters/parameters_test.py mode change 100644 => 100755 qa/L0_parameters/test.sh mode change 100755 => 100644 qa/L0_passive_instance/models/distributed_int32_int32_int32/config.pbtxt mode change 100644 => 100755 qa/L0_passive_instance/passive_instance_test.py mode change 100644 => 100755 qa/L0_passive_instance/test.sh mode change 100644 => 100755 qa/L0_perf_analyzer_doc_links/test.sh mode change 100644 => 100755 qa/L0_perf_kaldi/create_data.sh mode change 100644 => 100755 qa/L0_perf_kaldi/test.sh mode change 100644 => 100755 qa/L0_perf_pyclients/simple_perf_client.py mode change 100644 => 100755 qa/L0_query/query_e2e.py mode change 100644 => 100755 qa/L0_query/test.sh mode change 100644 => 100755 qa/L0_rate_limiter/rate_limiter_test.py mode change 100644 => 100755 qa/L0_rate_limiter/test.sh mode change 100644 => 100755 qa/L0_register/test.sh mode change 100644 => 100755 qa/L0_repoagent_checksum/identity_test.py mode change 100644 => 100755 qa/L0_sagemaker/sagemaker_multi_model_test.py mode change 100644 => 100755 qa/L0_sagemaker/sagemaker_test.py mode change 100644 => 100755 qa/L0_savedmodel_shape/saved_model_shape_test.py mode change 100644 => 100755 qa/L0_savedmodel_shape/test.sh mode change 100644 => 100755 qa/L0_secure_grpc/test.sh mode change 100644 => 100755 qa/L0_sequence_batcher/sequence_batcher_test.py mode change 100644 => 100755 qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py mode change 100644 => 100755 qa/L0_sequence_stress/sequence_stress.py mode change 100644 => 100755 qa/L0_server_status/server_status_test.py mode change 100644 => 100755 qa/L0_shared_memory/shared_memory_test.py mode change 100644 => 100755 qa/L0_shared_memory/test.sh mode change 100644 => 100755 qa/L0_simple_ensemble/ensemble_test.py mode change 100644 => 100755 qa/L0_simple_nodejs_client/test.sh mode change 100644 => 100755 qa/L0_socket/test.sh mode change 100644 => 100755 qa/L0_storage_S3_local/mock_s3_service.py mode change 100644 => 100755 qa/L0_storage_swiftstack/infer_test.py mode change 100644 => 100755 qa/L0_string_io/string_client_test.py mode change 100644 => 100755 qa/L0_tf_gpu_io/tf_gpu_io_test.py mode change 100644 => 100755 qa/L0_tf_parameters/test.sh mode change 100644 => 100755 qa/L0_tf_parameters/tf_parameter_test.py mode change 100644 => 100755 qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py mode change 100644 => 100755 qa/L0_tf_unknown_rank/test.sh mode change 100644 => 100755 qa/L0_tf_unknown_rank/tf_unknown_rank_test.py mode change 100644 => 100755 qa/L0_tftrt_optimization/tftrt_optimization_test.py mode change 100644 => 100755 qa/L0_trace/trace_endpoint_test.py mode change 100644 => 100755 qa/L0_triton_repo_agent/test.sh mode change 100644 => 100755 qa/L0_trt_data_dependent_shape/trt_data_dependent_shape_test.py mode change 100644 => 100755 qa/L0_trt_dla/dla_test.py mode change 100644 => 100755 qa/L0_trt_dla/test.sh mode change 100644 => 100755 qa/L0_trt_dynamic_shape/trt_dynamic_shape_test.py mode change 100644 => 100755 qa/L0_trt_error_propagation/trt_error_propagation_test.py mode change 100644 => 100755 qa/L0_trt_plugin/test.sh mode change 100644 => 100755 qa/L0_trt_plugin/trt_plugin_test.py mode change 100644 => 100755 qa/L0_trt_reformat_free/trt_reformat_free_test.py mode change 100644 => 100755 qa/L0_trt_shape_tensors/test.sh mode change 100644 => 100755 qa/L0_trt_shape_tensors/trt_shape_tensor_test.py mode change 100644 => 100755 qa/L0_vertex_ai/test.sh mode change 100644 => 100755 qa/L0_vertex_ai/vertex_ai_test.py mode change 100644 => 100755 qa/L0_warmup/decoupled/1/model.py mode change 100644 => 100755 qa/L0_warmup/failing_infer/1/model.py mode change 100644 => 100755 qa/L0_warmup/test.sh mode change 100644 => 100755 qa/common/gen_ensemble_model_utils.py mode change 100644 => 100755 qa/common/gen_qa_custom_ops_models.py mode change 100644 => 100755 qa/common/gen_qa_dyna_sequence_implicit_models.py mode change 100644 => 100755 qa/common/gen_qa_dyna_sequence_models.py mode change 100644 => 100755 qa/common/gen_qa_identity_models.py mode change 100644 => 100755 qa/common/gen_qa_implicit_models.py mode change 100644 => 100755 qa/common/gen_qa_models.py mode change 100644 => 100755 qa/common/gen_qa_noshape_models.py mode change 100644 => 100755 qa/common/gen_qa_ragged_models.py mode change 100644 => 100755 qa/common/gen_qa_reshape_models.py mode change 100644 => 100755 qa/common/gen_qa_sequence_models.py mode change 100644 => 100755 qa/common/gen_qa_tf_parameters.py mode change 100644 => 100755 qa/common/gen_qa_torchtrt_models.py mode change 100644 => 100755 qa/common/gen_qa_trt_data_dependent_shape.py mode change 100644 => 100755 qa/common/gen_qa_trt_format_models.py mode change 100644 => 100755 qa/common/gen_qa_trt_plugin_models.py mode change 100644 => 100755 qa/common/gen_tag_sigdef.py mode change 100644 => 100755 qa/common/infer_test.py mode change 100644 => 100755 qa/common/infer_util.py mode change 100644 => 100755 qa/common/inferentia_perf_analyzer_input_data_json/simple_model.py mode change 100644 => 100755 qa/common/libtorch_infer_client.py mode change 100644 => 100755 qa/common/nightly_email_helper.py mode change 100644 => 100755 qa/common/sequence_util.py mode change 100644 => 100755 qa/common/shm_util.py mode change 100644 => 100755 qa/common/test_util.py mode change 100755 => 100644 qa/custom_models/custom_zero_1_float32/config.pbtxt mode change 100644 => 100755 qa/python_models/add_sub/model.py mode change 100644 => 100755 qa/python_models/auto_complete/model.py mode change 100644 => 100755 qa/python_models/auto_complete_error/model.py mode change 100644 => 100755 qa/python_models/bls/model.py mode change 100644 => 100755 qa/python_models/bls_async/model.py mode change 100644 => 100755 qa/python_models/bls_finalize_error/model.py mode change 100644 => 100755 qa/python_models/bls_init_error/model.py mode change 100644 => 100755 qa/python_models/bls_memory/model.py mode change 100644 => 100755 qa/python_models/bls_memory_async/model.py mode change 100644 => 100755 qa/python_models/bls_model_loading/model.py mode change 100644 => 100755 qa/python_models/bls_onnx_warmup/config.pbtxt mode change 100644 => 100755 qa/python_models/bls_undefined/model.py mode change 100644 => 100755 qa/python_models/cuda_memory_consumer/1/model.py mode change 100644 => 100755 qa/python_models/custom_metrics/model.py mode change 100644 => 100755 qa/python_models/delayed_model/model.py mode change 100644 => 100755 qa/python_models/dlpack_add_sub/model.py mode change 100644 => 100755 qa/python_models/dlpack_empty_output/model.py mode change 100644 => 100755 qa/python_models/dlpack_identity/model.py mode change 100644 => 100755 qa/python_models/dlpack_io_identity/model.py mode change 100644 => 100755 qa/python_models/dlpack_io_identity_decoupled/model.py mode change 100644 => 100755 qa/python_models/dlpack_square/model.py mode change 100644 => 100755 qa/python_models/dlpack_sub_add/model.py mode change 100644 => 100755 qa/python_models/dlpack_test/model.py mode change 100644 => 100755 qa/python_models/execute_error/model.py mode change 100644 => 100755 qa/python_models/execute_return_error/model.py mode change 100644 => 100755 qa/python_models/fini_error/model.py mode change 100644 => 100755 qa/python_models/ground_truth/model.py mode change 100644 => 100755 qa/python_models/identity_fp32/model.py mode change 100644 => 100755 qa/python_models/identity_fp32_logging/model.py mode change 100644 => 100755 qa/python_models/identity_fp32_timeout/model.py mode change 100644 => 100755 qa/python_models/init_args/model.py mode change 100644 => 100755 qa/python_models/init_error/model.py mode change 100644 => 100755 qa/python_models/init_exit/model.py mode change 100644 => 100755 qa/python_models/model_env/model.py mode change 100644 => 100755 qa/python_models/model_init_del/model.py mode change 100644 => 100755 qa/python_models/model_init_del/util.py mode change 100644 => 100755 qa/python_models/multi_file/file1.py mode change 100644 => 100755 qa/python_models/multi_file/file2.py mode change 100644 => 100755 qa/python_models/multi_file/model.py mode change 100644 => 100755 qa/python_models/non_contiguous/model.py mode change 100644 => 100755 qa/python_models/optional/model.py mode change 100644 => 100755 qa/python_models/python_version/model.py mode change 100644 => 100755 qa/python_models/pytorch_fp32_fp32/model.py mode change 100644 => 100755 qa/python_models/response_sender_error/model.py mode change 100644 => 100755 qa/python_models/sequence_int32/model.py mode change 100644 => 100755 qa/python_models/string/model.py mode change 100644 => 100755 qa/python_models/string_fixed/model.py mode change 100644 => 100755 qa/python_models/string_identity/model.py mode change 100644 => 100755 qa/python_models/sub_add/model.py mode change 100755 => 100644 qa/python_models/torchvision/resnet50/config.pbtxt mode change 100644 => 100755 qa/python_models/torchvision/resnet50/model.py mode change 100644 => 100755 qa/python_models/variable_gpu_output/model.py mode change 100644 => 100755 qa/python_models/wrong_model/model.py diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index a724718d46..4f3f98cc6f 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -63,12 +63,12 @@ jobs: # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. - + # Details on CodeQL's query packs refer to: # https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs queries: +security-and-quality - + # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild @@ -77,7 +77,7 @@ jobs: # Command-line programs to run using the OS shell. # See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun - # If the Autobuild fails above, remove it and uncomment the following three lines. + # If the Autobuild fails above, remove it and uncomment the following three lines. # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. # - run: | diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6c03a4ad6c..1985278fd3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -64,7 +64,9 @@ repos: - id: check-merge-conflict - id: check-json - id: check-toml + # Do not check template yaml files in deploy directory - id: check-yaml + exclude: ^deploy(\/[^\/]+)*\/templates\/.*$ - id: check-shebang-scripts-are-executable - id: end-of-file-fixer types_or: [c, c++, cuda, proto, textproto, java, python] diff --git a/CMakeLists.txt b/CMakeLists.txt index a2031f1bdb..7ea6dbddf7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,7 +130,7 @@ if(EXISTS "/etc/os-release") set (LIB_DIR "lib64") endif() endif() - + set(TRITON_CORE_HEADERS_ONLY OFF) FetchContent_MakeAvailable(repo-third-party repo-core) diff --git a/Dockerfile.QA b/Dockerfile.QA index 0d3fb2a239..563194a7c8 100644 --- a/Dockerfile.QA +++ b/Dockerfile.QA @@ -70,7 +70,7 @@ RUN apt update && apt install -y gpg wget && \ echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | \ tee /etc/apt/sources.list.d/kitware.list >/dev/null && \ apt-get update && \ - apt-get install -y --no-install-recommends cmake cmake-data + apt-get install -y --no-install-recommends cmake cmake-data # Add inception_graphdef model to example repo WORKDIR /workspace/docs/examples/model_repository diff --git a/Dockerfile.sdk b/Dockerfile.sdk index cb64a5599a..5d7f409e8f 100644 --- a/Dockerfile.sdk +++ b/Dockerfile.sdk @@ -97,7 +97,7 @@ RUN apt update && apt install -y gpg wget && \ tee /etc/apt/sources.list.d/kitware.list >/dev/null && \ apt-get update && \ apt-get install -y --no-install-recommends cmake cmake-data && \ - cmake --version + cmake --version # Build expects "python" executable (not python3). RUN rm -f /usr/bin/python && \ @@ -197,8 +197,8 @@ RUN mkdir qa COPY qa/L0_sdk qa/L0_sdk COPY qa/L0_client_build_variants qa/L0_client_build_variants -# Create a directory for all the python client tests to enable unit testing -RUN mkdir -p qa/python_client_unit_tests/ +# Create a directory for all the python client tests to enable unit testing +RUN mkdir -p qa/python_client_unit_tests/ COPY --from=sdk_build /workspace/client/src/python/library/tests/* qa/python_client_unit_tests/ # Install an image needed by the quickstart and other documentation. diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min index a0660c2f80..ee9393de80 100644 --- a/Dockerfile.win10.min +++ b/Dockerfile.win10.min @@ -130,7 +130,7 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%" -LABEL CUDA_VERSION="${CUDA_VERSION}" +LABEL CUDA_VERSION="${CUDA_VERSION}" # # Installing Tensorrt @@ -159,7 +159,7 @@ ARG CUDNN_SOURCE=${CUDNN_ZIP} ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP} -RUN unzip /tmp/%CUDNN_ZIP% +RUN unzip /tmp/%CUDNN_ZIP% RUN move cudnn-* cudnn RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\." RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\." diff --git a/README.md b/README.md index 2ea07a98a7..8d5f96c0a2 100644 --- a/README.md +++ b/README.md @@ -31,19 +31,19 @@ [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause) **LATEST RELEASE: You are currently on the main branch which tracks -under-development progress towards the next release. The current release is +under-development progress towards the next release. The current release is version [2.35.0](https://github.com/triton-inference-server/server/tree/r23.06) -and corresponds to the 23.06 container release on +and corresponds to the 23.06 container release on [NVIDIA GPU Cloud (NGC)](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver).** ---- -Triton Inference Server is an open source inference serving software that -streamlines AI inferencing. Triton enables teams to deploy any AI model from -multiple deep learning and machine learning frameworks, including TensorRT, -TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton -supports inference across cloud, data center,edge and embedded devices on NVIDIA -GPUs, x86 and ARM CPU, or AWS Inferentia. Triton delivers optimized performance -for many query types, including real time, batched, ensembles and audio/video +Triton Inference Server is an open source inference serving software that +streamlines AI inferencing. Triton enables teams to deploy any AI model from +multiple deep learning and machine learning frameworks, including TensorRT, +TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton +supports inference across cloud, data center,edge and embedded devices on NVIDIA +GPUs, x86 and ARM CPU, or AWS Inferentia. Triton delivers optimized performance +for many query types, including real time, batched, ensembles and audio/video streaming. Major features include: @@ -55,7 +55,7 @@ Major features include: - [Concurrent model execution](docs/user_guide/architecture.md#concurrent-model-execution) - [Dynamic batching](docs/user_guide/model_configuration.md#dynamic-batcher) -- [Sequence batching](docs/user_guide/model_configuration.md#sequence-batcher) and +- [Sequence batching](docs/user_guide/model_configuration.md#sequence-batcher) and [implicit state management](docs/user_guide/architecture.md#implicit-state-management) for stateful models - Provides [Backend API](https://github.com/triton-inference-server/backend) that @@ -74,20 +74,20 @@ Major features include: - [Metrics](docs/user_guide/metrics.md) indicating GPU utilization, server throughput, server latency, and more -**New to Triton Inference Server?** Make use of +**New to Triton Inference Server?** Make use of [these tutorials](https://github.com/triton-inference-server/tutorials) -to begin your Triton journey! +to begin your Triton journey! -Join the [Triton and TensorRT community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and -stay current on the latest product updates, bug fixes, content, best practices, -and more. Need enterprise support? NVIDIA global support is available for Triton -Inference Server with the -[NVIDIA AI Enterprise software suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/). +Join the [Triton and TensorRT community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and +stay current on the latest product updates, bug fixes, content, best practices, +and more. Need enterprise support? NVIDIA global support is available for Triton +Inference Server with the +[NVIDIA AI Enterprise software suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/). ## Serve a Model in 3 Easy Steps ```bash -# Step 1: Create the example model repository +# Step 1: Create the example model repository git clone -b r23.06 https://github.com/triton-inference-server/server.git cd server/docs/examples ./fetch_models.sh @@ -95,7 +95,7 @@ cd server/docs/examples # Step 2: Launch triton from the NGC Triton container docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:23.06-py3 tritonserver --model-repository=/models -# Step 3: Sending an Inference Request +# Step 3: Sending an Inference Request # In a separate console, launch the image_client example from the NGC Triton SDK container docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:23.06-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg @@ -115,13 +115,13 @@ Check out [NVIDIA LaunchPad](https://www.nvidia.com/en-us/data-center/products/a for free access to a set of hands-on labs with Triton Inference Server hosted on NVIDIA infrastructure. -Specific end-to-end examples for popular models, such as ResNet, BERT, and DLRM -are located in the +Specific end-to-end examples for popular models, such as ResNet, BERT, and DLRM +are located in the [NVIDIA Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples) -page on GitHub. The -[NVIDIA Developer Zone](https://developer.nvidia.com/nvidia-triton-inference-server) +page on GitHub. The +[NVIDIA Developer Zone](https://developer.nvidia.com/nvidia-triton-inference-server) contains additional documentation, presentations, and examples. - + ## Documentation ### Build and Deploy @@ -134,7 +134,7 @@ images. - [Build a custom Triton Inference Server Docker container](docs/customization_guide/compose.md) - [Build Triton Inference Server from source](docs/customization_guide/build.md#building-on-unsupported-platforms) - [Build Triton Inference Server for Windows 10](docs/customization_guide/build.md#building-for-windows-10) -- Examples for deploying Triton Inference Server with Kubernetes and Helm on [GCP](deploy/gcp/README.md), +- Examples for deploying Triton Inference Server with Kubernetes and Helm on [GCP](deploy/gcp/README.md), [AWS](deploy/aws/README.md), and [NVIDIA FleetCommand](deploy/fleetcommand/README.md) ### Using Triton @@ -142,10 +142,10 @@ images. #### Preparing Models for Triton Inference Server The first step in using Triton to serve your models is to place one or -more models into a [model repository](docs/user_guide/model_repository.md). Depending on +more models into a [model repository](docs/user_guide/model_repository.md). Depending on the type of the model and on what Triton capabilities you want to enable for the model, you may need to create a [model -configuration](docs/user_guide/model_configuration.md) for the model. +configuration](docs/user_guide/model_configuration.md) for the model. - [Add custom operations to Triton if needed by your model](docs/user_guide/custom_operations.md) - Enable model pipelining with [Model Ensemble](docs/user_guide/architecture.md#ensemble-models) @@ -154,37 +154,37 @@ configuration](docs/user_guide/model_configuration.md) for the model. parameters and [model instances](docs/user_guide/model_configuration.md#instance-groups). - Use the [Model Analyzer tool](https://github.com/triton-inference-server/model_analyzer) to help optimize your model configuration with profiling -- Learn how to [explicitly manage what models are available by loading and +- Learn how to [explicitly manage what models are available by loading and unloading models](docs/user_guide/model_management.md) #### Configure and Use Triton Inference Server -- Read the [Quick Start Guide](docs/getting_started/quickstart.md) to run Triton Inference +- Read the [Quick Start Guide](docs/getting_started/quickstart.md) to run Triton Inference Server on both GPU and CPU -- Triton supports multiple execution engines, called - [backends](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton), including - [TensorRT](https://github.com/triton-inference-server/tensorrt_backend), - [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend), - [PyTorch](https://github.com/triton-inference-server/pytorch_backend), - [ONNX](https://github.com/triton-inference-server/onnxruntime_backend), - [OpenVINO](https://github.com/triton-inference-server/openvino_backend), +- Triton supports multiple execution engines, called + [backends](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton), including + [TensorRT](https://github.com/triton-inference-server/tensorrt_backend), + [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend), + [PyTorch](https://github.com/triton-inference-server/pytorch_backend), + [ONNX](https://github.com/triton-inference-server/onnxruntime_backend), + [OpenVINO](https://github.com/triton-inference-server/openvino_backend), [Python](https://github.com/triton-inference-server/python_backend), and more - Not all the above backends are supported on every platform supported by Triton. Look at the [Backend-Platform Support Matrix](https://github.com/triton-inference-server/backend/blob/main/docs/backend_platform_support_matrix.md) to learn which backends are supported on your target platform. -- Learn how to [optimize performance](docs/user_guide/optimization.md) using the +- Learn how to [optimize performance](docs/user_guide/optimization.md) using the [Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md) and [Model Analyzer](https://github.com/triton-inference-server/model_analyzer) -- Learn how to [manage loading and unloading models](docs/user_guide/model_management.md) in +- Learn how to [manage loading and unloading models](docs/user_guide/model_management.md) in Triton - Send requests directly to Triton with the [HTTP/REST JSON-based or gRPC protocols](docs/customization_guide/inference_protocols.md#httprest-and-grpc-protocols) #### Client Support and Examples -A Triton *client* application sends inference and other requests to Triton. The +A Triton *client* application sends inference and other requests to Triton. The [Python and C++ client libraries](https://github.com/triton-inference-server/client) provide APIs to simplify this communication. @@ -194,25 +194,25 @@ provide APIs to simplify this communication. - Configure [HTTP](https://github.com/triton-inference-server/client#http-options) and [gRPC](https://github.com/triton-inference-server/client#grpc-options) client options -- Send input data (e.g. a jpeg image) directly to Triton in the [body of an HTTP +- Send input data (e.g. a jpeg image) directly to Triton in the [body of an HTTP request without any additional metadata](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_binary_data.md#raw-binary-request) ### Extend Triton -[Triton Inference Server's architecture](docs/user_guide/architecture.md) is specifically +[Triton Inference Server's architecture](docs/user_guide/architecture.md) is specifically designed for modularity and flexibility - [Customize Triton Inference Server container](docs/customization_guide/compose.md) for your use case - [Create custom backends](https://github.com/triton-inference-server/backend) in either [C/C++](https://github.com/triton-inference-server/backend/blob/main/README.md#triton-backend-api) or [Python](https://github.com/triton-inference-server/python_backend) -- Create [decouple backends and models](docs/user_guide/decoupled_models.md) that can send +- Create [decouple backends and models](docs/user_guide/decoupled_models.md) that can send multiple responses for a request or not send any responses for a request - Use a [Triton repository agent](docs/customization_guide/repository_agents.md) to add functionality - that operates when a model is loaded and unloaded, such as authentication, + that operates when a model is loaded and unloaded, such as authentication, decryption, or conversion - Deploy Triton on [Jetson and JetPack](docs/user_guide/jetson.md) -- [Use Triton on AWS +- [Use Triton on AWS Inferentia](https://github.com/triton-inference-server/python_backend/tree/main/inferentia) ### Additional Documentation @@ -227,7 +227,7 @@ Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html) ## Contributing Contributions to Triton Inference Server are more than welcome. To -contribute please review the [contribution +contribute please review the [contribution guidelines](CONTRIBUTING.md). If you have a backend, client, example or similar contribution that is not modifying the core of Triton, then you should file a PR in the [contrib @@ -235,7 +235,7 @@ repo](https://github.com/triton-inference-server/contrib). ## Reporting problems, asking questions -We appreciate any feedback, questions or bug reporting regarding this project. +We appreciate any feedback, questions or bug reporting regarding this project. When posting [issues in GitHub](https://github.com/triton-inference-server/server/issues), follow the process outlined in the [Stack Overflow document](https://stackoverflow.com/help/mcve). Ensure posted examples are: diff --git a/build.py b/build.py index d59bb56f9c..1339c5c6f9 100755 --- a/build.py +++ b/build.py @@ -2495,4 +2495,4 @@ def enable_all(): else: p = subprocess.Popen([f'./{script_name}'], cwd=FLAGS.build_dir) p.wait() - fail_if(p.returncode != 0, 'build failed') + fail_if(p.returncode != 0, 'build failed') \ No newline at end of file diff --git a/compose.py b/compose.py old mode 100644 new mode 100755 index 0a00883727..9f948c14fd --- a/compose.py +++ b/compose.py @@ -39,7 +39,7 @@ def log(msg, force=False): try: print(msg, file=sys.stderr) except Exception: - print('', file=sys.stderr) + print("", file=sys.stderr) def log_verbose(msg): @@ -48,7 +48,7 @@ def log_verbose(msg): def fail(msg): - print('error: {}'.format(msg), file=sys.stderr) + print("error: {}".format(msg), file=sys.stderr) sys.exit(1) @@ -58,8 +58,8 @@ def fail_if(p, msg): def start_dockerfile(ddir, images, argmap, dockerfile_name, backends): - # Set enviroment variables, set default user and install dependencies - df = ''' + # Set environment variables, set default user and install dependencies + df = """ # # Multistage build. # @@ -67,30 +67,38 @@ def start_dockerfile(ddir, images, argmap, dockerfile_name, backends): ARG TRITON_CONTAINER_VERSION={} FROM {} AS full -'''.format(argmap['TRITON_VERSION'], argmap['TRITON_CONTAINER_VERSION'], - images["full"]) +""".format( + argmap["TRITON_VERSION"], argmap["TRITON_CONTAINER_VERSION"], images["full"] + ) # PyTorch, TensorFlow 1 and TensorFlow 2 backends need extra CUDA and other # dependencies during runtime that are missing in the CPU-only base container. # These dependencies must be copied from the Triton Min image. - if not FLAGS.enable_gpu and (('pytorch' in backends) or - ('tensorflow1' in backends) or - ('tensorflow2' in backends)): - df += ''' + if not FLAGS.enable_gpu and ( + ("pytorch" in backends) + or ("tensorflow1" in backends) + or ("tensorflow2" in backends) + ): + df += """ FROM {} AS min_container -'''.format(images["gpu-min"]) +""".format( + images["gpu-min"] + ) - df += ''' + df += """ FROM {} -'''.format(images["min"]) +""".format( + images["min"] + ) import build - df += build.dockerfile_prepare_container_linux(argmap, backends, - FLAGS.enable_gpu, - platform.machine().lower()) + + df += build.dockerfile_prepare_container_linux( + argmap, backends, FLAGS.enable_gpu, platform.machine().lower() + ) # Copy over files - df += ''' + df += """ WORKDIR /opt/tritonserver COPY --chown=1000:1000 --from=full /opt/tritonserver/LICENSE . COPY --chown=1000:1000 --from=full /opt/tritonserver/TRITON_VERSION . @@ -98,7 +106,7 @@ def start_dockerfile(ddir, images, argmap, dockerfile_name, backends): COPY --chown=1000:1000 --from=full /opt/tritonserver/bin bin/ COPY --chown=1000:1000 --from=full /opt/tritonserver/lib lib/ COPY --chown=1000:1000 --from=full /opt/tritonserver/include include/ -''' +""" with open(os.path.join(ddir, dockerfile_name), "w") as dfile: dfile.write(df) @@ -106,13 +114,15 @@ def start_dockerfile(ddir, images, argmap, dockerfile_name, backends): def add_requested_backends(ddir, dockerfile_name, backends): df = "# Copying over backends \n" for backend in backends: - df += '''COPY --chown=1000:1000 --from=full /opt/tritonserver/backends/{} /opt/tritonserver/backends/{} -'''.format(backend, backend) + df += """COPY --chown=1000:1000 --from=full /opt/tritonserver/backends/{} /opt/tritonserver/backends/{} +""".format( + backend, backend + ) if len(backends) > 0: - df += ''' + df += """ # Top-level /opt/tritonserver/backends not copied so need to explicitly set permissions here RUN chown triton-server:triton-server /opt/tritonserver/backends -''' +""" with open(os.path.join(ddir, dockerfile_name), "a") as dfile: dfile.write(df) @@ -120,13 +130,15 @@ def add_requested_backends(ddir, dockerfile_name, backends): def add_requested_repoagents(ddir, dockerfile_name, repoagents): df = "# Copying over repoagents \n" for ra in repoagents: - df += '''COPY --chown=1000:1000 --from=full /opt/tritonserver/repoagents/{} /opt/tritonserver/repoagents/{} -'''.format(ra, ra) + df += """COPY --chown=1000:1000 --from=full /opt/tritonserver/repoagents/{} /opt/tritonserver/repoagents/{} +""".format( + ra, ra + ) if len(repoagents) > 0: - df += ''' + df += """ # Top-level /opt/tritonserver/repoagents not copied so need to explicitly set permissions here RUN chown triton-server:triton-server /opt/tritonserver/repoagents -''' +""" with open(os.path.join(ddir, dockerfile_name), "a") as dfile: dfile.write(df) @@ -134,13 +146,15 @@ def add_requested_repoagents(ddir, dockerfile_name, repoagents): def add_requested_caches(ddir, dockerfile_name, caches): df = "# Copying over caches \n" for cache in caches: - df += '''COPY --chown=1000:1000 --from=full /opt/tritonserver/caches/{} /opt/tritonserver/caches/{} -'''.format(cache, cache) + df += """COPY --chown=1000:1000 --from=full /opt/tritonserver/caches/{} /opt/tritonserver/caches/{} +""".format( + cache, cache + ) if len(caches) > 0: - df += ''' + df += """ # Top-level /opt/tritonserver/caches not copied so need to explicitly set permissions here RUN chown triton-server:triton-server /opt/tritonserver/caches -''' +""" with open(os.path.join(ddir, dockerfile_name), "a") as dfile: dfile.write(df) @@ -148,33 +162,44 @@ def add_requested_caches(ddir, dockerfile_name, caches): def end_dockerfile(ddir, dockerfile_name, argmap): # Install additional dependencies df = "" - if argmap['SAGEMAKER_ENDPOINT']: - df += ''' + if argmap["SAGEMAKER_ENDPOINT"]: + df += """ LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true COPY --chown=1000:1000 --from=full /usr/bin/serve /usr/bin/. -''' +""" with open(os.path.join(ddir, dockerfile_name), "a") as dfile: dfile.write(df) def build_docker_image(ddir, dockerfile_name, container_name): # Create container with docker build - p = subprocess.Popen(['docker', 'build', '-t', container_name, '-f', \ - os.path.join(ddir, dockerfile_name), '.']) + p = subprocess.Popen( + [ + "docker", + "build", + "-t", + container_name, + "-f", + os.path.join(ddir, dockerfile_name), + ".", + ] + ) p.wait() - fail_if(p.returncode != 0, 'docker build {} failed'.format(container_name)) + fail_if(p.returncode != 0, "docker build {} failed".format(container_name)) def get_container_version_if_not_specified(): if FLAGS.container_version is None: # Read from TRITON_VERSION file in server repo to determine version - with open('TRITON_VERSION', "r") as vfile: + with open("TRITON_VERSION", "r") as vfile: version = vfile.readline().strip() import build + _, FLAGS.container_version = build.container_versions( - version, None, FLAGS.container_version) - log('version {}'.format(version)) - log('using container version {}'.format(FLAGS.container_version)) + version, None, FLAGS.container_version + ) + log("version {}".format(version)) + log("using container version {}".format(FLAGS.container_version)) def create_argmap(images, skip_pull): @@ -183,210 +208,246 @@ def create_argmap(images, skip_pull): full_docker_image = images["full"] min_docker_image = images["min"] enable_gpu = FLAGS.enable_gpu - # Docker inspect enviroment variables - base_run_args = ['docker', 'inspect', '-f'] - import re # parse all PATH enviroment variables + # Docker inspect environment variables + base_run_args = ["docker", "inspect", "-f"] + import re # parse all PATH environment variables # first pull docker images if not skip_pull: log("pulling container:{}".format(full_docker_image)) - p = subprocess.run(['docker', 'pull', full_docker_image]) + p = subprocess.run(["docker", "pull", full_docker_image]) fail_if( - p.returncode != 0, 'docker pull container {} failed, {}'.format( - full_docker_image, p.stderr)) + p.returncode != 0, + "docker pull container {} failed, {}".format(full_docker_image, p.stderr), + ) if enable_gpu: if not skip_pull: - pm = subprocess.run(['docker', 'pull', min_docker_image]) + pm = subprocess.run(["docker", "pull", min_docker_image]) fail_if( pm.returncode != 0 and not skip_pull, - 'docker pull container {} failed, {}'.format( - min_docker_image, pm.stderr)) - pm_path = subprocess.run(base_run_args + [ - '{{range $index, $value := .Config.Env}}{{$value}} {{end}}', - min_docker_image - ], - capture_output=True, - text=True) + "docker pull container {} failed, {}".format( + min_docker_image, pm.stderr + ), + ) + pm_path = subprocess.run( + base_run_args + + [ + "{{range $index, $value := .Config.Env}}{{$value}} {{end}}", + min_docker_image, + ], + capture_output=True, + text=True, + ) fail_if( pm_path.returncode != 0, - 'docker inspect to find triton enviroment variables for min container failed, {}' - .format(pm_path.stderr)) + "docker inspect to find triton environment variables for min container failed, {}".format( + pm_path.stderr + ), + ) # min container needs to be GPU-support-enabled if the build is GPU build vars = pm_path.stdout e = re.search("CUDA_VERSION", vars) gpu_enabled = False if e is None else True fail_if( not gpu_enabled, - 'Composing container with gpu support enabled but min container provided does not have CUDA installed' + "Composing container with gpu support enabled but min container provided does not have CUDA installed", ) - # Check full container enviroment variables - p_path = subprocess.run(base_run_args + [ - '{{range $index, $value := .Config.Env}}{{$value}} {{end}}', - full_docker_image - ], - capture_output=True, - text=True) + # Check full container environment variables + p_path = subprocess.run( + base_run_args + + [ + "{{range $index, $value := .Config.Env}}{{$value}} {{end}}", + full_docker_image, + ], + capture_output=True, + text=True, + ) fail_if( p_path.returncode != 0, - 'docker inspect to find enviroment variables for full container failed, {}' - .format(p_path.stderr)) + "docker inspect to find environment variables for full container failed, {}".format( + p_path.stderr + ), + ) vars = p_path.stdout log_verbose("inspect args: {}".format(vars)) e0 = re.search("TRITON_SERVER_GPU_ENABLED=([\S]{1,}) ", vars) e1 = re.search("CUDA_VERSION", vars) gpu_enabled = False - if (e0 != None): + if e0 != None: gpu_enabled = e0.group(1) == "1" - elif (e1 != None): + elif e1 != None: gpu_enabled = True fail_if( gpu_enabled != enable_gpu, - 'Error: full container provided was build with ' - '\'TRITON_SERVER_GPU_ENABLED\' as {} and you are composing container' - 'with \'TRITON_SERVER_GPU_ENABLED\' as {}'.format( - gpu_enabled, enable_gpu)) + "Error: full container provided was build with " + "'TRITON_SERVER_GPU_ENABLED' as {} and you are composing container" + "with 'TRITON_SERVER_GPU_ENABLED' as {}".format(gpu_enabled, enable_gpu), + ) e = re.search("TRITON_SERVER_VERSION=([\S]{6,}) ", vars) version = "" if e is None else e.group(1) fail_if( len(version) == 0, - 'docker inspect to find triton server version failed, {}'.format( - p_path.stderr)) + "docker inspect to find triton server version failed, {}".format(p_path.stderr), + ) e = re.search("NVIDIA_TRITON_SERVER_VERSION=([\S]{5,}) ", vars) container_version = "" if e is None else e.group(1) fail_if( len(container_version) == 0, - 'docker inspect to find triton container version failed, {}'.format( - vars)) + "docker inspect to find triton container version failed, {}".format(vars), + ) dcgm_ver = re.search("DCGM_VERSION=([\S]{4,}) ", vars) dcgm_version = "" if dcgm_ver is None: dcgm_version = "2.2.3" - log("WARNING: DCGM version not found from image, installing the earlierst version {}" - .format(dcgm_version)) + log( + "WARNING: DCGM version not found from image, installing the earlierst version {}".format( + dcgm_version + ) + ) else: dcgm_version = dcgm_ver.group(1) fail_if( len(dcgm_version) == 0, - 'docker inspect to find DCGM version failed, {}'.format(vars)) + "docker inspect to find DCGM version failed, {}".format(vars), + ) p_sha = subprocess.run( - base_run_args + - ['{{ index .Config.Labels "com.nvidia.build.ref"}}', full_docker_image], + base_run_args + + ['{{ index .Config.Labels "com.nvidia.build.ref"}}', full_docker_image], capture_output=True, - text=True) + text=True, + ) fail_if( p_sha.returncode != 0, - 'docker inspect of upstream docker image build sha failed, {}'.format( - p_sha.stderr)) + "docker inspect of upstream docker image build sha failed, {}".format( + p_sha.stderr + ), + ) p_build = subprocess.run( - base_run_args + - ['{{ index .Config.Labels "com.nvidia.build.id"}}', full_docker_image], + base_run_args + + ['{{ index .Config.Labels "com.nvidia.build.id"}}', full_docker_image], capture_output=True, - text=True) + text=True, + ) fail_if( p_build.returncode != 0, - 'docker inspect of upstream docker image build sha failed, {}'.format( - p_build.stderr)) + "docker inspect of upstream docker image build sha failed, {}".format( + p_build.stderr + ), + ) p_find = subprocess.run( - ['docker', 'run', full_docker_image, 'bash', '-c', 'ls /usr/bin/'], + ["docker", "run", full_docker_image, "bash", "-c", "ls /usr/bin/"], capture_output=True, - text=True) + text=True, + ) f = re.search("serve", p_find.stdout) - fail_if(p_find.returncode != 0, - "Cannot search for 'serve' in /usr/bin, {}".format(p_find.stderr)) + fail_if( + p_find.returncode != 0, + "Cannot search for 'serve' in /usr/bin, {}".format(p_find.stderr), + ) argmap = { - 'NVIDIA_BUILD_REF': p_sha.stdout.rstrip(), - 'NVIDIA_BUILD_ID': p_build.stdout.rstrip(), - 'TRITON_VERSION': version, - 'TRITON_CONTAINER_VERSION': container_version, - 'DCGM_VERSION': dcgm_version, - 'SAGEMAKER_ENDPOINT': f is not None, + "NVIDIA_BUILD_REF": p_sha.stdout.rstrip(), + "NVIDIA_BUILD_ID": p_build.stdout.rstrip(), + "TRITON_VERSION": version, + "TRITON_CONTAINER_VERSION": container_version, + "DCGM_VERSION": dcgm_version, + "SAGEMAKER_ENDPOINT": f is not None, } return argmap -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() group_qv = parser.add_mutually_exclusive_group() - group_qv.add_argument('-q', - '--quiet', - action="store_true", - required=False, - help='Disable console output.') - group_qv.add_argument('-v', - '--verbose', - action="store_true", - required=False, - help='Enable verbose output.') + group_qv.add_argument( + "-q", + "--quiet", + action="store_true", + required=False, + help="Disable console output.", + ) + group_qv.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + help="Enable verbose output.", + ) parser.add_argument( - '--output-name', + "--output-name", type=str, required=False, - help='Name for the generated Docker image. Default is "tritonserver".') + help='Name for the generated Docker image. Default is "tritonserver".', + ) parser.add_argument( - '--work-dir', + "--work-dir", type=str, required=False, - help= - 'Generated dockerfiles are placed here. Default to current directory.') + help="Generated dockerfiles are placed here. Default to current directory.", + ) parser.add_argument( - '--container-version', + "--container-version", type=str, required=False, - help= - 'The version to use for the generated Docker image. If not specified ' - 'the container version will be chosen automatically based on the ' - 'repository branch.') + help="The version to use for the generated Docker image. If not specified " + "the container version will be chosen automatically based on the " + "repository branch.", + ) parser.add_argument( - '--image', - action='append', + "--image", + action="append", required=False, - help='Use specified Docker image to generate Docker image. Specified as ' + help="Use specified Docker image to generate Docker image. Specified as " ',. can be "min", "gpu-min" ' 'or "full". Both "min" and "full" need to be specified at the same time.' 'This will override "--container-version". "gpu-min" is needed for ' - 'CPU-only container to copy TensorFlow and PyTorch deps.') - parser.add_argument('--enable-gpu', - nargs='?', - type=lambda x: (str(x).lower() == 'true'), - const=True, - default=True, - required=False, - help=argparse.SUPPRESS) + "CPU-only container to copy TensorFlow and PyTorch deps.", + ) + parser.add_argument( + "--enable-gpu", + nargs="?", + type=lambda x: (str(x).lower() == "true"), + const=True, + default=True, + required=False, + help=argparse.SUPPRESS, + ) parser.add_argument( - '--backend', - action='append', + "--backend", + action="append", required=False, - help= - 'Include in the generated Docker image. The flag may be ' - 'specified multiple times.') + help="Include in the generated Docker image. The flag may be " + "specified multiple times.", + ) parser.add_argument( - '--repoagent', - action='append', + "--repoagent", + action="append", required=False, - help= - 'Include in the generated Docker image. The flag may ' - 'be specified multiple times.') + help="Include in the generated Docker image. The flag may " + "be specified multiple times.", + ) parser.add_argument( - '--cache', - action='append', + "--cache", + action="append", required=False, - help='Include in the generated Docker image. The flag may ' - 'be specified multiple times.') + help="Include in the generated Docker image. The flag may " + "be specified multiple times.", + ) parser.add_argument( - '--skip-pull', - action='store_true', + "--skip-pull", + action="store_true", required=False, - help='Do not pull the required docker images. The user is responsible ' - 'for pulling the upstream images needed to compose the image.') + help="Do not pull the required docker images. The user is responsible " + "for pulling the upstream images needed to compose the image.", + ) parser.add_argument( - '--dry-run', + "--dry-run", action="store_true", required=False, - help='Only creates Dockerfile.compose, does not build the Docker image.' + help="Only creates Dockerfile.compose, does not build the Docker image.", ) FLAGS = parser.parse_args() @@ -396,7 +457,7 @@ def create_argmap(images, skip_pull): if FLAGS.output_name is None: FLAGS.output_name = "tritonserver" - dockerfile_name = 'Dockerfile.compose' + dockerfile_name = "Dockerfile.compose" if FLAGS.backend is None: FLAGS.backend = [] @@ -409,54 +470,56 @@ def create_argmap(images, skip_pull): images = {} if FLAGS.image: for img in FLAGS.image: - parts = img.split(',') + parts = img.split(",") fail_if( len(parts) != 2, - '--image must specific ,') + "--image must specific ,", + ) fail_if( - parts[0] not in ['min', 'full', 'gpu-min'], - 'unsupported image-name \'{}\' for --image'.format(parts[0])) + parts[0] not in ["min", "full", "gpu-min"], + "unsupported image-name '{}' for --image".format(parts[0]), + ) log('image "{}": "{}"'.format(parts[0], parts[1])) images[parts[0]] = parts[1] else: get_container_version_if_not_specified() if FLAGS.enable_gpu: images = { - "full": - "nvcr.io/nvidia/tritonserver:{}-py3".format( - FLAGS.container_version), - "min": - "nvcr.io/nvidia/tritonserver:{}-py3-min".format( - FLAGS.container_version) + "full": "nvcr.io/nvidia/tritonserver:{}-py3".format( + FLAGS.container_version + ), + "min": "nvcr.io/nvidia/tritonserver:{}-py3-min".format( + FLAGS.container_version + ), } else: images = { - "full": - "nvcr.io/nvidia/tritonserver:{}-cpu-only-py3".format( - FLAGS.container_version), - "min": - "ubuntu:22.04" + "full": "nvcr.io/nvidia/tritonserver:{}-cpu-only-py3".format( + FLAGS.container_version + ), + "min": "ubuntu:22.04", } - fail_if( - len(images) < 2, - "Need to specify both 'full' and 'min' images if at all") + fail_if(len(images) < 2, "Need to specify both 'full' and 'min' images if at all") # For CPU-only image we need to copy some cuda libraries and dependencies # since we are using PyTorch, TensorFlow 1, TensorFlow 2 containers that # are not CPU-only. - if (('pytorch' in FLAGS.backend) or ('tensorflow1' in FLAGS.backend) or - ('tensorflow2' in FLAGS.backend)) and ('gpu-min' not in images): + if ( + ("pytorch" in FLAGS.backend) + or ("tensorflow1" in FLAGS.backend) + or ("tensorflow2" in FLAGS.backend) + ) and ("gpu-min" not in images): images["gpu-min"] = "nvcr.io/nvidia/tritonserver:{}-py3-min".format( - FLAGS.container_version) + FLAGS.container_version + ) argmap = create_argmap(images, FLAGS.skip_pull) - start_dockerfile(FLAGS.work_dir, images, argmap, dockerfile_name, - FLAGS.backend) + start_dockerfile(FLAGS.work_dir, images, argmap, dockerfile_name, FLAGS.backend) add_requested_backends(FLAGS.work_dir, dockerfile_name, FLAGS.backend) add_requested_repoagents(FLAGS.work_dir, dockerfile_name, FLAGS.repoagent) add_requested_caches(FLAGS.work_dir, dockerfile_name, FLAGS.cache) end_dockerfile(FLAGS.work_dir, dockerfile_name, argmap) - if (not FLAGS.dry_run): + if not FLAGS.dry_run: build_docker_image(FLAGS.work_dir, dockerfile_name, FLAGS.output_name) diff --git a/deploy/alibaba-cloud/README.md b/deploy/alibaba-cloud/README.md index 1dea4ede11..0521eb704f 100644 --- a/deploy/alibaba-cloud/README.md +++ b/deploy/alibaba-cloud/README.md @@ -26,7 +26,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --> -# Deploy Triton Inference Server on PAI-EAS +# Deploy Triton Inference Server on PAI-EAS * Table Of Contents - [Description](https://yuque.alibaba-inc.com/pai/blade/mtptqc#Description) - [Prerequisites](https://yuque.alibaba-inc.com/pai/blade/mtptqc#Prerequisites) @@ -57,11 +57,11 @@ Download the tensorflow inception model via [fetch_model.sh](https://github.com/ The following is the json we use when creating a Triton Server on EAS. ``` { - "name": "", + "name": "", "processor": "triton", "processor_params": [ - "--model-repository=oss://triton-model-repo/models", - "--allow-grpc=true", + "--model-repository=oss://triton-model-repo/models", + "--allow-grpc=true", "--allow-http=true" ], "metadata": { diff --git a/deploy/aws/README.md b/deploy/aws/README.md index 600f8c953f..cbde5610ce 100644 --- a/deploy/aws/README.md +++ b/deploy/aws/README.md @@ -39,10 +39,10 @@ This guide assumes you already have a functional Kubernetes cluster and helm installed (see below for instructions on installing helm). Note the following requirements: -* The helm chart deploys Prometheus and Grafana to collect and display Triton metrics. To use this helm chart you must install Prpmetheus and Grafana in your cluster as described below and your cluster must contain sufficient CPU resourses to support these services. +* The helm chart deploys Prometheus and Grafana to collect and display Triton metrics. To use this helm chart you must install Prpmetheus and Grafana in your cluster as described below and your cluster must contain sufficient CPU resources to support these services. * If you want Triton Server to use GPUs for inferencing, your cluster -must be configured to contain the desired number of GPU nodes (EC2 G4 instances recommended) +must be configured to contain the desired number of GPU nodes (EC2 G4 instances recommended) with support for the NVIDIA driver and CUDA version required by the version of the inference server you are using. @@ -67,7 +67,7 @@ please see the [official migration guide](https://helm.sh/docs/topics/v2_v3_migr > **NOTE**: Moving forward this chart will only be tested and maintained for Helm v3. -Below are example instructions for installing Helm v2. +Below are example instructions for installing Helm v2. ``` $ curl https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash diff --git a/deploy/aws/templates/deployment.yaml b/deploy/aws/templates/deployment.yaml index 24f3f65380..48ef82160d 100644 --- a/deploy/aws/templates/deployment.yaml +++ b/deploy/aws/templates/deployment.yaml @@ -56,7 +56,7 @@ spec: limits: nvidia.com/gpu: {{ .Values.image.numGpus }} - args: ["tritonserver", "--model-store={{ .Values.image.modelRepositoryPath }}", + args: ["tritonserver", "--model-store={{ .Values.image.modelRepositoryPath }}", "--model-control-mode=poll", "--repository-poll-secs=5"] @@ -94,7 +94,7 @@ spec: httpGet: path: /v2/health/ready port: http - + securityContext: runAsUser: 1000 fsGroup: 1000 diff --git a/deploy/fleetcommand/README.md b/deploy/fleetcommand/README.md index 88a05af34b..996b7598cc 100644 --- a/deploy/fleetcommand/README.md +++ b/deploy/fleetcommand/README.md @@ -87,7 +87,7 @@ echo -n 'AWS_SESSION_TOKEN' | base64 Deploy the Triton Inference Server to your Location in Fleet Command by creating a Deployment. You can specify configuration parameters to override the default -[values.yaml](values.yaml) in the Application Configuration section. +[values.yaml](values.yaml) in the Application Configuration section. *Note:* You _must_ provide a `--model-repository` parameter with a path to your prepared model repository in your S3 bucket. Otherwise, the Triton will not @@ -114,7 +114,7 @@ for more info. If you have `prometheus-operator` deployed, you can enable the ServiceMonitor for the Triton Inference Server by setting `serviceMonitor.enabled: true` in Application Configuration. This will also deploy a Grafana dashboard for Triton -as a ConfigMap. +as a ConfigMap. Otherwise, metrics can be scraped by pointing an external Prometheus instance at the `metricsNodePort` in the values. diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md index 0530df412e..b1ed1d2d91 100644 --- a/deploy/gcp/README.md +++ b/deploy/gcp/README.md @@ -72,7 +72,7 @@ please see the [official migration guide](https://helm.sh/docs/topics/v2_v3_migr > **NOTE**: Moving forward this chart will only be tested and maintained for Helm v3. -Below are example instructions for installing Helm v2. +Below are example instructions for installing Helm v2. ``` $ curl https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash diff --git a/deploy/gke-marketplace-app/README.md b/deploy/gke-marketplace-app/README.md index 1d805c68d2..e99b9efbae 100644 --- a/deploy/gke-marketplace-app/README.md +++ b/deploy/gke-marketplace-app/README.md @@ -38,23 +38,23 @@ ## Description -This repository contains Google Kubernetes Engine(GKE) Marketplace Application for NVIDIA Triton Inference Server deployer. +This repository contains Google Kubernetes Engine(GKE) Marketplace Application for NVIDIA Triton Inference Server deployer. - Triton GKE deployer is a helm chart deployer recommended by GKE Marketplace - Triton GKE deployer deploys a GKE ingress which accepts public inference requests - Triton GKE deployer includes a horizontal pod autoscaler(HPA) which relies on [stack driver custom metrics adaptor](https://github.com/GoogleCloudPlatform/k8s-stackdriver/tree/master/custom-metrics-stackdriver-adapter) to monitor GPU duty cycle, and auto scale GPU nodes. - - This repo also contains a sample to generate BERT model with TensorRT and use Locust to experiment with GPU node autoscaling and monitor client latency/throughput. + - This repo also contains a sample to generate BERT model with TensorRT and use Locust to experiment with GPU node autoscaling and monitor client latency/throughput. ![Cloud Architecture Diagram](diagram.png) ## Prerequisites - - [Install Google Cloud SDK on your laptop/client workstation](https://cloud.google.com/sdk/docs/install), so that `gcloud` SDK cli interface could be run on the client and sign in with your GCP credentials. + - [Install Google Cloud SDK on your laptop/client workstation](https://cloud.google.com/sdk/docs/install), so that `gcloud` SDK cli interface could be run on the client and sign in with your GCP credentials. - In addition, user could leverage [Google Cloud shell](https://cloud.google.com/shell/docs/launching-cloud-shell). ## Demo Instruction -First, install this Triton GKE app to an existing GKE cluster with GPU node pool, Google Cloud Marketplace currently doesn't support auto creation of GPU clusters. User has to run following command to create a compatible cluster (gke version >=1.18.7) with GPU node pools, we recommend user to select T4 or A100(MIG) instances type and choose CPU ratio based on profiling of actual inference workflow. +First, install this Triton GKE app to an existing GKE cluster with GPU node pool, Google Cloud Marketplace currently doesn't support auto creation of GPU clusters. User has to run following command to create a compatible cluster (gke version >=1.18.7) with GPU node pools, we recommend user to select T4 or A100(MIG) instances type and choose CPU ratio based on profiling of actual inference workflow. Users need to follow these [instructions](https://cloud.google.com/kubernetes-engine/docs/how-to/kubernetes-service-accounts#creating_a_kubernetes_service_account) to create a kubernetes service account. In this example, we use `gke-test@k80-exploration.iam.gserviceaccount.com`. Make sure it has access to artifact registry and monitoring viewer. For example, to grant access to custom metrics which is required for HPA to work: ``` @@ -65,7 +65,7 @@ gcloud iam service-accounts add-iam-policy-binding --role \ kubectl annotate serviceaccount --namespace custom-metrics \ custom-metrics-stackdriver-adapter \ - iam.gke.io/gcp-service-account=@.iam.gserviceaccount.com + iam.gke.io/gcp-service-account=@.iam.gserviceaccount.com ``` Currently, GKE >= 1.18.7 only supported in GKE rapid channel, to find the latest version, please visit [GKE release notes](https://cloud.google.com/kubernetes-engine/docs/release-notes). @@ -104,10 +104,10 @@ gcloud container node-pools create accel \ --verbosity error # so that you can run kubectl locally to the cluster -gcloud container clusters get-credentials ${DEPLOYMENT_NAME} --project ${PROJECT_ID} --zone ${ZONE} +gcloud container clusters get-credentials ${DEPLOYMENT_NAME} --project ${PROJECT_ID} --zone ${ZONE} # deploy NVIDIA device plugin for GKE to prepare GPU nodes for driver install -kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml +kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml # make sure you can run kubectl locally to access the cluster kubectl create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin --user "$(gcloud config get-value account)" @@ -119,7 +119,7 @@ kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/k8s-stack gcloud compute addresses create ingress-triton --global ``` -Creating a cluster and adding GPU nodes could take up-to 10 minutes. Please be patient after executing this command. GPU resources in GCP could be fully utilized, so please try a different zone in case compute resource cannot be allocated. After GKE cluster is running, run `kubectl get pods --all-namespaces` to make sure the client can access the cluster correctly: +Creating a cluster and adding GPU nodes could take up-to 10 minutes. Please be patient after executing this command. GPU resources in GCP could be fully utilized, so please try a different zone in case compute resource cannot be allocated. After GKE cluster is running, run `kubectl get pods --all-namespaces` to make sure the client can access the cluster correctly: If user would like to experiment with A100 MIG partitioned GPU in GKE, please create node pool with following command: ``` @@ -137,14 +137,14 @@ gcloud beta container node-pools create accel \ --verbosity error ``` -Please note that A100 MIG in GKE does not support GPU metrics yet, also Triton GPU Metrics is not compatiable with A100 MIG. Hence, please disable GPU metrics by unselect allowGPUMetrics while deploy Triton GKE app. Also for the same reason, this deployer doesn't support inference workfload auto-scaling on A100 MIG as well. +Please note that A100 MIG in GKE does not support GPU metrics yet, also Triton GPU Metrics is not compatible with A100 MIG. Hence, please disable GPU metrics by unselect allowGPUMetrics while deploy Triton GKE app. Also for the same reason, this deployer doesn't support inference workfload auto-scaling on A100 MIG as well. -Second, go to this [GKE Marketplace link](https://console.cloud.google.com/marketplace/details/nvidia-ngc-public/triton-inference-server) to deploy Triton application. +Second, go to this [GKE Marketplace link](https://console.cloud.google.com/marketplace/details/nvidia-ngc-public/triton-inference-server) to deploy Triton application. Users can leave everything as default if their models have already been tested/validated with Triton. They can provide a GCS path pointing to the model repository containing their models. By default, we provide a BERT large model optimized by TensorRT in a public demo GCS bucket that is compatible with the `xx.yy` release of Triton Server in `gs://triton_sample_models/xx_yy`. However, please take note of the following about this demo bucket: -- The TensorRT engine provided in the demo bucket is only compatible with Tesla T4 GPUs. +- The TensorRT engine provided in the demo bucket is only compatible with Tesla T4 GPUs. - This bucket is located in `us-central1`, so loading from this bucket into Triton in other regions may be affected. -- The first deployment of this Triton GKE application will be slower than consecutive runs because the image needs to be pulled into the GKE cluster. +- The first deployment of this Triton GKE application will be slower than consecutive runs because the image needs to be pulled into the GKE cluster. - You can find an example of how this model is generated and uploaded [here](trt-engine/README.md). Where is the version of NGC Triton container needed. @@ -167,7 +167,7 @@ If User selected deploy Triton to accept HTTP request, please launch [Locust](ht locust -f locustfile_bert.py -H http://${INGRESS_HOST}:${INGRESS_PORT} ``` -The client example push about ~650 QPS(Query per second) to Triton Server, and will trigger a auto scale of T4 GPU nodes (We recommend to use T4 and A100[MIG] for inference). From locust UI, we will observer a drop of latency mean and variance for the requests. At the end, after autoscaling, we see the latency stablized at ~200 ms, end to end from US client to europe server, which is excellent for a model that has 345 million parameters. Since for each node, we use 1T4 + n1-standard-4 instance, and it can handle ~450 QPS, with on-demand price, it is ($0.35+$0.19)=$0.54/hr, that translate to 3 million inference per dollar for BERT large model at batch size 1. Further more, with 3 year commitment price, hr rate is ($0.16+$0.08)=$0.24/hr, that translate to 6.75 million inference per dollar. +The client example push about ~650 QPS(Query per second) to Triton Server, and will trigger a auto scale of T4 GPU nodes (We recommend to use T4 and A100[MIG] for inference). From locust UI, we will observer a drop of latency mean and variance for the requests. At the end, after autoscaling, we see the latency stablized at ~200 ms, end to end from US client to europe server, which is excellent for a model that has 345 million parameters. Since for each node, we use 1T4 + n1-standard-4 instance, and it can handle ~450 QPS, with on-demand price, it is ($0.35+$0.19)=$0.54/hr, that translate to 3 million inference per dollar for BERT large model at batch size 1. Further more, with 3 year commitment price, hr rate is ($0.16+$0.08)=$0.24/hr, that translate to 6.75 million inference per dollar. ![Locust Client Chart](client.png) @@ -197,5 +197,5 @@ See the following resources to learn more about NVIDIA Triton Inference Server a ## Known Issues -- GKE one click cluster creation doesn't support GPU node pools at the moment, users have to mannually create a compatible (>=1.18.7) cluster and attach node pool (T4 and A100 MIG recommended) +- GKE one click cluster creation doesn't support GPU node pools at the moment, users have to manually create a compatible (>=1.18.7) cluster and attach node pool (T4 and A100 MIG recommended) - When Horizontal Pod Autoscaler(HPA) expand and all GPU node pool already utilized, GKE will request new GPU node and it can take between 4-7 minutes, it could be a long wait plus GPU driver install and image pulling. We recommend user to leverage multi-tier model serving and Triton's priority feature to create cushion for latency critical models, and allocate active standby GPU node for spike of requests. diff --git a/deploy/gke-marketplace-app/benchmark/README.md b/deploy/gke-marketplace-app/benchmark/README.md index c350b931dc..c9c502e1b0 100644 --- a/deploy/gke-marketplace-app/benchmark/README.md +++ b/deploy/gke-marketplace-app/benchmark/README.md @@ -49,30 +49,30 @@ We the place the model into a GCS with following structure, `config.pbtxt` was p ├── bert_base_trt_gpu_seqlen128 │ ├── 1 │ │ └── model.plan - │ └── config.pbtxt + │ └── config.pbtxt ├── bert_base_tf_gpu │ ├── 1 │ │ └── model.savedmodel - │ └── config.pbtxt + │ └── config.pbtxt ├── bert_base_tf_cpu │ ├── 1 │ │ └── model.savedmodel │ └── config.pbtxt - ├── bert_distill_tf_gpu + ├── bert_distill_tf_gpu │ ├── 1 │ │ └── model.savedmodel │ └── config.pbtxt └── bert_distill_tf_cpu ├── 1 │ └── model.savedmodel - └── config.pbtxt + └── config.pbtxt ``` -When deploy Triton GKE application, point the model repository to directory contains the structure above with actual models. +When deploy Triton GKE application, point the model repository to directory contains the structure above with actual models. ## Performance -We use perf analyzer of Triton to benchmark the performance of each model, the perf analyzer reside in another pod of the GKE cluster. +We use perf analyzer of Triton to benchmark the performance of each model, the perf analyzer reside in another pod of the GKE cluster. ```bash export INGRESS_HOST=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.status.loadBalancer.ingress[0].ip}') export INGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=="http2")].port}') @@ -91,6 +91,5 @@ GPU TensorRT BERT BASE: latency: 50ms, throughput: 465 qps With n1-standard-96 priced at $4.56/hr and n1-standard-4 at $0.19/hr and T4 at $0.35/hr totaling $0.54/hr. While achieving a much lower latency, the TCO of BERT inference with TensorRT on T4 is over 163 times that of Distill BERT inference on n1-standard-96. - - \ No newline at end of file + diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt index b46aa21f5e..f369db917f 100644 --- a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt +++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt @@ -24,7 +24,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -max_batch_size: 4 +max_batch_size: 4 dynamic_batching { preferred_batch_size: 4 max_queue_delay_microseconds: 200000 diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt index 9cc4dd4551..f3b83d5725 100644 --- a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt +++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. platform: "tensorrt_plan" -max_batch_size: 4 +max_batch_size: 4 dynamic_batching { preferred_batch_size: 4 max_queue_delay_microseconds: 200000 diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt index 9b236c9092..3bfccb5c45 100644 --- a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt +++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt @@ -24,7 +24,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -max_batch_size: 4 +max_batch_size: 4 dynamic_batching { preferred_batch_size: 1 max_queue_delay_microseconds: 2000000 diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt index b46aa21f5e..f369db917f 100644 --- a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt +++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt @@ -24,7 +24,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -max_batch_size: 4 +max_batch_size: 4 dynamic_batching { preferred_batch_size: 4 max_queue_delay_microseconds: 200000 diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/perf_query.sh b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/perf_query.sh old mode 100644 new mode 100755 diff --git a/deploy/gke-marketplace-app/client-sample/bert_request.json b/deploy/gke-marketplace-app/client-sample/bert_request.json index b918815147..ce4b956db6 100644 --- a/deploy/gke-marketplace-app/client-sample/bert_request.json +++ b/deploy/gke-marketplace-app/client-sample/bert_request.json @@ -4,19 +4,19 @@ "shape": [1, 128], "datatype": "INT32", "parameters": {}, - "data": [101, 2054, 2003, 23435, 5339, 1029, 102, 23435, 5339, 2003, 1037, 2152, 2836, 2784, 4083, 28937, 4132, 2008, 18058, 2659, 2397, 9407, 1998, 2152, 2083, 18780, 2005, 18726, 2107, 2004, 16755, 2545, 1010, 4613, 1998, 3746, 1013, 2678, 2006, 1050, 17258, 2401, 14246, 2271, 1012, 2009, 2950, 11968, 8043, 2015, 2000, 12324, 4275, 1010, 1998, 13354, 7076, 2000, 2490, 3117, 23092, 1998, 9014, 2077, 11243, 20600, 2015, 2005, 28937, 1012, 2651, 1050, 17258, 2401, 2003, 2330, 1011, 14768, 6129, 11968, 8043, 2015, 1998, 13354, 7076, 1999, 23435, 5339, 2061, 2008, 1996, 2784, 4083, 2451, 2064, 7661, 4697, 1998, 7949, 2122, 6177, 2000, 2202, 5056, 1997, 3928, 23435, 5339, 20600, 2015, 2005, 2115, 18726, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + "data": [101, 2054, 2003, 23435, 5339, 1029, 102, 23435, 5339, 2003, 1037, 2152, 2836, 2784, 4083, 28937, 4132, 2008, 18058, 2659, 2397, 9407, 1998, 2152, 2083, 18780, 2005, 18726, 2107, 2004, 16755, 2545, 1010, 4613, 1998, 3746, 1013, 2678, 2006, 1050, 17258, 2401, 14246, 2271, 1012, 2009, 2950, 11968, 8043, 2015, 2000, 12324, 4275, 1010, 1998, 13354, 7076, 2000, 2490, 3117, 23092, 1998, 9014, 2077, 11243, 20600, 2015, 2005, 28937, 1012, 2651, 1050, 17258, 2401, 2003, 2330, 1011, 14768, 6129, 11968, 8043, 2015, 1998, 13354, 7076, 1999, 23435, 5339, 2061, 2008, 1996, 2784, 4083, 2451, 2064, 7661, 4697, 1998, 7949, 2122, 6177, 2000, 2202, 5056, 1997, 3928, 23435, 5339, 20600, 2015, 2005, 2115, 18726, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }, { "name": "input_mask", "shape": [1, 128], "datatype": "INT32", "parameters": {}, - "data": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + "data": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }, { "name": "segment_ids", "shape": [1, 128], "datatype": "INT32", "parameters": {}, - "data": [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + "data": [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] }], "outputs": [{ "name": "cls_squad_logits", diff --git a/deploy/gke-marketplace-app/client-sample/locustfile_bert.py b/deploy/gke-marketplace-app/client-sample/locustfile_bert.py old mode 100644 new mode 100755 index 2e2ac2f721..aae8c69f43 --- a/deploy/gke-marketplace-app/client-sample/locustfile_bert.py +++ b/deploy/gke-marketplace-app/client-sample/locustfile_bert.py @@ -1,4 +1,6 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,18 +26,18 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from locust import HttpUser, task, between -from locust import LoadTestShape import json +from locust import HttpUser, LoadTestShape, between, task + class ProfileLoad(LoadTestShape): - ''' + """ This load profile starts at 0 and steps up by step_users increments every tick, up to target_users. After reaching target_user level, load will stay at target_user level until time_limit is reached. - ''' + """ target_users = 1000 step_users = 50 # ramp users each step @@ -63,8 +65,7 @@ def bert(self): response = self.client.post(self.url1, data=json.dumps(self.data)) def on_start(self): - with open('bert_request.json') as f: + with open("bert_request.json") as f: self.data = json.load(f) - self.url1 = '{}/v2/models/{}/infer'.format(self.environment.host, - 'bert') + self.url1 = "{}/v2/models/{}/infer".format(self.environment.host, "bert") diff --git a/deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh b/deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh old mode 100644 new mode 100755 diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh old mode 100644 new mode 100755 index 6cf5319b8a..64292409c8 --- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh +++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh @@ -1,4 +1,5 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/bin/bash +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml index 2f08cf07d7..5658aea801 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml @@ -41,14 +41,14 @@ spec: type: Triton version: "{{ .Values.publishedVersion }}" description: |- - Triton Inference Server provides a cloud and edge inferencing solution - optimized for both CPUs and GPUs. Triton supports an HTTP/REST and GRPC - protocol that allows remote clients to request inferencing for any model + Triton Inference Server provides a cloud and edge inferencing solution + optimized for both CPUs and GPUs. Triton supports an HTTP/REST and GRPC + protocol that allows remote clients to request inferencing for any model being managed by the server. notes: |- - Send request to Triton server by using IP address "ingress-triton", + Send request to Triton server by using IP address "ingress-triton", send to IP:80/v2/models/{}/infer Links: diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml index 6a0b77b4ea..8bf21d9684 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml @@ -33,7 +33,7 @@ metadata: app: {{ template "triton-inference-server.name" . }} chart: {{ template "triton-inference-server.chart" . }} release: {{ .Release.Name }} - heritage: {{ .Release.Service }} + heritage: {{ .Release.Service }} spec: replicas: {{ .Values.initReplicaCount }} selector: diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml index b919c55f1f..5562fa76b5 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml @@ -35,11 +35,11 @@ metadata: app: {{ template "triton-inference-server.name" . }} chart: {{ template "triton-inference-server.chart" . }} release: {{ .Release.Name }} - heritage: {{ .Release.Service }} + heritage: {{ .Release.Service }} spec: type: {{ .Values.service.type }} ports: - - port: 8000 + - port: 8000 targetPort: http name: http-inference-server - port: 8001 diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml index 2413b17a82..6a7dc39772 100644 --- a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml +++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml @@ -49,7 +49,7 @@ image: allowGPUMetrics: True service: - type: NodePort + type: NodePort deployment: livenessProbe: diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml index 08086cc2d7..1a51f17a8f 100644 --- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml @@ -66,26 +66,26 @@ properties: type: string x-google-marketplace: type: NAMESPACE - initReplicaCount: + initReplicaCount: title: Initial number of Triton pod instances to deploy. type: integer default: 1 - minReplicaCount: + minReplicaCount: title: Minimum number of Triton pod instances in the deployment for autoscaling. type: integer default: 1 - maxReplicaCount: + maxReplicaCount: title: Maximum number of Triton pod instances in the deployment for autoscaling. type: integer default: 3 - tritonProtocol: + tritonProtocol: title: Request protocol to send data to Triton, choose from gRPC and HTTP. type: string default: HTTP - HPATargetAverageValue: - title: HPA autoscaling target, GKE currently support Duty Cycle which is GPU utilization, when target is reached, Triton Server service will create another pod instance. We ask user to analyze model inference to associate appropriate GPU metric target based on latency requirement. We also recommend to leave some room to mitigate transient load effect. For user interested in customizing autoscaling metrics, we recommends GPU Power (Percentage of Power), Queue time or SLA measurements such as latency. + HPATargetAverageValue: + title: HPA autoscaling target, GKE currently support Duty Cycle which is GPU utilization, when target is reached, Triton Server service will create another pod instance. We ask user to analyze model inference to associate appropriate GPU metric target based on latency requirement. We also recommend to leave some room to mitigate transient load effect. For user interested in customizing autoscaling metrics, we recommends GPU Power (Percentage of Power), Queue time or SLA measurements such as latency. type: integer - default: 85 + default: 85 modelRepositoryPath: type: string title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc. @@ -97,7 +97,7 @@ properties: image.logVerboseLevel: type: integer title: Set verbose logging level. Zero (0) disables verbose logging and values >= 1 enable verbose logging, this is helpful when user unsure if the model is compatible with Triton or for general debug. - default: 0 + default: 0 image.strictModelConfig: type: boolean title: Leave this unchecked by default. When strictModelConfig is not checked(False), Triton will try to infer the config file from model file, when checked(True), user need to provide config.pbtxt in model repository. @@ -105,14 +105,14 @@ properties: image.allowGPUMetrics: type: boolean title: Select by default. When use A100 MIG, unselect to disable GPU Memory metrics reported by Triton, as current GPU metrics not support on A100 MIG. - default: True + default: True istioEnabled: type: boolean x-google-marketplace: type: ISTIO_ENABLED default: True - + required: - name - namespace diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml index 4d9c95c2da..4da79a389a 100644 --- a/deploy/gke-marketplace-app/server-deployer/schema.yaml +++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml @@ -66,26 +66,26 @@ properties: type: string x-google-marketplace: type: NAMESPACE - initReplicaCount: + initReplicaCount: title: Initial number of Triton pod instances to deploy. type: integer default: 1 - minReplicaCount: + minReplicaCount: title: Minimum number of Triton pod instances in the deployment for autoscaling. type: integer default: 1 - maxReplicaCount: + maxReplicaCount: title: Maximum number of Triton pod instances in the deployment for autoscaling. type: integer default: 3 - tritonProtocol: + tritonProtocol: title: Request protocol to send data to Triton, choose from gRPC and HTTP. type: string default: HTTP - HPATargetAverageValue: - title: HPA autoscaling target, GKE currently support Duty Cycle which is GPU utilization, when target is reached, Triton Server service will create another pod instance. We ask user to analyze model inference to associate appropriate GPU metric target based on latency requirement. We also recommend to leave some room to mitigate transient load effect. For user interested in customizing autoscaling metrics, we recommends GPU Power (Percentage of Power), Queue time or SLA measurements such as latency. + HPATargetAverageValue: + title: HPA autoscaling target, GKE currently support Duty Cycle which is GPU utilization, when target is reached, Triton Server service will create another pod instance. We ask user to analyze model inference to associate appropriate GPU metric target based on latency requirement. We also recommend to leave some room to mitigate transient load effect. For user interested in customizing autoscaling metrics, we recommends GPU Power (Percentage of Power), Queue time or SLA measurements such as latency. type: integer - default: 85 + default: 85 modelRepositoryPath: type: string title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc. @@ -97,7 +97,7 @@ properties: image.logVerboseLevel: type: integer title: Set verbose logging level. Zero (0) disables verbose logging and values >= 1 enable verbose logging, this is helpful when user unsure if the model is compatible with Triton or for general debug. - default: 0 + default: 0 image.strictModelConfig: type: boolean title: Leave this unchecked by default. When strictModelConfig is not checked(False), Triton will try to infer the config file from model file, when checked(True), user need to provide config.pbtxt in model repository. @@ -105,14 +105,14 @@ properties: image.allowGPUMetrics: type: boolean title: Select by default. When use A100 MIG, unselect to disable GPU Memory metrics reported by Triton, as current GPU metrics not support on A100 MIG. - default: True + default: True istioEnabled: type: boolean x-google-marketplace: type: ISTIO_ENABLED default: True - + required: - name - namespace diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md index b4bade1e6b..2a67879f51 100644 --- a/deploy/gke-marketplace-app/trt-engine/README.md +++ b/deploy/gke-marketplace-app/trt-engine/README.md @@ -33,13 +33,13 @@ ``` docker run --gpus all -it --network host \ --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ - -v ~:/scripts nvcr.io/nvidia/tensorrt:23.06-py3 + -v ~:/scripts nvcr.io/nvidia/tensorrt:23.06-py3 -pip install onnx six torch tf2onnx tensorflow +pip install onnx six torch tf2onnx tensorflow git clone -b main https://github.com/NVIDIA/TensorRT.git cd TensorRT -git submodule update --init --recursive +git submodule update --init --recursive export TRT_OSSPATH=/workspace/TensorRT export TRT_LIBPATH=/lib/x86_64-linux-gnu @@ -49,15 +49,15 @@ pushd /usr/local/bin && wget https://ngc.nvidia.com/downloads/ngccli_cat_linux.z popd cd /workspace/TensorRT/demo/BERT -bash ./scripts/download_squad.sh +bash ./scripts/download_squad.sh bash ./scripts/download_model.sh large 128 # bash ./scripts/download_model.sh large 384 mkdir -p engines -python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh +python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/23_02/bert/1/model.plan ``` -For each Triton upgrade, container version used to genrate the model, and the model path in GCS `gs://triton_sample_models/23_02/` should be updated accordingly with the correct version. +For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/23_02/` should be updated accordingly with the correct version. diff --git a/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt b/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt old mode 100755 new mode 100644 diff --git a/deploy/mlflow-triton-plugin/mlflow_triton/__init__.py b/deploy/mlflow-triton-plugin/mlflow_triton/__init__.py old mode 100644 new mode 100755 index 6eff4167d0..0b73b537d4 --- a/deploy/mlflow-triton-plugin/mlflow_triton/__init__.py +++ b/deploy/mlflow-triton-plugin/mlflow_triton/__init__.py @@ -1,4 +1,6 @@ -# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -22,4 +24,4 @@ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/deploy/mlflow-triton-plugin/mlflow_triton/config.py b/deploy/mlflow-triton-plugin/mlflow_triton/config.py old mode 100644 new mode 100755 index 484b026227..0a381fd407 --- a/deploy/mlflow-triton-plugin/mlflow_triton/config.py +++ b/deploy/mlflow-triton-plugin/mlflow_triton/config.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,38 +28,40 @@ import os import re from collections import namedtuple + from mlflow.exceptions import MlflowException class Config(dict): - def __init__(self): super().__init__() - self['triton_url'] = os.environ.get('TRITON_URL') - self['triton_model_repo'] = os.environ.get('TRITON_MODEL_REPO') + self["triton_url"] = os.environ.get("TRITON_URL") + self["triton_model_repo"] = os.environ.get("TRITON_MODEL_REPO") - if self['triton_model_repo'].startswith('s3://'): + if self["triton_model_repo"].startswith("s3://"): self.s3_regex = re.compile( - 's3://(http://|https://|)([0-9a-zA-Z\\-.]+):([0-9]+)/' - '([0-9a-z.\\-]+)(((/[0-9a-zA-Z.\\-_]+)*)?)') + "s3://(http://|https://|)([0-9a-zA-Z\\-.]+):([0-9]+)/" + "([0-9a-z.\\-]+)(((/[0-9a-zA-Z.\\-_]+)*)?)" + ) - uri = self.parse_path(self['triton_model_repo']) + uri = self.parse_path(self["triton_model_repo"]) if uri.protocol == "https://": protocol = "https://" else: protocol = "http://" endpoint_url = None if uri.host_name != "" and uri.host_port != "": - endpoint_url = '{}{}:{}'.format(protocol, uri.host_name, - uri.host_port) + endpoint_url = "{}{}:{}".format(protocol, uri.host_name, uri.host_port) import boto3 + # boto3 handles AWS credentials - self['s3'] = boto3.client('s3', endpoint_url=endpoint_url) - self['s3_bucket'] = uri.bucket - self['s3_prefix'] = uri.prefix - self['triton_model_repo'] = 's3://{}'.format( - os.path.join(uri.bucket, uri.prefix)) + self["s3"] = boto3.client("s3", endpoint_url=endpoint_url) + self["s3_bucket"] = uri.bucket + self["s3_prefix"] = uri.prefix + self["triton_model_repo"] = "s3://{}".format( + os.path.join(uri.bucket, uri.prefix) + ) def parse_path(self, path): # Cleanup extra slashes @@ -66,10 +70,11 @@ def parse_path(self, path): # Get the bucket name and the object path. Return error if path is malformed match = self.s3_regex.fullmatch(clean_path) S3URI = namedtuple( - "S3URI", ["protocol", "host_name", "host_port", "bucket", "prefix"]) + "S3URI", ["protocol", "host_name", "host_port", "bucket", "prefix"] + ) if match: uri = S3URI(*match.group(1, 2, 3, 4, 5)) - if uri.prefix and uri.prefix[0] == '/': + if uri.prefix and uri.prefix[0] == "/": uri = uri._replace(prefix=uri.prefix[1:]) else: bucket_start = clean_path.find("s3://") + len("s3://") @@ -78,7 +83,7 @@ def parse_path(self, path): # If there isn't a slash, the address has only the bucket if bucket_end > bucket_start: bucket = clean_path[bucket_start:bucket_end] - prefix = clean_path[bucket_end + 1:] + prefix = clean_path[bucket_end + 1 :] else: bucket = clean_path[bucket_start:] prefix = "" @@ -94,8 +99,8 @@ def clean_path(self, s3_path): start = s3_path.find("s3://") path = "" if start != -1: - path = s3_path[start + len("s3://"):] - clean_path = ("s3://") + path = s3_path[start + len("s3://") :] + clean_path = "s3://" else: path = s3_path clean_path = "" @@ -103,29 +108,29 @@ def clean_path(self, s3_path): # Must handle paths with https:// or http:// prefix https_start = path.find("https://") if https_start != -1: - path = path[https_start + len("https://"):] + path = path[https_start + len("https://") :] clean_path += "https://" else: http_start = path.find("http://") if http_start != -1: - path = path[http_start + len("http://"):] + path = path[http_start + len("http://") :] clean_path += "http://" # Remove trailing slashes - rtrim_length = len(path.rstrip('/')) + rtrim_length = len(path.rstrip("/")) if rtrim_length == 0: raise MlflowException("Invalid bucket name: '" + path + "'") # Remove leading slashes - ltrim_length = len(path) - len(path.lstrip('/')) + ltrim_length = len(path) - len(path.lstrip("/")) if ltrim_length == len(path): raise MlflowException("Invalid bucket name: '" + path + "'") # Remove extra internal slashes - true_path = path[ltrim_length:rtrim_length + 1] + true_path = path[ltrim_length : rtrim_length + 1] previous_slash = False for i in range(len(true_path)): - if true_path[i] == '/': + if true_path[i] == "/": if not previous_slash: clean_path += true_path[i] previous_slash = True diff --git a/deploy/mlflow-triton-plugin/mlflow_triton/deployments.py b/deploy/mlflow-triton-plugin/mlflow_triton/deployments.py old mode 100644 new mode 100755 index 0a22ba6c88..fb8e72c286 --- a/deploy/mlflow-triton-plugin/mlflow_triton/deployments.py +++ b/deploy/mlflow-triton-plugin/mlflow_triton/deployments.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -23,25 +25,27 @@ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os import ast -import shutil +import glob +import json import logging +import os +import shutil from pathlib import Path -from mlflow_triton.config import Config +import numpy as np +import pandas as pd import tritonclient.http as tritonhttpclient -from tritonclient.utils import InferenceServerException, np_to_triton_dtype, triton_to_np_dtype - from mlflow.deployments import BaseDeploymentClient -from mlflow.tracking.artifact_utils import _download_artifact_from_uri from mlflow.exceptions import MlflowException from mlflow.models import Model - -import glob -import json -import pandas as pd -import numpy as np +from mlflow.tracking.artifact_utils import _download_artifact_from_uri +from mlflow_triton.config import Config +from tritonclient.utils import ( + InferenceServerException, + np_to_triton_dtype, + triton_to_np_dtype, +) logger = logging.getLogger(__name__) @@ -49,7 +53,6 @@ class TritonPlugin(BaseDeploymentClient): - def __init__(self, uri): """ Initializes the deployment plugin, sets the triton model repo @@ -58,16 +61,17 @@ def __init__(self, uri): self.server_config = Config() triton_url, self.triton_model_repo = self._get_triton_server_config() # need to add other flavors - self.supported_flavors = ['triton', 'onnx'] + self.supported_flavors = ["triton", "onnx"] # URL cleaning for constructing Triton client ssl = False if triton_url.startswith("http://"): - triton_url = triton_url[len("http://"):] + triton_url = triton_url[len("http://") :] elif triton_url.startswith("https://"): - triton_url = triton_url[len("https://"):] + triton_url = triton_url[len("https://") :] ssl = True self.triton_client = tritonhttpclient.InferenceServerClient( - url=triton_url, ssl=ssl) + url=triton_url, ssl=ssl + ) def _get_triton_server_config(self): triton_url = "localhost:8000" @@ -76,8 +80,7 @@ def _get_triton_server_config(self): logger.info("Triton url = {}".format(triton_url)) if not self.server_config["triton_model_repo"]: - raise Exception( - "Check that environment variable TRITON_MODEL_REPO is set") + raise Exception("Check that environment variable TRITON_MODEL_REPO is set") triton_model_repo = self.server_config["triton_model_repo"] logger.info("Triton model repo = {}".format(triton_model_repo)) @@ -100,7 +103,8 @@ def create_deployment(self, name, model_uri, flavor=None, config=None): if self._model_exists(name): raise Exception( "Unable to create deployment for name %s because it already exists." - % (name)) + % (name) + ) # Get the path of the artifact path = Path(_download_artifact_from_uri(model_uri)) @@ -126,7 +130,8 @@ def delete_deployment(self, name): if not self._model_exists(name): raise Exception( "Unable to delete deployment for name %s because it does not exist." - % (name)) + % (name) + ) try: self.triton_client.unload_model(name) @@ -156,7 +161,8 @@ def update_deployment(self, name, model_uri=None, flavor=None, config=None): if not self._model_exists(name): raise Exception( "Unable to update deployment for name %s because it does not exist." - % (name)) + % (name) + ) self.get_deployment(name) @@ -183,25 +189,32 @@ def list_deployments(self): resp = self.triton_client.get_model_repository_index() actives = [] for d in resp: - if 'state' in d and d['state'] == 'READY': - mlflow_meta_path = os.path.join(self.triton_model_repo, - d['name'], - _MLFLOW_META_FILENAME) - if 's3' in self.server_config: + if "state" in d and d["state"] == "READY": + mlflow_meta_path = os.path.join( + self.triton_model_repo, d["name"], _MLFLOW_META_FILENAME + ) + if "s3" in self.server_config: meta_dict = ast.literal_eval( - self.server_config['s3'].get_object( - Bucket=self.server_config['s3_bucket'], - Key=os.path.join(self.server_config['s3_prefix'], - d['name'], _MLFLOW_META_FILENAME), - )['Body'].read().decode('utf-8')) + self.server_config["s3"] + .get_object( + Bucket=self.server_config["s3_bucket"], + Key=os.path.join( + self.server_config["s3_prefix"], + d["name"], + _MLFLOW_META_FILENAME, + ), + )["Body"] + .read() + .decode("utf-8") + ) elif os.path.isfile(mlflow_meta_path): - meta_dict = self._get_mlflow_meta_dict(d['name']) + meta_dict = self._get_mlflow_meta_dict(d["name"]) else: continue - d['triton_model_path'] = meta_dict['triton_model_path'] - d['mlflow_model_uri'] = meta_dict['mlflow_model_uri'] - d['flavor'] = meta_dict['flavor'] + d["triton_model_path"] = meta_dict["triton_model_path"] + d["mlflow_model_uri"] = meta_dict["mlflow_model_uri"] + d["flavor"] = meta_dict["flavor"] actives.append(d) return actives @@ -217,9 +230,9 @@ def get_deployment(self, name): """ deployments = self.list_deployments() for d in deployments: - if d['name'] == name: + if d["name"] == name: return d - raise ValueError(f'Unable to get deployment with name {name}') + raise ValueError(f"Unable to get deployment with name {name}") def predict(self, deployment_name, df): single_input_np = None @@ -231,16 +244,13 @@ def predict(self, deployment_name, df): raise MlflowException("Unnamed input is not currently supported") else: if isinstance(df, pd.DataFrame): - model_metadata = self.triton_client.get_model_metadata( - deployment_name) + model_metadata = self.triton_client.get_model_metadata(deployment_name) input_dtype = {} for input in model_metadata["inputs"]: - input_dtype[input["name"]] = triton_to_np_dtype( - input["datatype"]) + input_dtype[input["name"]] = triton_to_np_dtype(input["datatype"]) # Sanity check if len(df.columns) != 1: - raise MlflowException( - "Expect Pandas DataFrame has only 1 column") + raise MlflowException("Expect Pandas DataFrame has only 1 column") col = df.columns[0] for row in df.index: val = df[col][row] @@ -249,21 +259,24 @@ def predict(self, deployment_name, df): val = np.array(val, dtype=input_dtype[row]) inputs.append( tritonhttpclient.InferInput( - row, val.shape, np_to_triton_dtype(val.dtype))) + row, val.shape, np_to_triton_dtype(val.dtype) + ) + ) inputs[-1].set_data_from_numpy(val) else: for key, val in df.items(): inputs.append( tritonhttpclient.InferInput( - key, val.shape, np_to_triton_dtype(val.dtype))) + key, val.shape, np_to_triton_dtype(val.dtype) + ) + ) inputs[-1].set_data_from_numpy(val) try: - resp = self.triton_client.infer(model_name=deployment_name, - inputs=inputs) + resp = self.triton_client.infer(model_name=deployment_name, inputs=inputs) res = {} - for output in resp.get_response()['outputs']: - res[output['name']] = resp.as_numpy(output['name']) + for output in resp.get_response()["outputs"]: + res[output["name"]] = resp.as_numpy(output["name"]) return pd.DataFrame.from_dict({"outputs": res}) except InferenceServerException as ex: raise MlflowException(str(ex)) @@ -271,99 +284,105 @@ def predict(self, deployment_name, df): def _generate_mlflow_meta_file(self, name, flavor, model_uri): triton_deployment_dir = os.path.join(self.triton_model_repo, name) meta_dict = { - 'name': name, - 'triton_model_path': triton_deployment_dir, - 'mlflow_model_uri': model_uri, - 'flavor': flavor + "name": name, + "triton_model_path": triton_deployment_dir, + "mlflow_model_uri": model_uri, + "flavor": flavor, } - if 's3' in self.server_config: - self.server_config['s3'].put_object( - Body=json.dumps(meta_dict, indent=4).encode('utf-8'), + if "s3" in self.server_config: + self.server_config["s3"].put_object( + Body=json.dumps(meta_dict, indent=4).encode("utf-8"), Bucket=self.server_config["s3_bucket"], - Key=os.path.join(self.server_config['s3_prefix'], name, - _MLFLOW_META_FILENAME), + Key=os.path.join( + self.server_config["s3_prefix"], name, _MLFLOW_META_FILENAME + ), ) else: with open( - os.path.join(triton_deployment_dir, _MLFLOW_META_FILENAME), - "w") as outfile: + os.path.join(triton_deployment_dir, _MLFLOW_META_FILENAME), "w" + ) as outfile: json.dump(meta_dict, outfile, indent=4) print("Saved", _MLFLOW_META_FILENAME, "to", triton_deployment_dir) def _get_mlflow_meta_dict(self, name): - mlflow_meta_path = os.path.join(self.triton_model_repo, name, - _MLFLOW_META_FILENAME) + mlflow_meta_path = os.path.join( + self.triton_model_repo, name, _MLFLOW_META_FILENAME + ) - if 's3' in self.server_config: + if "s3" in self.server_config: mlflow_meta_dict = ast.literal_eval( - self.server_config['s3'].get_object( - Bucket=self.server_config['s3_bucket'], - Key=os.path.join(self.server_config['s3_prefix'], name, - _MLFLOW_META_FILENAME), - )['Body'].read().decode('utf-8')) + self.server_config["s3"] + .get_object( + Bucket=self.server_config["s3_bucket"], + Key=os.path.join( + self.server_config["s3_prefix"], name, _MLFLOW_META_FILENAME + ), + )["Body"] + .read() + .decode("utf-8") + ) else: - with open(mlflow_meta_path, 'r') as metafile: + with open(mlflow_meta_path, "r") as metafile: mlflow_meta_dict = json.load(metafile) return mlflow_meta_dict def _get_copy_paths(self, artifact_path, name, flavor): copy_paths = {} - copy_paths['model_path'] = {} + copy_paths["model_path"] = {} triton_deployment_dir = os.path.join(self.triton_model_repo, name) if flavor == "triton": # When flavor is 'triton', the model is assumed to be preconfigured # with proper model versions and version strategy, which may differ from # the versioning in MLFlow for file in artifact_path.iterdir(): - if file.name not in ['MLmodel', 'conda.yaml']: - copy_paths['model_path']['from'] = file - copy_paths['model_path']['to'] = triton_deployment_dir + if file.name not in ["MLmodel", "conda.yaml"]: + copy_paths["model_path"]["from"] = file + copy_paths["model_path"]["to"] = triton_deployment_dir elif flavor == "onnx": # Look for model file via MLModel metadata or iterating dir model_file = None config_file = None for file in artifact_path.iterdir(): - if file.name == 'MLmodel': + if file.name == "MLmodel": mlmodel = Model.load(file) onnx_meta_data = mlmodel.flavors.get("onnx", None) if onnx_meta_data is not None: - model_file = onnx_meta_data.get('data', None) - elif file.name == 'config.pbtxt': + model_file = onnx_meta_data.get("data", None) + elif file.name == "config.pbtxt": config_file = file.name - copy_paths['config_path'] = {} - elif file.suffix == '.txt' and file.stem != 'requirements': - copy_paths[file.stem] = { - 'from': file, - 'to': triton_deployment_dir - } + copy_paths["config_path"] = {} + elif file.suffix == ".txt" and file.stem != "requirements": + copy_paths[file.stem] = {"from": file, "to": triton_deployment_dir} if model_file is None: for file in artifact_path.iterdir(): - if file.suffix == '.onnx': + if file.suffix == ".onnx": model_file = file.name break - copy_paths['model_path']['from'] = os.path.join( - artifact_path, model_file) - copy_paths['model_path']['to'] = os.path.join( - triton_deployment_dir, "1") + copy_paths["model_path"]["from"] = os.path.join(artifact_path, model_file) + copy_paths["model_path"]["to"] = os.path.join(triton_deployment_dir, "1") if config_file is not None: - copy_paths['config_path']['from'] = os.path.join( - artifact_path, config_file) - copy_paths['config_path']['to'] = triton_deployment_dir + copy_paths["config_path"]["from"] = os.path.join( + artifact_path, config_file + ) + copy_paths["config_path"]["to"] = triton_deployment_dir else: # Make sure the directory has been created for config.pbtxt os.makedirs(triton_deployment_dir, exist_ok=True) # Provide a minimum config file so Triton knows what backend # should be performing the auto-completion - config = ''' + config = """ backend: "onnxruntime" default_model_filename: "{}" -'''.format(model_file) - with open(os.path.join(triton_deployment_dir, "config.pbtxt"), - "w") as cfile: +""".format( + model_file + ) + with open( + os.path.join(triton_deployment_dir, "config.pbtxt"), "w" + ) as cfile: cfile.write(config) return copy_paths @@ -379,52 +398,51 @@ def _walk(self, path): elif os.path.isdir(path): return list(os.walk(path)) else: - raise Exception( - f'path: {path} is not a valid path to a file or dir.') + raise Exception(f"path: {path} is not a valid path to a file or dir.") def _copy_files_to_triton_repo(self, artifact_path, name, flavor): copy_paths = self._get_copy_paths(artifact_path, name, flavor) for key in copy_paths: - if 's3' in self.server_config: + if "s3" in self.server_config: # copy model dir to s3 recursively - for root, dirs, files in self._walk(copy_paths[key]['from']): + for root, dirs, files in self._walk(copy_paths[key]["from"]): for filename in files: local_path = os.path.join(root, filename) if flavor == "onnx": s3_path = os.path.join( - self.server_config['s3_prefix'], - copy_paths[key]['to'].replace( - self.server_config['triton_model_repo'], - '').strip('/'), + self.server_config["s3_prefix"], + copy_paths[key]["to"] + .replace(self.server_config["triton_model_repo"], "") + .strip("/"), filename, ) elif flavor == "triton": rel_path = os.path.relpath( local_path, - copy_paths[key]['from'], + copy_paths[key]["from"], ) s3_path = os.path.join( - self.server_config['s3_prefix'], name, rel_path) + self.server_config["s3_prefix"], name, rel_path + ) - self.server_config['s3'].upload_file( + self.server_config["s3"].upload_file( local_path, - self.server_config['s3_bucket'], + self.server_config["s3_bucket"], s3_path, ) else: - if os.path.isdir(copy_paths[key]['from']): - if os.path.isdir(copy_paths[key]['to']): - shutil.rmtree(copy_paths[key]['to']) - shutil.copytree(copy_paths[key]['from'], - copy_paths[key]['to']) + if os.path.isdir(copy_paths[key]["from"]): + if os.path.isdir(copy_paths[key]["to"]): + shutil.rmtree(copy_paths[key]["to"]) + shutil.copytree(copy_paths[key]["from"], copy_paths[key]["to"]) else: - if not os.path.isdir(copy_paths[key]['to']): - os.makedirs(copy_paths[key]['to']) - shutil.copy(copy_paths[key]['from'], copy_paths[key]['to']) + if not os.path.isdir(copy_paths[key]["to"]): + os.makedirs(copy_paths[key]["to"]) + shutil.copy(copy_paths[key]["from"], copy_paths[key]["to"]) - if 's3' not in self.server_config: + if "s3" not in self.server_config: triton_deployment_dir = os.path.join(self.triton_model_repo, name) version_folder = os.path.join(triton_deployment_dir, "1") os.makedirs(version_folder, exist_ok=True) @@ -432,40 +450,41 @@ def _copy_files_to_triton_repo(self, artifact_path, name, flavor): return copy_paths def _delete_mlflow_meta(self, filepath): - if 's3' in self.server_config: - self.server_config['s3'].delete_object( - Bucket=self.server_config['s3_bucket'], + if "s3" in self.server_config: + self.server_config["s3"].delete_object( + Bucket=self.server_config["s3_bucket"], Key=filepath, ) elif os.path.isfile(filepath): os.remove(filepath) def _delete_deployment_files(self, name): - triton_deployment_dir = os.path.join(self.triton_model_repo, name) - if 's3' in self.server_config: - objs = self.server_config['s3'].list_objects( - Bucket=self.server_config['s3_bucket'], - Prefix=os.path.join(self.server_config['s3_prefix'], name), + if "s3" in self.server_config: + objs = self.server_config["s3"].list_objects( + Bucket=self.server_config["s3_bucket"], + Prefix=os.path.join(self.server_config["s3_prefix"], name), ) - for key in objs['Contents']: - key = key['Key'] + for key in objs["Contents"]: + key = key["Key"] try: - self.server_config['s3'].delete_object( - Bucket=self.server_config['s3_bucket'], + self.server_config["s3"].delete_object( + Bucket=self.server_config["s3_bucket"], Key=key, ) except Exception as e: - raise Exception(f'Could not delete {key}: {e}') + raise Exception(f"Could not delete {key}: {e}") else: # Check if the deployment directory exists if not os.path.isdir(triton_deployment_dir): raise Exception( - "A deployment does not exist for this model in directory {} for model name {}" - .format(triton_deployment_dir, name)) + "A deployment does not exist for this model in directory {} for model name {}".format( + triton_deployment_dir, name + ) + ) model_file = glob.glob("{}/model*".format(triton_deployment_dir)) for file in model_file: @@ -474,28 +493,30 @@ def _delete_deployment_files(self, name): print("Model directory removed: {}".format(file)) # Delete mlflow meta file - mlflow_meta_path = os.path.join(self.triton_model_repo, name, - _MLFLOW_META_FILENAME) + mlflow_meta_path = os.path.join( + self.triton_model_repo, name, _MLFLOW_META_FILENAME + ) self._delete_mlflow_meta(mlflow_meta_path) def _validate_config_args(self, config): - if not config['version']: + if not config["version"]: raise Exception("Please provide the version as a config argument") - if not config['version'].isdigit(): + if not config["version"].isdigit(): raise ValueError( "Please make sure version is a number. version = {}".format( - config['version'])) + config["version"] + ) + ) def _validate_flavor(self, flavor): if flavor not in self.supported_flavors: - raise Exception( - "{} model flavor not supported by Triton".format(flavor)) + raise Exception("{} model flavor not supported by Triton".format(flavor)) def _model_exists(self, name): deploys = self.list_deployments() exists = False for d in deploys: - if d['name'] == name: + if d["name"] == name: exists = True return exists @@ -508,7 +529,7 @@ def target_help(): help_msg = ( "\nmlflow-triton plugin integrates the Triton Inference Server to the mlflow deployment pipeline. \n\n " "Example command: \n\n" - " mlflow deployments create -t triton --name mymodel --flavor onnx -m models:/mymodel/Production -C \"version=1\" \n\n" + ' mlflow deployments create -t triton --name mymodel --flavor onnx -m models:/mymodel/Production -C "version=1" \n\n' "The environment variable TRITON_MODEL_REPO must be set to the location that the Triton" "Inference Server is storing its models\n\n" "export TRITON_MODEL_REPO = /path/to/triton/model/repo\n\n" diff --git a/deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py b/deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py old mode 100644 new mode 100755 index 5343e0da63..779d393020 --- a/deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py +++ b/deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py @@ -1,4 +1,6 @@ -# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -23,10 +25,10 @@ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import mlflow import os -import click +import click +import mlflow import triton_flavor @@ -35,18 +37,20 @@ "--model_name", help="Model name", ) -@click.option("--model_directory", - type=click.Path(exists=True, readable=True), - required=True, - help="Model filepath") +@click.option( + "--model_directory", + type=click.Path(exists=True, readable=True), + required=True, + help="Model filepath", +) @click.option( "--flavor", - type=click.Choice(['triton'], case_sensitive=True), + type=click.Choice(["triton"], case_sensitive=True), required=True, help="Model flavor", ) def publish_to_mlflow(model_name, model_directory, flavor): - mlflow_tracking_uri = os.environ['MLFLOW_TRACKING_URI'] + mlflow_tracking_uri = os.environ["MLFLOW_TRACKING_URI"] artifact_path = "triton" mlflow.set_tracking_uri(uri=mlflow_tracking_uri) diff --git a/deploy/mlflow-triton-plugin/scripts/triton_flavor.py b/deploy/mlflow-triton-plugin/scripts/triton_flavor.py old mode 100644 new mode 100755 index eaafdea7c7..7b0f61630d --- a/deploy/mlflow-triton-plugin/scripts/triton_flavor.py +++ b/deploy/mlflow-triton-plugin/scripts/triton_flavor.py @@ -1,4 +1,6 @@ -# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,7 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ The ``triton`` module provides APIs for logging and loading Triton-recognized -models in the MLflow Model format. This module exports MLflow Models with the following +models in the MLflow Model format. This module exports MLflow Models with the following flavors: Triton format @@ -36,12 +38,12 @@ import shutil import sys +from mlflow.exceptions import MlflowException from mlflow.models import Model from mlflow.models.model import MLMODEL_FILE_NAME -from mlflow.exceptions import MlflowException from mlflow.protos.databricks_pb2 import RESOURCE_ALREADY_EXISTS -from mlflow.utils.annotations import experimental from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS +from mlflow.utils.annotations import experimental FLAVOR_NAME = "triton" @@ -63,8 +65,10 @@ def save_model( path = os.path.abspath(path) if os.path.exists(path): - raise MlflowException(message="Path '{}' already exists".format(path), - error_code=RESOURCE_ALREADY_EXISTS) + raise MlflowException( + message="Path '{}' already exists".format(path), + error_code=RESOURCE_ALREADY_EXISTS, + ) os.makedirs(path) triton_model_path = os.path.normpath(triton_model_path) model_data_subpath = os.path.basename(triton_model_path) diff --git a/deploy/mlflow-triton-plugin/setup.py b/deploy/mlflow-triton-plugin/setup.py old mode 100644 new mode 100755 index 6e5c2baa53..65b8e0df1e --- a/deploy/mlflow-triton-plugin/setup.py +++ b/deploy/mlflow-triton-plugin/setup.py @@ -1,4 +1,6 @@ -# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -23,7 +25,7 @@ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from setuptools import setup, find_packages +from setuptools import find_packages, setup setup( name="mlflow-triton", diff --git a/docker/cpu_only/entrypoint.d/12-banner.sh b/docker/cpu_only/entrypoint.d/12-banner.sh old mode 100644 new mode 100755 diff --git a/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh b/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh old mode 100644 new mode 100755 diff --git a/docker/entrypoint.d/50-gpu-driver-check2.sh b/docker/entrypoint.d/50-gpu-driver-check2.sh old mode 100644 new mode 100755 diff --git a/docker/entrypoint.d/56-network-driver-version-check.sh b/docker/entrypoint.d/56-network-driver-version-check.sh old mode 100644 new mode 100755 index 8b13789179..a9bf588e2f --- a/docker/entrypoint.d/56-network-driver-version-check.sh +++ b/docker/entrypoint.d/56-network-driver-version-check.sh @@ -1 +1 @@ - +#!/bin/bash diff --git a/docker/entrypoint.d/70-shm-check.sh b/docker/entrypoint.d/70-shm-check.sh old mode 100644 new mode 100755 index 8b13789179..a9bf588e2f --- a/docker/entrypoint.d/70-shm-check.sh +++ b/docker/entrypoint.d/70-shm-check.sh @@ -1 +1 @@ - +#!/bin/bash diff --git a/docker/entrypoint.d/99-check-run-aip-mode.sh b/docker/entrypoint.d/99-check-run-aip-mode.sh old mode 100644 new mode 100755 diff --git a/docker/sagemaker/serve b/docker/sagemaker/serve index 8f98010c95..268f1f0f68 100755 --- a/docker/sagemaker/serve +++ b/docker/sagemaker/serve @@ -32,12 +32,12 @@ SAGEMAKER_SINGLE_MODEL_REPO=/opt/ml/model/ if [ -n "$SAGEMAKER_TRITON_OVERRIDE_PING_MODE" ]; then SAGEMAKER_TRITON_PING_MODE=${SAGEMAKER_TRITON_OVERRIDE_PING_MODE} else - SAGEMAKER_TRITON_PING_MODE="ready" + SAGEMAKER_TRITON_PING_MODE="ready" fi # Note: in Triton on SageMaker, each model url is registered as a separate repository # e.g., /opt/ml/models//model. Specifying MME model repo path as /opt/ml/models causes Triton -# to treat it as an additional empty repository and changes +# to treat it as an additional empty repository and changes # the state of all models to be UNAVAILABLE in the model repository # https://github.com/triton-inference-server/core/blob/main/src/model_repository_manager.cc#L914,L922 # On Triton, this path will be a dummy path as it's mandatory to specify a model repo when starting triton @@ -53,10 +53,10 @@ if [ -n "$SAGEMAKER_MULTI_MODEL" ]; then if [ -n "$SAGEMAKER_TRITON_OVERRIDE_PING_MODE" ]; then SAGEMAKER_TRITON_PING_MODE=${SAGEMAKER_TRITON_OVERRIDE_PING_MODE} else - SAGEMAKER_TRITON_PING_MODE="live" + SAGEMAKER_TRITON_PING_MODE="live" fi is_mme_mode=true - echo -e "Triton is running in SageMaker MME mode. Using Triton ping mode: \"${SAGEMAKER_TRITON_PING_MODE}\"" + echo -e "Triton is running in SageMaker MME mode. Using Triton ping mode: \"${SAGEMAKER_TRITON_PING_MODE}\"" fi fi diff --git a/docs/Makefile b/docs/Makefile index 9a2abe880c..98271dfb29 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -49,5 +49,5 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: +%: @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md index ed4751c188..f6117c8168 100644 --- a/docs/README.md +++ b/docs/README.md @@ -29,11 +29,11 @@ # **Triton Inference Server Documentation** | [Installation](README.md#installation) | [Getting Started](README.md#getting-started) | [User Guide](README.md#user-guide) | [API Guide](protocol/README.md) | [Additional Resources](README.md#resources) | [Customization Guide](README.md#customization-guide) | -| ------------ | --------------- | --------------- | ------------ | --------------- | --------------- | +| ------------ | --------------- | --------------- | ------------ | --------------- | --------------- | -**New to Triton Inference Server?** Make use of +**New to Triton Inference Server?** Make use of [these tutorials](https://github.com/triton-inference-server/tutorials) - to begin your Triton journey! + to begin your Triton journey! ## **Installation** Before you can use the Triton Docker image you must install @@ -58,14 +58,14 @@ This guide covers the simplest possible workflow for deploying a model using a T - [Launch Triton](getting_started/quickstart.md#launch-triton) - [Send an Inference Request](getting_started/quickstart.md#send-an-inference-request) -Triton Inference Server has a considerable list versatile and powerful features. All new users are recommended to explore the [User Guide](README.md#user-guide) and the [additional resources](README.md#resources) sections for features most relevant to their use case. +Triton Inference Server has a considerable list versatile and powerful features. All new users are recommended to explore the [User Guide](README.md#user-guide) and the [additional resources](README.md#resources) sections for features most relevant to their use case. ## **User Guide** The User Guide describes how to configure Triton, organize and configure your models, use the C++ and Python clients, etc. This guide includes the following: * Creating a Model Repository [[Overview](README.md#model-repository) || [Details](user_guide/model_repository.md)] * Writing a Model Configuration [[Overview](README.md#model-configuration) || [Details](user_guide/model_configuration.md)] * Buillding a Model Pipeline [[Overview](README.md#model-pipeline)] -* Managing Model Availablity [[Overview](README.md#model-management) || [Details](user_guide/model_management.md)] +* Managing Model Availability [[Overview](README.md#model-management) || [Details](user_guide/model_management.md)] * Collecting Server Metrics [[Overview](README.md#metrics) || [Details](user_guide/metrics.md)] * Supporting Custom Ops/layers [[Overview](README.md#framework-custom-operations) || [Details](user_guide/custom_operations.md)] * Using the Client API [[Overview](README.md#client-libraries-and-examples) || [Details](https://github.com/triton-inference-server/client)] @@ -73,14 +73,14 @@ The User Guide describes how to configure Triton, organize and configure your mo * Deploying on edge (Jetson) [[Overview](README.md#jetson-and-jetpack)] -### Model Repository +### Model Repository [Model Repositories](user_guide/model_repository.md) are the organizational hub for using Triton. All models, configuration files, and additional resources needed to serve the models are housed inside a model repository. - [Cloud Storage](user_guide/model_repository.md#model-repository-locations) - [File Organization](user_guide/model_repository.md#model-files) - [Model Versioning](user_guide/model_repository.md#model-versions) ### Model Configuration -A [Model Configuration](user_guide/model_configuration.md) file is where you set the model-level options, such as output tensor reshaping and dynamic batch sizing. +A [Model Configuration](user_guide/model_configuration.md) file is where you set the model-level options, such as output tensor reshaping and dynamic batch sizing. #### Required Model Configuration @@ -112,7 +112,7 @@ The Model Configuration ModelOptimizationPolicy property is used to specify opti #### Scheduling and Batching -Triton supports batching individual inference requests to improve compute resource utilization. This is extremely important as individual requests typically will not saturate GPU resources thus not leveraging the parallelism provided by GPUs to its extent. Learn more about Triton's [Batcher and Scheduler](user_guide/model_configuration.md#scheduling-and-batching). +Triton supports batching individual inference requests to improve compute resource utilization. This is extremely important as individual requests typically will not saturate GPU resources thus not leveraging the parallelism provided by GPUs to its extent. Learn more about Triton's [Batcher and Scheduler](user_guide/model_configuration.md#scheduling-and-batching). - [Default Scheduler - Non-Batching](user_guide/model_configuration.md#default-scheduler) - [Dynamic Batcher](user_guide/model_configuration.md#dynamic-batcher) - [How to Configure Dynamic Batcher](user_guide/model_configuration.md#recommended-configuration-process) @@ -134,21 +134,21 @@ Triton supports batching individual inference requests to improve compute resour Rate limiter manages the rate at which requests are scheduled on model instances by Triton. The rate limiter operates across all models loaded in Triton to allow cross-model prioritization. [Learn more](user_guide/rate_limiter.md). #### Model Warmup -For a few of the Backends (check [Additional Resources](README.md#resources)) some or all of intialization is deffered till the first inference request is received, the benefit is resource conservation but comes with the downside of the initial requests getting processed slower than expected. Users can pre-"warm up" the model by instructing Triton to intialize the model. [Learn more](user_guide/model_configuration.md#model-warmup). +For a few of the Backends (check [Additional Resources](README.md#resources)) some or all of initialization is deferred until the first inference request is received, the benefit is resource conservation but comes with the downside of the initial requests getting processed slower than expected. Users can pre-"warm up" the model by instructing Triton to initialize the model. [Learn more](user_guide/model_configuration.md#model-warmup). #### Inference Request/Response Cache Triton has a feature which allows inference responses to get cached. [Learn More](user_guide/response_cache.md). ### Model Pipeline -Building ensembles is as easy as adding an addition configuration file which outlines the specific flow of tensors from one model to another. Any additional changes required by the model ensemble can be made in existing (individual) model configurations. +Building ensembles is as easy as adding an addition configuration file which outlines the specific flow of tensors from one model to another. Any additional changes required by the model ensemble can be made in existing (individual) model configurations. - [Model Ensemble](user_guide/architecture.md#ensemble-models) - [Business Logic Scripting (BLS)](https://github.com/triton-inference-server/python_backend#business-logic-scripting) ### Model Management -Users can specify policies in the model configuration for loading and unloading of models. This [section](user_guide/model_management.md) covers user selectable policy details. +Users can specify policies in the model configuration for loading and unloading of models. This [section](user_guide/model_management.md) covers user selectable policy details. - [Explicit Model Loading and Unloading](user_guide/model_management.md#model-control-mode-explicit) - [Modifying the Model Repository](user_guide/model_management.md#modifying-the-model-repository) ### Metrics -Triton provides Prometheus metrics like GPU Utilization, Memory Usage, Latency and more. Learn about [availble metrics](user_guide/metrics.md). +Triton provides Prometheus metrics like GPU Utilization, Memory Usage, Latency and more. Learn about [available metrics](user_guide/metrics.md). ### Framework Custom Operations Some frameworks provide the option of building custom layers/operations. These can be added to specific Triton Backends for the those frameworks. [Learn more](user_guide/custom_operations.md) - [TensorRT](user_guide/custom_operations.md#tensorrt) @@ -164,9 +164,9 @@ Use the [Triton Client](https://github.com/triton-inference-server/client) API t - [go](https://github.com/triton-inference-server/client/tree/main/src/grpc_generated/go) - [Java/Scala](https://github.com/triton-inference-server/client/tree/main/src/grpc_generated/java) - [Javascript](https://github.com/triton-inference-server/client/tree/main/src/grpc_generated/javascript) -- [Shared Memory Extention](protocol/extension_shared_memory.md) +- [Shared Memory Extension](protocol/extension_shared_memory.md) ### Performance Analysis -Understanding Inference perfomance is key to better resource utilization. Use Triton's Tools to costomize your deployment. +Understanding Inference performance is key to better resource utilization. Use Triton's Tools to costomize your deployment. - [Performance Tuning Guide](user_guide/performance_tuning.md) - [Optimization](user_guide/optimization.md) - [Model Analyzer](user_guide/model_analyzer.md) @@ -189,7 +189,7 @@ The following resources are recommended to explore the full suite of Triton Infe - [Model Navigator](https://github.com/triton-inference-server/model_navigator): The Triton Model Navigator is a tool that provides the ability to automate the process of moving model from source to optimal format and configuration for deployment on Triton Inference Server. The tool supports export model from source to all possible formats and applies the Triton Inference Server backend optimizations. -- **Backends**: Triton has suports a wide varity of frameworks used to run models. Users can extend this functionality by creating custom backends. +- **Backends**: Triton has supports a wide variety of frameworks used to run models. Users can extend this functionality by creating custom backends. - [PyTorch](https://github.com/triton-inference-server/pytorch_backend): Widely used Open Source DL Framework - [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend): Widely used Open Source DL Framework - [TensorRT](https://github.com/triton-inference-server/tensorrt_backend): NVIDIA [TensorRT](https://developer.nvidia.com/tensorrt) is an inference acceleration SDK that provide a with range of graph optimizations, kernel optimization, use of lower precision, and more. diff --git a/docs/_static/custom.css b/docs/_static/custom.css index 798df3d541..a8c37ced01 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -114,14 +114,14 @@ font-family: NVIDIA Sans, Helvetica, Arial, Sans-serif; font-size: 0.85em; } -/* colors +/* colors nv green 118,185,0 black 0, 0, 0 light gray 205, 205, 205 medium gray 140, 140, 140 dark gray 94, 94, 94 -emerald 0, 133, 100 +emerald 0, 133, 100 emerald #008564 amethyst 92, 22, 130 amethyst #5C1682 diff --git a/docs/conf.py b/docs/conf.py old mode 100644 new mode 100755 index 98000f2227..9378329752 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -46,9 +48,9 @@ # -- Project information ----------------------------------------------------- -project = 'NVIDIA Triton Inference Server' -copyright = '2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved' -author = 'NVIDIA' +project = "NVIDIA Triton Inference Server" +copyright = "2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved" +author = "NVIDIA" # The full version, including alpha/beta/rc tags # Env only set during riva-release process, otherwise keep as dev for all internal builds @@ -69,7 +71,7 @@ "sphinx_copybutton", "sphinx_design", "sphinx-prompt", - #"sphinxcontrib.bibtex", + # "sphinxcontrib.bibtex", "sphinx_tabs.tabs", "sphinx_sitemap", ] @@ -79,7 +81,9 @@ numfig = True # final location of docs for seo/sitemap -html_baseurl = 'https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/' +html_baseurl = ( + "https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/" +) myst_enable_extensions = [ "dollarmath", @@ -96,7 +100,7 @@ myst_heading_anchors = 5 # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -121,7 +125,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] html_css_files = ["custom.css"] html_theme_options = { @@ -150,11 +154,10 @@ deploy_ngc_org = "nvidia" deploy_ngc_team = "triton" myst_substitutions = { - "VersionNum": - version_short, - "deploy_ngc_org_team": - f"{deploy_ngc_org}/{deploy_ngc_team}" - if deploy_ngc_team else deploy_ngc_org, + "VersionNum": version_short, + "deploy_ngc_org_team": f"{deploy_ngc_org}/{deploy_ngc_team}" + if deploy_ngc_team + else deploy_ngc_org, } @@ -167,31 +170,31 @@ def ultimateReplace(app, docname, source): # this is a necessary hack to allow us to fill in variables that exist in code blocks ultimate_replacements = { - "{VersionNum}": - version_short, - "{SamplesVersionNum}": - version_short, - "{NgcOrgTeam}": - f"{deploy_ngc_org}/{deploy_ngc_team}" - if deploy_ngc_team else deploy_ngc_org, + "{VersionNum}": version_short, + "{SamplesVersionNum}": version_short, + "{NgcOrgTeam}": f"{deploy_ngc_org}/{deploy_ngc_team}" + if deploy_ngc_team + else deploy_ngc_org, } -#bibtex_bibfiles = ["references.bib"] +# bibtex_bibfiles = ["references.bib"] # To test that style looks good with common bibtex config -#bibtex_reference_style = "author_year" -#bibtex_default_style = "plain" +# bibtex_reference_style = "author_year" +# bibtex_default_style = "plain" -### We currrently use Myst: https://myst-nb.readthedocs.io/en/latest/use/execute.html +### We currently use Myst: https://myst-nb.readthedocs.io/en/latest/use/execute.html jupyter_execute_notebooks = "off" # Global execution disable # execution_excludepatterns = ['tutorials/tts-python-basics.ipynb'] # Individual notebook disable def setup(app): - app.add_config_value('ultimate_replacements', {}, True) - app.connect('source-read', ultimateReplace) + app.add_config_value("ultimate_replacements", {}, True) + app.connect("source-read", ultimateReplace) app.add_js_file("https://js.hcaptcha.com/1/api.js") - visitor_script = "//assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js" + visitor_script = ( + "//assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js" + ) if visitor_script: app.add_js_file(visitor_script) @@ -213,8 +216,9 @@ def setup(app): # Patch for sphinx.search stemming short terms (i.e. tts -> tt) # https://github.com/sphinx-doc/sphinx/blob/4.5.x/sphinx/search/__init__.py#L380 -def sphinxSearchIndexFeed(self, docname: str, filename: str, title: str, - doctree: nodes.document): +def sphinxSearchIndexFeed( + self, docname: str, filename: str, title: str, doctree: nodes.document +): """Feed a doctree to the index.""" self._titles[docname] = title self._filenames[docname] = filename @@ -242,11 +246,9 @@ def stem(word: str) -> str: for word in visitor.found_words: stemmed_word = stem(word) # again, stemmer must not remove words from search index - if len(stemmed_word) <= 3 or not _filter(stemmed_word) and _filter( - word): + if len(stemmed_word) <= 3 or not _filter(stemmed_word) and _filter(word): stemmed_word = word.lower() - already_indexed = docname in self._title_mapping.get( - stemmed_word, set()) + already_indexed = docname in self._title_mapping.get(stemmed_word, set()) if _filter(stemmed_word) and not already_indexed: self._mapping.setdefault(stemmed_word, set()).add(docname) diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md index ddfc931c19..fca514ab64 100644 --- a/docs/customization_guide/build.md +++ b/docs/customization_guide/build.md @@ -47,9 +47,9 @@ The Triton source is distributed across multiple GitHub repositories that together can be built and installed to create a complete Triton installation. Triton server is built using CMake and (optionally) Docker. To simplify the build process, Triton provides a -[build.py](https://github.com/triton-inference-server/server/blob/main/build.py) script. -The build.py script will generate the CMake and Docker build steps required to -build Triton, and will optionally invoke those steps or leave the invocation to +[build.py](https://github.com/triton-inference-server/server/blob/main/build.py) script. +The build.py script will generate the CMake and Docker build steps required to +build Triton, and will optionally invoke those steps or leave the invocation to you, as described below. The build.py script currently supports building Triton for the @@ -197,9 +197,9 @@ To include the TensorFlow2 backend in your CPU-only build, you must provide this additional flag to build.py: `--extra-backend-cmake-arg=tensorflow2:TRITON_TENSORFLOW_INSTALL_EXTRA_DEPS=ON`. -CPU-only builds of the TensorFlow and PyTorch backends require some CUDA stubs -and runtime dependencies that are not present in the CPU-only base container. -These are retrieved from a GPU base container, which can be changed with the +CPU-only builds of the TensorFlow and PyTorch backends require some CUDA stubs +and runtime dependencies that are not present in the CPU-only base container. +These are retrieved from a GPU base container, which can be changed with the `--image=gpu-base,nvcr.io/nvidia/tritonserver:-py3-min` flag. ### Building Without Docker diff --git a/docs/customization_guide/inference_protocols.md b/docs/customization_guide/inference_protocols.md index 97a505d720..6110b739cd 100644 --- a/docs/customization_guide/inference_protocols.md +++ b/docs/customization_guide/inference_protocols.md @@ -149,7 +149,7 @@ protocol types mentioned above: request to the protocol is received. The completed header will be in the form of `triton-grpc-protocol-` -* `restricted-value` : The value of the header to be matched in order to preceed +* `restricted-value` : The value of the header to be matched in order to proceed in the process of the specified protocols. #### Example @@ -177,8 +177,8 @@ tritonserver.dll. In the Triton Docker image the shared library is found in /opt/tritonserver/lib. The header file that defines and documents the Server API is [tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). -[Java bindings for In-Process Triton Server API](#java-bindings-for-in-process-triton-server-api) -are built on top of `tritonserver.h` and can be used for Java applications that +[Java bindings for In-Process Triton Server API](#java-bindings-for-in-process-triton-server-api) +are built on top of `tritonserver.h` and can be used for Java applications that need to use Tritonserver in-process. All capabilities of Triton server are encapsulated in the shared @@ -206,7 +206,7 @@ When you link the Triton shared library into your application you are *not* spawning a separate Triton process, instead, you are including the Triton core logic directly in your application. The Triton HTTP/REST or GRPC protocols are not used to communicate with this -Triton core logic, instead all communication between your appliation +Triton core logic, instead all communication between your application and the Triton core logic must take place via the [Server API](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). @@ -384,7 +384,7 @@ of Triton. The primary source files for the endpoints are The Triton Inference Server uses [Java CPP](https://github.com/bytedeco/javacpp) to create bindings around Tritonserver to create Java API. -The API is documented in +The API is documented in [tritonserver.java](https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/src/gen/java/org/bytedeco/tritonserver/global/tritonserver.java). Alternatively, the user can refer to the web version [API docs](http://bytedeco.org/javacpp-presets/tritonserver/apidocs/) generated from `tritonserver.java`. @@ -393,8 +393,8 @@ and the bindings for `C-API Wrapper`. More information about the [developer_tool A simple example using the Java API can be found in [Samples folder](https://github.com/bytedeco/javacpp-presets/tree/master/tritonserver/samples) -which includes `Simple.java` which is similar to -[`simple.cc`](https://github.com/triton-inference-server/server/blob/main/src/simple.cc). +which includes `Simple.java` which is similar to +[`simple.cc`](https://github.com/triton-inference-server/server/blob/main/src/simple.cc). Please refer to [sample usage documentation](https://github.com/bytedeco/javacpp-presets/tree/master/tritonserver#sample-usage) to learn about how to build and run `Simple.java`. @@ -438,7 +438,7 @@ After ensuring that Tritonserver and dependencies are installed, you can run you Java program with the Java bindings with the following steps: 1. Place Java bindings into your environment. You can do this by either: - + a. Building Java API bindings with provided build script: ```bash # Clone Triton client repo. Recommended client repo tag is: main @@ -451,7 +451,7 @@ Java program with the Java bindings with the following steps: $ source clientrepo/src/java-api-bindings/scripts/install_dependencies_and_build.sh --enable-developer-tools-server` ``` This will install the Java bindings to `/workspace/install/java-api-bindings/tritonserver-java-bindings.jar` - + *or* b. Copying "Uber Jar" from Triton SDK container to your environment @@ -459,7 +459,7 @@ Java program with the Java bindings with the following steps: $ id=$(docker run -dit nvcr.io/nvidia/tritonserver:-py3-sdk bash) $ docker cp ${id}:/workspace/install/java-api-bindings/tritonserver-java-bindings.jar /tritonserver-java-bindings.jar $ docker stop ${id} - ``` + ``` **Note:** `tritonserver-java-bindings.jar` only includes the `In-Process Java Bindings`. To use the `C-API Wrapper Java Bindings`, please use the build script. 2. Use the built "Uber Jar" that contains the Java bindings ```bash @@ -474,7 +474,7 @@ bindings Jar](#run-java-program-with-java-bindings-jar) to also build the jar yourself without any modifications to the Tritonserver bindings in JavaCPP-presets. You can do this using the following steps: -1. Create the JNI binaries in your local repository (`/root/.m2/repository`) +1. Create the JNI binaries in your local repository (`/root/.m2/repository`) with [`javacpp-presets/tritonserver`](https://github.com/bytedeco/javacpp-presets/tree/master/tritonserver) ```bash $ git clone https://github.com/bytedeco/javacpp-presets.git @@ -482,8 +482,8 @@ JavaCPP-presets. You can do this using the following steps: $ mvn clean install --projects .,tritonserver $ mvn clean install -f platform --projects ../tritonserver/platform -Djavacpp.platform=linux-x86_64 ``` -2. Create your custom `*.pom` file for Maven. Please refer to - [samples/simple/pom.xml](https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/samples/simple/pom.xml) as +2. Create your custom `*.pom` file for Maven. Please refer to + [samples/simple/pom.xml](https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/samples/simple/pom.xml) as reference for how to create your pom file. 3. After creating your `pom.xml` file you can build your application with: ```bash diff --git a/docs/examples/README.md b/docs/examples/README.md index 085e1ee803..3261bc6a9d 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -28,7 +28,7 @@ # Triton Examples -**New to Triton Inference Server?** Make use of [these tutorials](https://github.com/triton-inference-server/tutorials) to begin your Triton journey! +**New to Triton Inference Server?** Make use of [these tutorials](https://github.com/triton-inference-server/tutorials) to begin your Triton journey! This folder contains the following: * jetson: This covers deploying Triton Inference Server on Jetson devices. diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile b/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile index 5b22a63e06..6dcf0d0dc4 100644 --- a/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile +++ b/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile @@ -32,14 +32,14 @@ GCC_PARMS+=-I${HOME}/tritonserver/include/tritonserver -D TRITON_ENABLE_GPU=ON - GCC_LIBS=-L${HOME}/tritonserver/lib -L/usr/lib -L/usr/local/cuda/targets/aarch64-linux/lib GCC_LIBS+=-lpthread -ltritonserver -lopencv_core -lopencv_highgui -lopencv_imgproc -lopencv_imgcodecs -lopencv_dnn -lcudart -all: $(TARGET) +all: $(TARGET) %.o: %.cc $(GCC) $(GCC_PARMS) -c -g -o $@ $^ $(TARGET): $(TARGET).o - $(GCC) $^ $(GCC_LIBS) -o $@ + $(GCC) $^ $(GCC_LIBS) -o $@ clean: rm -f $(TARGET).o $(TARGET) diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/README.md b/docs/examples/jetson/concurrency_and_dynamic_batching/README.md index ad3c473dfb..30cfe196f1 100644 --- a/docs/examples/jetson/concurrency_and_dynamic_batching/README.md +++ b/docs/examples/jetson/concurrency_and_dynamic_batching/README.md @@ -48,7 +48,7 @@ ngc registry model download-version "nvidia/tao/peoplenet:pruned_v2.1" For latter you need to setup the [NGC CLI](https://ngc.nvidia.com/setup). -Having downloaded the model from the NGC, unzip the archive `peoplenet_pruned_v2.1.zip` into `concurrency_and_dynamic_batching/tao/models/peoplenet`. +Having downloaded the model from the NGC, unzip the archive `peoplenet_pruned_v2.1.zip` into `concurrency_and_dynamic_batching/tao/models/peoplenet`. If you have the zip archive in the `concurrency_and_dynamic_batching` directory, the following will automatically place the model to the correct location: @@ -78,10 +78,10 @@ The `tao-converter` tool is available as a compiled release file for different p After you have downloaded `tao-converter`, you might need to execute ```shell -chmod 777 tao-converter -``` +chmod 777 tao-converter +``` -in the directory with the tool. +in the directory with the tool. We provide a conversion script `tao/convert_peoplenet.sh` which expects the model to be present at the location. @@ -139,13 +139,13 @@ To execute from the terminal, run from the `concurrency_and_dynamic_batching` di LD_LIBRARY_PATH=$HOME/tritonserver/lib ./people_detection -m system -v -r $(pwd)/trtis_model_repo_sample_1 -t 6 -s false -p $HOME/tritonserver ``` -The parameter `-t` controlls the number of concurrent inference calls we want to execute. We will be executing the same model on the same sample image with the purpose of demonstrating how setting different concurency options affects the performance. +The parameter `-t` controls the number of concurrent inference calls we want to execute. We will be executing the same model on the same sample image with the purpose of demonstrating how setting different concurrency options affects the performance. You can enable saving detected bounding boxes in the project directory in form of overlays over the original image for each execution thread. You can turn the visualization on by setting the parameter `-s` to `true` upon execution (`-s` is set to `false` by default). ### Expected output -Upon execution, in the terminal log you will see _Model 'peoplenet' Stats_ in json format reflecting the inference performance. We also output _TOTAL INFERENCE TIME_ which simply reflects the elapsed time requred to run the application including data loading, pre-processing and post-processing. +Upon execution, in the terminal log you will see _Model 'peoplenet' Stats_ in json format reflecting the inference performance. We also output _TOTAL INFERENCE TIME_ which simply reflects the elapsed time required to run the application including data loading, pre-processing and post-processing. A typical output in the log for _Model 'peoplenet' Stats_ looks as follows: @@ -210,7 +210,7 @@ TOTAL INFERENCE TIME: 174ms To learn about different statistics check out the [documentation](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_statistics.md#statistics-extension). -To see how setting different values for concurrency affects total execution time and its componets reflected in the model stats, you need to modify a single parameter in the model config file. +To see how setting different values for concurrency affects total execution time and its components reflected in the model stats, you need to modify a single parameter in the model config file. To enable concurrent model execution support for a model, corresponding model config file `trtis_model_repo_sample_1/peoplenet/config.pbtxt` includes the following: @@ -223,17 +223,17 @@ instance_group [ ] ``` -You can change the count of allowed inferences for the same model instance and observe how it affects performance in _Model 'peoplenet' Stats_ and _TOTAL INFERENCE TIME_. Note that on Jetson we dont recommend setting values too high: for instance, on a device like a Jetson Xavier AGX we don't recommend setting the number larger than 6. The values in the range 1-3 are optimal. +You can change the count of allowed inferences for the same model instance and observe how it affects performance in _Model 'peoplenet' Stats_ and _TOTAL INFERENCE TIME_. Note that on Jetson we dont recommend setting values too high: for instance, on a device like a Jetson Xavier AGX we don't recommend setting the number larger than 6. The values in the range 1-3 are optimal. While trying out different values, note how it affects total inference time as well as some inference statistics (like queue and compute times) ## Demonstration case 2: Dynamic batching -For models that support batching, Triton implements multiple scheduling and batching algorithms that combine individual inference requests together to improve inference throughput. In this example, we want to demonstrate how enbling automatic dynamic batching affects inference performance. +For models that support batching, Triton implements multiple scheduling and batching algorithms that combine individual inference requests together to improve inference throughput. In this example, we want to demonstrate how enbling automatic dynamic batching affects inference performance. ### Running the sample -To observe the effect of dynamic batching, from the `concurrency_and_dynamic_batching` directory execute: +To observe the effect of dynamic batching, from the `concurrency_and_dynamic_batching` directory execute: ```shell LD_LIBRARY_PATH=$HOME/tritonserver/lib ./people_detection -m system -v -r $(pwd)/trtis_model_repo_sample_2 -t 6 -s false -p $HOME/tritonserver @@ -326,6 +326,6 @@ dynamic_batching { } ``` -To try further options of dynamic batcher see the [documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher). +To try further options of dynamic batcher see the [documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher). You can also try enabling both concurrent model execution and dynamic batching. \ No newline at end of file diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/tao/convert_peoplenet.sh b/docs/examples/jetson/concurrency_and_dynamic_batching/tao/convert_peoplenet.sh old mode 100644 new mode 100755 diff --git a/docs/examples/model_repository/simple_identity/config.pbtxt b/docs/examples/model_repository/simple_identity/config.pbtxt old mode 100755 new mode 100644 diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 316d33c24c..fa1a8ec690 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -28,10 +28,10 @@ # Quickstart -**New to Triton Inference Server and want do just deploy your model quickly?** -Make use of +**New to Triton Inference Server and want do just deploy your model quickly?** +Make use of [these tutorials](https://github.com/triton-inference-server/tutorials#quick-deploy) - to begin your Triton journey! + to begin your Triton journey! The Triton Inference Server is available as [buildable source code](../customization_guide/build.md), but the easiest way to install and run Triton is to @@ -49,7 +49,7 @@ Launching and maintaining Triton Inference Server revolves around the use of bui The [model repository](../user_guide/model_repository.md) is the directory where you place the models that you want Triton to serve. An example model repository is included in the -[docs/examples/model_repository](../examples/model_repository). +[docs/examples/model_repository](../examples/model_repository). Before using the repository, you must fetch any missing model definition files from their public model zoos via the provided script. diff --git a/docs/index.md b/docs/index.md index e3fcb91338..7ae2b22173 100644 --- a/docs/index.md +++ b/docs/index.md @@ -71,7 +71,7 @@ Major features include: - [Concurrent model execution](user_guide/architecture.md#concurrent-model-execution) - [Dynamic batching](user_guide/model_configuration.md#dynamic-batcher) -- [Sequence batching](user_guide/model_configuration.md#sequence-batcher) and +- [Sequence batching](user_guide/model_configuration.md#sequence-batcher) and [implicit state management](user_guide/architecture.md#implicit-state-management) for stateful models - Provides [Backend API](https://github.com/triton-inference-server/backend) that @@ -90,8 +90,8 @@ Major features include: - [Metrics](user_guide/metrics.md) indicating GPU utilization, server throughput, server latency, and more -Join the [Triton and TensorRT community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and stay current on the latest product updates, bug fixes, content, best -practices, and more. Need enterprise support? NVIDIA global support is available +Join the [Triton and TensorRT community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and stay current on the latest product updates, bug fixes, content, best +practices, and more. Need enterprise support? NVIDIA global support is available for Triton Inference Server with the [NVIDIA AI Enterprise software suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/). -See the [Lastest Release Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-23-05.html#rel-23-05) for updates on the newest features and bug fixes. +See the [Latest Release Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-23-05.html#rel-23-05) for updates on the newest features and bug fixes. diff --git a/docs/protocol/extension_logging.md b/docs/protocol/extension_logging.md index 2b31863f0f..d5b770d5d4 100644 --- a/docs/protocol/extension_logging.md +++ b/docs/protocol/extension_logging.md @@ -29,7 +29,7 @@ # Logging Extension This document describes Triton's logging extension. The logging extension enables -the client to configure log settings during a Triton run. Triton reports "logging" +the client to configure log settings during a Triton run. Triton reports "logging" in the extensions field of its Server Metadata. ## HTTP/REST @@ -41,7 +41,7 @@ indicates an optional JSON field. Triton exposes the logging endpoint at the following URL. The client may use HTTP GET request to retrieve the current log settings. A HTTP POST request will modify the log settings, and the endpoint will return the updated log -settings on success or an error in the case of failure. +settings on success or an error in the case of failure. ``` GET v2/logging @@ -65,22 +65,22 @@ $log_setting = $string : $string | $boolean | $number ``` Each `$log_setting` JSON describes a “name”/”value” pair, where the “name” is -the `$string` representation of the log setting and the “value” is a `$string`, -`$bool`, or `$number` representation of the setting value. Currently, the +the `$string` representation of the log setting and the “value” is a `$string`, +`$bool`, or `$number` representation of the setting value. Currently, the following log settings are defined: - "log_file" : a `$string` parameter defining the file where the log outputs will be saved. If an empty string is specified, log outputs will stream to the console. -- "log_info" : a `$boolean` parameter that controls whether the Triton server logs INFO level messages. +- "log_info" : a `$boolean` parameter that controls whether the Triton server logs INFO level messages. -- "log_warning" : a `$boolean` parameter that controls whether the Triton server logs WARNING level messages. +- "log_warning" : a `$boolean` parameter that controls whether the Triton server logs WARNING level messages. -- "log_error" : a `$boolean` parameter that controls whether the Triton server logs ERROR level messages. +- "log_error" : a `$boolean` parameter that controls whether the Triton server logs ERROR level messages. - "log_verbose_level" : a `$number` parameter that controls whether the Triton server outputs verbose messages -of varying degrees. This value can be any integer >= 0. If "log_verbose_level" is 0, verbose logging will be disabled, and +of varying degrees. This value can be any integer >= 0. If "log_verbose_level" is 0, verbose logging will be disabled, and no verbose messages will be output by the Triton server. If "log_verbose_level" is 1, level 1 verbose messages will be output -by the Triton server. If "log_verbose_level" is 2, the Triton server will output all verbose messages of +by the Triton server. If "log_verbose_level" is 2, the Triton server will output all verbose messages of level <= 2, etc. Attempting to set "log_verbose_level" to a number < 0 will result in an error. - "log_format" : a `$string` parameter that controls the format of Triton server log messages. There are currently @@ -121,7 +121,7 @@ When a `$log_setting` JSON is received (defined above), only the specified settings will be updated. ### Example Usage -The logging protocol extension can be invoked using the curl library in the following manner (assuming +The logging protocol extension can be invoked using the curl library in the following manner (assuming a Triton server is running at `localhost:8000`): ``` curl -s -w '\n%{http_code}\n' -d '{"log_verbose_level":1}' -X POST localhost:8000/v2/logging @@ -131,7 +131,7 @@ This command should return a `$log_setting_response` JSON object with the follow {"log_file":"","log_info":true,"log_warnings":true,"log_errors":true,"log_verbose_level":1,"log_format":"default"} 200 ``` -Note that the current values for all parameter fields are returned even though `log_verbose_level` +Note that the current values for all parameter fields are returned even though `log_verbose_level` was the only parameter that was modified. ## GRPC diff --git a/docs/protocol/extension_model_configuration.md b/docs/protocol/extension_model_configuration.md index 07ecc63e94..a9baaa58d7 100644 --- a/docs/protocol/extension_model_configuration.md +++ b/docs/protocol/extension_model_configuration.md @@ -39,7 +39,7 @@ In all JSON schemas shown in this document `$number`, `$string`, `$boolean`, `$object` and `$array` refer to the fundamental JSON types. #optional indicates an optional JSON field. -Triton exposes the model configuation endpoint at the following +Triton exposes the model configuration endpoint at the following URL. The versions portion of the URL is optional; if not provided Triton will return model configuration for the highest-numbered version of the model. diff --git a/docs/protocol/extension_parameters.md b/docs/protocol/extension_parameters.md index f75f069862..4cdb60cf38 100644 --- a/docs/protocol/extension_parameters.md +++ b/docs/protocol/extension_parameters.md @@ -89,12 +89,12 @@ ModelInferRequest message can be used to send custom parameters. ## Forwarding HTTP/GRPC Headers as Parameters -Triton can forward HTTP/GRPC headers as inference request parameters. By +Triton can forward HTTP/GRPC headers as inference request parameters. By specifying a regular expression in `--http-header-forward-pattern` and `--grpc-header-forward-pattern`, Triton will add the headers that match with the regular expression as request parameters. All the forwarded headers will be added as a parameter with string -value. For example to forward all the headers that start with 'PREFIX_' from +value. For example to forward all the headers that start with 'PREFIX_' from both HTTP and GRPC, you should add `--http-header-forward-pattern PREFIX_.* --grpc-header-forward-pattern PREFIX_.*` to your `tritonserver` command. diff --git a/docs/protocol/extension_schedule_policy.md b/docs/protocol/extension_schedule_policy.md index 25c63e2d1b..c3c57a63c7 100644 --- a/docs/protocol/extension_schedule_policy.md +++ b/docs/protocol/extension_schedule_policy.md @@ -36,7 +36,7 @@ request. Because this extension is supported, Triton reports Note the policies are specific to [dynamic batcher](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher) and only experimental support to [sequence -batcher](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#sequence-batcher) +batcher](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#sequence-batcher) with the [direct](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#direct) scheduling strategy. diff --git a/docs/protocol/extension_sequence.md b/docs/protocol/extension_sequence.md index f7ebdf9c7d..51c99fc3cf 100644 --- a/docs/protocol/extension_sequence.md +++ b/docs/protocol/extension_sequence.md @@ -50,13 +50,13 @@ if the "sequence_id" parameter supports string types. - "sequence_start" : boolean value if set to true in a request indicates that the request is the first in a sequence. If not set, or set to false the request is not the first in a sequence. If set - the "sequence_id" parameter must be set to a non-zero or non-empty string + the "sequence_id" parameter must be set to a non-zero or non-empty string value. - "sequence_end" : boolean value if set to true in a request indicates that the request is the last in a sequence. If not set, or set to false the request is not the last in a sequence. If set the - "sequence_id" parameter must be set to a non-zero or non-empty string + "sequence_id" parameter must be set to a non-zero or non-empty string value. ## HTTP/REST diff --git a/docs/protocol/extension_statistics.md b/docs/protocol/extension_statistics.md index 6e82e971ba..46e1a92322 100644 --- a/docs/protocol/extension_statistics.md +++ b/docs/protocol/extension_statistics.md @@ -227,7 +227,7 @@ $duration_stat = - “ns” : The total duration for the statistic in nanoseconds. ``` -$memory_usage = +$memory_usage = { "type" : $string, "id" : $number, @@ -375,7 +375,7 @@ message InferStatistics StatisticDuration fail = 2; // The count and cumulative duration that inference requests wait in - // scheduling or other queues. The "queue" count and cumulative + // scheduling or other queues. The "queue" count and cumulative // duration includes cache hits. StatisticDuration queue = 3; @@ -405,7 +405,7 @@ message InferStatistics // and extract output tensor data from the Response Cache on a cache // hit. For example, this duration should include the time to copy // output tensor data from the Response Cache to the response object. - // On cache hits, triton does not need to go to the model/backend + // On cache hits, triton does not need to go to the model/backend // for the output tensor data, so the "compute_input", "compute_infer", // and "compute_output" fields are not updated. Assuming the response // cache is enabled for a given model, a cache hit occurs for a @@ -419,7 +419,7 @@ message InferStatistics // The count of response cache misses and cumulative duration to lookup // and insert output tensor data from the computed response to the cache // For example, this duration should include the time to copy - // output tensor data from the resposne object to the Response Cache. + // output tensor data from the response object to the Response Cache. // Assuming the response cache is enabled for a given model, a cache // miss occurs for a request to that model when the request metadata // does NOT hash to an existing entry in the cache. See the response @@ -452,7 +452,7 @@ message InferBatchStatistics } // Memory usage. -message MemoryUsage +message MemoryUsage { // The type of memory, the value can be "CPU", "CPU_PINNED", "GPU". string type = 1; diff --git a/docs/protocol/extension_trace.md b/docs/protocol/extension_trace.md index 35905b6bef..6472e1db24 100644 --- a/docs/protocol/extension_trace.md +++ b/docs/protocol/extension_trace.md @@ -78,7 +78,7 @@ see trace setting "log_frequency" below for detail. - "trace_level" : the trace level. "OFF" to disable tracing, "TIMESTAMPS" to trace timestamps, "TENSORS" to trace tensors. This value is an array of string where user may specify multiple levels to -trace multiple informations. +trace multiple information. - "trace_rate" : the trace sampling rate. The value represents how many requests will one trace be sampled from. For example, if the trace rate is "1000", 1 trace will be sampled for every 1000 requests. diff --git a/docs/user_guide/architecture.md b/docs/user_guide/architecture.md index 973cb98f9d..b343842014 100644 --- a/docs/user_guide/architecture.md +++ b/docs/user_guide/architecture.md @@ -312,7 +312,7 @@ description of the model contains variable-sized dimensions, Triton will use *1* for every variable-sized dimension for the starting request. For other non-starting requests in the sequence, the input state is the output state of the previous request in the sequence. For an example ONNX model that uses -implicit state you can refer to this onnx model generated from the +implicit state you can refer to this onnx model generated from the `create_onnx_modelfile_wo_initial_state()` [from this generation script](https://github.com/triton-inference-server/server/blob/main/qa/common/gen_qa_implicit_models.py). This is a simple accumulator model that stores the partial sum of the requests @@ -321,8 +321,8 @@ request is starting, the model sets the "OUTPUT\_STATE" to be equal to the "INPUT" tensor. For non-starting requests, it sets the "OUTPUT\_STATE" tensor to the sum of "INPUT" and "INPUT\_STATE" tensors. -In addition to the default state initilization discussed above, Triton provides -two other mechanisms for initilizing state. +In addition to the default state initialization discussed above, Triton provides +two other mechanisms for initializing state. ###### Initializing State from Zero. @@ -354,7 +354,7 @@ converted to fixed size dimensions. For initializing state from file, you need to create a directory named "initial\_state" under the model directory. The file that contains the initial -state under this directory needs to be provided in the *data_file* field. +state under this directory needs to be provided in the *data_file* field. The data stored in this file will be used in row-major order as the initial state. Below is an example state description initializing state from file. @@ -522,7 +522,7 @@ model. Over time the following happens: the sequence scheduler sees them both available in their respective batch slots. The scheduler immediately schedules the model instance to perform a batch-size 2 inference and uses START and READY to show - that both slots have an inference request avaiable but that only + that both slots have an inference request available but that only slot1 is the start of a new sequence. * The processing continues in a similar manner for the other inference @@ -799,7 +799,7 @@ scheduler will: #### Additional Resources You can find additional end-to-end ensemble examples in the links below: -* [This guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_5-Model_Ensembles) +* [This guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_5-Model_Ensembles) explores the concept of ensembles with a running example. * [Preprocessing in Python Backend Using Ensemble](https://github.com/triton-inference-server/python_backend#preprocessing) diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md index 34e0288b8d..c3748647c4 100644 --- a/docs/user_guide/custom_operations.md +++ b/docs/user_guide/custom_operations.md @@ -72,7 +72,7 @@ container. TensorFlow allows users to [add custom operations](https://www.tensorflow.org/guide/create_op) which can then be used in TensorFlow models. You can load custom TensorFlow operations -into Triton in two ways: +into Triton in two ways: * At model load time, by listing them in the model configuration. * At server launch time, by using LD_PRELOAD. @@ -181,7 +181,7 @@ example](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/test/s from the [microsoft/onnxruntime](https://github.com/microsoft/onnxruntime) repository and your ONNXRuntime custom operations are compiled into -libonnxcustom.so, adding the following to the model configuraion of +libonnxcustom.so, adding the following to the model configuration of your model makes those operations available to that specific ONNX model. diff --git a/docs/user_guide/decoupled_models.md b/docs/user_guide/decoupled_models.md index 4f5c70d3e2..fbe6f4c298 100644 --- a/docs/user_guide/decoupled_models.md +++ b/docs/user_guide/decoupled_models.md @@ -56,7 +56,7 @@ TRITONBACKEND_ModelInstanceExecute until that instance is ready to handle another set of requests. If not designed properly the backend can be easily over-subscribed. This can also cause under-utilization of features like [Dynamic Batching](model_configuration.md#dynamic-batcher) -as it leads to eager batching. +as it leads to eager batching. ### Python model using Python Backend @@ -91,20 +91,20 @@ for more details. The [decoupled_test.py](../../qa/L0_decoupled/decoupled_test.p how the gRPC streaming can be used to infer decoupled models. If using [Triton's in-process C API](../customization_guide/inference_protocols.md#in-process-triton-server-api), -your application should be cognizant that the callback function you registered with +your application should be cognizant that the callback function you registered with `TRITONSERVER_InferenceRequestSetResponseCallback` can be invoked any number of times, each time with a new response. You can take a look at [grpc_server.cc](https://github.com/triton-inference-server/server/blob/main/src/grpc/grpc_server.cc) ### Knowing When a Decoupled Inference Request is Complete An inference request is considered complete when a response containing the -`TRITONSERVER_RESPONSE_COMPLETE_FINAL` flag is received from a model/backend. +`TRITONSERVER_RESPONSE_COMPLETE_FINAL` flag is received from a model/backend. 1. Client applications using streaming GRPC can access this information by checking the response parameters for the `"triton_final_response"` parameter. Decoupled models may not send a response for each request depending on how the model/backend is designed. In these cases where no response is sent by - the backend, the streaming GRPC client can opt-in to receive an empty final + the backend, the streaming GRPC client can opt-in to receive an empty final response for each request. By default, empty final responses are not sent to save on network traffic. diff --git a/docs/user_guide/faq.md b/docs/user_guide/faq.md index 455692dbb3..518f2cc161 100644 --- a/docs/user_guide/faq.md +++ b/docs/user_guide/faq.md @@ -70,7 +70,7 @@ documentation and using [grpc_service.proto](https://github.com/triton-inference-server/common/blob/main/protobuf/grpc_service.proto) you can generate language bindings for all the languages supported by GRPC. We provide three examples of this for -[Go](https://github.com/triton-inference-server/client/blob/main/src/grpc_generated/go), +[Go](https://github.com/triton-inference-server/client/blob/main/src/grpc_generated/go), [Python](https://github.com/triton-inference-server/client/blob/main/src/python/examples/grpc_client.py) and [Java](https://github.com/triton-inference-server/client/blob/main/src/grpc_generated/java). @@ -154,7 +154,7 @@ available Triton instances. ## If the server segfaults, how can I debug it? -The NGC build is a Release build and does not contain Debug symbols. +The NGC build is a Release build and does not contain Debug symbols. The build.py as well defaults to a Release build. Refer to the instructions in [build.md](../customization_guide/build.md#building-with-debug-symbols) to create a Debug build of Triton. This will help find the cause of the segmentation fault when diff --git a/docs/user_guide/jetson.md b/docs/user_guide/jetson.md index b5e1dcf46d..79e97f5166 100644 --- a/docs/user_guide/jetson.md +++ b/docs/user_guide/jetson.md @@ -144,7 +144,7 @@ apt update && apt install -y gpg wget && \ echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | \ tee /etc/apt/sources.list.d/kitware.list >/dev/null && \ apt-get update && \ - apt-get install -y --no-install-recommends cmake cmake-data + apt-get install -y --no-install-recommends cmake cmake-data ``` ### Runtime Dependencies for Triton @@ -178,7 +178,7 @@ pip3 install --upgrade wheel setuptools && \ pip3 install --upgrade grpcio-tools numpy attrdict pillow ``` -The PyTorch runtime depenencies are the same as the build dependencies listed above. +The PyTorch runtime dependencies are the same as the build dependencies listed above. ### Usage diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md index 7f84e30706..6e4f01bcd2 100644 --- a/docs/user_guide/metrics.md +++ b/docs/user_guide/metrics.md @@ -45,13 +45,13 @@ all metric reporting, while the `--allow-gpu-metrics=false` and metrics respectively. The `--metrics-port` option can be used to select a different port. By default, -Triton reuses the `--http-address` option for the metrics endpoint and binds the +Triton reuses the `--http-address` option for the metrics endpoint and binds the http and metrics endpoints to the same specific address when http service is enabled. If http service is not enabled, the metric address will bind to `0.0.0.0` by default. To uniquely specify the metric endpoint, `--metrics-address` option can be used. See the `tritonserver --help` output for more info on these CLI options. -To change the interval at whichs metrics are polled/updated, see the `--metrics-interval-ms` flag. Metrics that are updated "Per Request" are unaffected by this interval setting. This interval only applies to metrics that are designated as "Per Interval" in the tables of each section below: +To change the interval at which's metrics are polled/updated, see the `--metrics-interval-ms` flag. Metrics that are updated "Per Request" are unaffected by this interval setting. This interval only applies to metrics that are designated as "Per Interval" in the tables of each section below: - [Inference Request Metrics](#inference-request-metrics) - [GPU Metrics](#gpu-metrics) @@ -105,7 +105,7 @@ that are published through the `--metrics-config` CLI options. #### Counters -By default, the following +By default, the following [Counter](https://prometheus.io/docs/concepts/metric_types/#counter) metrics are used for latencies: @@ -129,7 +129,7 @@ To disable these metrics specifically, you can set `--metrics-config counter_lat To get configurable quantiles over a sliding time window, Triton supports a set a [Summary](https://prometheus.io/docs/concepts/metric_types/#summary) metrics for latencies as well. These metrics are disabled by default, but can -be enabled by setting `--metrics-config summary_latencies=true`. +be enabled by setting `--metrics-config summary_latencies=true`. For more information on how the quantiles are calculated, see [this explanation](https://grafana.com/blog/2022/03/01/how-summary-metrics-work-in-prometheus/). @@ -146,7 +146,7 @@ The following summary metrics are available: Each summary above is actually composed of several sub-metrics. For each metric, there is a set of `quantile` metrics tracking the latency for each -quantile. Additionaly, there are `_count` and `_sum` metrics that aggregate +quantile. Additionally, there are `_count` and `_sum` metrics that aggregate the count and observed values for each. For example, see the following information exposed by the Inference Queue Summary metrics: ``` @@ -187,8 +187,8 @@ To better understand the setting of error values for computing each quantile, se ## GPU Metrics -GPU metrics are collected through the use of [DCGM](https://developer.nvidia.com/dcgm). -Collection of GPU metrics can be toggled with the `--allow-gpu-metrics` CLI flag. +GPU metrics are collected through the use of [DCGM](https://developer.nvidia.com/dcgm). +Collection of GPU metrics can be toggled with the `--allow-gpu-metrics` CLI flag. If building Triton locally, the `TRITON_ENABLE_METRICS_GPU` CMake build flag can be used to toggle building the relevant code entirely. |Category |Metric |Metric Name |Description |Granularity|Frequency | @@ -203,7 +203,7 @@ If building Triton locally, the `TRITON_ENABLE_METRICS_GPU` CMake build flag can ## CPU Metrics -Collection of CPU metrics can be toggled with the `--allow-cpu-metrics` CLI flag. +Collection of CPU metrics can be toggled with the `--allow-cpu-metrics` CLI flag. If building Triton locally, the `TRITON_ENABLE_METRICS_CPU` CMake build flag can be used to toggle building the relevant code entirely. > **Note** @@ -225,15 +225,15 @@ Cache metrics can be reported in two ways: by Triton directly, such as the cache hit/miss counts and durations described below. -2. As of 23.03, additional cache metrics may be reported depending on the -[cache implementation](response_cache.md#cache-implementations) +2. As of 23.03, additional cache metrics may be reported depending on the +[cache implementation](response_cache.md#cache-implementations) being used through Triton's [Metrics API](#custom-metrics). ### Triton-reported Response Cache Metrics -Compute latency metrics in the -[Inference Request Metrics table](#inference-request-metrics) above are -calculated for the time spent in model inference backends. If the response +Compute latency metrics in the +[Inference Request Metrics table](#inference-request-metrics) above are +calculated for the time spent in model inference backends. If the response cache is enabled for a given model (see [Response Cache](response_cache.md) docs for more info), total inference times may be affected by response cache lookup times. @@ -243,7 +243,7 @@ response, and "Compute Input Time" / "Compute Time" / "Compute Output Time" are not recorded. On cache misses, "Cache Miss Time" indicates the time spent looking up -the request hash and inserting the computed output tensor data into the cache. +the request hash and inserting the computed output tensor data into the cache. Otherwise, "Compute Input Time" / "Compute Time" / "Compute Output Time" will be recorded as usual. @@ -271,7 +271,7 @@ custom metrics with the existing Triton metrics endpoint. The user takes the ownership of the custom metrics created through the APIs and must manage their lifetime following the API documentation. -The +The [identity_backend](https://github.com/triton-inference-server/identity_backend/blob/main/README.md#custom-metric-example) demonstrates a practical example of adding a custom metric to a backend. diff --git a/docs/user_guide/model_analyzer.md b/docs/user_guide/model_analyzer.md index bc6c67fc8b..663a8a277a 100644 --- a/docs/user_guide/model_analyzer.md +++ b/docs/user_guide/model_analyzer.md @@ -36,7 +36,7 @@ utilization. The Model Analyzer is specifically useful for characterizing the GPU memory requirements for your model under different batching and model instance configurations. Once you have this GPU memory usage information you can more intelligently decide on how to combine multiple models on the same GPU -while remaining within the memory capacity of the GPU. +while remaining within the memory capacity of the GPU. For more detailed examples and explanations of using Model Analyzer, see: - [Model Analyzer Conceptual Guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_3-optimizing_triton_configuration) diff --git a/docs/user_guide/model_configuration.md b/docs/user_guide/model_configuration.md index 8e4f53844e..9e8ba6e5a0 100644 --- a/docs/user_guide/model_configuration.md +++ b/docs/user_guide/model_configuration.md @@ -28,10 +28,10 @@ # Model Configuration -**Is this your first time writing a config file?** Check out +**Is this your first time writing a config file?** Check out [this guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_1-model_deployment#model-configuration) - or this -[example](https://github.com/triton-inference-server/tutorials/tree/main/HuggingFace#examples)! + or this +[example](https://github.com/triton-inference-server/tutorials/tree/main/HuggingFace#examples)! Each model in a [model repository](model_repository.md) must include a model configuration that provides required and optional information @@ -39,7 +39,7 @@ about the model. Typically, this configuration is provided in a config.pbtxt file specified as [ModelConfig protobuf](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto). In some cases, discussed in [Auto-Generated Model -Configuraton](#auto-generated-model-configuration), the model +Configuration](#auto-generated-model-configuration), the model configuration can be generated automatically by Triton and so does not need to be provided explicitly. @@ -135,7 +135,7 @@ expected by the model. #### Special Conventions for PyTorch Backend -**Naming Convention:** +**Naming Convention:** Due to the absence of sufficient metadata for inputs/outputs in TorchScript model files, the "name" attribute of inputs/outputs in the configuration must @@ -147,7 +147,7 @@ the forward function in the model's definition. For example, if the forward function for the Torchscript model was defined as `forward(self, input0, input1)`, the first and second inputs should be named -"input0" and "input1" respectively. +"input0" and "input1" respectively. 2. `__`: Where \ can be any string and \ is an integer index that refers to the position of the corresponding input/output. @@ -158,9 +158,9 @@ can be named "OUTPUT__0" and "OUTPUT__1" respectively. 3. If all inputs (or outputs) do not follow the same naming convention, then we enforce strict ordering from the model configuration i.e. we assume the order of -inputs (or outputs) in the configuartion is the true ordering of these inputs. +inputs (or outputs) in the configuration is the true ordering of these inputs. -***Dictionary of Tensors as Input:*** +***Dictionary of Tensors as Input:*** The PyTorch backend supports passing of inputs to the model in the form of a Dictionary of Tensors. This is only supported when there is a *single* input to @@ -290,7 +290,7 @@ function can be implemented in Python backend to provide and [`output`](#inputs-and-outputs) properties using `set_max_batch_size`, `add_input`, and `add_output` functions. These properties will allow Triton to load the Python model with [Minimal Model Configuration](#minimal-model-configuration) -in absence of a configuration file. +in absence of a configuration file. All other model types *must* provide a model configuration file. When developing a custom backend, you can populate required settings @@ -298,7 +298,7 @@ in the configuration and call `TRITONBACKEND_ModelSetConfig` API to update completed configuration with Triton core. You can take a look at [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend) and [Onnxruntime](https://github.com/triton-inference-server/onnxruntime_backend) -backends as examples of how to acheive this. Currently, only +backends as examples of how to achieve this. Currently, only [inputs, outputs](#inputs-and-outputs), [max_batch_size](#maximum-batch-size) and [dynamic batching](#dynamic-batcher) settings can be populated by backend. For custom backends, your config.pbtxt file must @@ -323,25 +323,25 @@ config.pbtxt file. ### Default Max Batch Size and Dynamic Batcher -When a model is using the auto-complete feature, a default maximum -batch size may be set by using the `--backend-config=default-max-batch-size=` +When a model is using the auto-complete feature, a default maximum +batch size may be set by using the `--backend-config=default-max-batch-size=` command line argument. This allows all models which are capable of batching and which make use of [Auto Generated Model Configuration](#auto-generated-model-configuration) -to have a default maximum batch size. This value is set to 4 by +to have a default maximum batch size. This value is set to 4 by default. Backend developers may make use of this default-max-batch-size by obtaining it from the TRITONBACKEND_BackendConfig api. Currently, the -following backends which utilize these default batch values and turn on +following backends which utilize these default batch values and turn on dynamic batching in their generated model configurations are: 1. [TensorFlow backend](https://github.com/triton-inference-server/tensorflow_backend) 2. [Onnxruntime backend](https://github.com/triton-inference-server/onnxruntime_backend) 3. [TensorRT backend](https://github.com/triton-inference-server/tensorrt_backend) 1. TensorRT models store the maximum batch size explicitly and do not make use - of the default-max-batch-size parameter. However, if max_batch_size > 1 + of the default-max-batch-size parameter. However, if max_batch_size > 1 and no [scheduler](model_configuration.md#scheduling-and-batching) is provided, the dynamic batch scheduler will be enabled. - -If a value greater than 1 for the maximum batch size is set for the + +If a value greater than 1 for the maximum batch size is set for the model, the [dynamic_batching](#dynamic-batcher) config will be set if no scheduler is provided in the configuration file. @@ -731,21 +731,21 @@ requirements and run on the same device as them. [Ensemble models](architecture.md#ensemble-models) are an abstraction Triton uses to execute a user-defined pipeline of models. -Since there is no physical instance associated with an ensemble model, the +Since there is no physical instance associated with an ensemble model, the `instance_group` field can not be specified for it. -However, each composing model that makes up an ensemble can specify +However, each composing model that makes up an ensemble can specify `instance_group` in its config file and individually support parallel execution as described above when the ensemble receives multiple requests. ## CUDA Compute Capability -Similar to the `default_model_filename` field, you can optionally specify the +Similar to the `default_model_filename` field, you can optionally specify the `cc_model_filenames` field to map the GPU's -[CUDA Compute Capability](https://developer.nvidia.com/cuda-gpus) -to a correspoding model filename at model load time. This is particularly -useful for TensorRT models, since they are generally tied to a specific -compute capability. +[CUDA Compute Capability](https://developer.nvidia.com/cuda-gpus) +to a corresponding model filename at model load time. This is particularly +useful for TensorRT models, since they are generally tied to a specific +compute capability. ``` cc_model_filenames [ @@ -798,7 +798,7 @@ configuration. These settings control the preferred size(s) of the dynamically created batches, the maximum time that requests can be delayed in the scheduler to allow other requests to join the dynamic batch, and queue properties such a queue size, priorities, and -time-outs. Refer to +time-outs. Refer to [this guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_2-improving_resource_utilization#what-is-dynamic-batching) for a more detailed example of dynamic batching. @@ -849,7 +849,7 @@ dynamic batcher should attempt to create. For most models, [Recommended Configuration Process](#recommended-configuration-process). An exception is TensorRT models that specify multiple optimization profiles for different batch -sizes. In this case, bacause some optimization profiles may give +sizes. In this case, because some optimization profiles may give significant performance improvement compared to others, it may make sense to use *preferred_batch_size* for the batch sizes supported by those higher-performance optimization profiles. @@ -942,10 +942,10 @@ timeout. #### Custom Batching You can set custom batching rules that work _in addition to_ the specified behavior of the dynamic batcher. -To do so, you would implement five functions in [tritonbackend.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonbackend.h) +To do so, you would implement five functions in [tritonbackend.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonbackend.h) and create a shared library. These functions are described below. -| Function | Description| +| Function | Description| | :-- | :-- | | TRITONBACKEND_ModelBatchIncludeRequest | Determines whether a request should be included in the current batch | | TRITONBACKEND_ModelBatchInitialize | Initializes a record-keeping data structure for a new batch | @@ -953,10 +953,10 @@ and create a shared library. These functions are described below. | TRITONBACKEND_ModelBatcherInitialize | Initializes a read-only data structure for use with all batches | | TRITONBACKEND_ModelBatcherFinalize | Deallocates the read-only data structure after the model is unloaded | -The path to the shared library can be passed into the model configuration via the parameter -`TRITON_BATCH_STRATEGY_PATH`. If not provided, the dynamic batcher will look for a custom -batching strategy named batchstrategy.so in the model version, model, and backend directories, -in that order. If found, it will load it. This lets you easily share a custom batching strategy +The path to the shared library can be passed into the model configuration via the parameter +`TRITON_BATCH_STRATEGY_PATH`. If not provided, the dynamic batcher will look for a custom +batching strategy named batchstrategy.so in the model version, model, and backend directories, +in that order. If found, it will load it. This lets you easily share a custom batching strategy among all models using the same backend. For a tutorial of how to create and use a custom batching library, please see the @@ -1036,7 +1036,7 @@ for examples on specifying different variants of warmup samples. ## Response Cache The model configuration `response_cache` section has an `enable` boolean used to -enable the Response Cache for this model. +enable the Response Cache for this model. ``` response_cache { @@ -1045,6 +1045,6 @@ response_cache { ``` In addition to enabling the cache in the model config, a `--cache-config` must -be specified when starting the server to enable caching on the server-side. See +be specified when starting the server to enable caching on the server-side. See the [Response Cache](response_cache.md) doc for more details on enabling server-side caching. diff --git a/docs/user_guide/model_management.md b/docs/user_guide/model_management.md index ae1c24da20..dc323a087c 100644 --- a/docs/user_guide/model_management.md +++ b/docs/user_guide/model_management.md @@ -55,8 +55,8 @@ Repository](#modifying-the-model-repository). ## Model Control Mode EXPLICIT At startup, Triton loads only those models specified explicitly with the -`--load-model` command-line option. To load ALL models at startup, specify -`--load-model=*` as the ONLY `--load-model` argument. Specifying +`--load-model` command-line option. To load ALL models at startup, specify +`--load-model=*` as the ONLY `--load-model` argument. Specifying `--load-model=*` in conjunction with another `--load-model` argument will result in error. If `--load-model` is not specified then no models are loaded at startup. Models that Triton is not able to load will be marked as @@ -226,7 +226,7 @@ configuration, so its presence in the model directory may be detected as a new f and cause the model to fully reload when only an update is expected. * If a sequence model is updated with in-flight sequence(s), Triton does not -guarentee any remaining request(s) from the in-flight sequence(s) will be routed +guarantee any remaining request(s) from the in-flight sequence(s) will be routed to the same model instance for processing. It is currently the responsibility of the user to ensure any in-flight sequence(s) is complete before updating a sequence model. @@ -239,7 +239,7 @@ performance requirements, the optimal amount of resources dedicated to loading models may differ. Triton exposes a `--model-load-thread-count` option to configure the number of threads dedicated to loading models, which defaults to 4. -To set this parameter with the C API, refer to -`TRITONSERVER_ServerOptionsSetModelLoadThreadCount` in +To set this parameter with the C API, refer to +`TRITONSERVER_ServerOptionsSetModelLoadThreadCount` in [tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). diff --git a/docs/user_guide/model_repository.md b/docs/user_guide/model_repository.md index a16633b75e..a96a1fb768 100644 --- a/docs/user_guide/model_repository.md +++ b/docs/user_guide/model_repository.md @@ -28,9 +28,9 @@ # Model Repository -**Is this your first time setting up a model repository?** Check out +**Is this your first time setting up a model repository?** Check out [these tutorials](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_1-model_deployment#setting-up-the-model-repository) - to begin your Triton journey! + to begin your Triton journey! The Triton Inference Server serves models from one or more model repositories that are specified when the server is started. While @@ -80,7 +80,7 @@ corresponding model. The config.pbtxt file describes the [model configuration](model_configuration.md) for the model. For some models, config.pbtxt is required while for others it is optional. See [Auto-Generated Model -Configuration](model_configuration.md#auto-generated-model-configuration) +Configuration](model_configuration.md#auto-generated-model-configuration) for more information. Each directory must have at least one numeric @@ -126,7 +126,7 @@ environment variable should be set and contains the location of a credential JSON file. If no credential is provided, Triton will use credentials from the [attached service account](https://cloud.google.com/docs/authentication/application-default-credentials#attached-sa) providing a value for the -[Authorization HTTP header](https://googleapis.dev/cpp/google-cloud-storage/1.42.0/classgoogle_1_1cloud_1_1storage_1_1oauth2_1_1ComputeEngineCredentials.html#a8c3a5d405366523e2f4df06554f0a676) +[Authorization HTTP header](https://googleapis.dev/cpp/google-cloud-storage/1.42.0/classgoogle_1_1cloud_1_1storage_1_1oauth2_1_1ComputeEngineCredentials.html#a8c3a5d405366523e2f4df06554f0a676) can be obtained. If not obtainable, anonymous credential will be used. To access buckets with anonymous credential (also known as public bucket), the @@ -159,9 +159,9 @@ subsequently the bucket path. $ tritonserver --model-repository=s3://host:port/bucket/path/to/model/repository ... ``` -By default, Triton uses HTTP to communicate with your instance of S3. If +By default, Triton uses HTTP to communicate with your instance of S3. If your instance of S3 supports HTTPS and you wish for Triton to use the HTTPS -protocol to communicate with it, you can specify the same in the model +protocol to communicate with it, you can specify the same in the model repository path by prefixing the host name with https://. ```bash @@ -201,8 +201,8 @@ $ export AZURE_STORAGE_KEY=$(az storage account keys list -n $AZURE_STORAGE_ACCO *This feature is currently in beta and may be subject to change.* -To group the credentials into a single file for Triton, you may set the -`TRITON_CLOUD_CREDENTIAL_PATH` environment variable to a path pointing to a +To group the credentials into a single file for Triton, you may set the +`TRITON_CLOUD_CREDENTIAL_PATH` environment variable to a path pointing to a JSON file of the following format, residing in the local file system. ``` @@ -254,7 +254,7 @@ This feature is intended for use-cases which multiple credentials are needed for each cloud storage provider. Be sure to replace any credential paths/keys with the actual paths/keys from the example above. -If the `TRITON_CLOUD_CREDENTIAL_PATH` environment variable is not set, the +If the `TRITON_CLOUD_CREDENTIAL_PATH` environment variable is not set, the [Cloud Storage with Environment variables](#cloud-storage-with-environment-variables) will be used. diff --git a/docs/user_guide/optimization.md b/docs/user_guide/optimization.md index 7d2d9c61aa..f842198a90 100644 --- a/docs/user_guide/optimization.md +++ b/docs/user_guide/optimization.md @@ -81,11 +81,11 @@ latency. For most models, the Triton feature that provides the largest performance improvement is [dynamic -batching](model_configuration.md#dynamic-batcher). +batching](model_configuration.md#dynamic-batcher). [This example](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_2-improving_resource_utilization#dynamic-batching--concurrent-model-execution) sheds more light on conceptual details. If your model does not support batching then you can skip ahead to [Model -Instances](#model-instances). +Instances](#model-instances). ### Dynamic Batcher @@ -131,8 +131,8 @@ typically applies when perf_analyzer is running on the same system as Triton. The first rule is that for minimum latency set the request concurrency to 1 and disable the dynamic batcher and use only 1 [model instance](#model-instances). The second rule is that for maximum -throughput set the request concurrency to be -`2 * * `. We will discuss model +throughput set the request concurrency to be +`2 * * `. We will discuss model instances [below](#model-instances), for now we are working with one model instance. So for maximum-batch-size 4 we want to run perf_analyzer with request concurrency of `2 * 4 * 1 = 8`. @@ -219,7 +219,7 @@ settings that best satisfy your throughput and latency requirements. Triton has several optimization settings that apply to only a subset of the supported model frameworks. These optimization settings are controlled by the model configuration [optimization -policy](model_configuration.md#optimization-policy). Visit +policy](model_configuration.md#optimization-policy). Visit [this guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_4-inference_acceleration) for an end to end discussion. diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md index 9764efcc23..877e4ecfa0 100644 --- a/docs/user_guide/performance_tuning.md +++ b/docs/user_guide/performance_tuning.md @@ -37,7 +37,7 @@ for most use cases. For those who wish to jump right in, skip to the [end-to-end example](#end-to-end-example). -For additional material, see the +For additional material, see the [Triton Conceptual Guide tutorial](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_4-inference_acceleration). ## Overview @@ -187,7 +187,7 @@ other frameworks. mkdir -p ./models/densenet_onnx/1 # Download model and place it in model repository -wget -O models/densenet_onnx/1/model.onnx +wget -O models/densenet_onnx/1/model.onnx https://contentmamluswest001.blob.core.windows.net/content/14b2744cf8d6418c87ffddc3f3127242/9502630827244d60a1214f250e3bbca7/08aed7327d694b8dbaee2c97b8d0fcba/densenet121-1.2.onnx ``` @@ -318,7 +318,7 @@ SERVER_PID=`ps | grep tritonserver | awk '{ printf $1 }'` kill ${SERVER_PID} # Install model analyzer -pip install --upgrade pip +pip install --upgrade pip pip install triton-model-analyzer wkhtmltopdf # Profile the model using local (default) mode @@ -369,7 +369,7 @@ your models for your use case. 6. Extract optimal config from Model Analyzer results -In our example above, `densenet_onnx_config_3` was the optimal configuration. +In our example above, `densenet_onnx_config_3` was the optimal configuration. So let's extract that `config.pbtxt` and put it back in our model repository for future use. ```bash diff --git a/docs/user_guide/rate_limiter.md b/docs/user_guide/rate_limiter.md index 2e38327042..69b94fd8b8 100644 --- a/docs/user_guide/rate_limiter.md +++ b/docs/user_guide/rate_limiter.md @@ -42,9 +42,9 @@ frameworks dynamically allocate memory. Running all such models simultaneously may lead to system going out-of-memory. Rate limiter allows to postpone the inference execution on some -model instances such that not all of them runs simultaneously. +model instances such that not all of them runs simultaneously. The model priorities are used to decide which model instance -to schedule next. +to schedule next. ## Using Rate Limiter diff --git a/docs/user_guide/response_cache.md b/docs/user_guide/response_cache.md index b526a3c84e..fbc1233f3b 100644 --- a/docs/user_guide/response_cache.md +++ b/docs/user_guide/response_cache.md @@ -47,18 +47,18 @@ used for the request. When this happens there is no need for Triton to execute the model to produce the inference result. If the hash is not found in the cache, Triton executes the model to produce the inference result, and then records that result in the cache so that subsequent inference requests can -(re)use those results. +(re)use those results. ## Usage In order for caching to be used on a given model, it must be enabled -on both the server-side, and in the model's +on both the server-side, and in the model's [model config](model_configuration.md#response-cache). See the following sections below for more details. ### Enable Caching on Server-side -The response cache is enabled on the server-side by specifying a +The response cache is enabled on the server-side by specifying a `` and corresponding configuration when starting the Triton server. @@ -75,10 +75,10 @@ This allows users to enable/disable caching globally on server startup. ### Enable Caching for a Model -**By default, no model uses response caching even if the response cache -is enabled globally with the `--cache-config` flag.** +**By default, no model uses response caching even if the response cache +is enabled globally with the `--cache-config` flag.** -For a given model to use response caching, the model must also have +For a given model to use response caching, the model must also have response caching enabled in its model configuration: ``` # config.pbtxt @@ -90,7 +90,7 @@ response_cache { This allows users to enable/disable caching for specific models. -For more information on enabling the response cache for each model, see the +For more information on enabling the response cache for each model, see the [model configuration docs](model_configuration.md#response-cache). ### Cache Implementations @@ -100,7 +100,7 @@ Starting in the 23.03 release, Triton has a set of that are used to communicate with a cache implementation of the user's choice. A cache implementation is a shared library that implements the required -TRITONCACHE APIs and is dynamically loaded on server startup, if enabled. +TRITONCACHE APIs and is dynamically loaded on server startup, if enabled. Triton's most recent [tritonserver release containers](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver) @@ -108,7 +108,7 @@ come with the following cache implementations out of the box: - [local](https://github.com/triton-inference-server/local_cache): `/opt/tritonserver/caches/local/libtritoncache_local.so` - [redis](https://github.com/triton-inference-server/redis_cache): `/opt/tritonserver/caches/redis/libtritoncache_redis.so` -With these TRITONCACHE APIs, `tritonserver` exposes a new `--cache-config` +With these TRITONCACHE APIs, `tritonserver` exposes a new `--cache-config` CLI flag that gives the user flexible customization of which cache implementation to use, and how to configure it. Similar to the `--backend-config` flag, the expected format is `--cache-config ,=` and may @@ -122,31 +122,31 @@ internally before the 23.03 release. For more implementation specific details, see the [local cache implementation](https://github.com/triton-inference-server/local_cache). -When `--cache-config local,size=SIZE` is specified with a non-zero `SIZE`, +When `--cache-config local,size=SIZE` is specified with a non-zero `SIZE`, Triton allocates the requested size in CPU memory and **shares the -cache across all inference requests and across all models**. +cache across all inference requests and across all models**. #### Redis Cache The `redis` cache implementation exposes the ability for Triton to communicate with a Redis server for caching. The `redis_cache` implementation is essentially -a Redis client that acts as an intermediary between Triton and Redis. +a Redis client that acts as an intermediary between Triton and Redis. To list a few benefits of the `redis` cache compared to the `local` cache in the context of Triton: -- The Redis server can be hosted remotely as long as it is accesible by Triton, - so it is not tied directly to the Triton process lifetime. +- The Redis server can be hosted remotely as long as it is accessible by Triton, + so it is not tied directly to the Triton process lifetime. - This means Triton can be restarted and still have access to previously cached entries. - This also means that Triton doesn't have to compete with the cache for memory/resource usage. - Multiple Triton instances can share a cache by configuring each Triton instance to communicate with the same Redis server. - The Redis server can be updated/restarted independently of Triton, and - Triton will fallback to operating as it would with no cache access during + Triton will fallback to operating as it would with no cache access during any Redis server downtime, and log appropriate errors. -In general, the Redis server can be configured/deployed as needed for your use -case, and Triton's `redis` cache will simply act as a client of your Redis -deployment. The [Redis docs](https://redis.io/docs/) should be consulted for +In general, the Redis server can be configured/deployed as needed for your use +case, and Triton's `redis` cache will simply act as a client of your Redis +deployment. The [Redis docs](https://redis.io/docs/) should be consulted for questions and details about configuring the Redis server. For Triton-specific `redis` cache implementation details/configuration, see the @@ -157,7 +157,7 @@ For Triton-specific `redis` cache implementation details/configuration, see the With the TRITONCACHE API interface, it is now possible for users to implement their own cache to suit any use-case specific needs. To see the required interface that must be implemented by a cache -developer, see the +developer, see the [TRITONCACHE API header](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritoncache.h). The `local` or `redis` cache implementations may be used as reference. @@ -165,22 +165,22 @@ Upon successfully developing and building a custom cache, the resulting shared library (ex: `libtritoncache_.so`) must be placed in the cache directory similar to where the `local` and `redis` cache implementations live. By default, this directory is `/opt/tritonserver/caches`, but a custom directory may be -specified with `--cache-dir` as needed. +specified with `--cache-dir` as needed. To put this example together, if the custom cache were named "custom" -(this name is arbitrary), by default Triton would expect to find the +(this name is arbitrary), by default Triton would expect to find the cache implementation at `/opt/tritonserver/caches/custom/libtritoncache_custom.so`. ## Deprecation Notes > **Note** > Prior to 23.03, enabling the `local` cache used to be done through setting a non-zero size -> (in bytes) when Triton was launched using the `--response-cache-byte-size` flag. +> (in bytes) when Triton was launched using the `--response-cache-byte-size` flag. > -> Starting in 23.03, the `--response-cache-byte-size` flag is now deprecated and -> `--cache-config` should be used instead. For backwards compatibility, -> `--response-cache-byte-size` will continue to function under the hood by being -> converted to the corresponding `--cache-config` argument, but it will default +> Starting in 23.03, the `--response-cache-byte-size` flag is now deprecated and +> `--cache-config` should be used instead. For backwards compatibility, +> `--response-cache-byte-size` will continue to function under the hood by being +> converted to the corresponding `--cache-config` argument, but it will default > to using the `local` cache implementation. It is not possible to choose other > cache implementations using the `--response-cache-byte-size` flag. > @@ -190,10 +190,10 @@ cache implementation at `/opt/tritonserver/caches/custom/libtritoncache_custom.s > **Warning** > -> The `local` cache implementation may fail to initialize for very small values -> of `--cache-config local,size=` or `--response-cache-byte-size` -> (ex: less than 1024 bytes) due to internal memory management requirements. -> If you encounter an initialization error for a relatively small cache size, +> The `local` cache implementation may fail to initialize for very small values +> of `--cache-config local,size=` or `--response-cache-byte-size` +> (ex: less than 1024 bytes) due to internal memory management requirements. +> If you encounter an initialization error for a relatively small cache size, > try increasing it. > > Similarly, the size is upper bounded by the available RAM on the system. @@ -202,14 +202,14 @@ cache implementation at `/opt/tritonserver/caches/custom/libtritoncache_custom.s ## Performance -The response cache is intended to be used for use cases where a significant -number of duplicate requests (cache hits) are expected and therefore would +The response cache is intended to be used for use cases where a significant +number of duplicate requests (cache hits) are expected and therefore would benefit from caching. The term "significant" here is subjective to the use case, but a simple interpretation would be to consider the proportion of expected cache hits/misses, as well as the average time spend computing -a response. +a response. -For cases where cache hits are common and computation is expensive, +For cases where cache hits are common and computation is expensive, the cache can significantly improve overall performance. For cases where most requests are unique (cache misses) or the compute is diff --git a/qa/L0_async_work_queue/test.sh b/qa/L0_async_work_queue/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_backend_config/test.sh b/qa/L0_backend_config/test.sh old mode 100644 new mode 100755 index 2bca7fa529..b898735798 --- a/qa/L0_backend_config/test.sh +++ b/qa/L0_backend_config/test.sh @@ -66,7 +66,7 @@ POSITIVE_TEST_ARGS=("--backend-config=tensorflow,default-max-batch-size=5 $COMMO "--backend-config=default-max-batch-size=7 --backend-config=tensorflow,default-max-batch-size=8 $COMMON_ARGS" \ ) -# These integers correspond to the expected default-max-batch-size which gets set +# These integers correspond to the expected default-max-batch-size which gets set # in the POSITIVE_TEST_ARGS POSITIVE_TEST_ANSWERS=(5 6 8) @@ -86,12 +86,12 @@ else RESULT_LOG_LINE=$(grep -a "Adding default backend config setting:" $SERVER_LOG) if [ "$RESULT_LOG_LINE" != "" ]; then - + # Pick out the logged value of the default-max-batch-size which gets passed into model creation RESOLVED_DEFAULT_MAX_BATCH_SIZE=$(awk -v line="$RESULT_LOG_LINE" 'BEGIN {split(line, a, "]"); split(a[2], b, ": "); split(b[2], c, ","); print c[2]}') if [ "$RESOLVED_DEFAULT_MAX_BATCH_SIZE" != "4" ]; then - echo "*** FAILED: Found default-max-batch-size not equal to the expected default-max-batch-size. Expected: default-max-batch-size,4, Found: $RESOLVED_DEFAULT_MAX_BATCH_SIZE \n" + echo "*** FAILED: Found default-max-batch-size not equal to the expected default-max-batch-size. Expected: default-max-batch-size,4, Found: $RESOLVED_DEFAULT_MAX_BATCH_SIZE \n" RET=1 fi else @@ -104,7 +104,7 @@ for ((i=0; i < ${#POSITIVE_TEST_ARGS[@]}; i++)); do SERVER_ARGS=${POSITIVE_TEST_ARGS[$i]} SERVER_LOG=$SERVER_LOG_BASE.backend_config_positive_$i.log run_server - + if [ "$SERVER_PID" == "0" ]; then echo -e "*** FAILED: Server failed to start $SERVER\n" RET=1 @@ -115,12 +115,12 @@ for ((i=0; i < ${#POSITIVE_TEST_ARGS[@]}; i++)); do RESULT_LOG_LINE=$(grep -a "Found overwritten default setting:" $SERVER_LOG) if [ "$RESULT_LOG_LINE" != "" ]; then - + # Pick out the logged value of the default-max-batch-size which gets passed into model creation RESOLVED_DEFAULT_MAX_BATCH_SIZE=$(awk -v line="$RESULT_LOG_LINE" 'BEGIN {split(line, a, "]"); split(a[2], b, ": "); split(b[2], c, ","); print c[2]}') if [ "$RESOLVED_DEFAULT_MAX_BATCH_SIZE" != "${POSITIVE_TEST_ANSWERS[$i]}" ]; then - echo "*** FAILED: Found default-max-batch-size not equal to the expected default-max-batch-size. Expected: ${POSITIVE_TEST_ANSWERS[$i]}, Found: $RESOLVED_DEFAULT_MAX_BATCH_SIZE \n" + echo "*** FAILED: Found default-max-batch-size not equal to the expected default-max-batch-size. Expected: ${POSITIVE_TEST_ANSWERS[$i]}, Found: $RESOLVED_DEFAULT_MAX_BATCH_SIZE \n" RET=1 fi else @@ -152,11 +152,11 @@ done # -# Sepcific backend tests -# +# Specific backend tests +# -# While inference server is running, save the -# config of the 'no_config' model to the TRIAL +# While inference server is running, save the +# config of the 'no_config' model to the TRIAL # file. function save_model_config() { CODE=`curl -s -w %{http_code} -o ./$TRIAL.out localhost:8000/v2/models/no_config/config` @@ -192,13 +192,13 @@ else RET=1 fi - # Assert we are also turning on the dynamic_batcher + # Assert we are also turning on the dynamic_batcher DYNAMIC_BATCHING_LOG_LINE=$(grep -a "Starting dynamic-batcher thread" $SERVER_LOG) if [ "$DYNAMIC_BATCHING_LOG_LINE" == "" ]; then echo "*** FAILED: Expected dynamic batching to be set in model config but was not found\n" RET=1 fi - + kill $SERVER_PID wait $SERVER_PID @@ -225,7 +225,7 @@ else RET=1 fi - # Assert batching disabled + # Assert batching disabled if [ "$(grep -a -E '\"dynamic_batching\": \{}' $SERVER_LOG)" != "" ]; then echo "*** FAILED: Found dynamic batching enabled in configuration when none expected.\n" RET=1 @@ -252,7 +252,7 @@ if [ "$SERVER_PID" == "0" ]; then else save_model_config - + # Assert the max-batch-size is the command line value MAX_BATCH_LOG_LINE=$(grep -a "\"max_batch_size\":5" $TRIAL.out) if [ "$MAX_BATCH_LOG_LINE" == "" ]; then @@ -260,13 +260,13 @@ else RET=1 fi - # Assert we are also turning on the dynamic_batcher + # Assert we are also turning on the dynamic_batcher DYNAMIC_BATCHING_LOG_LINE=$(grep -a "Starting dynamic-batcher thread" $SERVER_LOG) if [ "$DYNAMIC_BATCHING_LOG_LINE" == "" ]; then echo "*** FAILED: Expected dynamic batching to be set in model config but was not found\n" RET=1 fi - + kill $SERVER_PID wait $SERVER_PID fi @@ -296,7 +296,7 @@ else RET=1 fi - # Assert batching disabled + # Assert batching disabled if [ "$(grep -a -E '\"dynamic_batching\": \{}' $SERVER_LOG)" != "" ]; then echo "*** FAILED: Found dynamic batching in configuration when none expected.\n" RET=1 @@ -309,17 +309,17 @@ fi # # General backend tests -# +# -# We want to make sure that backend configurations +# We want to make sure that backend configurations # are not lost. For this purpose we are using only onnx backend rm -rf ./models/ mkdir -p ./models/no_config/ cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/onnx_float32_float32_float32/1 ./models/no_config/ -# First getting a baseline for the number of default configs -# added during a server set up +# First getting a baseline for the number of default configs +# added during a server set up SERVER_ARGS="$COMMON_ARGS" SERVER_LOG=$SERVER_LOG_BASE.default_configs.log run_server @@ -345,11 +345,11 @@ fi # Now make sure that when setting specific backend configs # default ones are not lost. # Current logic for backend config resolution reads default configs first, -# then specific configs and overrides defaults if needed. -# We would like to make sure that none of configs are lost and -# defaults are properly overriden. +# then specific configs and overrides defaults if needed. +# We would like to make sure that none of configs are lost and +# defaults are properly overridden. # One of defaultconfigs is `min-compute-capability`. This test -# checks if it is properlly overriden. +# checks if it is properlly overridden. MIN_COMPUTE_CAPABILITY=XX SERVER_ARGS="--backend-config=onnxruntime,min-compute-capability=$MIN_COMPUTE_CAPABILITY $COMMON_ARGS" SERVER_LOG=$SERVER_LOG_BASE.global_configs.log diff --git a/qa/L0_backend_fastertransformer/test.sh b/qa/L0_backend_fastertransformer/test.sh old mode 100644 new mode 100755 index 49d444392e..8e5d20271a --- a/qa/L0_backend_fastertransformer/test.sh +++ b/qa/L0_backend_fastertransformer/test.sh @@ -43,7 +43,7 @@ rm -f $SERVER_LOG* $CLIENT_LOG* RET=0 # install dependencies apt-get update && \ - apt-get install -y --no-install-recommends python3 python3-pip python3-protobuf + apt-get install -y --no-install-recommends python3 python3-pip python3-protobuf python3 -m pip install --upgrade pip && \ pip3 install --upgrade numpy diff --git a/qa/L0_backend_identity/identity_test.py b/qa/L0_backend_identity/identity_test.py old mode 100644 new mode 100755 index e9b3465050..ef0634b95c --- a/qa/L0_backend_identity/identity_test.py +++ b/qa/L0_backend_identity/identity_test.py @@ -27,42 +27,45 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse -import numpy as np import sys -import requests as httpreq from builtins import range + +import numpy as np +import requests as httpreq import tritongrpcclient as grpcclient import tritonhttpclient as httpclient from tritonclientutils import np_to_triton_dtype FLAGS = None -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') parser.add_argument( - '-i', - '--protocol', + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-u", "--url", type=str, required=False, help="Inference server URL." + ) + parser.add_argument( + "-i", + "--protocol", type=str, required=False, - default='http', - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') + default="http", + help='Protocol ("http"/"grpc") used to ' + + 'communicate with inference service. Default is "http".', + ) FLAGS = parser.parse_args() if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): - print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format( - FLAGS.protocol)) + print( + 'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol) + ) exit(1) client_util = httpclient if FLAGS.protocol == "http" else grpcclient @@ -77,17 +80,18 @@ model_name = "identity_uint32" request_parallelism = 4 shape = [2, 2] - with client_util.InferenceServerClient(FLAGS.url, - concurrency=request_parallelism, - verbose=FLAGS.verbose) as client: + with client_util.InferenceServerClient( + FLAGS.url, concurrency=request_parallelism, verbose=FLAGS.verbose + ) as client: input_datas = [] requests = [] for i in range(request_parallelism): input_data = (16384 * np.random.randn(*shape)).astype(np.uint32) input_datas.append(input_data) inputs = [ - client_util.InferInput("INPUT0", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + client_util.InferInput( + "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) requests.append(client.async_infer(model_name, inputs)) @@ -104,32 +108,44 @@ sys.exit(1) if not np.array_equal(output_data, input_datas[i]): - print("error: expected output {} to match input {}".format( - output_data, input_datas[i])) + print( + "error: expected output {} to match input {}".format( + output_data, input_datas[i] + ) + ) sys.exit(1) # Make sure the requests ran in parallel. stats = client.get_inference_statistics(model_name) - if (len(stats['model_stats']) != - 1) or (stats['model_stats'][0]['name'] != model_name): + if (len(stats["model_stats"]) != 1) or ( + stats["model_stats"][0]["name"] != model_name + ): print("error: expected statistics for {}".format(model_name)) sys.exit(1) - stat = stats['model_stats'][0] - if (stat['inference_count'] != 8) or (stat['execution_count'] != 1): + stat = stats["model_stats"][0] + if (stat["inference_count"] != 8) or (stat["execution_count"] != 1): print( - "error: expected execution_count == 1 and inference_count == 8, got {} and {}" - .format(stat['execution_count'], stat['inference_count'])) + "error: expected execution_count == 1 and inference_count == 8, got {} and {}".format( + stat["execution_count"], stat["inference_count"] + ) + ) sys.exit(1) # Check metrics to make sure they are reported correctly - metrics = httpreq.get('http://localhost:8002/metrics') + metrics = httpreq.get("http://localhost:8002/metrics") print(metrics.text) - success_str = 'nv_inference_request_success{model="identity_uint32",version="1"}' + success_str = ( + 'nv_inference_request_success{model="identity_uint32",version="1"}' + ) infer_count_str = 'nv_inference_count{model="identity_uint32",version="1"}' - infer_exec_str = 'nv_inference_exec_count{model="identity_uint32",version="1"}' - custom_metric_str = 'input_byte_size_counter{model="identity_uint32",version="1"}' + infer_exec_str = ( + 'nv_inference_exec_count{model="identity_uint32",version="1"}' + ) + custom_metric_str = ( + 'input_byte_size_counter{model="identity_uint32",version="1"}' + ) success_val = None infer_count_val = None @@ -137,36 +153,47 @@ custom_metric_val = None for line in metrics.text.splitlines(): if line.startswith(success_str): - success_val = float(line[len(success_str):]) + success_val = float(line[len(success_str) :]) if line.startswith(infer_count_str): - infer_count_val = float(line[len(infer_count_str):]) + infer_count_val = float(line[len(infer_count_str) :]) if line.startswith(infer_exec_str): - infer_exec_val = float(line[len(infer_exec_str):]) + infer_exec_val = float(line[len(infer_exec_str) :]) if line.startswith(custom_metric_str): - custom_metric_val = float(line[len(custom_metric_str):]) + custom_metric_val = float(line[len(custom_metric_str) :]) if success_val != 4: - print("error: expected metric {} == 4, got {}".format( - success_str, success_val)) + print( + "error: expected metric {} == 4, got {}".format( + success_str, success_val + ) + ) sys.exit(1) if infer_count_val != 8: - print("error: expected metric {} == 8, got {}".format( - infer_count_str, infer_count_val)) + print( + "error: expected metric {} == 8, got {}".format( + infer_count_str, infer_count_val + ) + ) sys.exit(1) if infer_exec_val != 1: - print("error: expected metric {} == 1, got {}".format( - infer_exec_str, infer_exec_val)) + print( + "error: expected metric {} == 1, got {}".format( + infer_exec_str, infer_exec_val + ) + ) sys.exit(1) if custom_metric_val != 64: - print("error: expected metric {} == 64, got {}".format( - custom_metric_str, custom_metric_val)) + print( + "error: expected metric {} == 64, got {}".format( + custom_metric_str, custom_metric_val + ) + ) sys.exit(1) # Reuse a single client for all sync tests - with client_util.InferenceServerClient(FLAGS.url, - verbose=FLAGS.verbose) as client: + with client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) as client: for model_name, np_dtype, shape in ( - # yapf: disable + # yapf: disable ("identity_fp32", np.float32, [1, 0]), ("identity_fp32", np.float32, [1, 5]), ("identity_uint32", np.uint32, [4, 0]), @@ -175,22 +202,20 @@ ("identity_nobatch_int8", np.int8, [7]), ("identity_bytes", object, [1, 1]), ("identity_bf16", np.float32, [1, 0]), - ("identity_bf16", np.float32, [1, 5])): + ("identity_bf16", np.float32, [1, 5]) + ): # yapf: enable if np_dtype != object: input_data = (16384 * np.random.randn(*shape)).astype(np_dtype) else: - in0 = (16384 * np.ones(shape, dtype='int')) - in0n = np.array([str(x) for x in in0.reshape(in0.size)], - dtype=object) + in0 = 16384 * np.ones(shape, dtype="int") + in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) input_data = in0n.reshape(in0.shape) if model_name != "identity_bf16": triton_type = np_to_triton_dtype(input_data.dtype) else: triton_type = "BF16" - inputs = [ - client_util.InferInput("INPUT0", input_data.shape, triton_type) - ] + inputs = [client_util.InferInput("INPUT0", input_data.shape, triton_type)] inputs[0].set_data_from_numpy(input_data) results = client.infer(model_name, inputs) @@ -201,41 +226,47 @@ if np_dtype == object: output_data = np.array( - [str(x, encoding='utf-8') for x in output_data.flatten()], - dtype=object).reshape(output_data.shape) + [str(x, encoding="utf-8") for x in output_data.flatten()], + dtype=object, + ).reshape(output_data.shape) if output_data is None: print("error: expected 'OUTPUT0'") sys.exit(1) if model_name == "identity_bf16": - if (input_data.shape != output_data.shape): + if input_data.shape != output_data.shape: print( - "error: expected output shape {} to match input shape {}" - .format(output_data.shape, input_data.shape)) + "error: expected output shape {} to match input shape {}".format( + output_data.shape, input_data.shape + ) + ) sys.exit(1) for input, output in zip( - np.nditer(input_data, - flags=["refs_ok", "zerosize_ok"], - order='C'), - np.nditer(output_data, - flags=["refs_ok", "zerosize_ok"], - order='C')): + np.nditer(input_data, flags=["refs_ok", "zerosize_ok"], order="C"), + np.nditer(output_data, flags=["refs_ok", "zerosize_ok"], order="C"), + ): if input.tobytes()[2:4] != output.tobytes()[2:4]: print( - "error: expected low-order bits of output {} to match low-order bits of input {}" - .format(output, input)) + "error: expected low-order bits of output {} to match low-order bits of input {}".format( + output, input + ) + ) sys.exit(1) - if output.tobytes()[0:2] != b'\x00\x00': + if output.tobytes()[0:2] != b"\x00\x00": print( - "error: expected output {} to have all-zero high-order bits, got {}" - .format(output, - output.tobytes()[0:2])) + "error: expected output {} to have all-zero high-order bits, got {}".format( + output, output.tobytes()[0:2] + ) + ) sys.exit(1) else: if not np.array_equal(output_data, input_data): - print("error: expected output {} to match input {}".format( - output_data, input_data)) + print( + "error: expected output {} to match input {}".format( + output_data, input_data + ) + ) sys.exit(1) # Make sure response parameters are correct @@ -252,8 +283,7 @@ param2 = params["param2"].bool_param if param0 != "an example string parameter": - print( - "error: expected 'param0' == 'an example string parameter'") + print("error: expected 'param0' == 'an example string parameter'") sys.exit(1) if param1 != 42: print("error: expected 'param1' == 42") diff --git a/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py b/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py old mode 100644 new mode 100755 index 8669132b3c..bd5fae1afe --- a/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py +++ b/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py @@ -1,4 +1,6 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,18 +26,18 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import numpy as np import unittest + +import numpy as np import triton_python_backend_utils as pb_utils class ArgumentValidationTest(unittest.TestCase): - def test_infer_request_args(self): # Dummy arguments used in the tests. - inputs = [pb_utils.Tensor('INPUT0', np.asarray([1, 2], dtype=np.int32))] - model_name = 'my_model' - requested_output_names = ['my_output'] + inputs = [pb_utils.Tensor("INPUT0", np.asarray([1, 2], dtype=np.int32))] + model_name = "my_model" + requested_output_names = ["my_output"] # # inputs field validation @@ -46,21 +48,24 @@ def test_infer_request_args(self): pb_utils.InferenceRequest( inputs=[None], model_name=model_name, - requested_output_names=requested_output_names) + requested_output_names=requested_output_names, + ) # Test None object as list of inputs with self.assertRaises(TypeError) as e: pb_utils.InferenceRequest( inputs=None, model_name=model_name, - requested_output_names=requested_output_names) + requested_output_names=requested_output_names, + ) # model_name validation with self.assertRaises(TypeError) as e: pb_utils.InferenceRequest( model_name=None, inputs=inputs, - requested_output_names=requested_output_names) + requested_output_names=requested_output_names, + ) # # Requested output name validations @@ -68,14 +73,14 @@ def test_infer_request_args(self): # Test list of None objects as requested_output_names with self.assertRaises(TypeError) as e: - pb_utils.InferenceRequest(requested_output_names=[None], - inputs=inputs, - model_name=model_name) + pb_utils.InferenceRequest( + requested_output_names=[None], inputs=inputs, model_name=model_name + ) with self.assertRaises(TypeError) as e: - pb_utils.InferenceRequest(requested_output_names=None, - inputs=inputs, - model_name=model_name) + pb_utils.InferenceRequest( + requested_output_names=None, inputs=inputs, model_name=model_name + ) # Other arguments validation @@ -85,7 +90,8 @@ def test_infer_request_args(self): requested_output_names=requested_output_names, inputs=inputs, model_name=model_name, - correleation_id=None) + correleation_id=None, + ) # request_id set to None with self.assertRaises(TypeError) as e: @@ -93,7 +99,8 @@ def test_infer_request_args(self): requested_output_names=requested_output_names, inputs=inputs, model_name=model_name, - request_id=None) + request_id=None, + ) # model_version set to None with self.assertRaises(TypeError) as e: @@ -101,7 +108,8 @@ def test_infer_request_args(self): requested_output_names=requested_output_names, inputs=inputs, model_name=model_name, - model_version=None) + model_version=None, + ) # flags set to None with self.assertRaises(TypeError) as e: @@ -109,17 +117,16 @@ def test_infer_request_args(self): requested_output_names=requested_output_names, inputs=inputs, model_name=model_name, - flags=None) + flags=None, + ) # Empty lists should not raise an exception - pb_utils.InferenceRequest(requested_output_names=[], - inputs=[], - model_name=model_name) + pb_utils.InferenceRequest( + requested_output_names=[], inputs=[], model_name=model_name + ) def test_infer_response_args(self): - outputs = [ - pb_utils.Tensor('OUTPUT0', np.asarray([1, 2], dtype=np.int32)) - ] + outputs = [pb_utils.Tensor("OUTPUT0", np.asarray([1, 2], dtype=np.int32))] # Test list of None object as output tensor with self.assertRaises(pb_utils.TritonModelException) as e: @@ -195,12 +202,15 @@ def execute(self, requests): responses = [] for _ in requests: # Run the unittest and store the results in InferenceResponse. - test = unittest.main('model', exit=False) + test = unittest.main("model", exit=False) responses.append( - pb_utils.InferenceResponse([ - pb_utils.Tensor( - 'OUTPUT0', - np.array([test.result.wasSuccessful()], - dtype=np.float16)) - ])) + pb_utils.InferenceResponse( + [ + pb_utils.Tensor( + "OUTPUT0", + np.array([test.result.wasSuccessful()], dtype=np.float16), + ) + ] + ) + ) return responses diff --git a/qa/L0_backend_python/argument_validation/test.sh b/qa/L0_backend_python/argument_validation/test.sh old mode 100644 new mode 100755 index f80ce3e84b..f47abb8485 --- a/qa/L0_backend_python/argument_validation/test.sh +++ b/qa/L0_backend_python/argument_validation/test.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/qa/L0_backend_python/bls/test.sh b/qa/L0_backend_python/bls/test.sh old mode 100644 new mode 100755 index 3f88df01f3..3d87cf7b65 --- a/qa/L0_backend_python/bls/test.sh +++ b/qa/L0_backend_python/bls/test.sh @@ -115,7 +115,7 @@ for TRIAL in non_decoupled decoupled ; do set +e export MODEL_NAME='bls' - python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 + python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then echo -e "\n***\n*** 'bls' $BLS_KIND test FAILED. \n***" cat $CLIENT_LOG @@ -130,7 +130,7 @@ for TRIAL in non_decoupled decoupled ; do fi export MODEL_NAME='bls_memory' - python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 + python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then echo -e "\n***\n*** 'bls_memory' $BLS_KIND test FAILED. \n***" cat $CLIENT_LOG @@ -145,7 +145,7 @@ for TRIAL in non_decoupled decoupled ; do fi export MODEL_NAME='bls_memory_async' - python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 + python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then echo -e "\n***\n*** 'bls_async_memory' $BLS_KIND test FAILED. \n***" cat $CLIENT_LOG @@ -160,7 +160,7 @@ for TRIAL in non_decoupled decoupled ; do fi export MODEL_NAME='bls_async' - python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 + python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then echo -e "\n***\n*** 'bls_async' $BLS_KIND test FAILED. \n***" cat $CLIENT_LOG diff --git a/qa/L0_backend_python/common.sh b/qa/L0_backend_python/common.sh old mode 100644 new mode 100755 index 074ad26da0..6030849fc9 --- a/qa/L0_backend_python/common.sh +++ b/qa/L0_backend_python/common.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -52,7 +53,7 @@ install_build_deps() { echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | \ tee /etc/apt/sources.list.d/kitware.list >/dev/null && \ apt-get update && \ - apt-get install -y --no-install-recommends cmake cmake-data + apt-get install -y --no-install-recommends cmake cmake-data } create_conda_env() { diff --git a/qa/L0_backend_python/custom_metrics/test.sh b/qa/L0_backend_python/custom_metrics/test.sh old mode 100644 new mode 100755 index 8842fa4ecf..149f5e5d56 --- a/qa/L0_backend_python/custom_metrics/test.sh +++ b/qa/L0_backend_python/custom_metrics/test.sh @@ -54,7 +54,7 @@ fi set +e export MODEL_NAME='custom_metrics' -python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 +python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then echo -e "\n***\n*** 'Custom Metrics' test FAILED. \n***" cat $CLIENT_LOG diff --git a/qa/L0_backend_python/decoupled/decoupled_test.py b/qa/L0_backend_python/decoupled/decoupled_test.py old mode 100644 new mode 100755 index 98b19b1cd2..4a4b77c661 --- a/qa/L0_backend_python/decoupled/decoupled_test.py +++ b/qa/L0_backend_python/decoupled/decoupled_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,18 +30,18 @@ sys.path.append("../../common") -import test_util as tu +import queue import time -import tritonclient.grpc as grpcclient -from tritonclient.utils import * -import numpy as np import unittest from functools import partial -import queue +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +from tritonclient.utils import * -class UserData: +class UserData: def __init__(self): self._completed_requests = queue.Queue() @@ -52,10 +54,9 @@ def callback(user_data, result, error): class DecoupledTest(tu.TestResultCollector): - def test_decoupled_execute_error(self): # The decoupled_execute_error model returns an error for the first - # request and sucessfully processes the second request. This is making + # request and successfully processes the second request. This is making # sure that an error in a single request does not completely fail the # batch. @@ -63,8 +64,7 @@ def test_decoupled_execute_error(self): shape = [2, 2] number_of_requests = 2 user_data = UserData() - with grpcclient.InferenceServerClient( - "localhost:8001") as triton_client: + with grpcclient.InferenceServerClient("localhost:8001") as triton_client: triton_client.start_stream(callback=partial(callback, user_data)) input_datas = [] @@ -72,12 +72,12 @@ def test_decoupled_execute_error(self): input_data = np.random.randn(*shape).astype(np.float32) input_datas.append(input_data) inputs = [ - grpcclient.InferInput("IN", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) - triton_client.async_stream_infer(model_name=model_name, - inputs=inputs) + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) for i in range(number_of_requests): result = user_data._completed_requests.get() @@ -91,27 +91,28 @@ def test_decoupled_execute_error(self): self.assertTrue( np.array_equal(output_data, input_datas[i]), "error: expected output {} to match input {}".format( - output_data, input_datas[i])) + output_data, input_datas[i] + ), + ) def test_decoupled_bls(self): # Test combinations of BLS and decoupled API in Python backend. model_name = "decoupled_bls" shape = [1, 2] user_data = UserData() - with grpcclient.InferenceServerClient( - "localhost:8001") as triton_client: + with grpcclient.InferenceServerClient("localhost:8001") as triton_client: triton_client.start_stream(callback=partial(callback, user_data)) input_datas = [] input_data = np.random.randn(*shape).astype(np.float32) input_datas.append(input_data) inputs = [ - grpcclient.InferInput("IN", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) - triton_client.async_stream_infer(model_name=model_name, - inputs=inputs) + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) # Check the results of the decoupled model using BLS def check_result(result): @@ -123,7 +124,9 @@ def check_result(result): self.assertTrue( np.array_equal(output_data, input_data), "error: expected output {} to match input {}".format( - output_data, input_data)) + output_data, input_data + ), + ) result = user_data._completed_requests.get() check_result(result) @@ -134,19 +137,19 @@ def test_decoupled_bls_stream(self): in_values = [4, 2, 0, 1] shape = [1] user_data = UserData() - with grpcclient.InferenceServerClient( - "localhost:8001") as triton_client: + with grpcclient.InferenceServerClient("localhost:8001") as triton_client: triton_client.start_stream(callback=partial(callback, user_data)) for i in range(len(in_values)): input_data = np.array([in_values[i]], dtype=np.int32) inputs = [ - grpcclient.InferInput("IN", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) - triton_client.async_stream_infer(model_name=model_name, - inputs=inputs, - request_id=str(i)) + triton_client.async_stream_infer( + model_name=model_name, inputs=inputs, request_id=str(i) + ) # Retrieve results... recv_count = 0 @@ -172,23 +175,27 @@ def test_decoupled_bls_stream(self): if in_values[i] != 0: self.assertTrue( is_received, - "response for request id {} not received".format( - this_id)) + "response for request id {} not received".format(this_id), + ) self.assertEqual(len(result_dict[this_id]), in_values[i]) result_list = result_dict[this_id] expected_data = np.array([in_values[i]], dtype=np.int32) for j in range(len(result_list)): - this_data = result_list[j][1].as_numpy('OUT') + this_data = result_list[j][1].as_numpy("OUT") self.assertTrue( np.array_equal(expected_data, this_data), "error: incorrect data: expected {}, got {}".format( - expected_data, this_data)) + expected_data, this_data + ), + ) else: self.assertFalse( is_received, "received unexpected response for request id {}".format( - this_id)) + this_id + ), + ) def test_decoupled_return_response_error(self): model_name = "decoupled_return_response_error" @@ -199,10 +206,12 @@ def test_decoupled_return_response_error(self): input_data_0 = np.random.random(shape).astype(np.float32) input_data_1 = np.random.random(shape).astype(np.float32) inputs = [ - grpcclient.InferInput("INPUT0", input_data_0.shape, - np_to_triton_dtype(input_data_0.dtype)), - grpcclient.InferInput("INPUT1", input_data_1.shape, - np_to_triton_dtype(input_data_1.dtype)) + grpcclient.InferInput( + "INPUT0", input_data_0.shape, np_to_triton_dtype(input_data_0.dtype) + ), + grpcclient.InferInput( + "INPUT1", input_data_1.shape, np_to_triton_dtype(input_data_1.dtype) + ), ] inputs[0].set_data_from_numpy(input_data_0) inputs[1].set_data_from_numpy(input_data_1) @@ -213,7 +222,9 @@ def test_decoupled_return_response_error(self): data_item.message(), "Python model 'decoupled_return_response_error_0' is using " "the decoupled mode and the execute function must return " - "None.", "Exception message didn't match.") + "None.", + "Exception message didn't match.", + ) def test_decoupled_send_after_close_error(self): model_name = "decoupled_send_after_close_error" @@ -224,10 +235,12 @@ def test_decoupled_send_after_close_error(self): input_data_0 = np.random.random(shape).astype(np.float32) input_data_1 = np.random.random(shape).astype(np.float32) inputs = [ - grpcclient.InferInput("INPUT0", input_data_0.shape, - np_to_triton_dtype(input_data_0.dtype)), - grpcclient.InferInput("INPUT1", input_data_1.shape, - np_to_triton_dtype(input_data_1.dtype)) + grpcclient.InferInput( + "INPUT0", input_data_0.shape, np_to_triton_dtype(input_data_0.dtype) + ), + grpcclient.InferInput( + "INPUT1", input_data_1.shape, np_to_triton_dtype(input_data_1.dtype) + ), ] inputs[0].set_data_from_numpy(input_data_0) inputs[1].set_data_from_numpy(input_data_1) @@ -237,9 +250,12 @@ def test_decoupled_send_after_close_error(self): # way to deliver the error message to the client. The error # will be logged on the server side. time.sleep(4) - self.assertEqual(user_data._completed_requests.qsize(), 0, - "The completed request size must be zero.") + self.assertEqual( + user_data._completed_requests.qsize(), + 0, + "The completed request size must be zero.", + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py old mode 100644 new mode 100755 index 84e43eccf9..901e4c46b7 --- a/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py +++ b/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py @@ -1,4 +1,6 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,19 +26,19 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import triton_python_backend_utils as pb_utils import json +import sys import threading import time + import numpy as np import torch +import triton_python_backend_utils as pb_utils from torch.utils.dlpack import from_dlpack, to_dlpack -import sys class TritonPythonModel: - """ This model sends an error message with the first request. - """ + """This model sends an error message with the first request.""" def initialize(self, args): logger = pb_utils.Logger @@ -45,22 +47,25 @@ def initialize(self, args): logger.log_warn("Initialize-Warning Msg!") logger.log_error("Initialize-Error Msg!") # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) using_decoupled = pb_utils.using_decoupled_model_transaction_policy( - model_config) + model_config + ) if not using_decoupled: raise pb_utils.TritonModelException( """the model `{}` can generate any number of responses per request, enable decoupled transaction policy in model configuration to - serve this model""".format(args['model_name'])) + serve this model""".format( + args["model_name"] + ) + ) # Get OUT configuration out_config = pb_utils.get_output_config_by_name(model_config, "OUT") # Convert Triton types to numpy types - self.out_dtype = pb_utils.triton_string_to_numpy( - out_config['data_type']) + self.out_dtype = pb_utils.triton_string_to_numpy(out_config["data_type"]) self.inflight_thread_count = 0 self.inflight_thread_count_lck = threading.Lock() @@ -71,8 +76,7 @@ def initialize(self, args): logger.log_error("Initialize-Error Msg!") def execute(self, requests): - """ This function is called on inference request. - """ + """This function is called on inference request.""" logger = pb_utils.Logger logger.log("Execute-Specific Msg!", logger.INFO) logger.log_info("Execute-Info Msg!") @@ -80,30 +84,33 @@ def execute(self, requests): logger.log_error("Execute-Error Msg!") # Only generate the error for the first request for i, request in enumerate(requests): - request_input = pb_utils.get_input_tensor_by_name(request, 'IN') + request_input = pb_utils.get_input_tensor_by_name(request, "IN") # Sync BLS request infer_request = pb_utils.InferenceRequest( - model_name='identity_fp32', + model_name="identity_fp32", requested_output_names=["OUTPUT0"], - inputs=[pb_utils.Tensor('INPUT0', request_input.as_numpy())]) + inputs=[pb_utils.Tensor("INPUT0", request_input.as_numpy())], + ) infer_response = infer_request.exec() if infer_response.has_error(): raise pb_utils.TritonModelException( f"BLS Response has an error: {infer_response.error().message()}" ) - output0 = pb_utils.get_output_tensor_by_name( - infer_response, "OUTPUT0") + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") if np.any(output0.as_numpy() != request_input.as_numpy()): raise pb_utils.TritonModelException( f"BLS Request input and BLS response output do not match. {request_input.as_numpy()} != {output0.as_numpy()}" ) - thread1 = threading.Thread(target=self.response_thread, - args=(request.get_response_sender(), - pb_utils.get_input_tensor_by_name( - request, 'IN').as_numpy())) + thread1 = threading.Thread( + target=self.response_thread, + args=( + request.get_response_sender(), + pb_utils.get_input_tensor_by_name(request, "IN").as_numpy(), + ), + ) thread1.daemon = True with self.inflight_thread_count_lck: self.inflight_thread_count += 1 @@ -131,15 +138,16 @@ def _get_gpu_bls_outputs(self, input0_pb, input1_pb): logger.log_error("_get_gpu_bls_outputs-Error Msg!") infer_request = pb_utils.InferenceRequest( - model_name='dlpack_add_sub', + model_name="dlpack_add_sub", inputs=[input0_pb, input1_pb], - requested_output_names=['OUTPUT0', 'OUTPUT1']) + requested_output_names=["OUTPUT0", "OUTPUT1"], + ) infer_response = infer_request.exec() if infer_response.has_error(): return False - output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') - output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1') + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT1") if output0 is None or output1 is None: return False @@ -193,30 +201,32 @@ def _test_gpu_bls_add_sub(self, is_input0_gpu, is_input1_gpu): input1 = torch.rand(16) if is_input0_gpu: - input0 = input0.to('cuda') + input0 = input0.to("cuda") if is_input1_gpu: - input1 = input1.to('cuda') + input1 = input1.to("cuda") - input0_pb = pb_utils.Tensor.from_dlpack('INPUT0', to_dlpack(input0)) - input1_pb = pb_utils.Tensor.from_dlpack('INPUT1', to_dlpack(input1)) + input0_pb = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(input0)) + input1_pb = pb_utils.Tensor.from_dlpack("INPUT1", to_dlpack(input1)) gpu_bls_return = self._get_gpu_bls_outputs(input0_pb, input1_pb) if gpu_bls_return: output0_dlpack, output1_dlpack = gpu_bls_return else: return False - expected_output_0 = from_dlpack( - input0_pb.to_dlpack()).to('cpu') + from_dlpack( - input1_pb.to_dlpack()).to('cpu') - expected_output_1 = from_dlpack( - input0_pb.to_dlpack()).to('cpu') - from_dlpack( - input1_pb.to_dlpack()).to('cpu') + expected_output_0 = from_dlpack(input0_pb.to_dlpack()).to("cpu") + from_dlpack( + input1_pb.to_dlpack() + ).to("cpu") + expected_output_1 = from_dlpack(input0_pb.to_dlpack()).to("cpu") - from_dlpack( + input1_pb.to_dlpack() + ).to("cpu") output0_matches = torch.all( - expected_output_0 == from_dlpack(output0_dlpack).to('cpu')) + expected_output_0 == from_dlpack(output0_dlpack).to("cpu") + ) output1_matches = torch.all( - expected_output_1 == from_dlpack(output1_dlpack).to('cpu')) + expected_output_1 == from_dlpack(output1_dlpack).to("cpu") + ) if not output0_matches or not output1_matches: return False @@ -230,8 +240,7 @@ def execute_gpu_bls(self): logger.log_error("execute_gpu_bls-Error Msg!") for input0_device in [True, False]: for input1_device in [True, False]: - test_status = self._test_gpu_bls_add_sub( - input0_device, input1_device) + test_status = self._test_gpu_bls_add_sub(input0_device, input1_device) if not test_status: return False @@ -250,39 +259,39 @@ def response_thread(self, response_sender, in_input): status = self.execute_gpu_bls() if not status: - infer_response = pb_utils.InferenceResponse( - error="GPU BLS test failed.") + infer_response = pb_utils.InferenceResponse(error="GPU BLS test failed.") response_sender.send(infer_response) else: in_value = in_input infer_request = pb_utils.InferenceRequest( - model_name='identity_fp32', + model_name="identity_fp32", requested_output_names=["OUTPUT0"], - inputs=[pb_utils.Tensor('INPUT0', in_input)]) + inputs=[pb_utils.Tensor("INPUT0", in_input)], + ) infer_response = infer_request.exec() - output0 = pb_utils.get_output_tensor_by_name( - infer_response, "OUTPUT0") + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") if infer_response.has_error(): response = pb_utils.InferenceResponse( - error=infer_response.error().message()) + error=infer_response.error().message() + ) response_sender.send( - response, - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) elif np.any(in_input != output0.as_numpy()): error_message = ( "BLS Request input and BLS response output do not match." - f" {in_value} != {output0.as_numpy()}") + f" {in_value} != {output0.as_numpy()}" + ) response = pb_utils.InferenceResponse(error=error_message) response_sender.send( - response, - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) else: - output_tensors = [pb_utils.Tensor('OUT', in_value)] - response = pb_utils.InferenceResponse( - output_tensors=output_tensors) + output_tensors = [pb_utils.Tensor("OUT", in_value)] + response = pb_utils.InferenceResponse(output_tensors=output_tensors) response_sender.send( - response, - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) with self.inflight_thread_count_lck: self.inflight_thread_count -= 1 @@ -297,13 +306,13 @@ def finalize(self): the model to perform any necessary clean ups before exit. """ logger = pb_utils.Logger - logger.log_info('Finalize invoked') + logger.log_info("Finalize invoked") inflight_threads = True while inflight_threads: with self.inflight_thread_count_lck: - inflight_threads = (self.inflight_thread_count != 0) + inflight_threads = self.inflight_thread_count != 0 if inflight_threads: time.sleep(0.1) - logger.log_info('Finalize complete...') + logger.log_info("Finalize complete...") diff --git a/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py old mode 100644 new mode 100755 index 81bb397115..e6334d34dc --- a/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py +++ b/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -24,11 +26,12 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import triton_python_backend_utils as pb_utils import json import threading import time + import numpy as np +import triton_python_backend_utils as pb_utils class TritonPythonModel: @@ -38,28 +41,34 @@ class TritonPythonModel: def initialize(self, args): # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) using_decoupled = pb_utils.using_decoupled_model_transaction_policy( - model_config) + model_config + ) if not using_decoupled: raise pb_utils.TritonModelException( """the model `{}` can generate any number of responses per request, enable decoupled transaction policy in model configuration to - serve this model""".format(args['model_name'])) + serve this model""".format( + args["model_name"] + ) + ) self.inflight_thread_count = 0 self.inflight_thread_count_lck = threading.Lock() def execute(self, requests): - """ This function is called on inference request. - """ + """This function is called on inference request.""" for request in requests: - thread = threading.Thread(target=self.response_thread, - args=(request.get_response_sender(), - pb_utils.get_input_tensor_by_name( - request, 'IN').as_numpy())) + thread = threading.Thread( + target=self.response_thread, + args=( + request.get_response_sender(), + pb_utils.get_input_tensor_by_name(request, "IN").as_numpy(), + ), + ) thread.daemon = True with self.inflight_thread_count_lck: self.inflight_thread_count += 1 @@ -69,50 +78,49 @@ def execute(self, requests): def response_thread(self, response_sender, in_value): infer_request = pb_utils.InferenceRequest( - model_name='square_int32', + model_name="square_int32", requested_output_names=["OUT"], - inputs=[pb_utils.Tensor('IN', in_value)]) + inputs=[pb_utils.Tensor("IN", in_value)], + ) infer_responses = infer_request.exec(decoupled=True) response_count = 0 for infer_response in infer_responses: if len(infer_response.output_tensors()) > 0: - output0 = pb_utils.get_output_tensor_by_name( - infer_response, "OUT") + output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT") if infer_response.has_error(): response = pb_utils.InferenceResponse( - error=infer_response.error().message()) + error=infer_response.error().message() + ) response_sender.send( - response, - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) elif np.any(in_value != output0.as_numpy()): error_message = ( "BLS Request input and BLS response output do not match." - f" {in_value} != {output0.as_numpy()}") + f" {in_value} != {output0.as_numpy()}" + ) response = pb_utils.InferenceResponse(error=error_message) response_sender.send( - response, - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) else: - output_tensors = [ - pb_utils.Tensor('OUT', output0.as_numpy()) - ] - response = pb_utils.InferenceResponse( - output_tensors=output_tensors) + output_tensors = [pb_utils.Tensor("OUT", output0.as_numpy())] + response = pb_utils.InferenceResponse(output_tensors=output_tensors) response_sender.send(response) response_count += 1 if in_value != response_count - 1: - error_message = ("Expected {} responses, got {}".format( - in_value, - len(infer_responses) - 1)) + error_message = "Expected {} responses, got {}".format( + in_value, len(infer_responses) - 1 + ) response = pb_utils.InferenceResponse(error=error_message) response_sender.send( - response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) else: - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) with self.inflight_thread_count_lck: self.inflight_thread_count -= 1 @@ -121,6 +129,6 @@ def finalize(self): inflight_threads = True while inflight_threads: with self.inflight_thread_count_lck: - inflight_threads = (self.inflight_thread_count != 0) + inflight_threads = self.inflight_thread_count != 0 if inflight_threads: time.sleep(0.1) diff --git a/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py old mode 100644 new mode 100755 index 1a7bd7abed..ecdb7df322 --- a/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py +++ b/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py @@ -1,4 +1,6 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,49 +26,55 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import triton_python_backend_utils as pb_utils import json import threading import time +import triton_python_backend_utils as pb_utils + class TritonPythonModel: - """ This model sends an error message with the first request. - """ + """This model sends an error message with the first request.""" def initialize(self, args): # You must parse model_config. JSON string is not parsed here - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) using_decoupled = pb_utils.using_decoupled_model_transaction_policy( - model_config) + model_config + ) if not using_decoupled: raise pb_utils.TritonModelException( """the model `{}` can generate any number of responses per request, enable decoupled transaction policy in model configuration to - serve this model""".format(args['model_name'])) + serve this model""".format( + args["model_name"] + ) + ) # Get OUT configuration out_config = pb_utils.get_output_config_by_name(model_config, "OUT") # Convert Triton types to numpy types - self.out_dtype = pb_utils.triton_string_to_numpy( - out_config['data_type']) + self.out_dtype = pb_utils.triton_string_to_numpy(out_config["data_type"]) self.inflight_thread_count = 0 self.inflight_thread_count_lck = threading.Lock() def execute(self, requests): - """ This function is called on inference request. - """ + """This function is called on inference request.""" # Only generate the error for the first request for i, request in enumerate(requests): # Start a separate thread to send the responses for the request. - thread = threading.Thread(target=self.response_thread, - args=(request.get_response_sender(), i, - pb_utils.get_input_tensor_by_name( - request, 'IN').as_numpy())) + thread = threading.Thread( + target=self.response_thread, + args=( + request.get_response_sender(), + i, + pb_utils.get_input_tensor_by_name(request, "IN").as_numpy(), + ), + ) thread.daemon = True with self.inflight_thread_count_lck: @@ -86,9 +94,10 @@ def response_thread(self, response_sender, index, in_input): out_output = pb_utils.Tensor("OUT", in_value) if index == 0: - error = pb_utils.TritonError('An error occured during execution') - response = pb_utils.InferenceResponse(output_tensors=[out_output], - error=error) + error = pb_utils.TritonError("An error occurred during execution") + response = pb_utils.InferenceResponse( + output_tensors=[out_output], error=error + ) else: response = pb_utils.InferenceResponse(output_tensors=[out_output]) response_sender.send(response) @@ -96,8 +105,7 @@ def response_thread(self, response_sender, index, in_input): # We must close the response sender to indicate to Triton that we are # done sending responses for the corresponding request. We can't use the # response sender after closing it. - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) with self.inflight_thread_count_lck: self.inflight_thread_count -= 1 @@ -107,13 +115,13 @@ def finalize(self): Implementing `finalize` function is OPTIONAL. This function allows the model to perform any necessary clean ups before exit. """ - print('Finalize invoked') + print("Finalize invoked") inflight_threads = True while inflight_threads: with self.inflight_thread_count_lck: - inflight_threads = (self.inflight_thread_count != 0) + inflight_threads = self.inflight_thread_count != 0 if inflight_threads: time.sleep(0.1) - print('Finalize complete...') + print("Finalize complete...") diff --git a/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py old mode 100644 new mode 100755 index 4c882481cf..10b9ef12fe --- a/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py +++ b/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py @@ -1,4 +1,6 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,37 +27,42 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json + import triton_python_backend_utils as pb_utils class TritonPythonModel: - """ This model tries to return a response directly from + """This model tries to return a response directly from execute function when configured as decoupled model. """ def initialize(self, args): - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) using_decoupled = pb_utils.using_decoupled_model_transaction_policy( - model_config) + model_config + ) if not using_decoupled: raise pb_utils.TritonModelException( """the model `{}` can generate any number of responses per request, - enable decoupled transaction policy in model configuration to - serve this model""".format(args['model_name'])) + enable decoupled transaction policy in model configuration to + serve this model""".format( + args["model_name"] + ) + ) - output0_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT0") - output1_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT1") + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) + output0_config["data_type"] + ) self.output1_dtype = pb_utils.triton_string_to_numpy( - output1_config['data_type']) + output1_config["data_type"] + ) def execute(self, requests): - """ Tries to create a response sender object and use that + """Tries to create a response sender object and use that for sending the response. """ @@ -66,13 +73,12 @@ def execute(self, requests): for request in requests: in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") - out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(), - in_0.as_numpy() - in_1.as_numpy()) + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) - out_tensor_0 = pb_utils.Tensor("OUTPUT0", - out_0.astype(output0_dtype)) - out_tensor_1 = pb_utils.Tensor("OUTPUT1", - out_1.astype(output1_dtype)) - responses.append( - pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) + responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])) return responses diff --git a/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py old mode 100644 new mode 100755 index 9611c2875c..aeab19851c --- a/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py +++ b/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py @@ -1,4 +1,6 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,44 +27,50 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json + import triton_python_backend_utils as pb_utils class TritonPythonModel: - """ This model tries to send response after closing + """This model tries to send response after closing the response_sender. """ def initialize(self, args): - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) using_decoupled = pb_utils.using_decoupled_model_transaction_policy( - model_config) + model_config + ) if not using_decoupled: raise pb_utils.TritonModelException( """the model `{}` can generate any number of responses per request, - enable decoupled transaction policy in model configuration to - serve this model""".format(args['model_name'])) + enable decoupled transaction policy in model configuration to + serve this model""".format( + args["model_name"] + ) + ) - output0_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT0") - output1_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT1") + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) + output0_config["data_type"] + ) self.output1_dtype = pb_utils.triton_string_to_numpy( - output1_config['data_type']) + output1_config["data_type"] + ) def execute(self, requests): - """ Create a response sender object and use that + """Create a response sender object and use that for sending the response. """ # This model does not support batching, so 'request_count' should always be 1. if len(requests) != 1: - raise pb_utils.TritonModelException("unsupported batch size " + - len(requests)) + raise pb_utils.TritonModelException( + "unsupported batch size " + len(requests) + ) output0_dtype = self.output0_dtype output1_dtype = self.output1_dtype @@ -70,13 +78,14 @@ def execute(self, requests): response_sender = requests[0].get_response_sender() in_0 = pb_utils.get_input_tensor_by_name(requests[0], "INPUT0") in_1 = pb_utils.get_input_tensor_by_name(requests[0], "INPUT1") - out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(), - in_0.as_numpy() - in_1.as_numpy()) + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype)) response = pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]) - response_sender.send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) response_sender.send(response) diff --git a/qa/L0_backend_python/decoupled/test.sh b/qa/L0_backend_python/decoupled/test.sh old mode 100644 new mode 100755 index c71055a511..0e316c8452 --- a/qa/L0_backend_python/decoupled/test.sh +++ b/qa/L0_backend_python/decoupled/test.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/qa/L0_backend_python/ensemble/ensemble_test.py b/qa/L0_backend_python/ensemble/ensemble_test.py old mode 100644 new mode 100755 index f0cceed4e7..64ddc3816f --- a/qa/L0_backend_python/ensemble/ensemble_test.py +++ b/qa/L0_backend_python/ensemble/ensemble_test.py @@ -1,4 +1,6 @@ -# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,16 +30,16 @@ sys.path.append("../../common") -import test_util as tu +import unittest + +import numpy as np import shm_util +import test_util as tu import tritonclient.http as httpclient from tritonclient.utils import * -import numpy as np -import unittest class EnsembleTest(tu.TestResultCollector): - def setUp(self): self._shm_leak_detector = shm_util.ShmLeakDetector() @@ -50,17 +52,21 @@ def test_ensemble(self): input_data_1 = np.random.random(shape).astype(np.float32) inputs = [ httpclient.InferInput( - "INPUT0", input_data_0.shape, - np_to_triton_dtype(input_data_0.dtype)), + "INPUT0", + input_data_0.shape, + np_to_triton_dtype(input_data_0.dtype), + ), httpclient.InferInput( - "INPUT1", input_data_1.shape, - np_to_triton_dtype(input_data_1.dtype)) + "INPUT1", + input_data_1.shape, + np_to_triton_dtype(input_data_1.dtype), + ), ] inputs[0].set_data_from_numpy(input_data_0) inputs[1].set_data_from_numpy(input_data_1) result = client.infer(model_name, inputs) - output0 = result.as_numpy('OUTPUT0') - output1 = result.as_numpy('OUTPUT1') + output0 = result.as_numpy("OUTPUT0") + output1 = result.as_numpy("OUTPUT1") self.assertIsNotNone(output0) self.assertIsNotNone(output1) @@ -74,17 +80,21 @@ def test_ensemble(self): input_data_1 = np.random.random(shape).astype(np.float32) inputs = [ httpclient.InferInput( - "INPUT0", input_data_0.shape, - np_to_triton_dtype(input_data_0.dtype)), + "INPUT0", + input_data_0.shape, + np_to_triton_dtype(input_data_0.dtype), + ), httpclient.InferInput( - "INPUT1", input_data_1.shape, - np_to_triton_dtype(input_data_1.dtype)) + "INPUT1", + input_data_1.shape, + np_to_triton_dtype(input_data_1.dtype), + ), ] inputs[0].set_data_from_numpy(input_data_0) inputs[1].set_data_from_numpy(input_data_1) result = client.infer(model_name, inputs) - output0 = result.as_numpy('OUTPUT0') - output1 = result.as_numpy('OUTPUT1') + output0 = result.as_numpy("OUTPUT0") + output1 = result.as_numpy("OUTPUT1") self.assertIsNotNone(output0) self.assertIsNotNone(output1) @@ -92,5 +102,5 @@ def test_ensemble(self): self.assertTrue(np.allclose(output1, 2 * input_data_1)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_backend_python/ensemble/test.sh b/qa/L0_backend_python/ensemble/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_backend_python/env/test.sh b/qa/L0_backend_python/env/test.sh old mode 100644 new mode 100755 index a32c4036a4..4161be5b49 --- a/qa/L0_backend_python/env/test.sh +++ b/qa/L0_backend_python/env/test.sh @@ -210,7 +210,7 @@ wait $SERVER_PID set +e -PY310_ENV_EXTRACTION="Extracting Python execution env" +PY310_ENV_EXTRACTION="Extracting Python execution env" if [ `grep -c "${PY310_ENV_EXTRACTION}" ${SERVER_LOG}` != "2" ]; then cat $SERVER_LOG echo -e "\n***\n*** Python execution environment should be extracted exactly twice. \n***" diff --git a/qa/L0_backend_python/examples/test.sh b/qa/L0_backend_python/examples/test.sh old mode 100644 new mode 100755 index 2c94904135..bbad8b5bfd --- a/qa/L0_backend_python/examples/test.sh +++ b/qa/L0_backend_python/examples/test.sh @@ -37,7 +37,7 @@ SERVER_LOG="./inference_server.log" RET=0 rm -fr *.log python_backend/ -# Install torch +# Install torch # Skip torch and torchvision install on Jetson since it is already installed. if [ "$TEST_JETSON" == "0" ]; then pip3 uninstall -y torch diff --git a/qa/L0_backend_python/io/io_test.py b/qa/L0_backend_python/io/io_test.py old mode 100644 new mode 100755 index d054ee54a8..9adb4414ab --- a/qa/L0_backend_python/io/io_test.py +++ b/qa/L0_backend_python/io/io_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,21 +30,21 @@ sys.path.append("../../common") +import os +import queue +import unittest from functools import partial -import test_util as tu + +import numpy as np import shm_util +import test_util as tu import tritonclient.grpc as grpcclient from tritonclient.utils import * -import numpy as np -import unittest -import queue -import os -TRIAL = os.getenv('TRIAL') +TRIAL = os.getenv("TRIAL") class UserData: - def __init__(self): self._completed_requests = queue.Queue() @@ -55,7 +57,6 @@ def callback(user_data, result, error): class IOTest(tu.TestResultCollector): - def setUp(self): self._shm_leak_detector = shm_util.ShmLeakDetector() self._client = grpcclient.InferenceServerClient("localhost:8001") @@ -69,60 +70,66 @@ def _run_ensemble_test(self): for model_2_in_gpu in [True, False]: for model_3_in_gpu in [True, False]: gpu_output = np.asarray( - [model_1_in_gpu, model_2_in_gpu, model_3_in_gpu], - dtype=bool) + [model_1_in_gpu, model_2_in_gpu, model_3_in_gpu], dtype=bool + ) inputs = [ - grpcclient.InferInput("INPUT0", input0.shape, - np_to_triton_dtype(input0.dtype)), grpcclient.InferInput( - "GPU_OUTPUT", gpu_output.shape, - np_to_triton_dtype(gpu_output.dtype)) + "INPUT0", input0.shape, np_to_triton_dtype(input0.dtype) + ), + grpcclient.InferInput( + "GPU_OUTPUT", + gpu_output.shape, + np_to_triton_dtype(gpu_output.dtype), + ), ] inputs[0].set_data_from_numpy(input0) inputs[1].set_data_from_numpy(gpu_output) - self._client.async_stream_infer(model_name=model_name, - inputs=inputs) - if TRIAL == 'default': + self._client.async_stream_infer( + model_name=model_name, inputs=inputs + ) + if TRIAL == "default": result = user_data._completed_requests.get() - output0 = result.as_numpy('OUTPUT0') + output0 = result.as_numpy("OUTPUT0") self.assertIsNotNone(output0) self.assertTrue(np.all(output0 == input0)) else: response_repeat = 2 for _ in range(response_repeat): result = user_data._completed_requests.get() - output0 = result.as_numpy('OUTPUT0') + output0 = result.as_numpy("OUTPUT0") self.assertIsNotNone(output0) self.assertTrue(np.all(output0 == input0)) def test_ensemble_io(self): # Only run the shared memory leak detection with the default trial - if TRIAL == 'default': + if TRIAL == "default": with self._shm_leak_detector.Probe(): self._run_ensemble_test() else: self._run_ensemble_test() def test_empty_gpu_output(self): - model_name = 'dlpack_empty_output' + model_name = "dlpack_empty_output" input_data = np.array([[1.0]], dtype=np.float32) inputs = [ - grpcclient.InferInput("INPUT", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + grpcclient.InferInput( + "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) result = self._client.infer(model_name, inputs) - output = result.as_numpy('OUTPUT') + output = result.as_numpy("OUTPUT") self.assertIsNotNone(output) self.assertEqual(output.size, 0) def test_variable_gpu_output(self): # Input is not important in this test - model_name = 'variable_gpu_output' + model_name = "variable_gpu_output" input_data = np.array([[1.0]], dtype=np.float32) inputs = [ - grpcclient.InferInput("INPUT", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + grpcclient.InferInput( + "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) user_data = UserData() @@ -131,20 +138,21 @@ def test_variable_gpu_output(self): # responses with different GPU output shapes num_requests = 5 for _ in range(num_requests): - result = self._client.async_infer(model_name=model_name, - inputs=inputs, - callback=partial( - callback, user_data)) + result = self._client.async_infer( + model_name=model_name, + inputs=inputs, + callback=partial(callback, user_data), + ) for i in range(num_requests): result = user_data._completed_requests.get() if result is InferenceServerException: self.assertTrue(False, result) - output = result.as_numpy('OUTPUT') + output = result.as_numpy("OUTPUT") self.assertIsNotNone(output) self.assertEqual(output.size, i + 1) np.testing.assert_almost_equal(output, np.ones(i + 1) * (i + 1)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_backend_python/io/test.sh b/qa/L0_backend_python/io/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py old mode 100644 new mode 100755 index 425eb4322d..23c0f9686d --- a/qa/L0_backend_python/lifecycle/lifecycle_test.py +++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,19 +30,19 @@ sys.path.append("../../common") -import test_util as tu -import shm_util +import queue +import unittest from functools import partial -import tritonclient.http as httpclient + +import numpy as np +import shm_util +import test_util as tu import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient from tritonclient.utils import * -import numpy as np -import unittest -import queue class UserData: - def __init__(self): self._completed_requests = queue.Queue() @@ -53,13 +55,12 @@ def callback(user_data, result, error): class LifecycleTest(tu.TestResultCollector): - def setUp(self): self._shm_leak_detector = shm_util.ShmLeakDetector() def test_batch_error(self): # The execute_error model returns an error for the first and third - # request and sucessfully processes the second request. This is making + # request and successfully processes the second request. This is making # sure that an error in a single request does not completely fail the # batch. model_name = "execute_error" @@ -75,12 +76,12 @@ def test_batch_error(self): input_data = np.random.randn(*shape).astype(np.float32) input_datas.append(input_data) inputs = [ - grpcclient.InferInput("IN", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) - triton_client.async_stream_infer(model_name=model_name, - inputs=inputs) + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) for i in range(number_of_requests): result = user_data._completed_requests.get() @@ -94,7 +95,9 @@ def test_batch_error(self): self.assertTrue( np.array_equal(output_data, input_datas[i]), "error: expected output {} to match input {}".format( - output_data, input_datas[i])) + output_data, input_datas[i] + ), + ) def test_infer_pymodel_error(self): model_name = "wrong_model" @@ -104,8 +107,9 @@ def test_infer_pymodel_error(self): with httpclient.InferenceServerClient("localhost:8000") as client: input_data = (16384 * np.random.randn(*shape)).astype(np.uint32) inputs = [ - httpclient.InferInput("IN", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + httpclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) try: @@ -115,21 +119,24 @@ def test_infer_pymodel_error(self): self.assertTrue( e.message().startswith( "Failed to process the request(s) for model instance" - ), "Exception message is not correct") + ), + "Exception message is not correct", + ) else: self.assertTrue( - False, - "Wrong exception raised or did not raise an exception") + False, "Wrong exception raised or did not raise an exception" + ) def test_incorrect_execute_return(self): - model_name = 'execute_return_error' + model_name = "execute_return_error" shape = [1, 1] with self._shm_leak_detector.Probe() as shm_probe: with httpclient.InferenceServerClient("localhost:8000") as client: input_data = (5 * np.random.randn(*shape)).astype(np.float32) inputs = [ - httpclient.InferInput("INPUT", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + httpclient.InferInput( + "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) @@ -141,7 +148,8 @@ def test_incorrect_execute_return(self): "Failed to process the request(s) for model instance " "'execute_return_error_0', message: Expected a list in the " "execute return" in str(e.exception), - "Exception message is not correct.") + "Exception message is not correct.", + ) # The second inference request will return a list of None object # instead of Python InferenceResponse objects. @@ -153,8 +161,9 @@ def test_incorrect_execute_return(self): "'execute_return_error_0', message: Expected an " "'InferenceResponse' object in the execute function return" " list" in str(e.exception), - "Exception message is not correct.") + "Exception message is not correct.", + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_backend_python/lifecycle/test.sh b/qa/L0_backend_python/lifecycle/test.sh old mode 100644 new mode 100755 index b393b0f06b..c1ab6baf92 --- a/qa/L0_backend_python/lifecycle/test.sh +++ b/qa/L0_backend_python/lifecycle/test.sh @@ -72,7 +72,7 @@ set +e # Run this multiple times to catch any intermittent segfault. for i in {0..4}; do - python3 lifecycle_test.py > $CLIENT_LOG 2>&1 + python3 lifecycle_test.py > $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then cat $CLIENT_LOG echo -e "\n***\n*** lifecycle_test.py FAILED. \n***" diff --git a/qa/L0_backend_python/logging/logging_test.py b/qa/L0_backend_python/logging/logging_test.py old mode 100644 new mode 100755 index 1070d240a7..b21919df65 --- a/qa/L0_backend_python/logging/logging_test.py +++ b/qa/L0_backend_python/logging/logging_test.py @@ -1,4 +1,6 @@ -# Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,29 +30,29 @@ sys.path.append("../../common") import unittest + import numpy as np import test_util as tu - -from tritonclient.utils import * import tritonclient.http as httpclient +from tritonclient.utils import * class LogTest(tu.TestResultCollector): - def test_log_output(self): - model_name = 'identity_fp32_logging' + model_name = "identity_fp32_logging" with httpclient.InferenceServerClient("localhost:8000") as client: input_data = np.array([[1.0]], dtype=np.float32) inputs = [ - httpclient.InferInput("INPUT0", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + httpclient.InferInput( + "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) result = client.infer(model_name, inputs) - output0 = result.as_numpy('OUTPUT0') + output0 = result.as_numpy("OUTPUT0") self.assertIsNotNone(output0) self.assertTrue(np.all(output0 == input_data)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_backend_python/logging/test.sh b/qa/L0_backend_python/logging/test.sh index 4b6b017d6d..369d28d0b9 100755 --- a/qa/L0_backend_python/logging/test.sh +++ b/qa/L0_backend_python/logging/test.sh @@ -68,7 +68,7 @@ source ../../common/util.sh function verify_log_counts () { non_verbose_expected=$1 verbose_expected=$2 - + if [ `grep -c "Specific Msg!" $SERVER_LOG` != $non_verbose_expected ]; then echo -e "\n***\n*** Test Failed: Specific Msg Count Incorrect\n***" RET=1 @@ -145,7 +145,7 @@ if [ "$SERVER_PID" == "0" ]; then fi set +e -# Enable verbose logging +# Enable verbose logging code=`curl -s -w %{http_code} -o ./curl.out -d'{"log_verbose_level":1}' localhost:8000/v2/logging` if [ "$code" != "200" ]; then diff --git a/qa/L0_backend_python/model_control/model_control_test.py b/qa/L0_backend_python/model_control/model_control_test.py old mode 100644 new mode 100755 index feceda01e4..17686f97d5 --- a/qa/L0_backend_python/model_control/model_control_test.py +++ b/qa/L0_backend_python/model_control/model_control_test.py @@ -1,4 +1,6 @@ -# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,22 +30,22 @@ sys.path.append("../../common") +import unittest + +import numpy as np +import shm_util import test_util as tu import tritonclient.http as httpclient from tritonclient.utils import * -import numpy as np -import unittest -import shm_util class ExplicitModelTest(tu.TestResultCollector): - def setUp(self): self._shm_leak_detector = shm_util.ShmLeakDetector() def send_identity_request(self, client, model_name): inputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "FP32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "FP32")) input0_data = np.arange(start=0, stop=16, dtype=np.float32) input0_data = np.expand_dims(input0_data, axis=0) inputs[0].set_data_from_numpy(input0_data) @@ -52,13 +54,14 @@ def send_identity_request(self, client, model_name): result = client.infer( model_name=model_name, inputs=inputs, - outputs=[httpclient.InferRequestedOutput('OUTPUT0')]) - output_numpy = result.as_numpy('OUTPUT0') + outputs=[httpclient.InferRequestedOutput("OUTPUT0")], + ) + output_numpy = result.as_numpy("OUTPUT0") self.assertTrue(np.all(input0_data == output_numpy)) def test_model_reload(self): model_name = "identity_fp32" - ensemble_model_name = 'simple_' + "identity_fp32" + ensemble_model_name = "simple_" + "identity_fp32" with httpclient.InferenceServerClient("localhost:8000") as client: for _ in range(5): self.assertFalse(client.is_model_ready(model_name)) @@ -76,5 +79,5 @@ def test_model_reload(self): self.assertFalse(client.is_model_ready(ensemble_model_name)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_backend_python/model_control/test.sh b/qa/L0_backend_python/model_control/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_backend_python/python_test.py b/qa/L0_backend_python/python_test.py old mode 100644 new mode 100755 index ba4dc25ecb..eb4d02aa53 --- a/qa/L0_backend_python/python_test.py +++ b/qa/L0_backend_python/python_test.py @@ -30,21 +30,20 @@ sys.path.append("../common") +import os import unittest + import numpy as np -import test_util as tu -import shm_util import requests as httpreq -import os - -from tritonclient.utils import * +import shm_util +import test_util as tu import tritonclient.http as httpclient +from tritonclient.utils import * -TEST_JETSON = bool(int(os.environ.get('TEST_JETSON', 0))) +TEST_JETSON = bool(int(os.environ.get("TEST_JETSON", 0))) class PythonTest(tu.TestResultCollector): - def setUp(self): self._shm_leak_detector = shm_util.ShmLeakDetector() @@ -52,41 +51,39 @@ def _infer_help(self, model_name, shape, data_type): with httpclient.InferenceServerClient("localhost:8000") as client: input_data_0 = np.array(np.random.randn(*shape), dtype=data_type) inputs = [ - httpclient.InferInput("INPUT0", shape, - np_to_triton_dtype(input_data_0.dtype)) + httpclient.InferInput( + "INPUT0", shape, np_to_triton_dtype(input_data_0.dtype) + ) ] inputs[0].set_data_from_numpy(input_data_0) result = client.infer(model_name, inputs) - output0 = result.as_numpy('OUTPUT0') + output0 = result.as_numpy("OUTPUT0") self.assertTrue(np.all(input_data_0 == output0)) def _create_cuda_region(self, client, size, name): import tritonclient.utils.cuda_shared_memory as cuda_shared_memory + shm0_handle = cuda_shared_memory.create_shared_memory_region( - name, byte_size=size, device_id=0) + name, byte_size=size, device_id=0 + ) client.register_cuda_shared_memory( - name, cuda_shared_memory.get_raw_handle(shm0_handle), 0, size) + name, cuda_shared_memory.get_raw_handle(shm0_handle), 0, size + ) return shm0_handle def _optional_input_infer(self, model_name, has_input0, has_input1): with httpclient.InferenceServerClient("localhost:8000") as client: shape = (1,) if has_input0: - input0_numpy = np.random.randint(0, - 100, - size=shape, - dtype=np.int32) + input0_numpy = np.random.randint(0, 100, size=shape, dtype=np.int32) else: # Set the input0 to a default value if it is optional. This is # the input used by the model if it is not provided. input0_numpy = np.array([5], dtype=np.int32) if has_input1: - input1_numpy = np.random.randint(0, - 100, - size=shape, - dtype=np.int32) + input1_numpy = np.random.randint(0, 100, size=shape, dtype=np.int32) else: # Set the input1 to a default value if it is optional. This is # the input used by the model if it is not provided. @@ -96,56 +93,62 @@ def _optional_input_infer(self, model_name, has_input0, has_input1): if has_input0: inputs.append( httpclient.InferInput( - "INPUT0", shape, - np_to_triton_dtype(input0_numpy.dtype))) + "INPUT0", shape, np_to_triton_dtype(input0_numpy.dtype) + ) + ) inputs[-1].set_data_from_numpy(input0_numpy) if has_input1: inputs.append( httpclient.InferInput( - "INPUT1", shape, - np_to_triton_dtype(input1_numpy.dtype))) + "INPUT1", shape, np_to_triton_dtype(input1_numpy.dtype) + ) + ) inputs[-1].set_data_from_numpy(input1_numpy) result = client.infer(model_name, inputs) - output0 = result.as_numpy('OUTPUT0') + output0 = result.as_numpy("OUTPUT0") self.assertIsNotNone(output0, "OUTPUT0 was not found.") - output1 = result.as_numpy('OUTPUT1') + output1 = result.as_numpy("OUTPUT1") self.assertIsNotNone(output1, "OUTPUT1 was not found.") expected_output0 = input0_numpy + input1_numpy expected_output1 = input0_numpy - input1_numpy - np.testing.assert_equal(output0, expected_output0, - "OUTPUT0 doesn't match expected OUTPUT0") - np.testing.assert_equal(output1, expected_output1, - "OUTPUT1 doesn't match expected OUTPUT1") + np.testing.assert_equal( + output0, expected_output0, "OUTPUT0 doesn't match expected OUTPUT0" + ) + np.testing.assert_equal( + output1, expected_output1, "OUTPUT1 doesn't match expected OUTPUT1" + ) def test_growth_error(self): # 2 MiBs total_byte_size = 2 * 1024 * 1024 shape = [total_byte_size] - model_name = 'identity_uint8_nobatch' + model_name = "identity_uint8_nobatch" dtype = np.uint8 with self._shm_leak_detector.Probe() as shm_probe: self._infer_help(model_name, shape, dtype) - # 1 GiB payload leads to error in the main Python backned process. + # 1 GiB payload leads to error in the main Python backend process. # Total shared memory available is 1GiB. total_byte_size = 1024 * 1024 * 1024 shape = [total_byte_size] with self.assertRaises(InferenceServerException) as ex: self._infer_help(model_name, shape, dtype) - self.assertIn("Failed to increase the shared memory pool size", - str(ex.exception)) + self.assertIn( + "Failed to increase the shared memory pool size", str(ex.exception) + ) # 512 MiBs payload leads to error in the Python stub process. total_byte_size = 512 * 1024 * 1024 shape = [total_byte_size] with self.assertRaises(InferenceServerException) as ex: self._infer_help(model_name, shape, dtype) - self.assertIn("Failed to increase the shared memory pool size", - str(ex.exception)) + self.assertIn( + "Failed to increase the shared memory pool size", str(ex.exception) + ) # 2 MiBs # Send a small paylaod to make sure it is still working properly @@ -160,60 +163,64 @@ def test_growth_error(self): def test_gpu_tensor_error(self): import tritonclient.utils.cuda_shared_memory as cuda_shared_memory - model_name = 'identity_bool' + + model_name = "identity_bool" with httpclient.InferenceServerClient("localhost:8000") as client: input_data = np.array([[True] * 1000], dtype=bool) inputs = [ - httpclient.InferInput("INPUT0", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + httpclient.InferInput( + "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) - requested_outputs = [httpclient.InferRequestedOutput('OUTPUT0')] + requested_outputs = [httpclient.InferRequestedOutput("OUTPUT0")] # intentionally create a shared memory region with not enough size. client.unregister_cuda_shared_memory() - shm0_handle = self._create_cuda_region(client, 1, - 'output0_data') + shm0_handle = self._create_cuda_region(client, 1, "output0_data") - requested_outputs[0].set_shared_memory('output0_data', 1) + requested_outputs[0].set_shared_memory("output0_data", 1) with self.assertRaises(InferenceServerException) as ex: client.infer(model_name, inputs, outputs=requested_outputs) self.assertIn( "should be at least 1000 bytes to hold the results", - str(ex.exception)) + str(ex.exception), + ) client.unregister_cuda_shared_memory() cuda_shared_memory.destroy_shared_memory_region(shm0_handle) def test_dlpack_tensor_error(self): import tritonclient.utils.cuda_shared_memory as cuda_shared_memory - model_name = 'dlpack_identity' + + model_name = "dlpack_identity" with httpclient.InferenceServerClient("localhost:8000") as client: input_data = np.array([[1] * 1000], dtype=np.float32) inputs = [ - httpclient.InferInput("INPUT0", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + httpclient.InferInput( + "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] - requested_outputs = [httpclient.InferRequestedOutput('OUTPUT0')] + requested_outputs = [httpclient.InferRequestedOutput("OUTPUT0")] input_data_size = input_data.itemsize * input_data.size client.unregister_cuda_shared_memory() - input_region = self._create_cuda_region(client, input_data_size, - 'input0_data') - inputs[0].set_shared_memory('input0_data', input_data_size) - cuda_shared_memory.set_shared_memory_region( - input_region, [input_data]) + input_region = self._create_cuda_region( + client, input_data_size, "input0_data" + ) + inputs[0].set_shared_memory("input0_data", input_data_size) + cuda_shared_memory.set_shared_memory_region(input_region, [input_data]) # Intentionally create a small region to trigger an error - shm0_handle = self._create_cuda_region(client, 1, - 'output0_data') - requested_outputs[0].set_shared_memory('output0_data', 1) + shm0_handle = self._create_cuda_region(client, 1, "output0_data") + requested_outputs[0].set_shared_memory("output0_data", 1) with self.assertRaises(InferenceServerException) as ex: client.infer(model_name, inputs, outputs=requested_outputs) self.assertIn( "should be at least 4000 bytes to hold the results", - str(ex.exception)) + str(ex.exception), + ) client.unregister_cuda_shared_memory() cuda_shared_memory.destroy_shared_memory_region(shm0_handle) @@ -224,18 +231,19 @@ def test_async_infer(self): with self._shm_leak_detector.Probe() as shm_probe: with httpclient.InferenceServerClient( - "localhost:8000", - concurrency=request_parallelism) as client: + "localhost:8000", concurrency=request_parallelism + ) as client: input_datas = [] requests = [] for i in range(request_parallelism): - input_data = (16384 * np.random.randn(*shape)).astype( - np.uint8) + input_data = (16384 * np.random.randn(*shape)).astype(np.uint8) input_datas.append(input_data) inputs = [ httpclient.InferInput( - "INPUT0", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + "INPUT0", + input_data.shape, + np_to_triton_dtype(input_data.dtype), + ) ] inputs[0].set_data_from_numpy(input_data) requests.append(client.async_infer(model_name, inputs)) @@ -246,76 +254,92 @@ def test_async_infer(self): results = requests[i].get_result() output_data = results.as_numpy("OUTPUT0") - self.assertIsNotNone(output_data, - "error: expected 'OUTPUT0'") + self.assertIsNotNone(output_data, "error: expected 'OUTPUT0'") self.assertTrue( np.array_equal(output_data, input_datas[i]), "error: expected output {} to match input {}".format( - output_data, input_datas[i])) + output_data, input_datas[i] + ), + ) # Make sure the requests ran in parallel. stats = client.get_inference_statistics(model_name) - test_cond = (len(stats['model_stats']) != 1) or ( - stats['model_stats'][0]['name'] != model_name) + test_cond = (len(stats["model_stats"]) != 1) or ( + stats["model_stats"][0]["name"] != model_name + ) + self.assertFalse( + test_cond, "error: expected statistics for {}".format(model_name) + ) + + stat = stats["model_stats"][0] self.assertFalse( - test_cond, - "error: expected statistics for {}".format(model_name)) - - stat = stats['model_stats'][0] - self.assertFalse((stat['inference_count'] != 8) or ( - stat['execution_count'] != 1 - ), "error: expected execution_count == 1 and inference_count == 8, got {} and {}" - .format(stat['execution_count'], - stat['inference_count'])) - batch_stat = stat['batch_stats'][0] + (stat["inference_count"] != 8) or (stat["execution_count"] != 1), + "error: expected execution_count == 1 and inference_count == 8, got {} and {}".format( + stat["execution_count"], stat["inference_count"] + ), + ) + batch_stat = stat["batch_stats"][0] self.assertFalse( - batch_stat['batch_size'] != 8, - f"error: expected batch_size == 8, got {batch_stat['batch_size']}" + batch_stat["batch_size"] != 8, + f"error: expected batch_size == 8, got {batch_stat['batch_size']}", ) # Check metrics to make sure they are reported correctly - metrics = httpreq.get('http://localhost:8002/metrics') + metrics = httpreq.get("http://localhost:8002/metrics") print(metrics.text) - success_str = 'nv_inference_request_success{model="identity_uint8",version="1"}' - infer_count_str = 'nv_inference_count{model="identity_uint8",version="1"}' - infer_exec_str = 'nv_inference_exec_count{model="identity_uint8",version="1"}' + success_str = ( + 'nv_inference_request_success{model="identity_uint8",version="1"}' + ) + infer_count_str = ( + 'nv_inference_count{model="identity_uint8",version="1"}' + ) + infer_exec_str = ( + 'nv_inference_exec_count{model="identity_uint8",version="1"}' + ) success_val = None infer_count_val = None infer_exec_val = None for line in metrics.text.splitlines(): if line.startswith(success_str): - success_val = float(line[len(success_str):]) + success_val = float(line[len(success_str) :]) if line.startswith(infer_count_str): - infer_count_val = float(line[len(infer_count_str):]) + infer_count_val = float(line[len(infer_count_str) :]) if line.startswith(infer_exec_str): - infer_exec_val = float(line[len(infer_exec_str):]) + infer_exec_val = float(line[len(infer_exec_str) :]) self.assertFalse( success_val != 4, "error: expected metric {} == 4, got {}".format( - success_str, success_val)) + success_str, success_val + ), + ) self.assertFalse( infer_count_val != 8, "error: expected metric {} == 8, got {}".format( - infer_count_str, infer_count_val)) + infer_count_str, infer_count_val + ), + ) self.assertFalse( infer_exec_val != 1, "error: expected metric {} == 1, got {}".format( - infer_exec_str, infer_exec_val)) + infer_exec_str, infer_exec_val + ), + ) def test_bool(self): - model_name = 'identity_bool' + model_name = "identity_bool" with self._shm_leak_detector.Probe() as shm_probe: with httpclient.InferenceServerClient("localhost:8000") as client: input_data = np.array([[True, False, True]], dtype=bool) inputs = [ - httpclient.InferInput("INPUT0", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + httpclient.InferInput( + "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) result = client.infer(model_name, inputs) - output0 = result.as_numpy('OUTPUT0') + output0 = result.as_numpy("OUTPUT0") self.assertIsNotNone(output0) self.assertTrue(np.all(output0 == input_data)) @@ -326,21 +350,32 @@ def test_infer_pytorch(self): with httpclient.InferenceServerClient("localhost:8000") as client: input_data = np.zeros(shape, dtype=np.float32) inputs = [ - httpclient.InferInput("IN", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + httpclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) result = client.infer(model_name, inputs) - output_data = result.as_numpy('OUT') + output_data = result.as_numpy("OUT") self.assertIsNotNone(output_data, "error: expected 'OUT'") - # expected inference resposne from a zero tensor + # expected inference response from a zero tensor expected_result = [ - -2.2377274, -2.3976364, -2.2464046, -2.2790744, -2.3828976, - -2.2940576, -2.2928185, -2.340665, -2.275219, -2.292135 + -2.2377274, + -2.3976364, + -2.2464046, + -2.2790744, + -2.3828976, + -2.2940576, + -2.2928185, + -2.340665, + -2.275219, + -2.292135, ] - self.assertTrue(np.allclose(output_data[0], expected_result), - 'Inference result is not correct') + self.assertTrue( + np.allclose(output_data[0], expected_result), + "Inference result is not correct", + ) def test_init_args(self): model_name = "init_args" @@ -349,15 +384,17 @@ def test_init_args(self): with httpclient.InferenceServerClient("localhost:8000") as client: input_data = np.zeros(shape, dtype=np.float32) inputs = [ - httpclient.InferInput("IN", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + httpclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) result = client.infer(model_name, inputs) - # output respone in this model is the number of keys in the args + # output response in this model is the number of keys in the args self.assertTrue( result.as_numpy("OUT") == 7, - "Number of keys in the init args is not correct") + "Number of keys in the init args is not correct", + ) def test_unicode(self): model_name = "string" @@ -367,19 +404,19 @@ def test_unicode(self): # np.object_ for i in range(2): with self._shm_leak_detector.Probe() as shm_probe: - with httpclient.InferenceServerClient( - "localhost:8000") as client: - utf8 = '😀' - input_data = np.array([bytes(utf8, encoding='utf-8')], - dtype=np.bytes_) + with httpclient.InferenceServerClient("localhost:8000") as client: + utf8 = "😀" + input_data = np.array( + [bytes(utf8, encoding="utf-8")], dtype=np.bytes_ + ) inputs = [ httpclient.InferInput( - "INPUT0", shape, - np_to_triton_dtype(input_data.dtype)) + "INPUT0", shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) result = client.infer(model_name, inputs) - output0 = result.as_numpy('OUTPUT0') + output0 = result.as_numpy("OUTPUT0") self.assertIsNotNone(output0) self.assertEqual(output0[0], input_data) @@ -389,8 +426,7 @@ def test_optional_input(self): with self._shm_leak_detector.Probe() as shm_probe: for has_input0 in [True, False]: for has_input1 in [True, False]: - self._optional_input_infer(model_name, has_input0, - has_input1) + self._optional_input_infer(model_name, has_input0, has_input1) def test_string(self): model_name = "string_fixed" @@ -401,27 +437,25 @@ def test_string(self): # (empty output and fixed output) for i in range(4): with self._shm_leak_detector.Probe() as shm_probe: - with httpclient.InferenceServerClient( - "localhost:8000") as client: - input_data = np.array(['123456'], dtype=np.object_) + with httpclient.InferenceServerClient("localhost:8000") as client: + input_data = np.array(["123456"], dtype=np.object_) inputs = [ httpclient.InferInput( - "INPUT0", shape, - np_to_triton_dtype(input_data.dtype)) + "INPUT0", shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) result = client.infer(model_name, inputs) - output0 = result.as_numpy('OUTPUT0') + output0 = result.as_numpy("OUTPUT0") self.assertIsNotNone(output0) if i % 2 == 0: - self.assertEqual(output0[0], - input_data.astype(np.bytes_)) + self.assertEqual(output0[0], input_data.astype(np.bytes_)) else: self.assertEqual(output0.size, 0) def test_non_contiguous(self): - model_name = 'non_contiguous' + model_name = "non_contiguous" shape = [2, 10, 11, 6, 5] new_shape = [10, 2, 6, 5, 11] shape_reorder = [1, 0, 4, 2, 3] @@ -429,8 +463,9 @@ def test_non_contiguous(self): input_numpy = np.random.rand(*shape) input_numpy = input_numpy.astype(np.float32) inputs = [ - httpclient.InferInput("INPUT0", shape, - np_to_triton_dtype(input_numpy.dtype)) + httpclient.InferInput( + "INPUT0", shape, np_to_triton_dtype(input_numpy.dtype) + ) ] inputs[0].set_data_from_numpy(input_numpy) result = client.infer(model_name, inputs) @@ -440,10 +475,10 @@ def test_non_contiguous(self): output1 = input_numpy.T output2 = np.transpose(input_numpy, shape_reorder) - self.assertTrue(np.all(output0 == result.as_numpy('OUTPUT0'))) - self.assertTrue(np.all(output1 == result.as_numpy('OUTPUT1'))) - self.assertTrue(np.all(output2 == result.as_numpy('OUTPUT2'))) + self.assertTrue(np.all(output0 == result.as_numpy("OUTPUT0"))) + self.assertTrue(np.all(output1 == result.as_numpy("OUTPUT1"))) + self.assertTrue(np.all(output2 == result.as_numpy("OUTPUT2"))) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_backend_python/python_unittest.py b/qa/L0_backend_python/python_unittest.py old mode 100644 new mode 100755 index 9ff1b30e02..bff4dd57da --- a/qa/L0_backend_python/python_unittest.py +++ b/qa/L0_backend_python/python_unittest.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,16 +30,16 @@ sys.path.append("../../common") -import test_util as tu -import shm_util +import os import unittest + +import shm_util +import test_util as tu import tritonclient.grpc as grpcclient from tritonclient.utils import * -import os class PythonUnittest(tu.TestResultCollector): - def setUp(self): self._shm_leak_detector = shm_util.ShmLeakDetector() @@ -45,15 +47,15 @@ def _run_unittest(self, model_name): with grpcclient.InferenceServerClient("localhost:8001") as client: # No input is required result = client.infer(model_name, [], client_timeout=240) - output0 = result.as_numpy('OUTPUT0') + output0 = result.as_numpy("OUTPUT0") - # The model returns 1 if the tests were sucessfully passed. + # The model returns 1 if the tests were successfully passed. # Otherwise, it will return 0. self.assertEqual(output0, [1]) def test_python_unittest(self): - model_name = os.environ['MODEL_NAME'] - bls_kind = os.environ.get('BLS_KIND', 'non_decoupled') + model_name = os.environ["MODEL_NAME"] + bls_kind = os.environ.get("BLS_KIND", "non_decoupled") if bls_kind == "decoupled": # Skip the shared memory probe for decoupled models for now as @@ -62,7 +64,11 @@ def test_python_unittest(self): # is bounded. self._run_unittest(model_name) else: - if model_name == 'bls' or model_name == 'bls_memory' or model_name == 'bls_memory_async': + if ( + model_name == "bls" + or model_name == "bls_memory" + or model_name == "bls_memory_async" + ): # For these tests, the memory region size will be grown. Because of # this we need to use the shared memory probe only on the later # call so that the probe can detect the leak correctly. @@ -77,5 +83,5 @@ def test_python_unittest(self): self._run_unittest(model_name) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_backend_python/restart/models/restart/1/model.py b/qa/L0_backend_python/restart/models/restart/1/model.py old mode 100644 new mode 100755 index 72bce2933a..d7cb765ec9 --- a/qa/L0_backend_python/restart/models/restart/1/model.py +++ b/qa/L0_backend_python/restart/models/restart/1/model.py @@ -1,4 +1,6 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,29 +26,30 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import triton_python_backend_utils as pb_utils -import c_python_backend_utils as c_utils from os import path +import c_python_backend_utils as c_utils +import triton_python_backend_utils as pb_utils -class TritonPythonModel: +class TritonPythonModel: def execute(self, requests): # This function will be called once to record the free memory. Then, # the stub process will be killed to trigger Python backend restart. # After that this value will be read again to make sure that it matches # before restart. - file_name = 'free_memory.txt' + file_name = "free_memory.txt" current_free_memory = str(c_utils.shared_memory.free_memory()) if path.exists(file_name): - with open(file_name, 'r') as f: + with open(file_name, "r") as f: expected_free_memory = f.read() - assert expected_free_memory == current_free_memory, \ - (f'Free shared memory before and after restart are not equal. ' - '{expected_free_memory} (before) != {current_free_memory} (after).') + assert expected_free_memory == current_free_memory, ( + f"Free shared memory before and after restart are not equal. " + "{expected_free_memory} (before) != {current_free_memory} (after)." + ) else: - with open(file_name, 'w') as f: + with open(file_name, "w") as f: f.write(current_free_memory) responses = [] diff --git a/qa/L0_backend_python/restart/restart_test.py b/qa/L0_backend_python/restart/restart_test.py old mode 100644 new mode 100755 index cf5afcbdb1..4f4bf63082 --- a/qa/L0_backend_python/restart/restart_test.py +++ b/qa/L0_backend_python/restart/restart_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,30 +30,31 @@ sys.path.append("../../common") +import unittest + +import numpy as np import test_util as tu import tritonclient.http as httpclient from tritonclient.utils import * -import numpy as np -import unittest class RestartTest(tu.TestResultCollector): - def _infer_helper(self, model_name, shape, data_type): with httpclient.InferenceServerClient("localhost:8000") as client: input_data_0 = np.array(np.random.randn(*shape), dtype=data_type) inputs = [ - httpclient.InferInput("INPUT0", shape, - np_to_triton_dtype(input_data_0.dtype)) + httpclient.InferInput( + "INPUT0", shape, np_to_triton_dtype(input_data_0.dtype) + ) ] inputs[0].set_data_from_numpy(input_data_0) result = client.infer(model_name, inputs) - output0 = result.as_numpy('OUTPUT0') + output0 = result.as_numpy("OUTPUT0") self.assertTrue(np.all(input_data_0 == output0)) def test_restart(self): shape = [1, 16] - model_name = 'restart' + model_name = "restart" dtype = np.float32 # Since the stub process has been killed, the first request @@ -65,10 +68,10 @@ def test_restart(self): def test_infer(self): shape = [1, 16] - model_name = 'restart' + model_name = "restart" dtype = np.float32 self._infer_helper(model_name, shape, dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_backend_python/restart/test.sh b/qa/L0_backend_python/restart/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_backend_python/variants/test.sh b/qa/L0_backend_python/variants/test.sh old mode 100644 new mode 100755 index 24ceb1cf4c..65116cb2dc --- a/qa/L0_backend_python/variants/test.sh +++ b/qa/L0_backend_python/variants/test.sh @@ -25,7 +25,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# Buidling a CPU build of Python backend +# Building a CPU build of Python backend source ../common.sh install_build_deps diff --git a/qa/L0_batch_custom/batch_custom_test.py b/qa/L0_batch_custom/batch_custom_test.py old mode 100644 new mode 100755 index 3fb74cf25d..6cd6346ad3 --- a/qa/L0_batch_custom/batch_custom_test.py +++ b/qa/L0_batch_custom/batch_custom_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,31 +30,32 @@ sys.path.append("../common") -from builtins import range import os -import time import threading +import time import unittest -import numpy as np +from builtins import range +from collections.abc import Iterable + import infer_util as iu +import numpy as np import test_util as tu -from collections.abc import Iterable import tritonclient.grpc as grpcclient # By default, find tritonserver on "localhost", but can be overridden # with TRITONSERVER_IPADDR envvar -_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost') +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") _deferred_exceptions_lock = threading.Lock() _deferred_exceptions = [] class BatcherTest(tu.TestResultCollector): - def setUp(self): # The helper client for setup will be GRPC for simplicity. self.triton_client_ = grpcclient.InferenceServerClient( - f"{_tritonserver_ipaddr}:8001") + f"{_tritonserver_ipaddr}:8001" + ) self.precreated_shm_regions_ = [] global _deferred_exceptions _deferred_exceptions = [] @@ -71,35 +74,45 @@ def check_deferred_exception(self): if len(_deferred_exceptions) > 0: raise _deferred_exceptions[0] - def check_response(self, - trial, - bs, - thresholds, - requested_outputs=("OUTPUT0", "OUTPUT1"), - input_size=16, - shm_region_names=None, - precreated_shm_regions=None): + def check_response( + self, + trial, + bs, + thresholds, + requested_outputs=("OUTPUT0", "OUTPUT1"), + input_size=16, + shm_region_names=None, + precreated_shm_regions=None, + ): try: start_ms = int(round(time.time() * 1000)) - if trial == "savedmodel" or trial == "graphdef" or trial == "libtorch" \ - or trial == "onnx" or trial == "plan" or trial == "python": + if ( + trial == "savedmodel" + or trial == "graphdef" + or trial == "libtorch" + or trial == "onnx" + or trial == "plan" + or trial == "python" + ): tensor_shape = (bs, input_size) - iu.infer_exact(self, - trial, - tensor_shape, - bs, - np.float32, - np.float32, - np.float32, - swap=False, - model_version=1, - outputs=requested_outputs, - use_http=False, - use_grpc=False, - use_http_json_tensors=False, - skip_request_id_check=True, - use_streaming=False) + iu.infer_exact( + self, + trial, + tensor_shape, + bs, + np.float32, + np.float32, + np.float32, + swap=False, + model_version=1, + outputs=requested_outputs, + use_http=False, + use_grpc=False, + use_http_json_tensors=False, + skip_request_id_check=True, + use_streaming=False, + ) else: self.assertFalse(True, "unknown trial type: " + trial) @@ -110,79 +123,110 @@ def check_response(self, if lt_ms is not None: self.assertTrue( (end_ms - start_ms) < lt_ms, - "expected less than " + str(lt_ms) + - "ms response time, got " + str(end_ms - start_ms) + " ms") + "expected less than " + + str(lt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) if gt_ms is not None: self.assertTrue( (end_ms - start_ms) > gt_ms, - "expected greater than " + str(gt_ms) + - "ms response time, got " + str(end_ms - start_ms) + " ms") + "expected greater than " + + str(gt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) except Exception as ex: self.add_deferred_exception(ex) - def check_status(self, model_name, batch_exec, request_cnt, infer_cnt, - exec_count): + def check_status(self, model_name, batch_exec, request_cnt, infer_cnt, exec_count): # There is a time window between when responses are returned and statistics are updated. # To prevent intermittent test failure during that window, wait up to 10 seconds for the # inference statistics to be ready. num_tries = 10 for i in range(num_tries): - stats = self.triton_client_.get_inference_statistics( - model_name, "1") + stats = self.triton_client_.get_inference_statistics(model_name, "1") self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") actual_exec_cnt = stats.model_stats[0].execution_count if actual_exec_cnt == exec_count: break - print("WARNING: expect {} executions, got {} (attempt {})".format( - exec_count, actual_exec_cnt, i)) + print( + "WARNING: expect {} executions, got {} (attempt {})".format( + exec_count, actual_exec_cnt, i + ) + ) time.sleep(1) - self.assertEqual(stats.model_stats[0].name, model_name, - "expect model stats for model {}".format(model_name)) self.assertEqual( - stats.model_stats[0].version, "1", - "expect model stats for model {} version 1".format(model_name)) + stats.model_stats[0].name, + model_name, + "expect model stats for model {}".format(model_name), + ) + self.assertEqual( + stats.model_stats[0].version, + "1", + "expect model stats for model {} version 1".format(model_name), + ) if batch_exec: batch_stats = stats.model_stats[0].batch_stats self.assertEqual( - len(batch_stats), len(batch_exec), + len(batch_stats), + len(batch_exec), "expected {} different batch-sizes, got {}".format( - len(batch_exec), len(batch_stats))) + len(batch_exec), len(batch_stats) + ), + ) for batch_stat in batch_stats: bs = batch_stat.batch_size bc = batch_stat.compute_infer.count - self.assertTrue(bs in batch_exec, - "unexpected batch-size {}".format(bs)) + self.assertTrue(bs in batch_exec, "unexpected batch-size {}".format(bs)) # Get count from one of the stats self.assertEqual( - bc, batch_exec[bs], - "expected model-execution-count {} for batch size {}, got {}" - .format(batch_exec[bs], bs, bc)) + bc, + batch_exec[bs], + "expected model-execution-count {} for batch size {}, got {}".format( + batch_exec[bs], bs, bc + ), + ) actual_request_cnt = stats.model_stats[0].inference_stats.success.count self.assertEqual( - actual_request_cnt, request_cnt, + actual_request_cnt, + request_cnt, "expected model-request-count {}, got {}".format( - request_cnt, actual_request_cnt)) + request_cnt, actual_request_cnt + ), + ) actual_exec_cnt = stats.model_stats[0].execution_count if isinstance(exec_count, Iterable): self.assertIn( - actual_exec_cnt, exec_count, + actual_exec_cnt, + exec_count, "expected model-exec-count {}, got {}".format( - exec_count, actual_exec_cnt)) + exec_count, actual_exec_cnt + ), + ) else: self.assertEqual( - actual_exec_cnt, exec_count, + actual_exec_cnt, + exec_count, "expected model-exec-count {}, got {}".format( - exec_count, actual_exec_cnt)) + exec_count, actual_exec_cnt + ), + ) actual_infer_cnt = stats.model_stats[0].inference_count self.assertEqual( - actual_infer_cnt, infer_cnt, + actual_infer_cnt, + infer_cnt, "expected model-inference-count {}, got {}".format( - infer_cnt, actual_infer_cnt)) + infer_cnt, actual_infer_cnt + ), + ) def test_volume_batching(self): # Send 12 requests with batch size 1. The max_queue_delay is set @@ -190,26 +234,30 @@ def test_volume_batching(self): # there can be either 4-6 model executions. model_base = "onnx" dtype = np.float16 - shapes = ([ - 1, - 4, - 4, - ],) + shapes = ( + [ + 1, + 4, + 4, + ], + ) try: # use threads to send 12 requests without waiting for response threads = [] for i in range(12): threads.append( - threading.Thread(target=iu.infer_zero, - args=(self, model_base, 1, dtype, shapes, - shapes), - kwargs={ - 'use_http': True, - 'use_grpc': False, - 'use_http_json_tensors': False, - 'use_streaming': False, - })) + threading.Thread( + target=iu.infer_zero, + args=(self, model_base, 1, dtype, shapes, shapes), + kwargs={ + "use_http": True, + "use_grpc": False, + "use_http_json_tensors": False, + "use_streaming": False, + }, + ) + ) for t in threads: t.start() for t in threads: @@ -221,5 +269,5 @@ def test_volume_batching(self): self.assertTrue(False, "unexpected error {}".format(ex)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_batch_custom/test.sh b/qa/L0_batch_custom/test.sh index c957ec4515..11735e1470 100755 --- a/qa/L0_batch_custom/test.sh +++ b/qa/L0_batch_custom/test.sh @@ -125,7 +125,7 @@ for i in "${!test_setups[@]}"; do if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" cat $SERVER_LOG - exit 1 + exit 1 fi if [ `grep -c "Loading custom batching strategy" $SERVER_LOG` != "1" ]; then cat $SERVER_LOG @@ -157,7 +157,7 @@ done FILE_PATH="backend/examples/batching_strategies/volume_batching/src/volume_batching.cc" OLD_STRING="\/\/ Batcher will point to an unsigned integer representing the maximum" NEW_STRING="return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_NOT_FOUND,\"Failure test case\");" - + sed -i "s/${OLD_STRING}/${NEW_STRING}/g" ${FILE_PATH} (cd backend/examples/batching_strategies/volume_batching && diff --git a/qa/L0_batch_input/batch_input_test.py b/qa/L0_batch_input/batch_input_test.py old mode 100644 new mode 100755 index d5dfe2763d..02de27d921 --- a/qa/L0_batch_input/batch_input_test.py +++ b/qa/L0_batch_input/batch_input_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,19 +30,19 @@ sys.path.append("../common") +import queue import unittest -import numpy as np from functools import partial -import queue + +import numpy as np import test_util as tu import tritonclient.grpc as grpcclient from tritonclient.utils import InferenceServerException class BatchInputTest(tu.TestResultCollector): - def setUp(self): - self.client = grpcclient.InferenceServerClient(url='localhost:8001') + self.client = grpcclient.InferenceServerClient(url="localhost:8001") def callback(user_data, result, error): if error: @@ -55,28 +57,27 @@ def set_inputs(self, shapes, input_name): self.inputs = [] for shape in shapes: self.inputs.append( - [grpcclient.InferInput(input_name, [1, shape[0]], "FP32")]) + [grpcclient.InferInput(input_name, [1, shape[0]], "FP32")] + ) self.inputs[-1][0].set_data_from_numpy( - np.full([1, shape[0]], shape[0], np.float32)) + np.full([1, shape[0]], shape[0], np.float32) + ) def set_inputs_for_batch_item(self, shapes, input_name): self.dtype_ = np.float32 self.inputs = [] for shape in shapes: - self.inputs.append( - [grpcclient.InferInput(input_name, shape, "FP32")]) - self.inputs[-1][0].set_data_from_numpy( - np.full(shape, shape[0], np.float32)) + self.inputs.append([grpcclient.InferInput(input_name, shape, "FP32")]) + self.inputs[-1][0].set_data_from_numpy(np.full(shape, shape[0], np.float32)) def test_ragged_output(self): model_name = "ragged_io" # The model is an identity model self.set_inputs([[2], [4], [1], [3]], "INPUT0") user_data = queue.Queue() - self.client.start_stream( - callback=partial(self.client_callback, user_data)) + self.client.start_stream(callback=partial(self.client_callback, user_data)) - output_name = 'OUTPUT0' + output_name = "OUTPUT0" outputs = [grpcclient.InferRequestedOutput(output_name)] async_requests = [] @@ -84,9 +85,10 @@ def test_ragged_output(self): for input in self.inputs: # Asynchronous inference call. async_requests.append( - self.client.async_stream_infer(model_name=model_name, - inputs=input, - outputs=outputs)) + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) expected_value_list = [[v] * v for v in [2, 4, 1, 3]] expected_value_list = [ @@ -103,7 +105,9 @@ def test_ragged_output(self): self.assertTrue( np.array_equal(output_data, expected_value_list[idx]), "Expect response {} to have value {}, got {}".format( - idx, expected_value_list[idx], output_data)) + idx, expected_value_list[idx], output_data + ), + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) self.client.stop_stream() @@ -112,19 +116,19 @@ def test_ragged_input(self): model_name = "ragged_acc_shape" self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT") user_data = queue.Queue() - self.client.start_stream( - callback=partial(self.client_callback, user_data)) + self.client.start_stream(callback=partial(self.client_callback, user_data)) - output_name = 'RAGGED_OUTPUT' + output_name = "RAGGED_OUTPUT" outputs = [grpcclient.InferRequestedOutput(output_name)] async_requests = [] try: for input in self.inputs: # Asynchronous inference call. async_requests.append( - self.client.async_stream_infer(model_name=model_name, - inputs=input, - outputs=outputs)) + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) value_lists = [[v] * v for v in [2, 4, 1, 3]] expected_value = [] @@ -140,7 +144,9 @@ def test_ragged_input(self): self.assertTrue( np.array_equal(output_data, expected_value), "Expect response {} to have value {}, got {}".format( - idx, expected_value, output_data)) + idx, expected_value, output_data + ), + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) self.client.stop_stream() @@ -149,10 +155,9 @@ def test_element_count(self): model_name = "ragged_element_count_acc_zero" self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT") user_data = queue.Queue() - self.client.start_stream( - callback=partial(self.client_callback, user_data)) + self.client.start_stream(callback=partial(self.client_callback, user_data)) - output_name = 'BATCH_AND_SIZE_OUTPUT' + output_name = "BATCH_AND_SIZE_OUTPUT" outputs = [grpcclient.InferRequestedOutput(output_name)] async_requests = [] @@ -160,9 +165,10 @@ def test_element_count(self): for input in self.inputs: # Asynchronous inference call. async_requests.append( - self.client.async_stream_infer(model_name=model_name, - inputs=input, - outputs=outputs)) + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) expected_value = np.asarray([[2, 4, 1, 3]], np.float32) for idx in range(len(async_requests)): @@ -175,7 +181,9 @@ def test_element_count(self): self.assertTrue( np.array_equal(output_data, expected_value), "Expect response {} to have value {}, got {}".format( - idx, expected_value, output_data)) + idx, expected_value, output_data + ), + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) self.client.stop_stream() @@ -184,10 +192,9 @@ def test_accumulated_element_count(self): model_name = "ragged_acc_shape" self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT") user_data = queue.Queue() - self.client.start_stream( - callback=partial(self.client_callback, user_data)) + self.client.start_stream(callback=partial(self.client_callback, user_data)) - output_name = 'BATCH_AND_SIZE_OUTPUT' + output_name = "BATCH_AND_SIZE_OUTPUT" outputs = [grpcclient.InferRequestedOutput(output_name)] async_requests = [] @@ -195,9 +202,10 @@ def test_accumulated_element_count(self): for input in self.inputs: # Asynchronous inference call. async_requests.append( - self.client.async_stream_infer(model_name=model_name, - inputs=input, - outputs=outputs)) + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) expected_value = np.asarray([[2, 6, 7, 10]], np.float32) for idx in range(len(async_requests)): @@ -210,7 +218,9 @@ def test_accumulated_element_count(self): self.assertTrue( np.array_equal(output_data, expected_value), "Expect response {} to have value {}, got {}".format( - idx, expected_value, output_data)) + idx, expected_value, output_data + ), + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) self.client.stop_stream() @@ -219,10 +229,9 @@ def test_accumulated_element_count_with_zero(self): model_name = "ragged_element_count_acc_zero" self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT") user_data = queue.Queue() - self.client.start_stream( - callback=partial(self.client_callback, user_data)) + self.client.start_stream(callback=partial(self.client_callback, user_data)) - output_name = 'BATCH_OUTPUT' + output_name = "BATCH_OUTPUT" outputs = [grpcclient.InferRequestedOutput(output_name)] async_requests = [] @@ -230,9 +239,10 @@ def test_accumulated_element_count_with_zero(self): for input in self.inputs: # Asynchronous inference call. async_requests.append( - self.client.async_stream_infer(model_name=model_name, - inputs=input, - outputs=outputs)) + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) expected_value = np.asarray([[0, 2, 6, 7, 10]], np.float32) for idx in range(len(async_requests)): @@ -245,7 +255,9 @@ def test_accumulated_element_count_with_zero(self): self.assertTrue( np.array_equal(output_data, expected_value), "Expect response {} to have value {}, got {}".format( - idx, expected_value, output_data)) + idx, expected_value, output_data + ), + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) self.client.stop_stream() @@ -254,10 +266,9 @@ def test_max_element_count_as_shape(self): model_name = "ragged_acc_shape" self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT") user_data = queue.Queue() - self.client.start_stream( - callback=partial(self.client_callback, user_data)) + self.client.start_stream(callback=partial(self.client_callback, user_data)) - output_name = 'BATCH_OUTPUT' + output_name = "BATCH_OUTPUT" outputs = [grpcclient.InferRequestedOutput(output_name)] async_requests = [] @@ -265,9 +276,10 @@ def test_max_element_count_as_shape(self): for input in self.inputs: # Asynchronous inference call. async_requests.append( - self.client.async_stream_infer(model_name=model_name, - inputs=input, - outputs=outputs)) + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) for idx in range(len(async_requests)): # Get the result from the initiated asynchronous inference request. @@ -277,9 +289,12 @@ def test_max_element_count_as_shape(self): # Validate the results by comparing with precomputed values. output_data = result.as_numpy(output_name) self.assertEqual( - output_data.shape, (1, 4), - "Expect response {} to have shape to represent max element count {} among the batch , got {}" - .format(idx, 4, output_data.shape)) + output_data.shape, + (1, 4), + "Expect response {} to have shape to represent max element count {} among the batch , got {}".format( + idx, 4, output_data.shape + ), + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) self.client.stop_stream() @@ -290,14 +305,14 @@ def test_batch_item_shape_flatten(self): # Note that the test only checks the formation of "BATCH_INPUT" where # the value of "RAGGED_INPUT" is irrelevant, only the shape matters self.set_inputs_for_batch_item( - [[1, 4, 1], [1, 1, 2], [1, 1, 2], [1, 2, 2]], "RAGGED_INPUT") + [[1, 4, 1], [1, 1, 2], [1, 1, 2], [1, 2, 2]], "RAGGED_INPUT" + ) model_name = "batch_item_flatten" user_data = queue.Queue() - self.client.start_stream( - callback=partial(self.client_callback, user_data)) + self.client.start_stream(callback=partial(self.client_callback, user_data)) - output_name = 'BATCH_OUTPUT' + output_name = "BATCH_OUTPUT" outputs = [grpcclient.InferRequestedOutput(output_name)] async_requests = [] @@ -305,9 +320,10 @@ def test_batch_item_shape_flatten(self): for input in self.inputs: # Asynchronous inference call. async_requests.append( - self.client.async_stream_infer(model_name=model_name, - inputs=input, - outputs=outputs)) + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) expected_value = np.asarray([[4, 1, 1, 2, 1, 2, 2, 2]], np.float32) for idx in range(len(async_requests)): @@ -320,7 +336,9 @@ def test_batch_item_shape_flatten(self): self.assertTrue( np.array_equal(output_data, expected_value), "Expect response {} to have value {}, got {}".format( - idx, expected_value, output_data)) + idx, expected_value, output_data + ), + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) self.client.stop_stream() @@ -329,8 +347,9 @@ def test_batch_item_shape(self): # Use 3 set of inputs with shape [2, 1, 2], [1, 1, 2], [1, 2, 2] # Note that the test only checks the formation of "BATCH_INPUT" where # the value of "RAGGED_INPUT" is irrelevant, only the shape matters - self.set_inputs_for_batch_item([[2, 1, 2], [1, 1, 2], [1, 2, 2]], - "RAGGED_INPUT") + self.set_inputs_for_batch_item( + [[2, 1, 2], [1, 1, 2], [1, 2, 2]], "RAGGED_INPUT" + ) expected_outputs = [ np.array([[1.0, 2.0], [1.0, 2.0]]), @@ -340,10 +359,9 @@ def test_batch_item_shape(self): model_name = "batch_item" user_data = queue.Queue() - self.client.start_stream( - callback=partial(self.client_callback, user_data)) + self.client.start_stream(callback=partial(self.client_callback, user_data)) - output_name = 'BATCH_OUTPUT' + output_name = "BATCH_OUTPUT" outputs = [grpcclient.InferRequestedOutput(output_name)] async_requests = [] @@ -351,9 +369,10 @@ def test_batch_item_shape(self): for input in self.inputs: # Asynchronous inference call. async_requests.append( - self.client.async_stream_infer(model_name=model_name, - inputs=input, - outputs=outputs)) + self.client.async_stream_infer( + model_name=model_name, inputs=input, outputs=outputs + ) + ) for idx in range(len(async_requests)): # Get the result from the initiated asynchronous inference request. @@ -364,13 +383,16 @@ def test_batch_item_shape(self): output_data = result.as_numpy(output_name) self.assertTrue( np.allclose(output_data, expected_outputs[idx]), - "Expect response to have value:\n{}, got:\n{}\nEqual matrix:\n{}" - .format(expected_outputs[idx], output_data, - np.isclose(expected_outputs[idx], output_data))) + "Expect response to have value:\n{}, got:\n{}\nEqual matrix:\n{}".format( + expected_outputs[idx], + output_data, + np.isclose(expected_outputs[idx], output_data), + ), + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) self.client.stop_stream() -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_batch_input/test.sh b/qa/L0_batch_input/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_batcher/batcher_test.py b/qa/L0_batcher/batcher_test.py old mode 100644 new mode 100755 index 4600474442..38e208c21e --- a/qa/L0_batcher/batcher_test.py +++ b/qa/L0_batcher/batcher_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,25 +30,23 @@ sys.path.append("../common") -from builtins import range import os -import time import threading +import time import unittest -import numpy as np +from builtins import range + import infer_util as iu +import numpy as np import test_util as tu - import tritonclient.grpc as grpcclient # By default, find tritonserver on "localhost", but can be overridden # with TRITONSERVER_IPADDR envvar -_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost') +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") -TEST_SYSTEM_SHARED_MEMORY = bool( - int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0))) -TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get('TEST_CUDA_SHARED_MEMORY', - 0))) +TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0))) +TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0))) if TEST_SYSTEM_SHARED_MEMORY: import tritonclient.utils.shared_memory as shm @@ -55,14 +55,13 @@ # Test with either GRPC of HTTP, but not both since when we check # results we expect only one to run -USE_GRPC = (os.environ.get('USE_GRPC', 1) != "0") -USE_HTTP = (os.environ.get('USE_HTTP', 1) != "0") +USE_GRPC = os.environ.get("USE_GRPC", 1) != "0" +USE_HTTP = os.environ.get("USE_HTTP", 1) != "0" if USE_GRPC and USE_HTTP: USE_GRPC = False assert USE_GRPC or USE_HTTP, "USE_GRPC or USE_HTTP must be non-zero" -BACKENDS = os.environ.get('BACKENDS', - "graphdef savedmodel onnx libtorch plan python") +BACKENDS = os.environ.get("BACKENDS", "graphdef savedmodel onnx libtorch plan python") _trials = BACKENDS.split(" ") @@ -81,11 +80,11 @@ class BatcherTest(tu.TestResultCollector): - def setUp(self): # The helper client for setup will be GRPC for simplicity. self.triton_client_ = grpcclient.InferenceServerClient( - f"{_tritonserver_ipaddr}:8001") + f"{_tritonserver_ipaddr}:8001" + ) self.precreated_shm_regions_ = [] global _deferred_exceptions _deferred_exceptions = [] @@ -107,19 +106,22 @@ def create_advance(self, shm_regions=None): if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: precreated_shm_regions = [] if shm_regions is None: - shm_regions = ['output0', 'output1'] + shm_regions = ["output0", "output1"] for shm_region in shm_regions: if TEST_SYSTEM_SHARED_MEMORY: shm_handle = shm.create_shared_memory_region( - shm_region + '_data', '/' + shm_region, 512) + shm_region + "_data", "/" + shm_region, 512 + ) self.triton_client_.register_system_shared_memory( - shm_region + '_data', '/' + shm_region, 512) + shm_region + "_data", "/" + shm_region, 512 + ) else: shm_handle = cudashm.create_shared_memory_region( - shm_region + '_data', 512, 0) + shm_region + "_data", 512, 0 + ) self.triton_client_.register_cuda_shared_memory( - shm_region + '_data', - cudashm.get_raw_handle(shm_handle), 0, 512) + shm_region + "_data", cudashm.get_raw_handle(shm_handle), 0, 512 + ) # Collect precreated handles for cleanup self.precreated_shm_regions_.append(shm_handle) precreated_shm_regions.append(shm_handle) @@ -137,19 +139,27 @@ def check_deferred_exception(self): if len(_deferred_exceptions) > 0: raise _deferred_exceptions[0] - def check_response(self, - trial, - bs, - thresholds, - requested_outputs=("OUTPUT0", "OUTPUT1"), - input_size=16, - shm_region_names=None, - precreated_shm_regions=None): + def check_response( + self, + trial, + bs, + thresholds, + requested_outputs=("OUTPUT0", "OUTPUT1"), + input_size=16, + shm_region_names=None, + precreated_shm_regions=None, + ): try: start_ms = int(round(time.time() * 1000)) - if trial == "savedmodel" or trial == "graphdef" or trial == "libtorch" \ - or trial == "onnx" or trial == "plan" or trial == "python": + if ( + trial == "savedmodel" + or trial == "graphdef" + or trial == "libtorch" + or trial == "onnx" + or trial == "plan" + or trial == "python" + ): tensor_shape = (bs, input_size) iu.infer_exact( self, @@ -170,7 +180,8 @@ def check_response(self, shm_region_names=shm_region_names, precreated_shm_regions=precreated_shm_regions, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, - use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY) + use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY, + ) else: self.assertFalse(True, "unknown trial type: " + trial) @@ -181,86 +192,109 @@ def check_response(self, if lt_ms is not None: self.assertTrue( (end_ms - start_ms) < lt_ms, - "expected less than " + str(lt_ms) + - "ms response time, got " + str(end_ms - start_ms) + " ms") + "expected less than " + + str(lt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) if gt_ms is not None: self.assertTrue( (end_ms - start_ms) > gt_ms, - "expected greater than " + str(gt_ms) + - "ms response time, got " + str(end_ms - start_ms) + " ms") + "expected greater than " + + str(gt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) except Exception as ex: self.add_deferred_exception(ex) - def check_setup(self, model_name, preferred_batch_sizes, - max_queue_delay_us): + def check_setup(self, model_name, preferred_batch_sizes, max_queue_delay_us): # Make sure test.sh set up the correct batcher settings config = self.triton_client_.get_model_config(model_name).config bconfig = config.dynamic_batching - self.assertEqual(len(bconfig.preferred_batch_size), - len(preferred_batch_sizes)) + self.assertEqual(len(bconfig.preferred_batch_size), len(preferred_batch_sizes)) for i in preferred_batch_sizes: self.assertTrue(i in bconfig.preferred_batch_size) - self.assertEqual(bconfig.max_queue_delay_microseconds, - max_queue_delay_us) + self.assertEqual(bconfig.max_queue_delay_microseconds, max_queue_delay_us) - def check_status(self, model_name, batch_exec, request_cnt, infer_cnt, - exec_count): + def check_status(self, model_name, batch_exec, request_cnt, infer_cnt, exec_count): # There is a time window between when responses are returned and statistics are updated. # To prevent intermittent test failure during that window, wait up to 10 seconds for the # inference statistics to be ready. num_tries = 10 for i in range(num_tries): - stats = self.triton_client_.get_inference_statistics( - model_name, "1") + stats = self.triton_client_.get_inference_statistics(model_name, "1") self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") actual_exec_cnt = stats.model_stats[0].execution_count if actual_exec_cnt in exec_count: break - print("WARNING: expect {} executions, got {} (attempt {})".format( - exec_count, actual_exec_cnt, i)) + print( + "WARNING: expect {} executions, got {} (attempt {})".format( + exec_count, actual_exec_cnt, i + ) + ) time.sleep(1) - self.assertEqual(stats.model_stats[0].name, model_name, - "expect model stats for model {}".format(model_name)) self.assertEqual( - stats.model_stats[0].version, "1", - "expect model stats for model {} version 1".format(model_name)) + stats.model_stats[0].name, + model_name, + "expect model stats for model {}".format(model_name), + ) + self.assertEqual( + stats.model_stats[0].version, + "1", + "expect model stats for model {} version 1".format(model_name), + ) if batch_exec: batch_stats = stats.model_stats[0].batch_stats self.assertEqual( - len(batch_stats), len(batch_exec), + len(batch_stats), + len(batch_exec), "expected {} different batch-sizes, got {}".format( - len(batch_exec), len(batch_stats))) + len(batch_exec), len(batch_stats) + ), + ) for batch_stat in batch_stats: bs = batch_stat.batch_size bc = batch_stat.compute_infer.count - self.assertTrue(bs in batch_exec, - "unexpected batch-size {}".format(bs)) + self.assertTrue(bs in batch_exec, "unexpected batch-size {}".format(bs)) # Get count from one of the stats self.assertEqual( - bc, batch_exec[bs], - "expected model-execution-count {} for batch size {}, got {}" - .format(batch_exec[bs], bs, bc)) + bc, + batch_exec[bs], + "expected model-execution-count {} for batch size {}, got {}".format( + batch_exec[bs], bs, bc + ), + ) actual_request_cnt = stats.model_stats[0].inference_stats.success.count self.assertEqual( - actual_request_cnt, request_cnt, + actual_request_cnt, + request_cnt, "expected model-request-count {}, got {}".format( - request_cnt, actual_request_cnt)) + request_cnt, actual_request_cnt + ), + ) actual_exec_cnt = stats.model_stats[0].execution_count self.assertIn( - actual_exec_cnt, exec_count, - "expected model-exec-count {}, got {}".format( - exec_count, actual_exec_cnt)) + actual_exec_cnt, + exec_count, + "expected model-exec-count {}, got {}".format(exec_count, actual_exec_cnt), + ) actual_infer_cnt = stats.model_stats[0].inference_count self.assertEqual( - actual_infer_cnt, infer_cnt, + actual_infer_cnt, + infer_cnt, "expected model-inference-count {}, got {}".format( - infer_cnt, actual_infer_cnt)) + infer_cnt, actual_infer_cnt + ), + ) def test_static_batch_preferred(self): # Send two requests with static batch sizes == preferred @@ -269,20 +303,25 @@ def test_static_batch_preferred(self): precreated_shm_regions = self.create_advance() for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) self.check_response( trial, - 2, (3000, None), - precreated_shm_regions=precreated_shm_regions) + 2, + (3000, None), + precreated_shm_regions=precreated_shm_regions, + ) self.check_response( trial, - 6, (3000, None), - precreated_shm_regions=precreated_shm_regions) + 6, + (3000, None), + precreated_shm_regions=precreated_shm_regions, + ) self.check_deferred_exception() self.check_status(model_name, {2: 1, 6: 1}, 2, 8, (2,)) except Exception as ex: @@ -295,16 +334,19 @@ def test_static_batch_lt_any_preferred(self): precreated_shm_regions = self.create_advance() for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) self.check_response( trial, - 1, (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), - precreated_shm_regions=precreated_shm_regions) + 1, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + precreated_shm_regions=precreated_shm_regions, + ) self.check_deferred_exception() self.check_status(model_name, {1: 1}, 1, 1, (1,)) except Exception as ex: @@ -317,16 +359,19 @@ def test_static_batch_not_preferred(self): precreated_shm_regions = self.create_advance() for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) self.check_response( trial, - 3, (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), - precreated_shm_regions=precreated_shm_regions) + 3, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + precreated_shm_regions=precreated_shm_regions, + ) self.check_deferred_exception() self.check_status(model_name, {3: 1}, 1, 3, (1,)) except Exception as ex: @@ -339,16 +384,19 @@ def test_static_batch_gt_max_preferred(self): precreated_shm_regions = self.create_advance() for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) self.check_response( trial, - 7, (3000, None), - precreated_shm_regions=precreated_shm_regions) + 7, + (3000, None), + precreated_shm_regions=precreated_shm_regions, + ) self.check_deferred_exception() self.check_status(model_name, {7: 1}, 1, 7, (1,)) except Exception as ex: @@ -369,25 +417,29 @@ def test_multi_batch_different_shape_allow_ragged(self): threads = [] threads.append( - threading.Thread(target=iu.infer_zero, - args=(self, trial, 1, dtype, ([1, 16],), - ([1, 16],)), - kwargs={ - 'use_grpc': USE_GRPC, - 'use_http': USE_HTTP, - 'use_http_json_tensors': False, - 'use_streaming': False - })) - threads.append( - threading.Thread(target=iu.infer_zero, - args=(self, trial, 1, dtype, ([1, 8],), - ([1, 8],)), - kwargs={ - 'use_grpc': USE_GRPC, - 'use_http': USE_HTTP, - 'use_http_json_tensors': False, - 'use_streaming': False - })) + threading.Thread( + target=iu.infer_zero, + args=(self, trial, 1, dtype, ([1, 16],), ([1, 16],)), + kwargs={ + "use_grpc": USE_GRPC, + "use_http": USE_HTTP, + "use_http_json_tensors": False, + "use_streaming": False, + }, + ) + ) + threads.append( + threading.Thread( + target=iu.infer_zero, + args=(self, trial, 1, dtype, ([1, 8],), ([1, 8],)), + kwargs={ + "use_grpc": USE_GRPC, + "use_http": USE_HTTP, + "use_http_json_tensors": False, + "use_streaming": False, + }, + ) + ) threads[0].start() threads[1].start() for t in threads: @@ -405,17 +457,18 @@ def test_multi_batch_different_shape(self): # immediately and the second delayed by the max batch queue # delay if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] else: shm0_region_names = None shm1_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) @@ -426,20 +479,27 @@ def test_multi_batch_different_shape(self): target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'input_size': 16, - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "input_size": 16, + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, - args=(trial, 1, (_max_queue_delay_ms * 1.5, - _max_queue_delay_ms)), + args=( + trial, + 1, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), kwargs={ - 'input_size': 8, - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "input_size": 8, + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads[0].start() time.sleep(1) threads[1].start() @@ -457,17 +517,18 @@ def test_multi_batch_not_preferred(self): # delay (minus the difference in time that they arrived in the # queue) if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] else: shm0_region_names = None shm1_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) @@ -476,21 +537,31 @@ def test_multi_batch_not_preferred(self): threads.append( threading.Thread( target=self.check_response, - args=(trial, 1, (_max_queue_delay_ms * 1.5, - _max_queue_delay_ms)), + args=( + trial, + 1, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), kwargs={ - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, - args=(trial, 3, (_max_queue_delay_ms * 1.5, - _max_queue_delay_ms - 2000)), + args=( + trial, + 3, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms - 2000), + ), kwargs={ - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads[0].start() time.sleep(1) threads[1].start() @@ -508,20 +579,21 @@ def test_multi_batch_not_preferred_different_shape(self): # two requests to be immediately responded to and the third # response to be delayed by the max batch queue delay. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] - shm2_region_names = ['ip20', 'ip21', 'op20', 'op21'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] else: shm0_region_names = None shm1_region_names = None shm2_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) - precreated_shm2_regions = self.create_advance(['op20', 'op21']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) @@ -532,27 +604,36 @@ def test_multi_batch_not_preferred_different_shape(self): target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 3, (6000, None)), kwargs={ - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, - args=(trial, 1, (_max_queue_delay_ms * 1.5, - _max_queue_delay_ms)), + args=( + trial, + 1, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), kwargs={ - 'input_size': 8, - 'shm_region_names': shm2_region_names, - 'precreated_shm_regions': precreated_shm2_regions - })) + "input_size": 8, + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) threads[0].start() threads[1].start() time.sleep(1) @@ -573,23 +654,24 @@ def test_multi_batch_preferred_different_shape(self): # preferred size so that third and forth response are sent # immediately. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] - shm2_region_names = ['ip20', 'ip21', 'op20', 'op21'] - shm3_region_names = ['ip30', 'ip31', 'op30', 'op31'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + shm3_region_names = ["ip30", "ip31", "op30", "op31"] else: shm0_region_names = None shm1_region_names = None shm2_region_names = None shm3_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) - precreated_shm2_regions = self.create_advance(['op20', 'op21']) - precreated_shm3_regions = self.create_advance(['op30', 'op31']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + precreated_shm3_regions = self.create_advance(["op30", "op31"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) @@ -600,35 +682,43 @@ def test_multi_batch_preferred_different_shape(self): target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 3, (6000, None)), kwargs={ - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'input_size': 8, - 'shm_region_names': shm2_region_names, - 'precreated_shm_regions': precreated_shm2_regions - })) + "input_size": 8, + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 5, (6000, None)), kwargs={ - 'input_size': 8, - 'shm_region_names': shm3_region_names, - 'precreated_shm_regions': precreated_shm3_regions - })) + "input_size": 8, + "shm_region_names": shm3_region_names, + "precreated_shm_regions": precreated_shm3_regions, + }, + ) + ) threads[0].start() threads[1].start() time.sleep(1) @@ -648,17 +738,18 @@ def test_multi_batch_gt_max_preferred(self): # be processed by the dynamic batcher. This should cause both # responses to be returned immediately. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] else: shm0_region_names = None shm1_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) @@ -669,17 +760,21 @@ def test_multi_batch_gt_max_preferred(self): target=self.check_response, args=(trial, 3, (3000, None)), kwargs={ - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 7, (3000, None)), kwargs={ - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads[0].start() time.sleep(1) threads[1].start() @@ -700,17 +795,18 @@ def test_multi_batch_sum_gt_max_preferred(self): # since it alone is not greater than max preferred size, will # be delayed. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] else: shm0_region_names = None shm1_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) @@ -721,18 +817,25 @@ def test_multi_batch_sum_gt_max_preferred(self): target=self.check_response, args=(trial, 3, (3000, None)), kwargs={ - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, - args=(trial, 4, (_max_queue_delay_ms * 1.5, - _max_queue_delay_ms)), + args=( + trial, + 4, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), kwargs={ - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads[0].start() time.sleep(1) threads[1].start() @@ -748,17 +851,18 @@ def test_multi_same_output0(self): # batched and get the correct response even though they don't # request both outputs. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00'] - shm1_region_names = ['ip10', 'ip11', 'op10'] + shm0_region_names = ["ip00", "ip01", "op00"] + shm1_region_names = ["ip10", "ip11", "op10"] else: shm0_region_names = None shm1_region_names = None - precreated_shm0_regions = self.create_advance(['op00']) - precreated_shm1_regions = self.create_advance(['op10']) + precreated_shm0_regions = self.create_advance(["op00"]) + precreated_shm1_regions = self.create_advance(["op10"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) @@ -770,19 +874,23 @@ def test_multi_same_output0(self): target=self.check_response, args=(trial, 1, (3000, None)), kwargs={ - 'requested_outputs': ("OUTPUT0",), - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "requested_outputs": ("OUTPUT0",), + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (3000, None)), kwargs={ - 'requested_outputs': ("OUTPUT0",), - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "requested_outputs": ("OUTPUT0",), + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads[0].start() threads[1].start() for t in threads: @@ -797,17 +905,18 @@ def test_multi_same_output1(self): # batched and get the correct response even though they don't # request both outputs. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op11'] + shm0_region_names = ["ip00", "ip01", "op01"] + shm1_region_names = ["ip10", "ip11", "op11"] else: shm0_region_names = None shm1_region_names = None - precreated_shm0_regions = self.create_advance(['op01']) - precreated_shm1_regions = self.create_advance(['op11']) + precreated_shm0_regions = self.create_advance(["op01"]) + precreated_shm1_regions = self.create_advance(["op11"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) @@ -819,19 +928,23 @@ def test_multi_same_output1(self): target=self.check_response, args=(trial, 1, (3000, None)), kwargs={ - 'requested_outputs': ("OUTPUT1",), - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "requested_outputs": ("OUTPUT1",), + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (3000, None)), kwargs={ - 'requested_outputs': ("OUTPUT1",), - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "requested_outputs": ("OUTPUT1",), + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads[0].start() threads[1].start() for t in threads: @@ -847,17 +960,18 @@ def test_multi_different_outputs(self): # batched and get the correct response even though they don't # request both outputs. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00'] - shm1_region_names = ['ip10', 'ip11', 'op11'] + shm0_region_names = ["ip00", "ip01", "op00"] + shm1_region_names = ["ip10", "ip11", "op11"] else: shm0_region_names = None shm1_region_names = None - precreated_shm0_regions = self.create_advance(['op00']) - precreated_shm1_regions = self.create_advance(['op11']) + precreated_shm0_regions = self.create_advance(["op00"]) + precreated_shm1_regions = self.create_advance(["op11"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) @@ -869,19 +983,23 @@ def test_multi_different_outputs(self): target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'requested_outputs': ("OUTPUT0",), - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "requested_outputs": ("OUTPUT0",), + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'requested_outputs': ("OUTPUT1",), - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "requested_outputs": ("OUTPUT1",), + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads[0].start() threads[1].start() for t in threads: @@ -896,15 +1014,16 @@ def test_multi_different_output_order(self): # different order. They should be batched and get the correct # response even though they use different order. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op11', 'op10'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op11", "op10"] else: shm0_region_names = None shm1_region_names = None for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) @@ -912,21 +1031,25 @@ def test_multi_different_output_order(self): threads = [] threads.append( - threading.Thread(target=self.check_response, - args=(trial, 1, (6000, None)), - kwargs={ - 'requested_outputs': - ("OUTPUT0", "OUTPUT1"), - 'shm_region_names': shm0_region_names - })) - threads.append( - threading.Thread(target=self.check_response, - args=(trial, 1, (6000, None)), - kwargs={ - 'requested_outputs': - ("OUTPUT1", "OUTPUT0"), - 'shm_region_names': shm1_region_names - })) + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "requested_outputs": ("OUTPUT0", "OUTPUT1"), + "shm_region_names": shm0_region_names, + }, + ) + ) + threads.append( + threading.Thread( + target=self.check_response, + args=(trial, 1, (6000, None)), + kwargs={ + "requested_outputs": ("OUTPUT1", "OUTPUT0"), + "shm_region_names": shm1_region_names, + }, + ) + ) threads[0].start() threads[1].start() for t in threads: @@ -946,24 +1069,24 @@ def test_multi_batch_delayed_sum_gt_max_preferred(self): # immediately but the second response, since it alone is not # greater than max preferred size, will be delayed. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] else: shm0_region_names = None shm1_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) # Need scheduler to wait for queue to contain 2 requests self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.assertEqual( - int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 2) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 2) threads = [] threads.append( @@ -971,18 +1094,25 @@ def test_multi_batch_delayed_sum_gt_max_preferred(self): target=self.check_response, args=(trial, 3, (6000, None)), kwargs={ - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, - args=(trial, 4, (_max_queue_delay_ms * 1.5, - _max_queue_delay_ms)), + args=( + trial, + 4, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), kwargs={ - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads[0].start() time.sleep(1) threads[1].start() @@ -996,7 +1126,7 @@ def test_multi_batch_delayed_sum_gt_max_preferred(self): def test_multi_batch_delayed_use_max_batch(self): # Send three requests with first not having preferred size, # second being smaller than max preferred size but the sum of - # the requests being larger than max preferred size and thrid + # the requests being larger than max preferred size and third # is sent after the first two requests exceeds the queue delay # and the sum of the requests to be in full batch. Use # TRITONSERVER_DELAY_SCHEDULER in the environment so that @@ -1005,55 +1135,67 @@ def test_multi_batch_delayed_use_max_batch(self): # while it appears that the first two responses to be returned # after being delayed and the third response to be returned immediately. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] - shm2_region_names = ['ip20', 'ip21', 'op20', 'op21'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] else: shm0_region_names = None shm1_region_names = None shm2_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) - precreated_shm2_regions = self.create_advance(['op20', 'op21']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) # Need scheduler to wait for queue to contain 3 requests self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.assertEqual( - int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3) threads = [] threads.append( threading.Thread( target=self.check_response, - args=(trial, 3, (_max_queue_delay_ms * 1.5, - _max_queue_delay_ms)), + args=( + trial, + 3, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), kwargs={ - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, - args=(trial, 4, (_max_queue_delay_ms * 1.5, - _max_queue_delay_ms)), + args=( + trial, + 4, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), kwargs={ - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm2_region_names, - 'precreated_shm_regions': precreated_shm2_regions - })) + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) threads[0].start() threads[1].start() time.sleep(11) @@ -1076,30 +1218,30 @@ def test_multi_batch_delayed_preferred_different_shape(self): # shape as the third that causes a preferred size so that # third and forth response are sent immediately. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] - shm2_region_names = ['ip20', 'ip21', 'op20', 'op21'] - shm3_region_names = ['ip30', 'ip31', 'op30', 'op31'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + shm3_region_names = ["ip30", "ip31", "op30", "op31"] else: shm0_region_names = None shm1_region_names = None shm2_region_names = None shm3_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) - precreated_shm2_regions = self.create_advance(['op20', 'op21']) - precreated_shm3_regions = self.create_advance(['op30', 'op31']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + precreated_shm3_regions = self.create_advance(["op30", "op31"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) # Need scheduler to wait for queue to contain 4 requests self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.assertEqual( - int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 4) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 4) threads = [] threads.append( @@ -1107,35 +1249,43 @@ def test_multi_batch_delayed_preferred_different_shape(self): target=self.check_response, args=(trial, 1, (3000, None)), kwargs={ - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 3, (3000, None)), kwargs={ - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (3000, None)), kwargs={ - 'input_size': 8, - 'shm_region_names': shm2_region_names, - 'precreated_shm_regions': precreated_shm2_regions - })) + "input_size": 8, + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 5, (3000, None)), kwargs={ - 'input_size': 8, - 'shm_region_names': shm3_region_names, - 'precreated_shm_regions': precreated_shm3_regions - })) + "input_size": 8, + "shm_region_names": shm3_region_names, + "precreated_shm_regions": precreated_shm3_regions, + }, + ) + ) threads[0].start() threads[1].start() time.sleep(1) @@ -1155,12 +1305,12 @@ def test_multi_batch_use_biggest_preferred(self): # that requests can be queued up before scheduler starts # servicing. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] - shm2_region_names = ['ip20', 'ip21', 'op20', 'op21'] - shm3_region_names = ['ip30', 'ip31', 'op30', 'op31'] - shm4_region_names = ['ip40', 'ip41', 'op40', 'op41'] - shm5_region_names = ['ip50', 'ip51', 'op50', 'op51'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + shm3_region_names = ["ip30", "ip31", "op30", "op31"] + shm4_region_names = ["ip40", "ip41", "op40", "op41"] + shm5_region_names = ["ip50", "ip51", "op50", "op51"] else: shm0_region_names = None shm1_region_names = None @@ -1168,23 +1318,23 @@ def test_multi_batch_use_biggest_preferred(self): shm3_region_names = None shm4_region_names = None shm5_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) - precreated_shm2_regions = self.create_advance(['op20', 'op21']) - precreated_shm3_regions = self.create_advance(['op30', 'op31']) - precreated_shm4_regions = self.create_advance(['op40', 'op41']) - precreated_shm5_regions = self.create_advance(['op50', 'op51']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + precreated_shm3_regions = self.create_advance(["op30", "op31"]) + precreated_shm4_regions = self.create_advance(["op40", "op41"]) + precreated_shm5_regions = self.create_advance(["op50", "op51"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) # Need scheduler to wait for queue to contain 6 request self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.assertEqual( - int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 6) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 6) threads = [] threads.append( @@ -1192,49 +1342,61 @@ def test_multi_batch_use_biggest_preferred(self): target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm2_region_names, - 'precreated_shm_regions': precreated_shm2_regions - })) + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm3_region_names, - 'precreated_shm_regions': precreated_shm3_regions - })) + "shm_region_names": shm3_region_names, + "precreated_shm_regions": precreated_shm3_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm4_region_names, - 'precreated_shm_regions': precreated_shm4_regions - })) + "shm_region_names": shm4_region_names, + "precreated_shm_regions": precreated_shm4_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm5_region_names, - 'precreated_shm_regions': precreated_shm5_regions - })) + "shm_region_names": shm5_region_names, + "precreated_shm_regions": precreated_shm5_regions, + }, + ) + ) for t in threads: t.start() for t in threads: @@ -1253,27 +1415,27 @@ def test_multi_batch_use_best_preferred(self): # that requests can be queued up before scheduler starts # servicing. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] - shm2_region_names = ['ip20', 'ip21', 'op20', 'op21'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] else: shm0_region_names = None shm1_region_names = None shm2_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) - precreated_shm2_regions = self.create_advance(['op20', 'op21']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000) # Need scheduler to wait for queue to contain 3 requests self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.assertEqual( - int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3) threads = [] threads.append( @@ -1281,26 +1443,35 @@ def test_multi_batch_use_best_preferred(self): target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, - args=(trial, 1, (_max_queue_delay_ms * 1.5, - _max_queue_delay_ms)), + args=( + trial, + 1, + (_max_queue_delay_ms * 1.5, _max_queue_delay_ms), + ), kwargs={ - 'shm_region_names': shm2_region_names, - 'precreated_shm_regions': precreated_shm2_regions - })) + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) threads[0].start() threads[1].start() time.sleep(1) @@ -1315,41 +1486,36 @@ def test_multi_batch_use_best_preferred(self): def test_multi_batch_preserve_ordering(self): model_base = "custom" dtype = np.float32 - shapes = ([ - 1, - 1, - ],) + shapes = ( + [ + 1, + 1, + ], + ) try: # use threads to send 12 requests without waiting for response threads = [] for i in range(12): if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm_region_name_prefix = [ - "input" + str(i), "output" + str(i) - ] + shm_region_name_prefix = ["input" + str(i), "output" + str(i)] else: shm_region_name_prefix = None threads.append( - threading.Thread(target=iu.infer_zero, - args=(self, model_base, 1, dtype, shapes, - shapes), - kwargs={ - 'use_grpc': - USE_GRPC, - 'use_http': - USE_HTTP, - 'use_http_json_tensors': - False, - 'use_streaming': - False, - 'shm_region_name_prefix': - shm_region_name_prefix, - 'use_system_shared_memory': - TEST_SYSTEM_SHARED_MEMORY, - 'use_cuda_shared_memory': - TEST_CUDA_SHARED_MEMORY - })) + threading.Thread( + target=iu.infer_zero, + args=(self, model_base, 1, dtype, shapes, shapes), + kwargs={ + "use_grpc": USE_GRPC, + "use_http": USE_HTTP, + "use_http_json_tensors": False, + "use_streaming": False, + "shm_region_name_prefix": shm_region_name_prefix, + "use_system_shared_memory": TEST_SYSTEM_SHARED_MEMORY, + "use_cuda_shared_memory": TEST_CUDA_SHARED_MEMORY, + }, + ) + ) for t in threads: t.start() for t in threads: @@ -1367,30 +1533,30 @@ def test_preferred_batch_only_aligned(self): # servicing. The batcher should form a batch of preferred # size 4. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] - shm2_region_names = ['ip20', 'ip21', 'op20', 'op21'] - shm3_region_names = ['ip30', 'ip31', 'op30', 'op31'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + shm3_region_names = ["ip30", "ip31", "op30", "op31"] else: shm0_region_names = None shm1_region_names = None shm2_region_names = None shm3_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) - precreated_shm2_regions = self.create_advance(['op20', 'op21']) - precreated_shm3_regions = self.create_advance(['op30', 'op31']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + precreated_shm3_regions = self.create_advance(["op30", "op31"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [4, 6], 0) # Need scheduler to wait for queue to contain 4 requests self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.assertEqual( - int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 4) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 4) threads = [] threads.append( @@ -1398,33 +1564,41 @@ def test_preferred_batch_only_aligned(self): target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm2_region_names, - 'precreated_shm_regions': precreated_shm2_regions - })) + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm3_region_names, - 'precreated_shm_regions': precreated_shm3_regions - })) + "shm_region_names": shm3_region_names, + "precreated_shm_regions": precreated_shm3_regions, + }, + ) + ) for t in threads: t.start() for t in threads: @@ -1441,33 +1615,33 @@ def test_preferred_batch_only_unaligned(self): # servicing. The batcher should form a batch of preferred # size 4 followed by a batch of size 1. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] - shm2_region_names = ['ip20', 'ip21', 'op20', 'op21'] - shm3_region_names = ['ip30', 'ip31', 'op30', 'op31'] - shm4_region_names = ['ip40', 'ip41', 'op40', 'op41'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + shm3_region_names = ["ip30", "ip31", "op30", "op31"] + shm4_region_names = ["ip40", "ip41", "op40", "op41"] else: shm0_region_names = None shm1_region_names = None shm2_region_names = None shm3_region_names = None shm4_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) - precreated_shm2_regions = self.create_advance(['op20', 'op21']) - precreated_shm3_regions = self.create_advance(['op30', 'op31']) - precreated_shm4_regions = self.create_advance(['op40', 'op41']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + precreated_shm3_regions = self.create_advance(["op30", "op31"]) + precreated_shm4_regions = self.create_advance(["op40", "op41"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [4, 6], 0) # Need scheduler to wait for queue to contain 3 requests self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.assertEqual( - int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 5) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 5) threads = [] threads.append( @@ -1475,41 +1649,51 @@ def test_preferred_batch_only_unaligned(self): target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm2_region_names, - 'precreated_shm_regions': precreated_shm2_regions - })) + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm3_region_names, - 'precreated_shm_regions': precreated_shm3_regions - })) + "shm_region_names": shm3_region_names, + "precreated_shm_regions": precreated_shm3_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm4_region_names, - 'precreated_shm_regions': precreated_shm4_regions - })) + "shm_region_names": shm4_region_names, + "precreated_shm_regions": precreated_shm4_regions, + }, + ) + ) for t in threads: t.start() for t in threads: @@ -1526,13 +1710,13 @@ def test_preferred_batch_only_use_biggest_preferred(self): # servicing. The batcher should form a batch of largest preferred # size 6 followed by a batch of size 1. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] - shm2_region_names = ['ip20', 'ip21', 'op20', 'op21'] - shm3_region_names = ['ip30', 'ip31', 'op30', 'op31'] - shm4_region_names = ['ip40', 'ip41', 'op40', 'op41'] - shm5_region_names = ['ip50', 'ip51', 'op50', 'op51'] - shm6_region_names = ['ip60', 'ip61', 'op60', 'op61'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] + shm3_region_names = ["ip30", "ip31", "op30", "op31"] + shm4_region_names = ["ip40", "ip41", "op40", "op41"] + shm5_region_names = ["ip50", "ip51", "op50", "op51"] + shm6_region_names = ["ip60", "ip61", "op60", "op61"] else: shm0_region_names = None shm1_region_names = None @@ -1541,24 +1725,24 @@ def test_preferred_batch_only_use_biggest_preferred(self): shm4_region_names = None shm5_region_names = None shm6_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) - precreated_shm2_regions = self.create_advance(['op20', 'op21']) - precreated_shm3_regions = self.create_advance(['op30', 'op31']) - precreated_shm4_regions = self.create_advance(['op40', 'op41']) - precreated_shm5_regions = self.create_advance(['op50', 'op51']) - precreated_shm6_regions = self.create_advance(['op60', 'op61']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) + precreated_shm3_regions = self.create_advance(["op30", "op31"]) + precreated_shm4_regions = self.create_advance(["op40", "op41"]) + precreated_shm5_regions = self.create_advance(["op50", "op51"]) + precreated_shm6_regions = self.create_advance(["op60", "op61"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [4, 6], 0) # Need scheduler to wait for queue to contain 6 request self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.assertEqual( - int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 7) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 7) threads = [] threads.append( @@ -1566,57 +1750,71 @@ def test_preferred_batch_only_use_biggest_preferred(self): target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm2_region_names, - 'precreated_shm_regions': precreated_shm2_regions - })) + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm3_region_names, - 'precreated_shm_regions': precreated_shm3_regions - })) + "shm_region_names": shm3_region_names, + "precreated_shm_regions": precreated_shm3_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm4_region_names, - 'precreated_shm_regions': precreated_shm4_regions - })) + "shm_region_names": shm4_region_names, + "precreated_shm_regions": precreated_shm4_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm5_region_names, - 'precreated_shm_regions': precreated_shm5_regions - })) + "shm_region_names": shm5_region_names, + "precreated_shm_regions": precreated_shm5_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm6_region_names, - 'precreated_shm_regions': precreated_shm6_regions - })) + "shm_region_names": shm6_region_names, + "precreated_shm_regions": precreated_shm6_regions, + }, + ) + ) for t in threads: t.start() for t in threads: @@ -1632,27 +1830,27 @@ def test_preferred_batch_only_use_no_preferred_size(self): # requests can be queued up before scheduler starts # servicing. The batcher should form a batch of of 3. if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm0_region_names = ['ip00', 'ip01', 'op00', 'op01'] - shm1_region_names = ['ip10', 'ip11', 'op10', 'op11'] - shm2_region_names = ['ip20', 'ip21', 'op20', 'op21'] + shm0_region_names = ["ip00", "ip01", "op00", "op01"] + shm1_region_names = ["ip10", "ip11", "op10", "op11"] + shm2_region_names = ["ip20", "ip21", "op20", "op21"] else: shm0_region_names = None shm1_region_names = None shm2_region_names = None - precreated_shm0_regions = self.create_advance(['op00', 'op01']) - precreated_shm1_regions = self.create_advance(['op10', 'op11']) - precreated_shm2_regions = self.create_advance(['op20', 'op21']) + precreated_shm0_regions = self.create_advance(["op00", "op01"]) + precreated_shm1_regions = self.create_advance(["op10", "op11"]) + precreated_shm2_regions = self.create_advance(["op20", "op21"]) for trial in _trials: try: - model_name = tu.get_model_name(trial, np.float32, np.float32, - np.float32) + model_name = tu.get_model_name( + trial, np.float32, np.float32, np.float32 + ) self.check_setup(model_name, [4, 6], 0) # Need scheduler to wait for queue to contain 3 request self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.assertEqual( - int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3) threads = [] threads.append( @@ -1660,25 +1858,31 @@ def test_preferred_batch_only_use_no_preferred_size(self): target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm0_region_names, - 'precreated_shm_regions': precreated_shm0_regions - })) + "shm_region_names": shm0_region_names, + "precreated_shm_regions": precreated_shm0_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm1_region_names, - 'precreated_shm_regions': precreated_shm1_regions - })) + "shm_region_names": shm1_region_names, + "precreated_shm_regions": precreated_shm1_regions, + }, + ) + ) threads.append( threading.Thread( target=self.check_response, args=(trial, 1, (6000, None)), kwargs={ - 'shm_region_names': shm2_region_names, - 'precreated_shm_regions': precreated_shm2_regions - })) + "shm_region_names": shm2_region_names, + "precreated_shm_regions": precreated_shm2_regions, + }, + ) + ) for t in threads: t.start() for t in threads: @@ -1694,41 +1898,36 @@ def test_max_queue_delay_only_non_default(self): # there can be either 1 or 2 model executions. model_base = "custom" dtype = np.float32 - shapes = ([ - 1, - 1, - ],) + shapes = ( + [ + 1, + 1, + ], + ) try: # use threads to send 12 requests without waiting for response threads = [] for i in range(12): if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm_region_name_prefix = [ - "input" + str(i), "output" + str(i) - ] + shm_region_name_prefix = ["input" + str(i), "output" + str(i)] else: shm_region_name_prefix = None threads.append( - threading.Thread(target=iu.infer_zero, - args=(self, model_base, 1, dtype, shapes, - shapes), - kwargs={ - 'use_grpc': - USE_GRPC, - 'use_http': - USE_HTTP, - 'use_http_json_tensors': - False, - 'use_streaming': - False, - 'shm_region_name_prefix': - shm_region_name_prefix, - 'use_system_shared_memory': - TEST_SYSTEM_SHARED_MEMORY, - 'use_cuda_shared_memory': - TEST_CUDA_SHARED_MEMORY - })) + threading.Thread( + target=iu.infer_zero, + args=(self, model_base, 1, dtype, shapes, shapes), + kwargs={ + "use_grpc": USE_GRPC, + "use_http": USE_HTTP, + "use_http_json_tensors": False, + "use_streaming": False, + "shm_region_name_prefix": shm_region_name_prefix, + "use_system_shared_memory": TEST_SYSTEM_SHARED_MEMORY, + "use_cuda_shared_memory": TEST_CUDA_SHARED_MEMORY, + }, + ) + ) for t in threads: t.start() for t in threads: @@ -1746,41 +1945,36 @@ def test_max_queue_delay_only_default(self): # and the remaining requests will form the second batch. model_base = "custom" dtype = np.float32 - shapes = ([ - 1, - 1, - ],) + shapes = ( + [ + 1, + 1, + ], + ) try: # use threads to send 12 requests without waiting for response threads = [] for i in range(12): if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY: - shm_region_name_prefix = [ - "input" + str(i), "output" + str(i) - ] + shm_region_name_prefix = ["input" + str(i), "output" + str(i)] else: shm_region_name_prefix = None threads.append( - threading.Thread(target=iu.infer_zero, - args=(self, model_base, 1, dtype, shapes, - shapes), - kwargs={ - 'use_grpc': - USE_GRPC, - 'use_http': - USE_HTTP, - 'use_http_json_tensors': - False, - 'use_streaming': - False, - 'shm_region_name_prefix': - shm_region_name_prefix, - 'use_system_shared_memory': - TEST_SYSTEM_SHARED_MEMORY, - 'use_cuda_shared_memory': - TEST_CUDA_SHARED_MEMORY - })) + threading.Thread( + target=iu.infer_zero, + args=(self, model_base, 1, dtype, shapes, shapes), + kwargs={ + "use_grpc": USE_GRPC, + "use_http": USE_HTTP, + "use_http_json_tensors": False, + "use_streaming": False, + "shm_region_name_prefix": shm_region_name_prefix, + "use_system_shared_memory": TEST_SYSTEM_SHARED_MEMORY, + "use_cuda_shared_memory": TEST_CUDA_SHARED_MEMORY, + }, + ) + ) for t in threads: t.start() for t in threads: @@ -1792,5 +1986,5 @@ def test_max_queue_delay_only_default(self): self.assertTrue(False, "unexpected error {}".format(ex)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_batcher/test.sh b/qa/L0_batcher/test.sh old mode 100644 new mode 100755 index c13c249a3c..c5f8819276 --- a/qa/L0_batcher/test.sh +++ b/qa/L0_batcher/test.sh @@ -159,7 +159,7 @@ for BACKEND in $BACKENDS; do cp $onnx_model/output0_labels.txt models/$python_model cp ../python_models/add_sub/model.py models/$python_model/1/ else - cp -r $TMP_MODEL_DIR models/. + cp -r $TMP_MODEL_DIR models/. fi (cd models/$(basename $TMP_MODEL_DIR) && \ sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ diff --git a/qa/L0_batcher/verify_timestamps.py b/qa/L0_batcher/verify_timestamps.py old mode 100644 new mode 100755 index c39f560c73..3271135fcd --- a/qa/L0_batcher/verify_timestamps.py +++ b/qa/L0_batcher/verify_timestamps.py @@ -33,7 +33,7 @@ def verify_timestamps(traces, preserve): # Order traces by id - traces = sorted(traces, key=lambda t: t.get('id', -1)) + traces = sorted(traces, key=lambda t: t.get("id", -1)) # Filter the trace that is not meaningful and group them by 'id' filtered_traces = dict() @@ -41,7 +41,7 @@ def verify_timestamps(traces, preserve): for trace in traces: if "id" not in trace: continue - # Skip GRPC traces as actual traces are not genarated via GRPC, + # Skip GRPC traces as actual traces are not generated via GRPC, # thus GRPC traces are ill-formed if "timestamps" in trace: is_grpc = False @@ -53,16 +53,16 @@ def verify_timestamps(traces, preserve): grpc_id_offset += 1 continue - if (trace['id'] in filtered_traces.keys()): - rep_trace = filtered_traces[trace['id']] - # Apend the timestamp to the trace representing this 'id' + if trace["id"] in filtered_traces.keys(): + rep_trace = filtered_traces[trace["id"]] + # Append the timestamp to the trace representing this 'id' if "timestamps" in trace: rep_trace["timestamps"] += trace["timestamps"] else: # Use this trace to represent this 'id' if "timestamps" not in trace: trace["timestamps"] = [] - filtered_traces[trace['id']] = trace + filtered_traces[trace["id"]] = trace # First find the latest response complete timestamp for the batch with large delay large_delay_response_complete = 0 @@ -75,11 +75,11 @@ def verify_timestamps(traces, preserve): compute_span = timestamps["COMPUTE_END"] - timestamps["COMPUTE_START"] # If the 3rd batch is also processed by large delay instance, we don't # want to use its responses as baseline - if trace["id"] <= ( - 8 + grpc_id_offset) and compute_span >= 400 * 1000 * 1000: + if trace["id"] <= (8 + grpc_id_offset) and compute_span >= 400 * 1000 * 1000: response_complete = timestamps["INFER_RESPONSE_COMPLETE"] - large_delay_response_complete = max(large_delay_response_complete, - response_complete) + large_delay_response_complete = max( + large_delay_response_complete, response_complete + ) else: small_delay_traces.append(trace) @@ -93,8 +93,11 @@ def verify_timestamps(traces, preserve): response_request_after_large_delay_count += 1 # Hardcoded expected count here - print("responses after large delay count: {}".format( - response_request_after_large_delay_count)) + print( + "responses after large delay count: {}".format( + response_request_after_large_delay_count + ) + ) if preserve: # If preserve ordering, there must be large delay batch followed by # small delay batch and thus at least 4 responses are sent after @@ -105,15 +108,17 @@ def verify_timestamps(traces, preserve): return 0 if response_request_after_large_delay_count == 0 else 1 -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-p', - '--preserve', - action="store_true", - required=False, - default=False, - help='Timestamps is collected with preserve ordering') - parser.add_argument('file', type=argparse.FileType('r'), nargs='+') + parser.add_argument( + "-p", + "--preserve", + action="store_true", + required=False, + default=False, + help="Timestamps is collected with preserve ordering", + ) + parser.add_argument("file", type=argparse.FileType("r"), nargs="+") FLAGS = parser.parse_args() for f in FLAGS.file: diff --git a/qa/L0_buffer_attributes/buffer_attributes_test.py b/qa/L0_buffer_attributes/buffer_attributes_test.py old mode 100644 new mode 100755 index 907a469bab..7d61e082c5 --- a/qa/L0_buffer_attributes/buffer_attributes_test.py +++ b/qa/L0_buffer_attributes/buffer_attributes_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -29,28 +31,26 @@ sys.path.append("../common") import unittest + import numpy as np import test_util as tu - +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient import tritonclient.utils.cuda_shared_memory as cudashm from tritonclient.utils import triton_to_np_dtype -import tritonclient.http as httpclient -import tritonclient.grpc as grpcclient class BufferAttributesTest(tu.TestResultCollector): - def test_buffer_attributes(self): - model_name = 'bls' + model_name = "bls" # Infer clients = [ - httpclient.InferenceServerClient(url='localhost:8000'), - grpcclient.InferenceServerClient(url='localhost:8001') + httpclient.InferenceServerClient(url="localhost:8000"), + grpcclient.InferenceServerClient(url="localhost:8001"), ] triton_clients = [httpclient, grpcclient] for i, client in enumerate(clients): - # To make sure no shared memory regions are registered with the # server. client.unregister_system_shared_memory() @@ -59,8 +59,7 @@ def test_buffer_attributes(self): triton_client = triton_clients[i] inputs = [] outputs = [] - inputs.append(triton_client.InferInput('INPUT0', [1, 1000], - "INT32")) + inputs.append(triton_client.InferInput("INPUT0", [1, 1000], "INT32")) input0_data = np.arange(start=0, stop=1000, dtype=np.int32) input0_data = np.expand_dims(input0_data, axis=0) @@ -69,45 +68,55 @@ def test_buffer_attributes(self): output_byte_size = input_byte_size shm_ip0_handle = cudashm.create_shared_memory_region( - "input0_data", input_byte_size, 0) + "input0_data", input_byte_size, 0 + ) shm_op0_handle = cudashm.create_shared_memory_region( - "output0_data", output_byte_size, 0) + "output0_data", output_byte_size, 0 + ) client.register_cuda_shared_memory( - "input0_data", cudashm.get_raw_handle(shm_ip0_handle), 0, - input_byte_size) + "input0_data", + cudashm.get_raw_handle(shm_ip0_handle), + 0, + input_byte_size, + ) client.register_cuda_shared_memory( - "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, - input_byte_size) + "output0_data", + cudashm.get_raw_handle(shm_op0_handle), + 0, + input_byte_size, + ) cudashm.set_shared_memory_region(shm_ip0_handle, [input0_data]) inputs[0].set_shared_memory("input0_data", input_byte_size) if triton_client is grpcclient: - outputs.append(triton_client.InferRequestedOutput('OUTPUT0')) + outputs.append(triton_client.InferRequestedOutput("OUTPUT0")) outputs[0].set_shared_memory("output0_data", output_byte_size) else: outputs.append( - triton_client.InferRequestedOutput('OUTPUT0', - binary_data=True)) + triton_client.InferRequestedOutput("OUTPUT0", binary_data=True) + ) outputs[0].set_shared_memory("output0_data", output_byte_size) - results = client.infer(model_name=model_name, - inputs=inputs, - outputs=outputs) + results = client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) output0 = results.get_output("OUTPUT0") self.assertIsNotNone(output0) if triton_client is grpcclient: output0_data = cudashm.get_contents_as_numpy( - shm_op0_handle, triton_to_np_dtype(output0.datatype), - output0.shape) + shm_op0_handle, triton_to_np_dtype(output0.datatype), output0.shape + ) else: output0_data = cudashm.get_contents_as_numpy( - shm_op0_handle, triton_to_np_dtype(output0['datatype']), - output0['shape']) + shm_op0_handle, + triton_to_np_dtype(output0["datatype"]), + output0["shape"], + ) self.assertTrue(np.all(output0_data == input0_data)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_buffer_attributes/models/bls/1/model.py b/qa/L0_buffer_attributes/models/bls/1/model.py old mode 100644 new mode 100755 index 201d5a4a5e..6c035bb6a4 --- a/qa/L0_buffer_attributes/models/bls/1/model.py +++ b/qa/L0_buffer_attributes/models/bls/1/model.py @@ -1,4 +1,6 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -29,25 +31,26 @@ # Simple Python model that executes a BLS request on an identity model. class TritonPythonModel: - def execute(self, requests): responses = [] for request in requests: # Get INPUT0 - input0 = pb_utils.get_input_tensor_by_name(request, 'INPUT0') + input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") infer_request = pb_utils.InferenceRequest( - model_name='identity', + model_name="identity", requested_output_names=["OUTPUT0"], - inputs=[input0]) + inputs=[input0], + ) infer_response = infer_request.exec() if infer_response.has_error(): - raise pb_utils.TritonModelException( - infer_response.error().message()) + raise pb_utils.TritonModelException(infer_response.error().message()) - inference_response = pb_utils.InferenceResponse(output_tensors=[ - pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0') - ]) + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") + ] + ) responses.append(inference_response) return responses diff --git a/qa/L0_buffer_attributes/models/identity/1/model.py b/qa/L0_buffer_attributes/models/identity/1/model.py old mode 100644 new mode 100755 index 74b114deb7..933ed6d9c5 --- a/qa/L0_buffer_attributes/models/identity/1/model.py +++ b/qa/L0_buffer_attributes/models/identity/1/model.py @@ -1,4 +1,6 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,7 +30,6 @@ class TritonPythonModel: - def execute(self, requests): """ Identity model using DLPack in Python backend. @@ -36,7 +37,8 @@ def execute(self, requests): responses = [] for request in requests: input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0") - out_tensor = pb_utils.Tensor.from_dlpack("OUTPUT0", - input_tensor.to_dlpack()) + out_tensor = pb_utils.Tensor.from_dlpack( + "OUTPUT0", input_tensor.to_dlpack() + ) responses.append(pb_utils.InferenceResponse([out_tensor])) return responses diff --git a/qa/L0_buffer_attributes/test.sh b/qa/L0_buffer_attributes/test.sh old mode 100644 new mode 100755 index 52babf37e2..7e2f35d837 --- a/qa/L0_buffer_attributes/test.sh +++ b/qa/L0_buffer_attributes/test.sh @@ -1,4 +1,5 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/bin/bash +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions diff --git a/qa/L0_client_build_variants/test.sh b/qa/L0_client_build_variants/test.sh index be8ae2c15e..63eb34fa5a 100755 --- a/qa/L0_client_build_variants/test.sh +++ b/qa/L0_client_build_variants/test.sh @@ -40,7 +40,7 @@ apt update && apt install -y gpg wget && \ echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | \ tee /etc/apt/sources.list.d/kitware.list >/dev/null && \ apt-get update && \ - apt-get install -y --no-install-recommends cmake cmake-data + apt-get install -y --no-install-recommends cmake cmake-data cmake --version diff --git a/qa/L0_client_java/test.sh b/qa/L0_client_java/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_client_memory_growth/client_memory_mail.py b/qa/L0_client_memory_growth/client_memory_mail.py old mode 100644 new mode 100755 index 4662f4ba41..ef1703f2c3 --- a/qa/L0_client_memory_growth/client_memory_mail.py +++ b/qa/L0_client_memory_growth/client_memory_mail.py @@ -29,18 +29,22 @@ sys.path.append("../common") -import nightly_email_helper - import glob from datetime import date -if __name__ == '__main__': +import nightly_email_helper + +if __name__ == "__main__": today = date.today().strftime("%Y-%m-%d") subject = "Triton Client Memory Growth " + sys.argv[1] + " Summary: " + today memory_graphs = glob.glob("client_memory_growth*.log") write_up = "

This test is run for both HTTP and GRPC protocols using C++ and Python test scripts. The max-allowed difference between mean and maximum memory usage is set to 10MB and 1MB for C++ and Python tests individually.

" write_up += "

• What to look for
A linear memory growth in the beginning of the graph is acceptable only when it is followed by a flat memory usage. If a linear memory growth is observed during the entire test then there is possibly a memory leak.

" - html_content = "
" + write_up + "
"
+    html_content = (
+        '
'
+        + write_up
+        + '
'
+    )
     for mem_graph in sorted(memory_graphs):
         html_content += "\n" + mem_graph + "\n"
         with open(mem_graph, "r") as f:
diff --git a/qa/L0_client_memory_growth/models/custom_identity_int32/config.pbtxt b/qa/L0_client_memory_growth/models/custom_identity_int32/config.pbtxt
index 8d3a78baf4..6a2a76bde5 100644
--- a/qa/L0_client_memory_growth/models/custom_identity_int32/config.pbtxt
+++ b/qa/L0_client_memory_growth/models/custom_identity_int32/config.pbtxt
@@ -35,7 +35,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_INT32
     dims: [ -1 ]
-    
+
   }
 ]
 output [
diff --git a/qa/L0_client_memory_growth/test.sh b/qa/L0_client_memory_growth/test.sh
index 8d90a649cf..73188812b2 100755
--- a/qa/L0_client_memory_growth/test.sh
+++ b/qa/L0_client_memory_growth/test.sh
@@ -117,7 +117,7 @@ for PROTOCOL in http grpc; do
             MEMORY_GROWTH_TEST=$MEMORY_GROWTH_TEST_CPP
             MAX_ALLOWED_ALLOC="10"
             # NOTE: This test has risk of exhausting all available sockets in
-            # the ephemeral port range. Re-using the same client connection 
+            # the ephemeral port range. Re-using the same client connection
             # ("-R") can easily solve this problem. However, to cleanly separate
             # the resources used by different client objects, we create new
             # connections for each request and retry/sleep on failure to give
diff --git a/qa/L0_client_nobatch/client_test.py b/qa/L0_client_nobatch/client_test.py
old mode 100644
new mode 100755
index d3a0e5f596..ed6a3149df
--- a/qa/L0_client_nobatch/client_test.py
+++ b/qa/L0_client_nobatch/client_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -29,15 +31,15 @@
 sys.path.append("../common")
 
 import unittest
+
 import numpy as np
-import tritonhttpclient
+import test_util as tu
 import tritongrpcclient
+import tritonhttpclient
 from tritonclientutils import InferenceServerException
-import test_util as tu
 
 
 class ClientNoBatchTest(tu.TestResultCollector):
-
     def test_nobatch_request_for_batching_model(self):
         input_size = 16
 
@@ -46,53 +48,46 @@ def test_nobatch_request_for_batching_model(self):
         # input shapes.
         tensor_shape = (input_size,)
         for protocol in ["http", "grpc"]:
-            model_name = tu.get_model_name("graphdef", np.int32, np.int8,
-                                           np.int8)
-            in0 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
-            in1 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
+            model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8)
+            in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
+            in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
 
             inputs = []
             outputs = []
             if protocol == "http":
                 triton_client = tritonhttpclient.InferenceServerClient(
-                    url='localhost:8000', verbose=True)
+                    url="localhost:8000", verbose=True
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritonhttpclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1'))
+                    tritonhttpclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT1"))
             else:
                 triton_client = tritongrpcclient.InferenceServerClient(
-                    url='localhost:8001', verbose=True)
+                    url="localhost:8001", verbose=True
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritongrpcclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))
+                    tritongrpcclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1"))
 
             # Initialize the data
             inputs[0].set_data_from_numpy(in0)
             inputs[1].set_data_from_numpy(in1)
 
             try:
-                results = triton_client.infer(model_name,
-                                              inputs,
-                                              outputs=outputs)
+                results = triton_client.infer(model_name, inputs, outputs=outputs)
                 self.assertTrue(
-                    False,
-                    "expected failure with no batch request for batching model")
+                    False, "expected failure with no batch request for batching model"
+                )
             except InferenceServerException as ex:
                 pass
 
@@ -104,53 +99,48 @@ def test_batch_request_for_nobatching_model(self):
         # is included in the shape
         tensor_shape = (1, input_size)
         for protocol in ["http", "grpc"]:
-            model_name = tu.get_model_name("graphdef_nobatch", np.int32,
-                                           np.int8, np.int8)
-            in0 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
-            in1 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
+            model_name = tu.get_model_name(
+                "graphdef_nobatch", np.int32, np.int8, np.int8
+            )
+            in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
+            in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
 
             inputs = []
             outputs = []
             if protocol == "http":
                 triton_client = tritonhttpclient.InferenceServerClient(
-                    url='localhost:8000', verbose=True)
+                    url="localhost:8000", verbose=True
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritonhttpclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1'))
+                    tritonhttpclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT1"))
             else:
                 triton_client = tritongrpcclient.InferenceServerClient(
-                    url='localhost:8001', verbose=True)
+                    url="localhost:8001", verbose=True
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritongrpcclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))
+                    tritongrpcclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1"))
 
             # Initialize the data
             inputs[0].set_data_from_numpy(in0)
             inputs[1].set_data_from_numpy(in1)
 
             try:
-                results = triton_client.infer(model_name,
-                                              inputs,
-                                              outputs=outputs)
+                results = triton_client.infer(model_name, inputs, outputs=outputs)
                 self.assertTrue(
                     False,
-                    "expected failure with batched request for non-batching model"
+                    "expected failure with batched request for non-batching model",
                 )
             except InferenceServerException as ex:
                 pass
@@ -163,41 +153,38 @@ def test_nobatch_request_for_nonbatching_model(self):
         # input shapes.
         tensor_shape = (input_size,)
         for protocol in ["http", "grpc"]:
-            model_name = tu.get_model_name("graphdef_nobatch", np.int32,
-                                           np.int8, np.int8)
-            in0 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
-            in1 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
+            model_name = tu.get_model_name(
+                "graphdef_nobatch", np.int32, np.int8, np.int8
+            )
+            in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
+            in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
 
             inputs = []
             outputs = []
             if protocol == "http":
                 triton_client = tritonhttpclient.InferenceServerClient(
-                    url='localhost:8000', verbose=True)
+                    url="localhost:8000", verbose=True
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritonhttpclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1'))
+                    tritonhttpclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT1"))
             else:
                 triton_client = tritongrpcclient.InferenceServerClient(
-                    url='localhost:8001', verbose=True)
+                    url="localhost:8001", verbose=True
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritongrpcclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))
+                    tritongrpcclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1"))
 
             # Initialize the data
             inputs[0].set_data_from_numpy(in0)
@@ -213,41 +200,36 @@ def test_batch_request_for_batching_model(self):
         # is included in the shape
         tensor_shape = (1, input_size)
         for protocol in ["http", "grpc"]:
-            model_name = tu.get_model_name("graphdef", np.int32, np.int8,
-                                           np.int8)
-            in0 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
-            in1 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
+            model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8)
+            in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
+            in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
 
             inputs = []
             outputs = []
             if protocol == "http":
                 triton_client = tritonhttpclient.InferenceServerClient(
-                    url='localhost:8000', verbose=True)
+                    url="localhost:8000", verbose=True
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritonhttpclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1'))
+                    tritonhttpclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT1"))
             else:
                 triton_client = tritongrpcclient.InferenceServerClient(
-                    url='localhost:8001', verbose=True)
+                    url="localhost:8001", verbose=True
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritongrpcclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))
+                    tritongrpcclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1"))
 
             # Initialize the data
             inputs[0].set_data_from_numpy(in0)
@@ -256,5 +238,5 @@ def test_batch_request_for_batching_model(self):
             results = triton_client.infer(model_name, inputs, outputs=outputs)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_client_timeout/client_timeout_test.py b/qa/L0_client_timeout/client_timeout_test.py
old mode 100644
new mode 100755
index f85eec5084..af7ea768eb
--- a/qa/L0_client_timeout/client_timeout_test.py
+++ b/qa/L0_client_timeout/client_timeout_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,20 +30,19 @@
 
 sys.path.append("../common")
 
-from functools import partial
-import numpy as np
 import queue
-import unittest
 import socket
-import test_util as tu
+import unittest
+from functools import partial
 
+import numpy as np
+import test_util as tu
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 from tritonclientutils import InferenceServerException
 
 
 class UserData:
-
     def __init__(self):
         self._completed_requests = queue.Queue()
 
@@ -54,54 +55,57 @@ def callback(user_data, result, error):
 
 
 class ClientTimeoutTest(tu.TestResultCollector):
-
     def setUp(self):
         self.model_name_ = "custom_identity_int32"
         self.input0_data_ = np.array([[10]], dtype=np.int32)
 
     def _prepare_request(self, protocol):
-        if (protocol == "grpc"):
+        if protocol == "grpc":
             self.inputs_ = []
-            self.inputs_.append(grpcclient.InferInput('INPUT0', [1, 1],
-                                                      "INT32"))
+            self.inputs_.append(grpcclient.InferInput("INPUT0", [1, 1], "INT32"))
             self.outputs_ = []
-            self.outputs_.append(grpcclient.InferRequestedOutput('OUTPUT0'))
+            self.outputs_.append(grpcclient.InferRequestedOutput("OUTPUT0"))
         else:
             self.inputs_ = []
-            self.inputs_.append(httpclient.InferInput('INPUT0', [1, 1],
-                                                      "INT32"))
+            self.inputs_.append(httpclient.InferInput("INPUT0", [1, 1], "INT32"))
             self.outputs_ = []
-            self.outputs_.append(httpclient.InferRequestedOutput('OUTPUT0'))
+            self.outputs_.append(httpclient.InferRequestedOutput("OUTPUT0"))
 
         self.inputs_[0].set_data_from_numpy(self.input0_data_)
 
     def test_grpc_infer(self):
-        triton_client = grpcclient.InferenceServerClient(url="localhost:8001",
-                                                         verbose=True)
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
         self._prepare_request("grpc")
 
         # The model is configured to take three seconds to send the
         # response. Expect an exception for small timeout values.
         with self.assertRaises(InferenceServerException) as cm:
-            result = triton_client.infer(model_name=self.model_name_,
-                                         inputs=self.inputs_,
-                                         outputs=self.outputs_,
-                                         client_timeout=0.2)
+            result = triton_client.infer(
+                model_name=self.model_name_,
+                inputs=self.inputs_,
+                outputs=self.outputs_,
+                client_timeout=0.2,
+            )
         self.assertIn("Deadline Exceeded", str(cm.exception))
 
         # Expect inference to pass successfully for a large timeout
         # value
-        result = triton_client.infer(model_name=self.model_name_,
-                                     inputs=self.inputs_,
-                                     outputs=self.outputs_,
-                                     client_timeout=10)
-
-        output0_data = result.as_numpy('OUTPUT0')
+        result = triton_client.infer(
+            model_name=self.model_name_,
+            inputs=self.inputs_,
+            outputs=self.outputs_,
+            client_timeout=10,
+        )
+
+        output0_data = result.as_numpy("OUTPUT0")
         self.assertTrue(np.array_equal(self.input0_data_, output0_data))
 
     def test_grpc_async_infer(self):
-        triton_client = grpcclient.InferenceServerClient(url="localhost:8001",
-                                                         verbose=True)
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
         self._prepare_request("grpc")
 
         user_data = UserData()
@@ -109,11 +113,13 @@ def test_grpc_async_infer(self):
         # The model is configured to take three seconds to send the
         # response. Expect an exception for small timeout values.
         with self.assertRaises(InferenceServerException) as cm:
-            triton_client.async_infer(model_name=self.model_name_,
-                                      inputs=self.inputs_,
-                                      callback=partial(callback, user_data),
-                                      outputs=self.outputs_,
-                                      client_timeout=2)
+            triton_client.async_infer(
+                model_name=self.model_name_,
+                inputs=self.inputs_,
+                callback=partial(callback, user_data),
+                outputs=self.outputs_,
+                client_timeout=2,
+            )
             data_item = user_data._completed_requests.get()
             if type(data_item) == InferenceServerException:
                 raise data_item
@@ -121,23 +127,25 @@ def test_grpc_async_infer(self):
 
         # Expect inference to pass successfully for a large timeout
         # value
-        triton_client.async_infer(model_name=self.model_name_,
-                                  inputs=self.inputs_,
-                                  callback=partial(callback, user_data),
-                                  outputs=self.outputs_,
-                                  client_timeout=10)
+        triton_client.async_infer(
+            model_name=self.model_name_,
+            inputs=self.inputs_,
+            callback=partial(callback, user_data),
+            outputs=self.outputs_,
+            client_timeout=10,
+        )
 
         # Wait until the results are available in user_data
         data_item = user_data._completed_requests.get()
         self.assertFalse(type(data_item) == InferenceServerException)
 
-        output0_data = data_item.as_numpy('OUTPUT0')
+        output0_data = data_item.as_numpy("OUTPUT0")
         self.assertTrue(np.array_equal(self.input0_data_, output0_data))
 
     def test_grpc_stream_infer(self):
-
-        triton_client = grpcclient.InferenceServerClient(url="localhost:8001",
-                                                         verbose=True)
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
 
         self._prepare_request("grpc")
         user_data = UserData()
@@ -146,11 +154,12 @@ def test_grpc_stream_infer(self):
         # response. Expect an exception for small timeout values.
         with self.assertRaises(InferenceServerException) as cm:
             triton_client.stop_stream()
-            triton_client.start_stream(callback=partial(callback, user_data),
-                                       stream_timeout=1)
-            triton_client.async_stream_infer(model_name=self.model_name_,
-                                             inputs=self.inputs_,
-                                             outputs=self.outputs_)
+            triton_client.start_stream(
+                callback=partial(callback, user_data), stream_timeout=1
+            )
+            triton_client.async_stream_infer(
+                model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
+            )
             data_item = user_data._completed_requests.get()
             if type(data_item) == InferenceServerException:
                 raise data_item
@@ -159,73 +168,75 @@ def test_grpc_stream_infer(self):
         # Expect inference to pass successfully for a large timeout
         # value
         triton_client.stop_stream()
-        triton_client.start_stream(callback=partial(callback, user_data),
-                                   stream_timeout=100)
+        triton_client.start_stream(
+            callback=partial(callback, user_data), stream_timeout=100
+        )
 
-        triton_client.async_stream_infer(model_name=self.model_name_,
-                                         inputs=self.inputs_,
-                                         outputs=self.outputs_)
+        triton_client.async_stream_infer(
+            model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
+        )
         data_item = user_data._completed_requests.get()
         triton_client.stop_stream()
 
         if type(data_item) == InferenceServerException:
             raise data_item
-        output0_data = data_item.as_numpy('OUTPUT0')
+        output0_data = data_item.as_numpy("OUTPUT0")
         self.assertTrue(np.array_equal(self.input0_data_, output0_data))
 
     def test_http_infer(self):
-
         self._prepare_request("http")
 
         # The model is configured to take three seconds to send the
         # response. Expect an exception for small timeout values.
         with self.assertRaises(socket.timeout) as cm:
             triton_client = httpclient.InferenceServerClient(
-                url="localhost:8000", verbose=True, network_timeout=2.0)
-            result = triton_client.infer(model_name=self.model_name_,
-                                         inputs=self.inputs_,
-                                         outputs=self.outputs_)
+                url="localhost:8000", verbose=True, network_timeout=2.0
+            )
+            result = triton_client.infer(
+                model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
+            )
         self.assertIn("timed out", str(cm.exception))
 
         # Expect to successfully pass with sufficiently large timeout
         triton_client = httpclient.InferenceServerClient(
-            url="localhost:8000", verbose=True, connection_timeout=10.0)
+            url="localhost:8000", verbose=True, connection_timeout=10.0
+        )
 
-        result = triton_client.infer(model_name=self.model_name_,
-                                     inputs=self.inputs_,
-                                     outputs=self.outputs_)
+        result = triton_client.infer(
+            model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
+        )
 
-        output0_data = result.as_numpy('OUTPUT0')
+        output0_data = result.as_numpy("OUTPUT0")
         self.assertTrue(np.array_equal(self.input0_data_, output0_data))
 
     def test_http_async_infer(self):
-
         self._prepare_request("http")
 
         # The model is configured to take three seconds to send the
         # response. Expect an exception for small timeout values.
         with self.assertRaises(socket.timeout) as cm:
             triton_client = httpclient.InferenceServerClient(
-                url="localhost:8000", verbose=True, network_timeout=2.0)
+                url="localhost:8000", verbose=True, network_timeout=2.0
+            )
             async_request = triton_client.async_infer(
-                model_name=self.model_name_,
-                inputs=self.inputs_,
-                outputs=self.outputs_)
+                model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
+            )
             result = async_request.get_result()
         self.assertIn("timed out", str(cm.exception))
 
         # Expect to successfully pass with sufficiently large timeout
         triton_client = httpclient.InferenceServerClient(
-            url="localhost:8000", verbose=True, connection_timeout=10.0)
+            url="localhost:8000", verbose=True, connection_timeout=10.0
+        )
 
-        async_request = triton_client.async_infer(model_name=self.model_name_,
-                                                  inputs=self.inputs_,
-                                                  outputs=self.outputs_)
+        async_request = triton_client.async_infer(
+            model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
+        )
         result = async_request.get_result()
 
-        output0_data = result.as_numpy('OUTPUT0')
+        output0_data = result.as_numpy("OUTPUT0")
         self.assertTrue(np.array_equal(self.input0_data_, output0_data))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_client_timeout/models/custom_identity_int32/config.pbtxt b/qa/L0_client_timeout/models/custom_identity_int32/config.pbtxt
index a42c5dcd45..1732ff32fd 100644
--- a/qa/L0_client_timeout/models/custom_identity_int32/config.pbtxt
+++ b/qa/L0_client_timeout/models/custom_identity_int32/config.pbtxt
@@ -35,7 +35,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_INT32
     dims: [ -1 ]
-    
+
   }
 ]
 output [
diff --git a/qa/L0_client_timeout/test.sh b/qa/L0_client_timeout/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_client_valgrind/models/custom_identity_int32/config.pbtxt b/qa/L0_client_valgrind/models/custom_identity_int32/config.pbtxt
index 8d3a78baf4..6a2a76bde5 100644
--- a/qa/L0_client_valgrind/models/custom_identity_int32/config.pbtxt
+++ b/qa/L0_client_valgrind/models/custom_identity_int32/config.pbtxt
@@ -35,7 +35,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_INT32
     dims: [ -1 ]
-    
+
   }
 ]
 output [
diff --git a/qa/L0_cmdline_trace/test.sh b/qa/L0_cmdline_trace/test.sh
index 3de5328610..66f9a08fc0 100755
--- a/qa/L0_cmdline_trace/test.sh
+++ b/qa/L0_cmdline_trace/test.sh
@@ -570,7 +570,7 @@ else
 fi
 
 
-# check deprecation warnings 
+# check deprecation warnings
 SERVER_ARGS=" --trace-file=/tmp/trace.json --trace-rate=100 --trace-level=TIMESTAMPS \
               --trace-log-frequency=50 --trace-count=100 --model-repository=$MODELSDIR"
 SERVER_LOG="./inference_server_trace_config_flag.log"
diff --git a/qa/L0_cmdline_trace/trace_client.py b/qa/L0_cmdline_trace/trace_client.py
old mode 100644
new mode 100755
index 8e19ba6fb7..4d59579d7c
--- a/qa/L0_cmdline_trace/trace_client.py
+++ b/qa/L0_cmdline_trace/trace_client.py
@@ -26,24 +26,26 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
 
+import numpy as np
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8001',
-                        help='Inference server URL. Default is localhost:8001.')
-    parser.add_argument('-i', '--protocol', type=str, required=True)
+    parser.add_argument(
+        "-u",
+        "--url",
+        type=str,
+        required=False,
+        default="localhost:8001",
+        help="Inference server URL. Default is localhost:8001.",
+    )
+    parser.add_argument("-i", "--protocol", type=str, required=True)
     FLAGS = parser.parse_args()
 
-    if FLAGS.protocol == 'grpc':
+    if FLAGS.protocol == "grpc":
         client_type = grpcclient
     else:
         client_type = httpclient
@@ -59,8 +61,8 @@
     # Infer
     inputs = []
     outputs = []
-    inputs.append(client_type.InferInput('INPUT0', [1, 16], "INT32"))
-    inputs.append(client_type.InferInput('INPUT1', [1, 16], "INT32"))
+    inputs.append(client_type.InferInput("INPUT0", [1, 16], "INT32"))
+    inputs.append(client_type.InferInput("INPUT1", [1, 16], "INT32"))
 
     input0_data = np.arange(start=0, stop=16, dtype=np.int32)
     input0_data = np.expand_dims(input0_data, axis=0)
@@ -69,10 +71,9 @@
     inputs[0].set_data_from_numpy(input0_data)
     inputs[1].set_data_from_numpy(input1_data)
 
-    outputs.append(client_type.InferRequestedOutput('OUTPUT0'))
-    outputs.append(client_type.InferRequestedOutput('OUTPUT1'))
+    outputs.append(client_type.InferRequestedOutput("OUTPUT0"))
+    outputs.append(client_type.InferRequestedOutput("OUTPUT1"))
 
-    triton_client.infer(model_name=model_name,
-                        inputs=inputs,
-                        outputs=outputs,
-                        request_id="1")
+    triton_client.infer(
+        model_name=model_name, inputs=inputs, outputs=outputs, request_id="1"
+    )
diff --git a/qa/L0_cuda_graph/test.sh b/qa/L0_cuda_graph/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_cuda_graph/trt_cuda_graph_test.py b/qa/L0_cuda_graph/trt_cuda_graph_test.py
old mode 100644
new mode 100755
index 6cb68255ae..a7f9f3be98
--- a/qa/L0_cuda_graph/trt_cuda_graph_test.py
+++ b/qa/L0_cuda_graph/trt_cuda_graph_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,8 +31,9 @@
 sys.path.append("../common")
 
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
 from tritonclientutils import *
 
@@ -49,22 +52,25 @@ def _check_infer(self, tensor_shape, batch_size=1):
                 full_shape = (batch_size,) + tensor_shape
             else:
                 full_shape = tensor_shape
-            iu.infer_exact(self,
-                           self.model_name_,
-                           full_shape,
-                           batch_size,
-                           self.dtype_,
-                           self.dtype_,
-                           self.dtype_,
-                           model_version=1,
-                           use_http_json_tensors=False,
-                           use_grpc=False,
-                           use_streaming=False)
+            iu.infer_exact(
+                self,
+                self.model_name_,
+                full_shape,
+                batch_size,
+                self.dtype_,
+                self.dtype_,
+                self.dtype_,
+                model_version=1,
+                use_http_json_tensors=False,
+                use_grpc=False,
+                use_streaming=False,
+            )
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def _erroneous_infer(self, tensor_shape, batch_size):
         import tritonhttpclient
+
         item_size = batch_size
         for dim in tensor_shape:
             item_size *= dim
@@ -75,30 +81,38 @@ def _erroneous_infer(self, tensor_shape, batch_size):
 
         inputs = []
         inputs.append(
-            tritonhttpclient.InferInput('INPUT0', full_shape, self.dtype_str_))
+            tritonhttpclient.InferInput("INPUT0", full_shape, self.dtype_str_)
+        )
         inputs[-1].set_data_from_numpy(input_np)
         inputs.append(
-            tritonhttpclient.InferInput('INPUT1', full_shape, self.dtype_str_))
+            tritonhttpclient.InferInput("INPUT1", full_shape, self.dtype_str_)
+        )
         inputs[-1].set_data_from_numpy(input_np)
         outputs = []
         outputs.append(
-            tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
+            tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True)
+        )
         outputs.append(
-            tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True))
+            tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True)
+        )
 
-        model_name = tu.get_model_name(self.model_name_, self.dtype_,
-                                       self.dtype_, self.dtype_)
+        model_name = tu.get_model_name(
+            self.model_name_, self.dtype_, self.dtype_, self.dtype_
+        )
         results = tritonhttpclient.InferenceServerClient(
-            "localhost:8000", verbose=True).infer(model_name=model_name,
-                                                  inputs=inputs,
-                                                  outputs=outputs)
+            "localhost:8000", verbose=True
+        ).infer(model_name=model_name, inputs=inputs, outputs=outputs)
         # Validate the results by comparing with precomputed values.
-        output0_np = results.as_numpy('OUTPUT0')
-        output1_np = results.as_numpy('OUTPUT1')
-        self.assertFalse(np.array_equal(output0_np, expected_output0_np),
-                         "expects OUTPUT0 is not correct")
-        self.assertFalse(np.array_equal(output1_np, expected_output1_np),
-                         "expects OUTPUT1 is not correct")
+        output0_np = results.as_numpy("OUTPUT0")
+        output1_np = results.as_numpy("OUTPUT1")
+        self.assertFalse(
+            np.array_equal(output0_np, expected_output0_np),
+            "expects OUTPUT0 is not correct",
+        )
+        self.assertFalse(
+            np.array_equal(output1_np, expected_output1_np),
+            "expects OUTPUT1 is not correct",
+        )
 
     def test_fixed_shape(self):
         tensor_shape = (16,)
@@ -142,7 +156,7 @@ def test_nobatch_fixed_shape(self):
         self._check_infer((16,), 0)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     if len(sys.argv) > 2:
         TrtCudaGraphTest.MODELNAME = sys.argv.pop()
 
diff --git a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
old mode 100644
new mode 100755
index 2e8939951b..87fb7c1d3c
--- a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
+++ b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,11 +30,11 @@
 
 sys.path.append("../common")
 
-import numpy as np
-import unittest
 import os
-import test_util as tu
+import unittest
 
+import numpy as np
+import test_util as tu
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 import tritonshmutils.cuda_shared_memory as cshm
@@ -40,16 +42,13 @@
 
 
 class CudaSharedMemoryTest(tu.TestResultCollector):
-
     def test_invalid_create_shm(self):
         # Raises error since tried to create invalid cuda shared memory region
         try:
-            shm_op0_handle = cshm.create_shared_memory_region(
-                "dummy_data", -1, 0)
+            shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0)
             cshm.destroy_shared_memory_region(shm_op0_handle)
         except Exception as ex:
-            self.assertEqual(str(ex),
-                             "unable to create cuda shared memory handle")
+            self.assertEqual(str(ex), "unable to create cuda shared memory handle")
 
     def test_valid_create_set_register(self):
         # Create a valid cuda shared memory region, fill data in it and register
@@ -58,10 +57,12 @@ def test_valid_create_set_register(self):
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
         shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        cshm.set_shared_memory_region(shm_op0_handle,
-                                      [np.array([1, 2], dtype=np.float32)])
+        cshm.set_shared_memory_region(
+            shm_op0_handle, [np.array([1, 2], dtype=np.float32)]
+        )
         triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8)
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
         shm_status = triton_client.get_cuda_shared_memory_status()
         if _protocol == "http":
             self.assertEqual(len(shm_status), 1)
@@ -92,7 +93,8 @@ def test_unregister_after_register(self):
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
         shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
         triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8)
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
         triton_client.unregister_cuda_shared_memory("dummy_data")
         shm_status = triton_client.get_cuda_shared_memory_status()
         if _protocol == "http":
@@ -109,13 +111,16 @@ def test_reregister_after_register(self):
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
         shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
         triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8)
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
         try:
             triton_client.register_cuda_shared_memory(
-                "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8)
+                "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+            )
         except Exception as ex:
             self.assertIn(
-                "shared memory region 'dummy_data' already in manager", str(ex))
+                "shared memory region 'dummy_data' already in manager", str(ex)
+            )
         shm_status = triton_client.get_cuda_shared_memory_status()
         if _protocol == "http":
             self.assertEqual(len(shm_status), 1)
@@ -138,27 +143,33 @@ def _configure_sever(self):
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
         triton_client.register_cuda_shared_memory(
-            "input0_data", cshm.get_raw_handle(shm_ip0_handle), 0, 64)
+            "input0_data", cshm.get_raw_handle(shm_ip0_handle), 0, 64
+        )
         triton_client.register_cuda_shared_memory(
-            "input1_data", cshm.get_raw_handle(shm_ip1_handle), 0, 64)
+            "input1_data", cshm.get_raw_handle(shm_ip1_handle), 0, 64
+        )
         triton_client.register_cuda_shared_memory(
-            "output0_data", cshm.get_raw_handle(shm_op0_handle), 0, 64)
+            "output0_data", cshm.get_raw_handle(shm_op0_handle), 0, 64
+        )
         triton_client.register_cuda_shared_memory(
-            "output1_data", cshm.get_raw_handle(shm_op1_handle), 0, 64)
+            "output1_data", cshm.get_raw_handle(shm_op1_handle), 0, 64
+        )
         return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
 
     def _cleanup_server(self, shm_handles):
         for shm_handle in shm_handles:
             cshm.destroy_shared_memory_region(shm_handle)
 
-    def _basic_inference(self,
-                         shm_ip0_handle,
-                         shm_ip1_handle,
-                         shm_op0_handle,
-                         shm_op1_handle,
-                         error_msg,
-                         big_shm_name="",
-                         big_shm_size=64):
+    def _basic_inference(
+        self,
+        shm_ip0_handle,
+        shm_ip1_handle,
+        shm_op0_handle,
+        shm_op1_handle,
+        error_msg,
+        big_shm_name="",
+        big_shm_size=64,
+    ):
         input0_data = np.arange(start=0, stop=16, dtype=np.int32)
         input1_data = np.ones(shape=16, dtype=np.int32)
         inputs = []
@@ -167,16 +178,16 @@ def _basic_inference(self,
             triton_client = httpclient.InferenceServerClient(_url, verbose=True)
             inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
             inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
+            outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
             outputs.append(
-                httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
-            outputs.append(
-                httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
+                httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)
+            )
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
             inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
             inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
-            outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
-            outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))
+            outputs.append(grpcclient.InferRequestedOutput("OUTPUT0"))
+            outputs.append(grpcclient.InferRequestedOutput("OUTPUT1"))
         inputs[0].set_shared_memory("input0_data", 64)
         if type(shm_ip1_handle) == np.array:
             inputs[1].set_data_from_numpy(input0_data, binary_data=True)
@@ -188,22 +199,21 @@ def _basic_inference(self,
         outputs[1].set_shared_memory("output1_data", 64)
 
         try:
-            results = triton_client.infer("simple",
-                                          inputs,
-                                          model_version="",
-                                          outputs=outputs)
-            output = results.get_output('OUTPUT0')
+            results = triton_client.infer(
+                "simple", inputs, model_version="", outputs=outputs
+            )
+            output = results.get_output("OUTPUT0")
             if _protocol == "http":
-                output_datatype = output['datatype']
-                output_shape = output['shape']
+                output_datatype = output["datatype"]
+                output_shape = output["shape"]
             else:
                 output_datatype = output.datatype
                 output_shape = output.shape
             output_dtype = triton_to_np_dtype(output_datatype)
-            output_data = cshm.get_contents_as_numpy(shm_op0_handle,
-                                                     output_dtype, output_shape)
-            self.assertTrue(
-                (output_data[0] == (input0_data + input1_data)).all())
+            output_data = cshm.get_contents_as_numpy(
+                shm_op0_handle, output_dtype, output_shape
+            )
+            self.assertTrue((output_data[0] == (input0_data + input1_data)).all())
         except Exception as ex:
             error_msg.append(str(ex))
 
@@ -211,8 +221,9 @@ def test_unregister_after_inference(self):
         # Unregister after inference
         error_msg = []
         shm_handles = self._configure_sever()
-        self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2],
-                              shm_handles[3], error_msg)
+        self._basic_inference(
+            shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg
+        )
         if len(error_msg) > 0:
             raise Exception(str(error_msg))
         if _protocol == "http":
@@ -235,13 +246,15 @@ def test_register_after_inference(self):
             triton_client = httpclient.InferenceServerClient(_url, verbose=True)
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-        self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2],
-                              shm_handles[3], error_msg)
+        self._basic_inference(
+            shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg
+        )
         if len(error_msg) > 0:
             raise Exception(str(error_msg))
         shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 64, 0)
         triton_client.register_cuda_shared_memory(
-            "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 64)
+            "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 64
+        )
         shm_status = triton_client.get_cuda_shared_memory_status()
         if _protocol == "http":
             self.assertEqual(len(shm_status), 5)
@@ -260,13 +273,22 @@ def test_too_big_shm(self):
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
         triton_client.register_cuda_shared_memory(
-            "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 128)
-        self._basic_inference(shm_handles[0], shm_ip2_handle, shm_handles[2],
-                              shm_handles[3], error_msg, "input2_data", 128)
+            "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 128
+        )
+        self._basic_inference(
+            shm_handles[0],
+            shm_ip2_handle,
+            shm_handles[2],
+            shm_handles[3],
+            error_msg,
+            "input2_data",
+            128,
+        )
         if len(error_msg) > 0:
             self.assertIn(
                 "unexpected total byte size 128 for input 'INPUT1', expecting 64",
-                error_msg[-1])
+                error_msg[-1],
+            )
         shm_handles.append(shm_ip2_handle)
         self._cleanup_server(shm_handles)
 
@@ -275,8 +297,9 @@ def test_mixed_raw_shm(self):
         error_msg = []
         shm_handles = self._configure_sever()
         input1_data = np.ones(shape=16, dtype=np.int32)
-        self._basic_inference(shm_handles[0], [input1_data], shm_handles[2],
-                              shm_handles[3], error_msg)
+        self._basic_inference(
+            shm_handles[0], [input1_data], shm_handles[2], shm_handles[3], error_msg
+        )
         if len(error_msg) > 0:
             raise Exception(error_msg[-1])
         self._cleanup_server(shm_handles)
@@ -302,8 +325,8 @@ def test_unregisterall(self):
         self._cleanup_server(shm_handles)
 
 
-if __name__ == '__main__':
-    _protocol = os.environ.get('CLIENT_TYPE', "http")
+if __name__ == "__main__":
+    _protocol = os.environ.get("CLIENT_TYPE", "http")
     if _protocol == "http":
         _url = "localhost:8000"
     else:
diff --git a/qa/L0_cuda_shared_memory/test.sh b/qa/L0_cuda_shared_memory/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_custom_ops/cuda_op_test.py b/qa/L0_custom_ops/cuda_op_test.py
old mode 100644
new mode 100755
index d4389d67ad..896ed2adf0
--- a/qa/L0_custom_ops/cuda_op_test.py
+++ b/qa/L0_custom_ops/cuda_op_test.py
@@ -27,47 +27,50 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
 from builtins import range
+
+import numpy as np
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 from tritonclientutils import np_to_triton_dtype
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8000',
-                        help='Inference server URL. Default is localhost:8000.')
     parser.add_argument(
-        '-i',
-        '--protocol',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
         type=str,
         required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-m',
-                        '--model',
-                        type=str,
-                        required=True,
-                        help='Name of model.')
+        default="localhost:8000",
+        help="Inference server URL. Default is localhost:8000.",
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
+        type=str,
+        required=False,
+        default="http",
+        help='Protocol ("http"/"grpc") used to '
+        + 'communicate with inference service. Default is "http".',
+    )
+    parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.")
 
     FLAGS = parser.parse_args()
     if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
+        print(
+            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
+        )
         exit(1)
 
     client_util = httpclient if FLAGS.protocol == "http" else grpcclient
@@ -84,21 +87,22 @@
     input_data = np.arange(start=42, stop=42 + elements, dtype=np.int32)
 
     inputs = [
-        client_util.InferInput("in", input_data.shape,
-                               np_to_triton_dtype(input_data.dtype))
+        client_util.InferInput(
+            "in", input_data.shape, np_to_triton_dtype(input_data.dtype)
+        )
     ]
     inputs[0].set_data_from_numpy(input_data)
 
     results = client.infer(model_name, inputs)
-    output_data = results.as_numpy('out')
+    output_data = results.as_numpy("out")
     if output_data is None:
         print("error: expected 'out'")
         sys.exit(1)
 
     for i in range(elements):
         print(
-            str(i) + ": input " + str(input_data[i]) + ", output " +
-            str(output_data[i]))
+            str(i) + ": input " + str(input_data[i]) + ", output " + str(output_data[i])
+        )
         if output_data[i] != (input_data[i] + 1):
             print("error: incorrect value")
             sys.exit(1)
diff --git a/qa/L0_custom_ops/mod_op_test.py b/qa/L0_custom_ops/mod_op_test.py
old mode 100644
new mode 100755
index 62edd1e289..14855f7c40
--- a/qa/L0_custom_ops/mod_op_test.py
+++ b/qa/L0_custom_ops/mod_op_test.py
@@ -27,47 +27,50 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
 from builtins import range
+
+import numpy as np
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 from tritonclientutils import np_to_triton_dtype
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8000',
-                        help='Inference server URL. Default is localhost:8000.')
     parser.add_argument(
-        '-i',
-        '--protocol',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
         type=str,
         required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-m',
-                        '--model',
-                        type=str,
-                        required=True,
-                        help='Name of model.')
+        default="localhost:8000",
+        help="Inference server URL. Default is localhost:8000.",
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
+        type=str,
+        required=False,
+        default="http",
+        help='Protocol ("http"/"grpc") used to '
+        + 'communicate with inference service. Default is "http".',
+    )
+    parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.")
 
     FLAGS = parser.parse_args()
     if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
+        print(
+            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
+        )
         exit(1)
 
     client_util = httpclient if FLAGS.protocol == "http" else grpcclient
@@ -87,22 +90,32 @@
     inputs = []
     for i in range(len(input_data)):
         inputs.append(
-            client_util.InferInput("INPUT__{}".format(i), input_data[0].shape,
-                                   np_to_triton_dtype(input_data[0].dtype)))
+            client_util.InferInput(
+                "INPUT__{}".format(i),
+                input_data[0].shape,
+                np_to_triton_dtype(input_data[0].dtype),
+            )
+        )
         inputs[i].set_data_from_numpy(input_data[i])
 
     results = client.infer(model_name, inputs)
 
     # We expect 1 result of size 10 with alternating 1 and 0.
-    output_data = results.as_numpy('OUTPUT__0')
+    output_data = results.as_numpy("OUTPUT__0")
     if output_data is None:
         print("error: expected 'OUTPUT__0'")
         sys.exit(1)
 
     for i in range(elements):
         print(
-            str(i) + ": " + str(input_data[0][i]) + " % " +
-            str(input_data[1][i]) + " = " + str(output_data[i]))
-        if ((input_data[0][i] % input_data[1][i]) != output_data[i]):
+            str(i)
+            + ": "
+            + str(input_data[0][i])
+            + " % "
+            + str(input_data[1][i])
+            + " = "
+            + str(output_data[i])
+        )
+        if (input_data[0][i] % input_data[1][i]) != output_data[i]:
             print("error: incorrect value")
             sys.exit(1)
diff --git a/qa/L0_custom_ops/onnx_op_test.py b/qa/L0_custom_ops/onnx_op_test.py
old mode 100644
new mode 100755
index 6a3d5ebb53..9b246c8e31
--- a/qa/L0_custom_ops/onnx_op_test.py
+++ b/qa/L0_custom_ops/onnx_op_test.py
@@ -27,47 +27,50 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
 from builtins import range
+
+import numpy as np
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 from tritonclientutils import np_to_triton_dtype
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8000',
-                        help='Inference server URL. Default is localhost:8000.')
     parser.add_argument(
-        '-i',
-        '--protocol',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
+        type=str,
+        required=False,
+        default="localhost:8000",
+        help="Inference server URL. Default is localhost:8000.",
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
         type=str,
         required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-m',
-                        '--model',
-                        type=str,
-                        required=True,
-                        help='Name of model.')
+        default="http",
+        help='Protocol ("http"/"grpc") used to '
+        + 'communicate with inference service. Default is "http".',
+    )
+    parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.")
 
     FLAGS = parser.parse_args()
     if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
+        print(
+            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
+        )
         exit(1)
 
     client_util = httpclient if FLAGS.protocol == "http" else grpcclient
@@ -88,14 +91,16 @@
     inputs = []
     for i in range(len(input_data)):
         inputs.append(
-            client_util.InferInput("input_{}".format(i + 1), shape,
-                                   np_to_triton_dtype(dtype)))
+            client_util.InferInput(
+                "input_{}".format(i + 1), shape, np_to_triton_dtype(dtype)
+            )
+        )
         inputs[i].set_data_from_numpy(input_data[i])
 
     results = client.infer(model_name, inputs)
 
     # We expect 1 result of size 10 with alternating 1 and 0.
-    output_data = results.as_numpy('output')
+    output_data = results.as_numpy("output")
     if output_data is None:
         print("error: expected 'output'")
         sys.exit(1)
@@ -103,9 +108,12 @@
     for i in range(3):
         for j in range(5):
             print(
-                str(input_data[0][i][j]) + " + " + str(input_data[1][i][j]) +
-                " = " + str(output_data[i][j]))
-            if ((input_data[0][i][j] + input_data[1][i][j]) !=
-                    output_data[i][j]):
+                str(input_data[0][i][j])
+                + " + "
+                + str(input_data[1][i][j])
+                + " = "
+                + str(output_data[i][j])
+            )
+            if (input_data[0][i][j] + input_data[1][i][j]) != output_data[i][j]:
                 print("error: incorrect value")
                 sys.exit(1)
diff --git a/qa/L0_custom_ops/vision_op_test.py b/qa/L0_custom_ops/vision_op_test.py
old mode 100644
new mode 100755
index c925dc19c0..88857c3d12
--- a/qa/L0_custom_ops/vision_op_test.py
+++ b/qa/L0_custom_ops/vision_op_test.py
@@ -27,46 +27,49 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
+
+import numpy as np
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
 from tritonclient.utils import np_to_triton_dtype
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8000',
-                        help='Inference server URL. Default is localhost:8000.')
     parser.add_argument(
-        '-i',
-        '--protocol',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
+        type=str,
+        required=False,
+        default="localhost:8000",
+        help="Inference server URL. Default is localhost:8000.",
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
         type=str,
         required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-m',
-                        '--model',
-                        type=str,
-                        required=True,
-                        help='Name of model.')
+        default="http",
+        help='Protocol ("http"/"grpc") used to '
+        + 'communicate with inference service. Default is "http".',
+    )
+    parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.")
 
     FLAGS = parser.parse_args()
     if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
+        print(
+            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
+        )
         exit(1)
 
     client_util = httpclient if FLAGS.protocol == "http" else grpcclient
@@ -83,23 +86,26 @@
 
     inputs = []
     inputs.append(
-        client_util.InferInput("INPUT__0", input_data.shape,
-                               np_to_triton_dtype(input_data.dtype)))
+        client_util.InferInput(
+            "INPUT__0", input_data.shape, np_to_triton_dtype(input_data.dtype)
+        )
+    )
     inputs[0].set_data_from_numpy(input_data)
     inputs.append(
-        client_util.InferInput("INPUT__1", box_data.shape,
-                               np_to_triton_dtype(box_data.dtype)))
+        client_util.InferInput(
+            "INPUT__1", box_data.shape, np_to_triton_dtype(box_data.dtype)
+        )
+    )
     inputs[1].set_data_from_numpy(box_data)
 
     results = client.infer(model_name, inputs)
 
     # We expect 1 result of shape [1, 3, 5, 5].
-    output_data = results.as_numpy('OUTPUT__0')
+    output_data = results.as_numpy("OUTPUT__0")
     if output_data is None:
         print("error: expected 'OUTPUT__0'")
         sys.exit(1)
 
-    if (output_data.shape != (1, 3, 5, 5)):
-        print("error: incorrect shape " + str(output_data.shape) +
-              "for 'OUTPUT__0'")
+    if output_data.shape != (1, 3, 5, 5):
+        print("error: incorrect shape " + str(output_data.shape) + "for 'OUTPUT__0'")
         sys.exit(1)
diff --git a/qa/L0_custom_ops/zero_out_test.py b/qa/L0_custom_ops/zero_out_test.py
old mode 100644
new mode 100755
index ad87dc8f37..28d5d2c9e6
--- a/qa/L0_custom_ops/zero_out_test.py
+++ b/qa/L0_custom_ops/zero_out_test.py
@@ -27,47 +27,50 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
 from builtins import range
+
+import numpy as np
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 from tritonclientutils import np_to_triton_dtype
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8000',
-                        help='Inference server URL. Default is localhost:8000.')
     parser.add_argument(
-        '-i',
-        '--protocol',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
         type=str,
         required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-m',
-                        '--model',
-                        type=str,
-                        required=True,
-                        help='Name of model.')
+        default="localhost:8000",
+        help="Inference server URL. Default is localhost:8000.",
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
+        type=str,
+        required=False,
+        default="http",
+        help='Protocol ("http"/"grpc") used to '
+        + 'communicate with inference service. Default is "http".',
+    )
+    parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.")
 
     FLAGS = parser.parse_args()
     if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
+        print(
+            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
+        )
         exit(1)
 
     client_util = httpclient if FLAGS.protocol == "http" else grpcclient
@@ -83,8 +86,9 @@
     input_data = np.arange(start=42, stop=42 + elements, dtype=np.int32)
 
     inputs = [
-        client_util.InferInput("to_zero", input_data.shape,
-                               np_to_triton_dtype(input_data.dtype))
+        client_util.InferInput(
+            "to_zero", input_data.shape, np_to_triton_dtype(input_data.dtype)
+        )
     ]
     inputs[0].set_data_from_numpy(input_data)
     results = client.infer(model_name, inputs)
@@ -97,8 +101,8 @@
 
     for i in range(elements):
         print(
-            str(i) + ": input " + str(input_data[i]) + ", output " +
-            str(output_data[i]))
+            str(i) + ": input " + str(input_data[i]) + ", output " + str(output_data[i])
+        )
         if (i == 0) and (input_data[i] != output_data[i]):
             print("error: incorrect value")
             sys.exit(1)
diff --git a/qa/L0_data_compression/test.sh b/qa/L0_data_compression/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_data_compression/validation.py b/qa/L0_data_compression/validation.py
old mode 100644
new mode 100755
index 927c863952..a0e5cb1576
--- a/qa/L0_data_compression/validation.py
+++ b/qa/L0_data_compression/validation.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,8 +31,9 @@
 
 def generate_compressed_data():
     with open("raw_data", "rb") as f:
-        import zlib
         import gzip
+        import zlib
+
         raw_data = f.read()
         with open("deflate_compressed_data", "wb") as of:
             of.write(zlib.compress(raw_data))
@@ -40,8 +43,9 @@ def generate_compressed_data():
 
 def validate_compressed_data():
     with open("raw_data", "rb") as f:
-        import zlib
         import gzip
+        import zlib
+
         raw_data = f.read()
         with open("generated_deflate_compressed_data", "rb") as cf:
             decompressed_data = zlib.decompress(cf.read())
@@ -53,5 +57,5 @@ def validate_compressed_data():
                 exit(1)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     globals()[sys.argv[1]]()
diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py
old mode 100644
new mode 100755
index 0ce47e5b80..b78170cf63
--- a/qa/L0_decoupled/decoupled_test.py
+++ b/qa/L0_decoupled/decoupled_test.py
@@ -1,4 +1,6 @@
-# Copyright 2020-2023, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,11 +27,12 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import sys
+
 sys.path.append("../common")
 
 import os
-import time
 import queue
+import time
 import unittest
 from functools import partial
 
@@ -41,7 +44,6 @@
 
 
 class UserData:
-
     def __init__(self):
         self._response_queue = queue.Queue()
 
@@ -54,23 +56,25 @@ def callback(user_data, result, error):
 
 
 class DecoupledTest(tu.TestResultCollector):
-
     def setUp(self):
-        self.trials_ = [("repeat_int32", None), ("simple_repeat", None),
-                        ("sequence_repeat", None),
-                        ("fan_repeat", self._fan_validate),
-                        ("repeat_square", self._nested_validate),
-                        ("nested_square", self._nested_validate)]
+        self.trials_ = [
+            ("repeat_int32", None),
+            ("simple_repeat", None),
+            ("sequence_repeat", None),
+            ("fan_repeat", self._fan_validate),
+            ("repeat_square", self._nested_validate),
+            ("nested_square", self._nested_validate),
+        ]
         self.model_name_ = "repeat_int32"
 
         self.inputs_ = []
-        self.inputs_.append(grpcclient.InferInput('IN', [1], "INT32"))
-        self.inputs_.append(grpcclient.InferInput('DELAY', [1], "UINT32"))
-        self.inputs_.append(grpcclient.InferInput('WAIT', [1], "UINT32"))
+        self.inputs_.append(grpcclient.InferInput("IN", [1], "INT32"))
+        self.inputs_.append(grpcclient.InferInput("DELAY", [1], "UINT32"))
+        self.inputs_.append(grpcclient.InferInput("WAIT", [1], "UINT32"))
 
         self.outputs_ = []
-        self.outputs_.append(grpcclient.InferRequestedOutput('OUT'))
-        self.outputs_.append(grpcclient.InferRequestedOutput('IDX'))
+        self.outputs_.append(grpcclient.InferRequestedOutput("OUT"))
+        self.outputs_.append(grpcclient.InferRequestedOutput("IDX"))
         # Some trials only expect a subset of outputs
         self.requested_outputs_ = self.outputs_
 
@@ -95,14 +99,22 @@ def setUp(self):
     # If the decoupled backend/model always sends the final response flag along
     # with a non-null response, no opt-in is needed.
     #
-    # With this behavior, the client can programatically detect when all responses
+    # With this behavior, the client can programmatically detect when all responses
     # for an individual request have been received without knowing the expected
     # number of responses in advance and without closing the stream.
-    def _stream_infer_with_params(self, request_count, request_delay, _,
-                                  delay_data, delay_factor, user_data,
-                                  result_dict):
-        with grpcclient.InferenceServerClient(url="localhost:8001",
-                                              verbose=True) as triton_client:
+    def _stream_infer_with_params(
+        self,
+        request_count,
+        request_delay,
+        _,
+        delay_data,
+        delay_factor,
+        user_data,
+        result_dict,
+    ):
+        with grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        ) as triton_client:
             # Establish stream
             triton_client.start_stream(callback=partial(callback, user_data))
             # Send specified many requests in parallel
@@ -116,7 +128,8 @@ def _stream_infer_with_params(self, request_count, request_delay, _,
                     outputs=self.requested_outputs_,
                     # Opt-in to receiving flags-only responses from model/backend
                     # to help detect final responses for decoupled models.
-                    enable_empty_final_response=True)
+                    enable_empty_final_response=True,
+                )
                 # Update delay input in accordance with the scaling factor
                 delay_data = delay_data * delay_factor
                 delay_data = delay_data.astype(np.uint32)
@@ -134,11 +147,11 @@ def _stream_infer_with_params(self, request_count, request_delay, _,
                     # to associate decoupled responses with their requests.
                     if not response.id:
                         raise ValueError(
-                            "No response id found. Was a request_id provided?")
+                            "No response id found. Was a request_id provided?"
+                        )
 
                     # Detect final response. Parameters are oneof and we expect bool_param
-                    if response.parameters.get(
-                            "triton_final_response").bool_param:
+                    if response.parameters.get("triton_final_response").bool_param:
                         completed_requests += 1
 
                     # Only process non-empty response, ignore if empty (no outputs)
@@ -148,10 +161,19 @@ def _stream_infer_with_params(self, request_count, request_delay, _,
                         result_dict[response.id].append((recv_count, data_item))
                         recv_count += 1
 
-    def _stream_infer(self, request_count, request_delay, expected_count,
-                      delay_data, delay_factor, user_data, result_dict):
-        with grpcclient.InferenceServerClient(url="localhost:8001",
-                                              verbose=True) as triton_client:
+    def _stream_infer(
+        self,
+        request_count,
+        request_delay,
+        expected_count,
+        delay_data,
+        delay_factor,
+        user_data,
+        result_dict,
+    ):
+        with grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        ) as triton_client:
             # Establish stream
             triton_client.start_stream(callback=partial(callback, user_data))
             # Send specified many requests in parallel
@@ -162,7 +184,8 @@ def _stream_infer(self, request_count, request_delay, expected_count,
                     model_name=self.model_name_,
                     inputs=self.inputs_,
                     request_id=str(i),
-                    outputs=self.requested_outputs_)
+                    outputs=self.requested_outputs_,
+                )
                 # Update delay input in accordance with the scaling factor
                 delay_data = delay_data * delay_factor
                 delay_data = delay_data.astype(np.uint32)
@@ -186,7 +209,7 @@ def _fan_validate(self, result_list, data_offset, repeat_count):
         self.assertEqual(len(result_list), repeat_count)
         expected_data = 2 * data_offset
         for j in range(len(result_list)):
-            this_data = result_list[j][1].as_numpy('OUT')
+            this_data = result_list[j][1].as_numpy("OUT")
             self.assertEqual(len(this_data), 1)
             self.assertEqual(this_data[0], expected_data)
             expected_data += 2
@@ -194,13 +217,12 @@ def _fan_validate(self, result_list, data_offset, repeat_count):
     def _nested_validate(self, result_list, data_offset, repeat_count):
         # if repeat model returns repeat result n, repeat_square-like model
         # will return the same result n times
-        expected_len = sum(
-            x for x in range(data_offset, data_offset + repeat_count))
+        expected_len = sum(x for x in range(data_offset, data_offset + repeat_count))
         self.assertEqual(len(result_list), expected_len)
         expected_data = data_offset
         expected_count = expected_data
         for j in range(len(result_list)):
-            this_data = result_list[j][1].as_numpy('OUT')
+            this_data = result_list[j][1].as_numpy("OUT")
             self.assertEqual(len(this_data), 1)
             self.assertEqual(this_data[0], expected_data)
             expected_count -= 1
@@ -208,20 +230,22 @@ def _nested_validate(self, result_list, data_offset, repeat_count):
                 expected_data += 1
                 expected_count = expected_data
 
-    def _decoupled_infer(self,
-                         request_count,
-                         request_delay=0,
-                         repeat_count=1,
-                         data_offset=100,
-                         delay_time=1000,
-                         delay_factor=1,
-                         wait_time=500,
-                         order_sequence=None,
-                         validate_fn=None):
+    def _decoupled_infer(
+        self,
+        request_count,
+        request_delay=0,
+        repeat_count=1,
+        data_offset=100,
+        delay_time=1000,
+        delay_factor=1,
+        wait_time=500,
+        order_sequence=None,
+        validate_fn=None,
+    ):
         # Initialize data for IN
-        input_data = np.arange(start=data_offset,
-                               stop=data_offset + repeat_count,
-                               dtype=np.int32)
+        input_data = np.arange(
+            start=data_offset, stop=data_offset + repeat_count, dtype=np.int32
+        )
         self.inputs_[0].set_shape([repeat_count])
         self.inputs_[0].set_data_from_numpy(input_data)
 
@@ -234,24 +258,31 @@ def _decoupled_infer(self,
         self.inputs_[2].set_data_from_numpy(wait_data)
 
         # use validate_fn to differentiate requested outputs
-        self.requested_outputs_ = self.outputs_ if validate_fn is None else self.outputs_[
-            0:1]
+        self.requested_outputs_ = (
+            self.outputs_ if validate_fn is None else self.outputs_[0:1]
+        )
 
-        for infer_helper in [
-                self._stream_infer, self._stream_infer_with_params
-        ]:
+        for infer_helper in [self._stream_infer, self._stream_infer_with_params]:
             user_data = UserData()
             result_dict = {}
 
             try:
                 if "square" not in self.model_name_:
-                    expected_count = (repeat_count * request_count)
+                    expected_count = repeat_count * request_count
                 else:
-                    expected_count = sum(
-                        x for x in range(data_offset, data_offset +
-                                         repeat_count)) * request_count
-                infer_helper(request_count, request_delay, expected_count,
-                             delay_data, delay_factor, user_data, result_dict)
+                    expected_count = (
+                        sum(x for x in range(data_offset, data_offset + repeat_count))
+                        * request_count
+                    )
+                infer_helper(
+                    request_count,
+                    request_delay,
+                    expected_count,
+                    delay_data,
+                    delay_factor,
+                    user_data,
+                    result_dict,
+                )
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -260,33 +291,34 @@ def _decoupled_infer(self,
                 this_id = str(i)
                 if repeat_count != 0 and this_id not in result_dict.keys():
                     self.assertTrue(
-                        False, "response for request id {} not received".format(
-                            this_id))
+                        False, "response for request id {} not received".format(this_id)
+                    )
                 elif repeat_count == 0 and this_id in result_dict.keys():
                     self.assertTrue(
                         False,
                         "received unexpected response for request id {}".format(
-                            this_id))
+                            this_id
+                        ),
+                    )
                 if repeat_count != 0:
                     if validate_fn is None:
-                        self.assertEqual(len(result_dict[this_id]),
-                                         repeat_count)
+                        self.assertEqual(len(result_dict[this_id]), repeat_count)
                         expected_data = data_offset
                         result_list = result_dict[this_id]
                         for j in range(len(result_list)):
                             if order_sequence is not None:
-                                self.assertEqual(result_list[j][0],
-                                                 order_sequence[i][j])
-                            this_data = result_list[j][1].as_numpy('OUT')
+                                self.assertEqual(
+                                    result_list[j][0], order_sequence[i][j]
+                                )
+                            this_data = result_list[j][1].as_numpy("OUT")
                             self.assertEqual(len(this_data), 1)
                             self.assertEqual(this_data[0], expected_data)
-                            this_idx = result_list[j][1].as_numpy('IDX')
+                            this_idx = result_list[j][1].as_numpy("IDX")
                             self.assertEqual(len(this_idx), 1)
                             self.assertEqual(this_idx[0], j)
                             expected_data += 1
                     else:
-                        validate_fn(result_dict[this_id], data_offset,
-                                    repeat_count)
+                        validate_fn(result_dict[this_id], data_offset, repeat_count)
 
     def test_one_to_none(self):
         # Test cases where each request generates no response.
@@ -296,13 +328,9 @@ def test_one_to_none(self):
         for trial in self.trials_:
             self.model_name_ = trial[0]
             # Single request case
-            self._decoupled_infer(request_count=1,
-                                  repeat_count=0,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(request_count=1, repeat_count=0, validate_fn=trial[1])
             # Multiple request case
-            self._decoupled_infer(request_count=5,
-                                  repeat_count=0,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(request_count=5, repeat_count=0, validate_fn=trial[1])
 
     def test_one_to_one(self):
         # Test cases where each request generates single response.
@@ -313,23 +341,15 @@ def test_one_to_one(self):
             self.model_name_ = trial[0]
             # Single request case
             # Release request before the response is delivered
-            self._decoupled_infer(request_count=1,
-                                  wait_time=500,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(request_count=1, wait_time=500, validate_fn=trial[1])
             # Release request after the response is delivered
-            self._decoupled_infer(request_count=1,
-                                  wait_time=2000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(request_count=1, wait_time=2000, validate_fn=trial[1])
 
             # Multiple request case
             # Release request before the response is delivered
-            self._decoupled_infer(request_count=5,
-                                  wait_time=500,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(request_count=5, wait_time=500, validate_fn=trial[1])
             # Release request after the response is delivered
-            self._decoupled_infer(request_count=5,
-                                  wait_time=2000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(request_count=5, wait_time=2000, validate_fn=trial[1])
 
     def test_one_to_many(self):
         # Test cases where each request generates multiple response.
@@ -342,37 +362,31 @@ def test_one_to_many(self):
             self.model_name_ = trial[0]
             # Single request case
             # Release request before the first response is delivered
-            self._decoupled_infer(request_count=1,
-                                  repeat_count=5,
-                                  wait_time=500,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=500, validate_fn=trial[1]
+            )
             # Release request when the responses are getting delivered
-            self._decoupled_infer(request_count=1,
-                                  repeat_count=5,
-                                  wait_time=2000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=2000, validate_fn=trial[1]
+            )
             # Release request after all the responses are delivered
-            self._decoupled_infer(request_count=1,
-                                  repeat_count=5,
-                                  wait_time=10000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=10000, validate_fn=trial[1]
+            )
 
             # Multiple request case
             # Release request before the first response is delivered
-            self._decoupled_infer(request_count=5,
-                                  repeat_count=5,
-                                  wait_time=500,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=500, validate_fn=trial[1]
+            )
             # Release request when the responses are getting delivered
-            self._decoupled_infer(request_count=5,
-                                  repeat_count=5,
-                                  wait_time=2000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=2000, validate_fn=trial[1]
+            )
             # Release request after all the responses are delivered
-            self._decoupled_infer(request_count=5,
-                                  repeat_count=5,
-                                  wait_time=10000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=10000, validate_fn=trial[1]
+            )
 
     def test_one_to_multi_many(self):
         # Test cases where each request generates multiple response but the
@@ -385,37 +399,31 @@ def test_one_to_multi_many(self):
             self.model_name_ = trial[0]
             # Single request case
             # Release request before the first response is delivered
-            self._decoupled_infer(request_count=1,
-                                  repeat_count=5,
-                                  wait_time=500,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=500, validate_fn=trial[1]
+            )
             # Release request when the responses are getting delivered
-            self._decoupled_infer(request_count=1,
-                                  repeat_count=5,
-                                  wait_time=8000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=8000, validate_fn=trial[1]
+            )
             # Release request after all the responses are delivered
-            self._decoupled_infer(request_count=1,
-                                  repeat_count=5,
-                                  wait_time=20000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=20000, validate_fn=trial[1]
+            )
 
             # Multiple request case
             # Release request before the first response is delivered
-            self._decoupled_infer(request_count=5,
-                                  repeat_count=5,
-                                  wait_time=500,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=500, validate_fn=trial[1]
+            )
             # Release request when the responses are getting delivered
-            self._decoupled_infer(request_count=5,
-                                  repeat_count=5,
-                                  wait_time=3000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=3000, validate_fn=trial[1]
+            )
             # Release request after all the responses are delivered
-            self._decoupled_infer(request_count=5,
-                                  repeat_count=5,
-                                  wait_time=10000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=10000, validate_fn=trial[1]
+            )
 
     def test_response_order(self):
         # Test the expected response order for different cases
@@ -426,51 +434,61 @@ def test_response_order(self):
             self.model_name_ = trial[0]
 
             # Case 1: Interleaved responses
-            self._decoupled_infer(request_count=2,
-                                  request_delay=500,
-                                  repeat_count=4,
-                                  order_sequence=[[0, 2, 4, 6], [1, 3, 5, 7]],
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=500,
+                repeat_count=4,
+                order_sequence=[[0, 2, 4, 6], [1, 3, 5, 7]],
+                validate_fn=trial[1],
+            )
 
             # Case 2: All responses of second request delivered before any
             # response from the first
-            self._decoupled_infer(request_count=2,
-                                  request_delay=500,
-                                  repeat_count=4,
-                                  delay_time=2000,
-                                  delay_factor=0.1,
-                                  order_sequence=[[4, 5, 6, 7], [0, 1, 2, 3]],
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=500,
+                repeat_count=4,
+                delay_time=2000,
+                delay_factor=0.1,
+                order_sequence=[[4, 5, 6, 7], [0, 1, 2, 3]],
+                validate_fn=trial[1],
+            )
 
             # Case 3: Similar to Case 2, but the second request is generated
             # after the first response from first request is received
-            self._decoupled_infer(request_count=2,
-                                  request_delay=2500,
-                                  repeat_count=4,
-                                  delay_time=2000,
-                                  delay_factor=0.1,
-                                  order_sequence=[[0, 5, 6, 7], [1, 2, 3, 4]],
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=2500,
+                repeat_count=4,
+                delay_time=2000,
+                delay_factor=0.1,
+                order_sequence=[[0, 5, 6, 7], [1, 2, 3, 4]],
+                validate_fn=trial[1],
+            )
 
             # Case 4: All the responses of second requests are dleivered after
             # all the responses from first requests are received
-            self._decoupled_infer(request_count=2,
-                                  request_delay=100,
-                                  repeat_count=4,
-                                  delay_time=500,
-                                  delay_factor=10,
-                                  order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]],
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=100,
+                repeat_count=4,
+                delay_time=500,
+                delay_factor=10,
+                order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]],
+                validate_fn=trial[1],
+            )
 
             # Case 5: Similar to Case 4, but the second request is generated
             # after the first response from the first request is received
-            self._decoupled_infer(request_count=2,
-                                  request_delay=750,
-                                  repeat_count=4,
-                                  delay_time=500,
-                                  delay_factor=10,
-                                  order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]],
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=750,
+                repeat_count=4,
+                delay_time=500,
+                delay_factor=10,
+                order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]],
+                validate_fn=trial[1],
+            )
 
     def _no_streaming_helper(self, protocol):
         data_offset = 100
@@ -478,9 +496,9 @@ def _no_streaming_helper(self, protocol):
         delay_time = 1000
         wait_time = 2000
 
-        input_data = np.arange(start=data_offset,
-                               stop=data_offset + repeat_count,
-                               dtype=np.int32)
+        input_data = np.arange(
+            start=data_offset, stop=data_offset + repeat_count, dtype=np.int32
+        )
         delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time
         wait_data = np.array([wait_time], dtype=np.uint32)
 
@@ -490,12 +508,11 @@ def _no_streaming_helper(self, protocol):
             this_outputs = self.outputs_
         else:
             this_inputs = []
-            this_inputs.append(
-                httpclient.InferInput('IN', [repeat_count], "INT32"))
-            this_inputs.append(httpclient.InferInput('DELAY', [1], "UINT32"))
-            this_inputs.append(httpclient.InferInput('WAIT', [1], "UINT32"))
+            this_inputs.append(httpclient.InferInput("IN", [repeat_count], "INT32"))
+            this_inputs.append(httpclient.InferInput("DELAY", [1], "UINT32"))
+            this_inputs.append(httpclient.InferInput("WAIT", [1], "UINT32"))
             this_outputs = []
-            this_outputs.append(httpclient.InferRequestedOutput('OUT'))
+            this_outputs.append(httpclient.InferRequestedOutput("OUT"))
 
         # Initialize data for IN
         this_inputs[0].set_shape([repeat_count])
@@ -510,19 +527,22 @@ def _no_streaming_helper(self, protocol):
 
         if protocol == "grpc":
             triton_client = grpcclient.InferenceServerClient(
-                url="localhost:8001", verbose=True)
+                url="localhost:8001", verbose=True
+            )
         else:
             triton_client = httpclient.InferenceServerClient(
-                url="localhost:8000", verbose=True)
+                url="localhost:8000", verbose=True
+            )
 
         with self.assertRaises(InferenceServerException) as cm:
-            triton_client.infer(model_name=self.model_name_,
-                                inputs=this_inputs,
-                                outputs=this_outputs)
+            triton_client.infer(
+                model_name=self.model_name_, inputs=this_inputs, outputs=this_outputs
+            )
 
         self.assertIn(
             "doesn't support models with decoupled transaction policy",
-            str(cm.exception))
+            str(cm.exception),
+        )
 
     def test_no_streaming(self):
         # Test cases with no streaming inference. Server should give
@@ -541,9 +561,9 @@ def test_wrong_shape(self):
         delay_time = 1000
         wait_time = 2000
 
-        input_data = np.arange(start=data_offset,
-                               stop=data_offset + repeat_count,
-                               dtype=np.int32)
+        input_data = np.arange(
+            start=data_offset, stop=data_offset + repeat_count, dtype=np.int32
+        )
         delay_data = (np.ones([repeat_count + 1], dtype=np.uint32)) * delay_time
         wait_data = np.array([wait_time], dtype=np.uint32)
 
@@ -562,12 +582,14 @@ def test_wrong_shape(self):
         result_dict = {}
 
         with self.assertRaises(InferenceServerException) as cm:
-            self._stream_infer(1, 0, repeat_count, delay_data, 1, user_data,
-                               result_dict)
+            self._stream_infer(
+                1, 0, repeat_count, delay_data, 1, user_data, result_dict
+            )
 
-        self.assertIn("expected IN and DELAY shape to match, got [1] and [2]",
-                      str(cm.exception))
+        self.assertIn(
+            "expected IN and DELAY shape to match, got [1] and [2]", str(cm.exception)
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh
old mode 100644
new mode 100755
index 8fb5841997..90bb913b6c
--- a/qa/L0_decoupled/test.sh
+++ b/qa/L0_decoupled/test.sh
@@ -74,7 +74,7 @@ for trial in $TRIALS; do
       cat $SERVER_LOG
       exit 1
   fi
-  
+
   for i in \
               test_one_to_none \
               test_one_to_one \
@@ -82,7 +82,7 @@ for trial in $TRIALS; do
               test_no_streaming \
               test_response_order \
 	      test_wrong_shape; do
-  
+
       echo "Test: $i" >>$CLIENT_LOG
       set +e
       python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1
@@ -100,11 +100,11 @@ for trial in $TRIALS; do
       fi
       set -e
   done
-  
+
   # Will delay the writing of each response by the specified many milliseconds.
   # This will ensure that there are multiple responses available to be written.
   export TRITONSERVER_DELAY_GRPC_RESPONSE=2000
-  
+
   echo "Test: test_one_to_multi_many" >>$CLIENT_LOG
   set +e
   python $DECOUPLED_TEST DecoupledTest.test_one_to_multi_many >>$CLIENT_LOG 2>&1
@@ -120,18 +120,18 @@ for trial in $TRIALS; do
           RET=1
       fi
   fi
-  
+
   set -e
-  
+
   unset TRITONSERVER_DELAY_GRPC_RESPONSE
-  
+
   kill $SERVER_PID
   wait $SERVER_PID
 done
 
 if [ $RET -eq 0 ]; then
   echo -e "\n***\n*** Test Passed\n***"
-else 
+else
   echo -e "\n***\n*** Test Failed\n***"
 fi
 
diff --git a/qa/L0_device_memory_tracker/test.py b/qa/L0_device_memory_tracker/test.py
old mode 100644
new mode 100755
index 0265f043d5..1d443d1032
--- a/qa/L0_device_memory_tracker/test.py
+++ b/qa/L0_device_memory_tracker/test.py
@@ -25,18 +25,16 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import unittest
 import time
+import unittest
 from functools import partial
 
-import tritonclient.http as httpclient
-import tritonclient.grpc as grpcclient
-
 import nvidia_smi
+import tritonclient.grpc as grpcclient
+import tritonclient.http as httpclient
 
 
 class UnifiedClientProxy:
-
     def __init__(self, client):
         self.client_ = client
 
@@ -45,21 +43,19 @@ def __getattr__(self, attr):
         if type(self.client_) == grpcclient.InferenceServerClient:
             if attr == "get_model_config":
                 return lambda *args, **kwargs: forward_attr(
-                    *args, **kwargs, as_json=True)["config"]
+                    *args, **kwargs, as_json=True
+                )["config"]
             elif attr == "get_inference_statistics":
                 return partial(forward_attr, as_json=True)
         return forward_attr
 
 
 class MemoryUsageTest(unittest.TestCase):
-
     def setUp(self):
         nvidia_smi.nvmlInit()
         self.gpu_handle_ = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
-        self.http_client_ = httpclient.InferenceServerClient(
-            url="localhost:8000")
-        self.grpc_client_ = grpcclient.InferenceServerClient(
-            url="localhost:8001")
+        self.http_client_ = httpclient.InferenceServerClient(url="localhost:8000")
+        self.grpc_client_ = grpcclient.InferenceServerClient(url="localhost:8001")
 
     def tearDown(self):
         nvidia_smi.nvmlShutdown()
@@ -69,8 +65,7 @@ def report_used_gpu_memory(self):
         return info.used
 
     def is_testing_backend(self, model_name, backend_name):
-        return self.client_.get_model_config(
-            model_name)["backend"] == backend_name
+        return self.client_.get_model_config(model_name)["backend"] == backend_name
 
     def verify_recorded_usage(self, model_stat):
         recorded_gpu_usage = 0
@@ -87,10 +82,13 @@ def verify_recorded_usage(self, model_stat):
         # check with tolerance as gpu usage obtained is overall usage
         self.assertTrue(
             usage_delta * 0.9 <= recorded_gpu_usage <= usage_delta * 1.1,
-            msg=
-            "For model {}, expect recorded usage to be in range [{}, {}], got {}"
-            .format(model_stat["name"], usage_delta * 0.9, usage_delta * 1.1,
-                    recorded_gpu_usage))
+            msg="For model {}, expect recorded usage to be in range [{}, {}], got {}".format(
+                model_stat["name"],
+                usage_delta * 0.9,
+                usage_delta * 1.1,
+                recorded_gpu_usage,
+            ),
+        )
 
     def test_onnx_http(self):
         self.client_ = UnifiedClientProxy(self.http_client_)
diff --git a/qa/L0_device_memory_tracker/test.sh b/qa/L0_device_memory_tracker/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_dlpack_multi_gpu/test.sh b/qa/L0_dlpack_multi_gpu/test.sh
old mode 100644
new mode 100755
index af528a6667..2485bfdb88
--- a/qa/L0_dlpack_multi_gpu/test.sh
+++ b/qa/L0_dlpack_multi_gpu/test.sh
@@ -64,7 +64,7 @@ fi
 
 set +e
 export MODEL_NAME="dlpack_test"
-python3 $CLIENT_PY > $CLIENT_LOG 2>&1 
+python3 $CLIENT_PY > $CLIENT_LOG 2>&1
 
 if [ $? -ne 0 ]; then
     echo -e "\n***\n*** python_unittest.py FAILED. \n***"
diff --git a/qa/L0_doc_links/test.sh b/qa/L0_doc_links/test.sh
old mode 100644
new mode 100755
index 730adee917..be7d291b01
--- a/qa/L0_doc_links/test.sh
+++ b/qa/L0_doc_links/test.sh
@@ -1,4 +1,5 @@
-# Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/bin/bash
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/qa/L0_dyna_implicit_state/test.sh b/qa/L0_dyna_implicit_state/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_dyna_sequence_batcher/dyna_sequence_batcher_test.py b/qa/L0_dyna_sequence_batcher/dyna_sequence_batcher_test.py
old mode 100644
new mode 100755
index 6fff86948c..f2c709469b
--- a/qa/L0_dyna_sequence_batcher/dyna_sequence_batcher_test.py
+++ b/qa/L0_dyna_sequence_batcher/dyna_sequence_batcher_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,57 +30,55 @@
 
 sys.path.append("../common")
 
-from builtins import str
 import os
-import time
 import threading
+import time
 import unittest
+from builtins import str
+
 import numpy as np
-import test_util as tu
 import sequence_util as su
+import test_util as tu
 
-_test_system_shared_memory = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
-_test_cuda_shared_memory = bool(
-    int(os.environ.get('TEST_CUDA_SHARED_MEMORY', 0)))
+_test_system_shared_memory = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
+_test_cuda_shared_memory = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0)))
 
-NO_BATCHING = (int(os.environ.get('NO_BATCHING', 0)) == 1)
+NO_BATCHING = int(os.environ.get("NO_BATCHING", 0)) == 1
 BACKENDS = os.environ.get(
-    'BACKENDS', "graphdef savedmodel libtorch onnx plan custom custom_string")
-IMPLICIT_STATE = (int(os.environ['IMPLICIT_STATE']) == 1)
+    "BACKENDS", "graphdef savedmodel libtorch onnx plan custom custom_string"
+)
+IMPLICIT_STATE = int(os.environ["IMPLICIT_STATE"]) == 1
 
-_trials = BACKENDS.split(' ')
+_trials = BACKENDS.split(" ")
 for backend in BACKENDS.split(" "):
     if NO_BATCHING:
-        if (backend != 'custom') and (backend != 'custom_string'):
+        if (backend != "custom") and (backend != "custom_string"):
             _trials += (backend + "_nobatch",)
 
 _ragged_batch_supported_trials = []
-if 'custom' in BACKENDS.split(' '):
-    _ragged_batch_supported_trials.append('custom')
+if "custom" in BACKENDS.split(" "):
+    _ragged_batch_supported_trials.append("custom")
 
 _protocols = ("http", "grpc")
 _max_sequence_idle_ms = 5000
 
 
 class DynaSequenceBatcherTest(su.SequenceBatcherTestUtil):
-
     def get_datatype(self, trial):
         return np.int32
 
-    def get_expected_result(self,
-                            expected_result,
-                            corrid,
-                            value,
-                            trial,
-                            flag_str=None):
+    def get_expected_result(self, expected_result, corrid, value, trial, flag_str=None):
         # Adjust the expected_result for models that
-        # couldn't implement the full accumulator. See
+        # could not implement the full accumulator. See
         # qa/common/gen_qa_dyna_sequence_models.py for more
         # information.
-        if ((("nobatch" not in trial) and ("custom" not in trial)) or \
-            ("graphdef" in trial) or ("plan" in trial) or ("onnx" in trial) or \
-            ("libtorch" in trial)):
+        if (
+            (("nobatch" not in trial) and ("custom" not in trial))
+            or ("graphdef" in trial)
+            or ("plan" in trial)
+            or ("onnx" in trial)
+            or ("libtorch" in trial)
+        ):
             expected_result = value
             if flag_str is not None:
                 if "start" in flag_str:
@@ -90,12 +90,9 @@ def get_expected_result(self,
                         expected_result += corrid
         return expected_result
 
-    def get_expected_result_implicit(self,
-                                     expected_result,
-                                     corrid,
-                                     value,
-                                     trial,
-                                     flag_str=None):
+    def get_expected_result_implicit(
+        self, expected_result, corrid, value, trial, flag_str=None
+    ):
         return expected_result
 
     def test_simple_sequence(self):
@@ -111,18 +108,22 @@ def test_simple_sequence(self):
 
                     self.check_setup(model_name)
                     self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
-                    self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                     os.environ)
+                    self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
 
                     if "string" in trial:
-                        corrid = '52'
+                        corrid = "52"
                     else:
                         corrid = 52
 
-                    expected_result = self.get_expected_result(
-                        45 + int(corrid), corrid, 9, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        45, corrid, 9, trial, "end")
+                    expected_result = (
+                        self.get_expected_result(
+                            45 + int(corrid), corrid, 9, trial, "end"
+                        )
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            45, corrid, 9, trial, "end"
+                        )
+                    )
 
                     self.check_sequence(
                         trial,
@@ -131,19 +132,26 @@ def test_simple_sequence(self):
                         corrid,
                         (4000, None),
                         # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay))
-                        (("start", 1, None, None), (None, 2, None, None),
-                         (None, 3, None, None), (None, 4, None, None),
-                         (None, 5, None, None), (None, 6, None, None),
-                         (None, 7, None, None), (None, 8, None, None),
-                         ("end", 9, None, None)),
+                        (
+                            ("start", 1, None, None),
+                            (None, 2, None, None),
+                            (None, 3, None, None),
+                            (None, 4, None, None),
+                            (None, 5, None, None),
+                            (None, 6, None, None),
+                            (None, 7, None, None),
+                            (None, 8, None, None),
+                            ("end", 9, None, None),
+                        ),
                         expected_result,
                         protocol,
-                        sequence_name="{}_{}".format(self._testMethodName,
-                                                     protocol))
+                        sequence_name="{}_{}".format(self._testMethodName, protocol),
+                    )
 
                     self.check_deferred_exception()
-                    self.check_status(model_name, {1: 9 * (idx + 1)},
-                                      9 * (idx + 1), 9 * (idx + 1))
+                    self.check_status(
+                        model_name, {1: 9 * (idx + 1)}, 9 * (idx + 1), 9 * (idx + 1)
+                    )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -160,18 +168,22 @@ def test_length1_sequence(self):
 
                     self.check_setup(model_name)
                     self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
-                    self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                     os.environ)
+                    self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
 
                     if "string" in trial:
-                        corrid = '99'
+                        corrid = "99"
                     else:
                         corrid = 99
 
-                    expected_result = self.get_expected_result(
-                        42 + int(corrid), corrid, 42, trial, "start,end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        42, corrid, 42, trial, "start,end")
+                    expected_result = (
+                        self.get_expected_result(
+                            42 + int(corrid), corrid, 42, trial, "start,end"
+                        )
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            42, corrid, 42, trial, "start,end"
+                        )
+                    )
 
                     self.check_sequence(
                         trial,
@@ -180,50 +192,60 @@ def test_length1_sequence(self):
                         corrid,
                         (4000, None),
                         # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay))
-                        (
-                            ("start,end", 42, None, None),),
+                        (("start,end", 42, None, None),),
                         expected_result,
                         protocol,
-                        sequence_name="{}_{}".format(self._testMethodName,
-                                                     protocol))
+                        sequence_name="{}_{}".format(self._testMethodName, protocol),
+                    )
 
                     self.check_deferred_exception()
-                    self.check_status(model_name, {1: (idx + 1)}, (idx + 1),
-                                      (idx + 1))
+                    self.check_status(model_name, {1: (idx + 1)}, (idx + 1), (idx + 1))
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
-    def _multi_sequence_impl(self, trials, expected_batch_exec,
-                             expected_exec_cnt, sleep_secs, tensor_shapes):
+    def _multi_sequence_impl(
+        self, trials, expected_batch_exec, expected_exec_cnt, sleep_secs, tensor_shapes
+    ):
         for trial in trials:
             self.clear_deferred_exceptions()
             dtype = self.get_datatype(trial)
             precreated_shm0_handles = self.precreate_register_regions(
-                (1, 3), dtype, 0, tensor_shape=(tensor_shapes[0],))
+                (1, 3), dtype, 0, tensor_shape=(tensor_shapes[0],)
+            )
             precreated_shm1_handles = self.precreate_register_regions(
-                (11, 12, 13), dtype, 1, tensor_shape=(tensor_shapes[1],))
+                (11, 12, 13), dtype, 1, tensor_shape=(tensor_shapes[1],)
+            )
             precreated_shm2_handles = self.precreate_register_regions(
-                (111, 112, 113), dtype, 2, tensor_shape=(tensor_shapes[2],))
+                (111, 112, 113), dtype, 2, tensor_shape=(tensor_shapes[2],)
+            )
             precreated_shm3_handles = self.precreate_register_regions(
-                (1111, 1112, 1113), dtype, 3, tensor_shape=(tensor_shapes[3],))
+                (1111, 1112, 1113), dtype, 3, tensor_shape=(tensor_shapes[3],)
+            )
             try:
                 model_name = tu.get_dyna_sequence_model_name(trial, dtype)
 
                 self.check_setup(model_name)
                 self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
-                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                 os.environ)
+                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
 
                 if "string" in trial:
-                    corrids = ['1001', '1002', '1003', '1004']
+                    corrids = ["1001", "1002", "1003", "1004"]
                 else:
                     corrids = [1001, 1002, 1003, 1004]
 
-                expected_result = self.get_expected_result(
-                    4 * tensor_shapes[0] +
-                    int(corrids[0]), corrids[0], 3, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    4, corrids[0], 3, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        4 * tensor_shapes[0] + int(corrids[0]),
+                        corrids[0],
+                        3,
+                        trial,
+                        "end",
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        4, corrids[0], 3, trial, "end"
+                    )
+                )
 
                 threads = []
                 threads.append(
@@ -238,19 +260,30 @@ def _multi_sequence_impl(self, trials, expected_batch_exec,
                             # (flag_str, value, pre_delay_ms)
                             (("start", 1, None), ("end", 3, None)),
                             expected_result,
-                            precreated_shm0_handles),
+                            precreated_shm0_handles,
+                        ),
                         kwargs={
-                            'sequence_name':
-                                "{}_{}".format(self._testMethodName,
-                                               corrids[0]),
-                            'tensor_shape': (tensor_shapes[0],)
-                        }))
+                            "sequence_name": "{}_{}".format(
+                                self._testMethodName, corrids[0]
+                            ),
+                            "tensor_shape": (tensor_shapes[0],),
+                        },
+                    )
+                )
 
-                expected_result = self.get_expected_result(
-                    36 * tensor_shapes[1] +
-                    int(corrids[1]), corrids[1], 13, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    36, corrids[1], 13, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        36 * tensor_shapes[1] + int(corrids[1]),
+                        corrids[1],
+                        13,
+                        trial,
+                        "end",
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        36, corrids[1], 13, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -261,22 +294,32 @@ def _multi_sequence_impl(self, trials, expected_batch_exec,
                             corrids[1],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 11, None), (None, 12, None), ("end", 13,
-                                                                     None)),
+                            (("start", 11, None), (None, 12, None), ("end", 13, None)),
                             expected_result,
-                            precreated_shm1_handles),
+                            precreated_shm1_handles,
+                        ),
                         kwargs={
-                            'sequence_name':
-                                "{}_{}".format(self._testMethodName,
-                                               corrids[1]),
-                            'tensor_shape': (tensor_shapes[1],)
-                        }))
+                            "sequence_name": "{}_{}".format(
+                                self._testMethodName, corrids[1]
+                            ),
+                            "tensor_shape": (tensor_shapes[1],),
+                        },
+                    )
+                )
 
-                expected_result = self.get_expected_result(
-                    336 * tensor_shapes[2] +
-                    int(corrids[2]), corrids[2], 113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    336, corrids[2], 113, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        336 * tensor_shapes[2] + int(corrids[2]),
+                        corrids[2],
+                        113,
+                        trial,
+                        "end",
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        336, corrids[2], 113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -287,21 +330,35 @@ def _multi_sequence_impl(self, trials, expected_batch_exec,
                             corrids[2],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 111, None), (None, 112, None),
-                             ("end", 113, None)),
+                            (
+                                ("start", 111, None),
+                                (None, 112, None),
+                                ("end", 113, None),
+                            ),
                             expected_result,
-                            precreated_shm2_handles),
+                            precreated_shm2_handles,
+                        ),
                         kwargs={
-                            'sequence_name':
-                                "{}_{}".format(self._testMethodName,
-                                               corrids[2]),
-                            'tensor_shape': (tensor_shapes[2],)
-                        }))
-                expected_result = self.get_expected_result(
-                    3336 * tensor_shapes[3] +
-                    int(corrids[3]), corrids[3], 1113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    3336, corrids[3], 1113, trial, "end")
+                            "sequence_name": "{}_{}".format(
+                                self._testMethodName, corrids[2]
+                            ),
+                            "tensor_shape": (tensor_shapes[2],),
+                        },
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        3336 * tensor_shapes[3] + int(corrids[3]),
+                        corrids[3],
+                        1113,
+                        trial,
+                        "end",
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        3336, corrids[3], 1113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -312,16 +369,22 @@ def _multi_sequence_impl(self, trials, expected_batch_exec,
                             corrids[3],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1111, None), (None, 1112, None),
-                             ("end", 1113, None)),
+                            (
+                                ("start", 1111, None),
+                                (None, 1112, None),
+                                ("end", 1113, None),
+                            ),
                             expected_result,
-                            precreated_shm3_handles),
+                            precreated_shm3_handles,
+                        ),
                         kwargs={
-                            'sequence_name':
-                                "{}_{}".format(self._testMethodName,
-                                               corrids[3]),
-                            'tensor_shape': (tensor_shapes[3],)
-                        }))
+                            "sequence_name": "{}_{}".format(
+                                self._testMethodName, corrids[3]
+                            ),
+                            "tensor_shape": (tensor_shapes[3],),
+                        },
+                    )
+                )
 
                 for t in threads:
                     t.start()
@@ -330,8 +393,9 @@ def _multi_sequence_impl(self, trials, expected_batch_exec,
                 for t in threads:
                     t.join()
                 self.check_deferred_exception()
-                self.check_status(model_name, expected_batch_exec,
-                                  expected_exec_cnt, 11)
+                self.check_status(
+                    model_name, expected_batch_exec, expected_exec_cnt, 11
+                )
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
             finally:
@@ -355,18 +419,18 @@ def test_multi_sequence_different_shape(self):
         # Send four sequences in parallel where the requests in each
         # sequence have different shape. Sequences should not be
         # batched due to input tensor size differences.
-        self._multi_sequence_impl(_ragged_batch_supported_trials, {1: 11}, 11,
-                                  0, (4, 3, 1, 2))
+        self._multi_sequence_impl(
+            _ragged_batch_supported_trials, {1: 11}, 11, 0, (4, 3, 1, 2)
+        )
 
     def test_multi_sequence_different_shape_allow_ragged(self):
         # Send four sequences in parallel where the requests in each
         # sequence have different shape. Input is marked as allowing
         # ragged and so sequences should be batched even with input
         # tensor size differences.
-        self._multi_sequence_impl(_ragged_batch_supported_trials, {
-            4: 2,
-            3: 1
-        }, 3, 1, (4, 3, 1, 2))
+        self._multi_sequence_impl(
+            _ragged_batch_supported_trials, {4: 2, 3: 1}, 3, 1, (4, 3, 1, 2)
+        )
 
     def test_backlog(self):
         # Send 5 equal-length sequences in parallel and make sure they
@@ -376,33 +440,42 @@ def test_backlog(self):
         for trial in _trials:
             self.clear_deferred_exceptions()
             dtype = self.get_datatype(trial)
-            precreated_shm0_handles = self.precreate_register_regions((1, 2, 3),
-                                                                      dtype, 0)
+            precreated_shm0_handles = self.precreate_register_regions(
+                (1, 2, 3), dtype, 0
+            )
             precreated_shm1_handles = self.precreate_register_regions(
-                (11, 12, 13), dtype, 1)
+                (11, 12, 13), dtype, 1
+            )
             precreated_shm2_handles = self.precreate_register_regions(
-                (111, 112, 113), dtype, 2)
+                (111, 112, 113), dtype, 2
+            )
             precreated_shm3_handles = self.precreate_register_regions(
-                (1111, 1112, 1113), dtype, 3)
+                (1111, 1112, 1113), dtype, 3
+            )
             precreated_shm4_handles = self.precreate_register_regions(
-                (11111, 11112, 11113), dtype, 4)
+                (11111, 11112, 11113), dtype, 4
+            )
             try:
                 model_name = tu.get_dyna_sequence_model_name(trial, dtype)
 
                 self.check_setup(model_name)
                 self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
-                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                 os.environ)
+                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
 
                 if "string" in trial:
-                    corrids = ['1001', '1002', '1003', '1004', '1005']
+                    corrids = ["1001", "1002", "1003", "1004", "1005"]
                 else:
                     corrids = [1001, 1002, 1003, 1004, 1005]
 
-                expected_result = self.get_expected_result(
-                    6 + int(corrids[0]), corrids[0], 3, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    6, corrids[0], 3, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        6 + int(corrids[0]), corrids[0], 3, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        6, corrids[0], 3, trial, "end"
+                    )
+                )
 
                 threads = []
                 threads.append(
@@ -415,18 +488,23 @@ def test_backlog(self):
                             corrids[0],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1, None), (None, 2, None), ("end", 3,
-                                                                   None)),
+                            (("start", 1, None), (None, 2, None), ("end", 3, None)),
                             expected_result,
-                            precreated_shm0_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm0_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
-                expected_result = self.get_expected_result(
-                    36 + int(corrids[1]), corrids[1], 13, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    36, corrids[1], 13, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        36 + int(corrids[1]), corrids[1], 13, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        36, corrids[1], 13, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -437,18 +515,23 @@ def test_backlog(self):
                             corrids[1],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 11, None), (None, 12, None), ("end", 13,
-                                                                     None)),
+                            (("start", 11, None), (None, 12, None), ("end", 13, None)),
                             expected_result,
-                            precreated_shm1_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm1_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
-                expected_result = self.get_expected_result(
-                    336 + int(corrids[2]), corrids[2], 113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    336, corrids[2], 113, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        336 + int(corrids[2]), corrids[2], 113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        336, corrids[2], 113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -459,18 +542,27 @@ def test_backlog(self):
                             corrids[2],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 111, None), (None, 112, None),
-                             ("end", 113, None)),
+                            (
+                                ("start", 111, None),
+                                (None, 112, None),
+                                ("end", 113, None),
+                            ),
                             expected_result,
-                            precreated_shm2_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm2_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
-                expected_result = self.get_expected_result(
-                    3336 + int(corrids[3]), corrids[3], 1113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    3336, corrids[3], 1113, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        3336 + int(corrids[3]), corrids[3], 1113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        3336, corrids[3], 1113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -481,18 +573,27 @@ def test_backlog(self):
                             corrids[3],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1111, None), (None, 1112, None),
-                             ("end", 1113, None)),
+                            (
+                                ("start", 1111, None),
+                                (None, 1112, None),
+                                ("end", 1113, None),
+                            ),
                             expected_result,
-                            precreated_shm3_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm3_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
-                expected_result = self.get_expected_result(
-                    33336 + int(corrids[4]), corrids[4], 11113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    33336, corrids[4], 11113, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        33336 + int(corrids[4]), corrids[4], 11113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        33336, corrids[4], 11113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -503,13 +604,17 @@ def test_backlog(self):
                             corrids[4],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 11111, None), (None, 11112, None),
-                             ("end", 11113, None)),
+                            (
+                                ("start", 11111, None),
+                                (None, 11112, None),
+                                ("end", 11113, None),
+                            ),
                             expected_result,
-                            precreated_shm4_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm4_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
                 for t in threads:
                     t.start()
@@ -534,35 +639,45 @@ def test_backlog_fill(self):
         for trial in _trials:
             self.clear_deferred_exceptions()
             dtype = self.get_datatype(trial)
-            precreated_shm0_handles = self.precreate_register_regions((1, 2, 3),
-                                                                      dtype, 0)
-            precreated_shm1_handles = self.precreate_register_regions((11, 13),
-                                                                      dtype, 1)
+            precreated_shm0_handles = self.precreate_register_regions(
+                (1, 2, 3), dtype, 0
+            )
+            precreated_shm1_handles = self.precreate_register_regions(
+                (11, 13), dtype, 1
+            )
             precreated_shm2_handles = self.precreate_register_regions(
-                (111, 113), dtype, 2)
+                (111, 113), dtype, 2
+            )
             precreated_shm3_handles = self.precreate_register_regions(
-                (1111, 1112, 1113), dtype, 3)
-            precreated_shm4_handles = self.precreate_register_regions((11111,),
-                                                                      dtype, 4)
-            precreated_shm5_handles = self.precreate_register_regions((22222,),
-                                                                      dtype, 5)
+                (1111, 1112, 1113), dtype, 3
+            )
+            precreated_shm4_handles = self.precreate_register_regions(
+                (11111,), dtype, 4
+            )
+            precreated_shm5_handles = self.precreate_register_regions(
+                (22222,), dtype, 5
+            )
             try:
                 model_name = tu.get_dyna_sequence_model_name(trial, dtype)
 
                 self.check_setup(model_name)
                 self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
-                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                 os.environ)
+                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                 if "string" in trial:
-                    corrids = ['1001', '1002', '1003', '1004', '1005', '1006']
+                    corrids = ["1001", "1002", "1003", "1004", "1005", "1006"]
                 else:
                     corrids = [1001, 1002, 1003, 1004, 1005, 1006]
                 threads = []
 
-                expected_result = self.get_expected_result(
-                    6 + int(corrids[0]), corrids[0], 3, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    6, corrids[0], 3, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        6 + int(corrids[0]), corrids[0], 3, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        6, corrids[0], 3, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -573,17 +688,22 @@ def test_backlog_fill(self):
                             corrids[0],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1, None), (None, 2, None), ("end", 3,
-                                                                   None)),
+                            (("start", 1, None), (None, 2, None), ("end", 3, None)),
                             expected_result,
-                            precreated_shm0_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    24 + int(corrids[1]), corrids[1], 13, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    24, corrids[1], 13, trial, "end")
+                            precreated_shm0_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        24 + int(corrids[1]), corrids[1], 13, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        24, corrids[1], 13, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -596,14 +716,20 @@ def test_backlog_fill(self):
                             # (flag_str, value, pre_delay_ms)
                             (("start", 11, None), ("end", 13, None)),
                             expected_result,
-                            precreated_shm1_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    224 + int(corrids[2]), corrids[2], 113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    224, corrids[2], 113, trial, "end")
+                            precreated_shm1_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        224 + int(corrids[2]), corrids[2], 113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        224, corrids[2], 113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -616,14 +742,20 @@ def test_backlog_fill(self):
                             # (flag_str, value, pre_delay_ms)
                             (("start", 111, None), ("end", 113, None)),
                             expected_result,
-                            precreated_shm2_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    3336 + int(corrids[3]), corrids[3], 1113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    3336, corrids[3], 1113, trial, "end")
+                            precreated_shm2_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        3336 + int(corrids[3]), corrids[3], 1113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        3336, corrids[3], 1113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -634,18 +766,26 @@ def test_backlog_fill(self):
                             corrids[3],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1111, None), (None, 1112, 3000),
-                             ("end", 1113, None)),
+                            (
+                                ("start", 1111, None),
+                                (None, 1112, 3000),
+                                ("end", 1113, None),
+                            ),
                             expected_result,
-                            precreated_shm3_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    11111 +
-                    int(corrids[4]), corrids[4], 11111, trial, "start,end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    11111, corrids[4], 11111, trial, "start,end")
+                            precreated_shm3_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        11111 + int(corrids[4]), corrids[4], 11111, trial, "start,end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        11111, corrids[4], 11111, trial, "start,end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -656,18 +796,22 @@ def test_backlog_fill(self):
                             corrids[4],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (
-                                ("start,end", 11111, None),),
+                            (("start,end", 11111, None),),
                             expected_result,
-                            precreated_shm4_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    22222 +
-                    int(corrids[5]), corrids[5], 22222, trial, "start,end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    22222, corrids[5], 22222, trial, "start,end")
+                            precreated_shm4_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        22222 + int(corrids[5]), corrids[5], 22222, trial, "start,end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        22222, corrids[5], 22222, trial, "start,end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -678,13 +822,13 @@ def test_backlog_fill(self):
                             corrids[5],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (
-                                ("start,end", 22222, None),),
+                            (("start,end", 22222, None),),
                             expected_result,
-                            precreated_shm5_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm5_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
                 threads[0].start()
                 threads[1].start()
@@ -716,35 +860,45 @@ def test_backlog_fill_no_end(self):
         for trial in _trials:
             self.clear_deferred_exceptions()
             dtype = self.get_datatype(trial)
-            precreated_shm0_handles = self.precreate_register_regions((1, 2, 3),
-                                                                      dtype, 0)
-            precreated_shm1_handles = self.precreate_register_regions((11, 13),
-                                                                      dtype, 1)
+            precreated_shm0_handles = self.precreate_register_regions(
+                (1, 2, 3), dtype, 0
+            )
+            precreated_shm1_handles = self.precreate_register_regions(
+                (11, 13), dtype, 1
+            )
             precreated_shm2_handles = self.precreate_register_regions(
-                (111, 113), dtype, 2)
+                (111, 113), dtype, 2
+            )
             precreated_shm3_handles = self.precreate_register_regions(
-                (1111, 1112, 1113), dtype, 3)
-            precreated_shm4_handles = self.precreate_register_regions((11111,),
-                                                                      dtype, 4)
+                (1111, 1112, 1113), dtype, 3
+            )
+            precreated_shm4_handles = self.precreate_register_regions(
+                (11111,), dtype, 4
+            )
             precreated_shm5_handles = self.precreate_register_regions(
-                (22222, 22223, 22224), dtype, 5)
+                (22222, 22223, 22224), dtype, 5
+            )
             try:
                 model_name = tu.get_dyna_sequence_model_name(trial, dtype)
 
                 self.check_setup(model_name)
                 self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
-                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                 os.environ)
+                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
 
                 if "string" in trial:
-                    corrids = ['1001', '1002', '1003', '1004', '1005', '1006']
+                    corrids = ["1001", "1002", "1003", "1004", "1005", "1006"]
                 else:
                     corrids = [1001, 1002, 1003, 1004, 1005, 1006]
                 threads = []
-                expected_result = self.get_expected_result(
-                    6 + int(corrids[0]), corrids[0], 3, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    6, corrids[0], 3, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        6 + int(corrids[0]), corrids[0], 3, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        6, corrids[0], 3, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -755,17 +909,22 @@ def test_backlog_fill_no_end(self):
                             corrids[0],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1, None), (None, 2, None), ("end", 3,
-                                                                   None)),
+                            (("start", 1, None), (None, 2, None), ("end", 3, None)),
                             expected_result,
-                            precreated_shm0_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    24 + int(corrids[1]), corrids[1], 13, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    24, corrids[1], 13, trial, "end")
+                            precreated_shm0_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        24 + int(corrids[1]), corrids[1], 13, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        24, corrids[1], 13, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -778,14 +937,20 @@ def test_backlog_fill_no_end(self):
                             # (flag_str, value, pre_delay_ms)
                             (("start", 11, None), ("end", 13, None)),
                             expected_result,
-                            precreated_shm1_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    224 + int(corrids[2]), corrids[2], 113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    224, corrids[2], 113, trial, "end")
+                            precreated_shm1_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        224 + int(corrids[2]), corrids[2], 113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        224, corrids[2], 113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -798,14 +963,20 @@ def test_backlog_fill_no_end(self):
                             # (flag_str, value, pre_delay_ms)
                             (("start", 111, None), ("end", 113, None)),
                             expected_result,
-                            precreated_shm2_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    3336 + int(corrids[3]), corrids[3], 1113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    3336, corrids[3], 1113, trial, "end")
+                            precreated_shm2_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        3336 + int(corrids[3]), corrids[3], 1113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        3336, corrids[3], 1113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -816,18 +987,26 @@ def test_backlog_fill_no_end(self):
                             corrids[3],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1111, None), (None, 1112, 3000),
-                             ("end", 1113, None)),
+                            (
+                                ("start", 1111, None),
+                                (None, 1112, 3000),
+                                ("end", 1113, None),
+                            ),
                             expected_result,
-                            precreated_shm3_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    11111 +
-                    int(corrids[4]), corrids[4], 11111, trial, "start,end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    11111, corrids[4], 11111, trial, "start,end")
+                            precreated_shm3_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        11111 + int(corrids[4]), corrids[4], 11111, trial, "start,end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        11111, corrids[4], 11111, trial, "start,end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -838,17 +1017,22 @@ def test_backlog_fill_no_end(self):
                             corrids[4],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (
-                                ("start,end", 11111, None),),
+                            (("start,end", 11111, None),),
                             expected_result,
-                            precreated_shm4_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    66669 + int(corrids[5]), corrids[5], 22224, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    66669, corrids[5], 22224, trial, "end")
+                            precreated_shm4_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        66669 + int(corrids[5]), corrids[5], 22224, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        66669, corrids[5], 22224, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -865,10 +1049,11 @@ def test_backlog_fill_no_end(self):
                                 ("end", 22224, 2000),
                             ),
                             expected_result,
-                            precreated_shm5_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm5_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
                 threads[0].start()
                 threads[1].start()
@@ -906,33 +1091,40 @@ def test_backlog_sequence_timeout(self):
         for trial in _trials:
             self.clear_deferred_exceptions()
             dtype = self.get_datatype(trial)
-            precreated_shm0_handles = self.precreate_register_regions((1, 3),
-                                                                      dtype, 0)
+            precreated_shm0_handles = self.precreate_register_regions((1, 3), dtype, 0)
             precreated_shm1_handles = self.precreate_register_regions(
-                (11, 12, 12, 13), dtype, 1)
+                (11, 12, 12, 13), dtype, 1
+            )
             precreated_shm2_handles = self.precreate_register_regions(
-                (111, 112, 112, 113), dtype, 2)
+                (111, 112, 112, 113), dtype, 2
+            )
             precreated_shm3_handles = self.precreate_register_regions(
-                (1111, 1112, 1112, 1113), dtype, 3)
+                (1111, 1112, 1112, 1113), dtype, 3
+            )
             precreated_shm4_handles = self.precreate_register_regions(
-                (11111, 11113), dtype, 4)
+                (11111, 11113), dtype, 4
+            )
             try:
                 model_name = tu.get_dyna_sequence_model_name(trial, dtype)
 
                 self.check_setup(model_name)
                 self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
-                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                 os.environ)
+                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
 
                 if "string" in trial:
-                    corrids = ['1001', '1002', '1003', '1004', '1005']
+                    corrids = ["1001", "1002", "1003", "1004", "1005"]
                 else:
                     corrids = [1001, 1002, 1003, 1004, 1005]
                 threads = []
-                expected_result = self.get_expected_result(
-                    4 + int(corrids[0]), corrids[0], 3, trial, None
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    4, corrids[0], 3, trial, None)
+                expected_result = (
+                    self.get_expected_result(
+                        4 + int(corrids[0]), corrids[0], 3, trial, None
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        4, corrids[0], 3, trial, None
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -943,17 +1135,25 @@ def test_backlog_sequence_timeout(self):
                             corrids[0],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1, None),
-                             (None, 3, _max_sequence_idle_ms + 1000)),
+                            (
+                                ("start", 1, None),
+                                (None, 3, _max_sequence_idle_ms + 1000),
+                            ),
                             expected_result,
-                            precreated_shm0_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    48 + int(corrids[1]), corrids[1], 13, trial, None
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    48, corrids[1], 13, trial, None)
+                            precreated_shm0_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        48 + int(corrids[1]), corrids[1], 13, trial, None
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        48, corrids[1], 13, trial, None
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -964,19 +1164,27 @@ def test_backlog_sequence_timeout(self):
                             corrids[1],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 11, None), (None, 12,
-                                                   _max_sequence_idle_ms / 2),
-                             (None, 12, _max_sequence_idle_ms / 2),
-                             ("end", 13, _max_sequence_idle_ms / 2)),
+                            (
+                                ("start", 11, None),
+                                (None, 12, _max_sequence_idle_ms / 2),
+                                (None, 12, _max_sequence_idle_ms / 2),
+                                ("end", 13, _max_sequence_idle_ms / 2),
+                            ),
                             expected_result,
-                            precreated_shm1_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    448 + int(corrids[2]), corrids[2], 113, trial, None
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    448, corrids[2], 113, trial, None)
+                            precreated_shm1_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        448 + int(corrids[2]), corrids[2], 113, trial, None
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        448, corrids[2], 113, trial, None
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -987,19 +1195,27 @@ def test_backlog_sequence_timeout(self):
                             corrids[2],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 111, None), (None, 112,
-                                                    _max_sequence_idle_ms / 2),
-                             (None, 112, _max_sequence_idle_ms / 2),
-                             ("end", 113, _max_sequence_idle_ms / 2)),
+                            (
+                                ("start", 111, None),
+                                (None, 112, _max_sequence_idle_ms / 2),
+                                (None, 112, _max_sequence_idle_ms / 2),
+                                ("end", 113, _max_sequence_idle_ms / 2),
+                            ),
                             expected_result,
-                            precreated_shm2_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    4448 + int(corrids[3]), corrids[3], 1113, trial, None
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    4448, corrids[3], 1113, trial, None)
+                            precreated_shm2_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        4448 + int(corrids[3]), corrids[3], 1113, trial, None
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        4448, corrids[3], 1113, trial, None
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -1010,19 +1226,27 @@ def test_backlog_sequence_timeout(self):
                             corrids[3],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1111, None), (None, 1112,
-                                                     _max_sequence_idle_ms / 2),
-                             (None, 1112, _max_sequence_idle_ms / 2),
-                             ("end", 1113, _max_sequence_idle_ms / 2)),
+                            (
+                                ("start", 1111, None),
+                                (None, 1112, _max_sequence_idle_ms / 2),
+                                (None, 1112, _max_sequence_idle_ms / 2),
+                                ("end", 1113, _max_sequence_idle_ms / 2),
+                            ),
                             expected_result,
-                            precreated_shm3_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    22224 + int(corrids[4]), corrids[4], 11113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    22224, corrids[4], 11113, trial, "end")
+                            precreated_shm3_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        22224 + int(corrids[4]), corrids[4], 11113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        22224, corrids[4], 11113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -1035,10 +1259,11 @@ def test_backlog_sequence_timeout(self):
                             # (flag_str, value, pre_delay_ms)
                             (("start", 11111, None), ("end", 11113, None)),
                             expected_result,
-                            precreated_shm4_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm4_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
                 threads[0].start()
                 threads[1].start()
@@ -1052,10 +1277,15 @@ def test_backlog_sequence_timeout(self):
                 self.check_deferred_exception()
                 self.assertTrue(False, "expected error")
             except Exception as ex:
-                self.assertTrue(ex.message().startswith(
-                    str("inference request for sequence 1001 to " +
-                        "model '{}' must specify the START flag on the first " +
-                        "request of the sequence").format(model_name)))
+                self.assertTrue(
+                    ex.message().startswith(
+                        str(
+                            "inference request for sequence 1001 to "
+                            + "model '{}' must specify the START flag on the first "
+                            + "request of the sequence"
+                        ).format(model_name)
+                    )
+                )
             finally:
                 if _test_system_shared_memory or _test_cuda_shared_memory:
                     self.cleanup_shm_regions(precreated_shm0_handles)
@@ -1065,5 +1295,5 @@ def test_backlog_sequence_timeout(self):
                     self.cleanup_shm_regions(precreated_shm4_handles)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_dyna_sequence_batcher/test.sh b/qa/L0_dyna_sequence_batcher/test.sh
index 42f732338c..acac8399af 100755
--- a/qa/L0_dyna_sequence_batcher/test.sh
+++ b/qa/L0_dyna_sequence_batcher/test.sh
@@ -65,7 +65,7 @@ fi
 
 RET=0
 
-rm -fr *.log 
+rm -fr *.log
 
 # models
 rm -fr models && mkdir models
diff --git a/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py b/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py
old mode 100644
new mode 100755
index 89cbf359a7..e03876f981
--- a/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py
+++ b/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,22 +27,25 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import json
-import triton_python_backend_utils as pb_utils
+
 import numpy as np
+import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         responses = []
 
         for request in requests:
-            json_string = pb_utils.get_input_tensor_by_name(
-                request, "EXPECTED_HEADERS").as_numpy()[0].decode("utf-8")
+            json_string = (
+                pb_utils.get_input_tensor_by_name(request, "EXPECTED_HEADERS")
+                .as_numpy()[0]
+                .decode("utf-8")
+            )
             expected_headers = json.loads(json_string)
 
             success = True
-            if request.parameters() != '':
+            if request.parameters() != "":
                 parameters = json.loads(request.parameters())
                 for key, value in expected_headers.items():
                     if key in parameters:
@@ -49,10 +54,12 @@ def execute(self, requests):
                     else:
                         success = False
 
-            test_success = pb_utils.Tensor("TEST_SUCCESS",
-                                           np.array([success], dtype=bool))
+            test_success = pb_utils.Tensor(
+                "TEST_SUCCESS", np.array([success], dtype=bool)
+            )
             inference_response = pb_utils.InferenceResponse(
-                output_tensors=[test_success])
+                output_tensors=[test_success]
+            )
             responses.append(inference_response)
 
         return responses
diff --git a/qa/L0_grpc/grpc_basic_auth_test.py b/qa/L0_grpc/grpc_basic_auth_test.py
old mode 100644
new mode 100755
index a6408c442d..07d29ef5b7
--- a/qa/L0_grpc/grpc_basic_auth_test.py
+++ b/qa/L0_grpc/grpc_basic_auth_test.py
@@ -24,26 +24,23 @@
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import unittest
 import sys
+import unittest
 
 sys.path.append("../common")
 
 import test_util as tu
 import tritonclient.grpc as tritongrpcclient
 import tritonclient.grpc.aio as asynctritongrpcclient
-
-from tritonclient.grpc.auth import BasicAuth
 from tritonclient.grpc.aio.auth import BasicAuth as AsyncBasicAuth
+from tritonclient.grpc.auth import BasicAuth
 
 
 class GRPCBasicAuthTest(tu.TestResultCollector):
-
     def setUp(self):
         # Use the nginx port
-        self._client = tritongrpcclient.InferenceServerClient(
-            url='localhost:8004')
-        self._client.register_plugin(BasicAuth('username', 'password'))
+        self._client = tritongrpcclient.InferenceServerClient(url="localhost:8004")
+        self._client.register_plugin(BasicAuth("username", "password"))
 
     def test_client_call(self):
         self.assertTrue(self._client.is_server_live())
@@ -53,12 +50,10 @@ def tearDown(self):
 
 
 class GRPCBasicAuthAsyncTest(unittest.IsolatedAsyncioTestCase):
-
     async def asyncSetUp(self):
         # Use the nginx port
-        self._client = asynctritongrpcclient.InferenceServerClient(
-            url='localhost:8004')
-        self._client.register_plugin(AsyncBasicAuth('username', 'password'))
+        self._client = asynctritongrpcclient.InferenceServerClient(url="localhost:8004")
+        self._client.register_plugin(AsyncBasicAuth("username", "password"))
 
     async def test_client_call(self):
         self.assertTrue(await self._client.is_server_live())
@@ -67,5 +62,5 @@ async def asyncTearDown(self):
         await self._client.close()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_grpc/grpc_client_plugin_test.py b/qa/L0_grpc/grpc_client_plugin_test.py
old mode 100644
new mode 100755
index 45b6251e3e..1cc8c474ef
--- a/qa/L0_grpc/grpc_client_plugin_test.py
+++ b/qa/L0_grpc/grpc_client_plugin_test.py
@@ -24,23 +24,23 @@
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import sys
 import json
+import sys
 
 sys.path.append("../common")
 
 import unittest
+
 import numpy as np
 import test_util as tu
 import tritonclient.grpc as tritongrpcclient
+import tritonclient.grpc.aio as asynctritongrpcclient
 from tritonclient.grpc import InferenceServerClientPlugin
 from tritonclient.utils import np_to_triton_dtype
-import tritonclient.grpc.aio as asynctritongrpcclient
 
 
 # A simple plugin that adds headers to the inference request.
 class TestPlugin(InferenceServerClientPlugin):
-
     def __init__(self, headers):
         self._headers = headers
 
@@ -52,33 +52,35 @@ def prepare_infer_inputs(headers):
     expected_headers = np.array([json.dumps(headers)], dtype=object)
     inputs = []
     inputs.append(
-        tritongrpcclient.InferInput('EXPECTED_HEADERS', expected_headers.shape,
-                                    np_to_triton_dtype(expected_headers.dtype)))
+        tritongrpcclient.InferInput(
+            "EXPECTED_HEADERS",
+            expected_headers.shape,
+            np_to_triton_dtype(expected_headers.dtype),
+        )
+    )
     inputs[0].set_data_from_numpy(expected_headers)
 
     return inputs
 
 
 class GRPCClientPluginAsyncTest(unittest.IsolatedAsyncioTestCase):
-
     async def asyncSetUp(self):
-        self._headers = {'my-key': 'my-value'}
+        self._headers = {"my-key": "my-value"}
         self._plugin = TestPlugin(self._headers)
-        self._client = asynctritongrpcclient.InferenceServerClient(
-            url='localhost:8001')
+        self._client = asynctritongrpcclient.InferenceServerClient(url="localhost:8001")
 
     async def test_simple_infer(self):
         model = "client_plugin_test"
         inputs = prepare_infer_inputs(self._headers)
         self._client.register_plugin(self._plugin)
         response = await self._client.infer(model_name=model, inputs=inputs)
-        test_success = response.as_numpy('TEST_SUCCESS')
+        test_success = response.as_numpy("TEST_SUCCESS")
         self.assertEqual(test_success, True)
 
         self._client.unregister_plugin()
         inputs = prepare_infer_inputs({})
         response = await self._client.infer(model_name=model, inputs=inputs)
-        test_success = response.as_numpy('TEST_SUCCESS')
+        test_success = response.as_numpy("TEST_SUCCESS")
         self.assertEqual(test_success, True)
 
     async def asyncTearDown(self):
@@ -86,12 +88,10 @@ async def asyncTearDown(self):
 
 
 class GRPCClientPluginTest(tu.TestResultCollector):
-
     def setUp(self):
-        self._headers = {'my-key': 'my-value'}
+        self._headers = {"my-key": "my-value"}
         self._plugin = TestPlugin(self._headers)
-        self._client = tritongrpcclient.InferenceServerClient(
-            url='localhost:8001')
+        self._client = tritongrpcclient.InferenceServerClient(url="localhost:8001")
 
     def test_simple_infer(self):
         # Set the binary data to False so that 'Inference-Header-Length' is not
@@ -101,7 +101,7 @@ def test_simple_infer(self):
         self._client.register_plugin(self._plugin)
         self.assertEqual(self._plugin, self._client.plugin())
         response = self._client.infer(model_name=model, inputs=inputs)
-        test_success = response.as_numpy('TEST_SUCCESS')
+        test_success = response.as_numpy("TEST_SUCCESS")
         self.assertEqual(test_success, True)
 
         # Unregister the plugin
@@ -109,12 +109,12 @@ def test_simple_infer(self):
         self._client.unregister_plugin()
         self.assertEqual(None, self._client.plugin())
         response = self._client.infer(model_name=model, inputs=inputs)
-        test_success = response.as_numpy('TEST_SUCCESS')
+        test_success = response.as_numpy("TEST_SUCCESS")
         self.assertEqual(test_success, True)
 
     def tearDown(self):
         self._client.close()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_grpc/python_grpc_aio_test.py b/qa/L0_grpc/python_grpc_aio_test.py
old mode 100644
new mode 100755
index 88c08b8ab6..f5b3a8f958
--- a/qa/L0_grpc/python_grpc_aio_test.py
+++ b/qa/L0_grpc/python_grpc_aio_test.py
@@ -32,13 +32,10 @@
 
 
 class TestGrpcAioClient(unittest.IsolatedAsyncioTestCase):
-    """Test if aio rpc can reach the server
-
-    """
+    """Test if aio rpc can reach the server"""
 
     def setUp(self):
-        self._triton_client = grpcclient.InferenceServerClient(
-            url="localhost:8001")
+        self._triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
 
     async def asyncTearDown(self):
         await self._triton_client.close()
@@ -73,15 +70,15 @@ async def test_get_model_repository_index(self):
 
     async def test_load_model(self):
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "\[StatusCode\.UNAVAILABLE\] explicit model load / unload is not allowed if polling is enabled"
+            InferenceServerException,
+            "\[StatusCode\.UNAVAILABLE\] explicit model load / unload is not allowed if polling is enabled",
         ):
             await self._triton_client.load_model("simple")
 
     async def test_unload_model(self):
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "\[StatusCode\.UNAVAILABLE\] explicit model load / unload is not allowed if polling is enabled"
+            InferenceServerException,
+            "\[StatusCode\.UNAVAILABLE\] explicit model load / unload is not allowed if polling is enabled",
         ):
             await self._triton_client.load_model("simple")
 
@@ -99,8 +96,8 @@ async def test_get_system_shared_memory_status(self):
 
     async def test_register_system_shared_memory(self):
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "\[StatusCode\.INTERNAL\] Unable to open shared memory region: ''"
+            InferenceServerException,
+            "\[StatusCode\.INTERNAL\] Unable to open shared memory region: ''",
         ):
             await self._triton_client.register_system_shared_memory("", "", 0)
 
@@ -112,8 +109,8 @@ async def test_get_cuda_shared_memory_status(self):
 
     async def test_register_cuda_shared_memory(self):
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "\[StatusCode\.INVALID_ARGUMENT\] failed to register CUDA shared memory region '': failed to open CUDA IPC handle: invalid argument"
+            InferenceServerException,
+            "\[StatusCode\.INVALID_ARGUMENT\] failed to register CUDA shared memory region '': failed to open CUDA IPC handle: invalid argument",
         ):
             await self._triton_client.register_cuda_shared_memory("", b"", 0, 0)
 
diff --git a/qa/L0_grpc/python_unit_test.py b/qa/L0_grpc/python_unit_test.py
old mode 100644
new mode 100755
index db2a63f0a5..0fb6d97554
--- a/qa/L0_grpc/python_unit_test.py
+++ b/qa/L0_grpc/python_unit_test.py
@@ -25,20 +25,19 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import unittest
-import numpy as np
+import queue
 import time
-
-import tritonclient.grpc as grpcclient
-from tritonclient.utils import InferenceServerException
+import unittest
 
 # For stream infer test
 from functools import partial
-import queue
 
+import numpy as np
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import InferenceServerException
 
-class UserData:
 
+class UserData:
     def __init__(self):
         self._completed_requests = queue.Queue()
 
@@ -51,7 +50,6 @@ def callback(user_data, result, error):
 
 
 class RestrictedProtocolTest(unittest.TestCase):
-
     def setUp(self):
         self.client_ = grpcclient.InferenceServerClient(url="localhost:8001")
         self.model_name_ = "simple"
@@ -61,55 +59,61 @@ def setUp(self):
     def test_sanity(self):
         self.client_.get_inference_statistics("simple")
         self.client_.get_inference_statistics(
-            "simple", headers={self.prefix_ + "infer-key": "infer-value"})
+            "simple", headers={self.prefix_ + "infer-key": "infer-value"}
+        )
 
     # health, infer, model repository protocols are restricted.
     # health and infer expects "triton-grpc-restricted-infer-key : infer-value" header,
     # model repository expected "triton-grpc-restricted-admin-key : admin-value".
     def test_model_repository(self):
-        with self.assertRaisesRegex(InferenceServerException,
-                                    "This protocol is restricted"):
+        with self.assertRaisesRegex(
+            InferenceServerException, "This protocol is restricted"
+        ):
             self.client_.unload_model(
-                self.model_name_,
-                headers={self.prefix_ + "infer-key": "infer-value"})
+                self.model_name_, headers={self.prefix_ + "infer-key": "infer-value"}
+            )
         # Request go through and get actual transaction error
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "explicit model load / unload is not allowed"):
+            InferenceServerException, "explicit model load / unload is not allowed"
+        ):
             self.client_.unload_model(
-                self.model_name_,
-                headers={self.prefix_ + "admin-key": "admin-value"})
+                self.model_name_, headers={self.prefix_ + "admin-key": "admin-value"}
+            )
 
     def test_health(self):
-        with self.assertRaisesRegex(InferenceServerException,
-                                    "This protocol is restricted"):
+        with self.assertRaisesRegex(
+            InferenceServerException, "This protocol is restricted"
+        ):
             self.client_.is_server_live()
         self.client_.is_server_live({self.prefix_ + "infer-key": "infer-value"})
 
     def test_infer(self):
         # setup
         inputs = [
-            grpcclient.InferInput('INPUT0', [1, 16], "INT32"),
-            grpcclient.InferInput('INPUT1', [1, 16], "INT32")
+            grpcclient.InferInput("INPUT0", [1, 16], "INT32"),
+            grpcclient.InferInput("INPUT1", [1, 16], "INT32"),
         ]
         inputs[0].set_data_from_numpy(np.ones(shape=(1, 16), dtype=np.int32))
         inputs[1].set_data_from_numpy(np.ones(shape=(1, 16), dtype=np.int32))
 
         # This test only care if the request goes through
-        with self.assertRaisesRegex(InferenceServerException,
-                                    "This protocol is restricted"):
-            results = self.client_.infer(model_name=self.model_name_,
-                                         inputs=inputs,
-                                         headers={'test': '1'})
-        self.client_.infer(model_name=self.model_name_,
-                           inputs=inputs,
-                           headers={self.prefix_ + "infer-key": "infer-value"})
+        with self.assertRaisesRegex(
+            InferenceServerException, "This protocol is restricted"
+        ):
+            results = self.client_.infer(
+                model_name=self.model_name_, inputs=inputs, headers={"test": "1"}
+            )
+        self.client_.infer(
+            model_name=self.model_name_,
+            inputs=inputs,
+            headers={self.prefix_ + "infer-key": "infer-value"},
+        )
 
     def test_stream_infer(self):
         # setup
         inputs = [
-            grpcclient.InferInput('INPUT0', [1, 16], "INT32"),
-            grpcclient.InferInput('INPUT1', [1, 16], "INT32")
+            grpcclient.InferInput("INPUT0", [1, 16], "INT32"),
+            grpcclient.InferInput("INPUT1", [1, 16], "INT32"),
         ]
         inputs[0].set_data_from_numpy(np.ones(shape=(1, 16), dtype=np.int32))
         inputs[1].set_data_from_numpy(np.ones(shape=(1, 16), dtype=np.int32))
@@ -120,18 +124,18 @@ def test_stream_infer(self):
         # the stream.
         # So on client side, it will always perceive that the stream is
         # successfully created and can only check its health at a later time.
-        self.client_.start_stream(partial(callback, user_data),
-                                  headers={'test': '1'})
+        self.client_.start_stream(partial(callback, user_data), headers={"test": "1"})
         # wait for sufficient round-trip time
         time.sleep(1)
-        with self.assertRaisesRegex(InferenceServerException,
-                                    "The stream is no longer in valid state"):
-            self.client_.async_stream_infer(model_name=self.model_name_,
-                                            inputs=inputs)
+        with self.assertRaisesRegex(
+            InferenceServerException, "The stream is no longer in valid state"
+        ):
+            self.client_.async_stream_infer(model_name=self.model_name_, inputs=inputs)
         # callback should record error detail
         self.assertFalse(user_data._completed_requests.empty())
-        with self.assertRaisesRegex(InferenceServerException,
-                                    "This protocol is restricted"):
+        with self.assertRaisesRegex(
+            InferenceServerException, "This protocol is restricted"
+        ):
             raise user_data._completed_requests.get()
 
         self.assertTrue(user_data._completed_requests.empty())
@@ -140,14 +144,15 @@ def test_stream_infer(self):
         self.client_.stop_stream()
         self.client_.start_stream(
             partial(callback, user_data),
-            headers={self.prefix_ + "infer-key": "infer-value"})
-        self.client_.async_stream_infer(model_name=self.model_name_,
-                                        inputs=inputs)
+            headers={self.prefix_ + "infer-key": "infer-value"},
+        )
+        self.client_.async_stream_infer(model_name=self.model_name_, inputs=inputs)
         # wait for response
         time.sleep(1)
         self.assertFalse(user_data._completed_requests.empty())
-        self.assertNotEqual(type(user_data._completed_requests.get()),
-                            InferenceServerException)
+        self.assertNotEqual(
+            type(user_data._completed_requests.get()), InferenceServerException
+        )
 
 
 if __name__ == "__main__":
diff --git a/qa/L0_grpc/test.sh b/qa/L0_grpc/test.sh
old mode 100644
new mode 100755
index 923479836d..90d34a8738
--- a/qa/L0_grpc/test.sh
+++ b/qa/L0_grpc/test.sh
@@ -490,7 +490,7 @@ wait $SERVER_PID
 # Run cpp client unit test
 rm -rf unit_test_models && mkdir unit_test_models
 cp -r $DATADIR/qa_model_repository/onnx_int32_int32_int32 unit_test_models/.
-cp -r ${MODELDIR}/simple unit_test_models/. 
+cp -r ${MODELDIR}/simple unit_test_models/.
 
 SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=unit_test_models
             --trace-file=global_unittest.log --trace-level=TIMESTAMPS --trace-rate=1"
diff --git a/qa/L0_http/http_basic_auth_test.py b/qa/L0_http/http_basic_auth_test.py
old mode 100644
new mode 100755
index 21aa96dc5e..5aa1f71d81
--- a/qa/L0_http/http_basic_auth_test.py
+++ b/qa/L0_http/http_basic_auth_test.py
@@ -24,26 +24,23 @@
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import unittest
 import sys
+import unittest
 
 sys.path.append("../common")
 
 import test_util as tu
 import tritonclient.http as tritonhttpclient
 import tritonclient.http.aio as asynctritonhttpclient
-
-from tritonclient.http.auth import BasicAuth
 from tritonclient.http.aio.auth import BasicAuth as AsyncBasicAuth
+from tritonclient.http.auth import BasicAuth
 
 
 class HTTPBasicAuthTest(tu.TestResultCollector):
-
     def setUp(self):
         # Use the nginx port
-        self._client = tritonhttpclient.InferenceServerClient(
-            url='localhost:8004')
-        self._client.register_plugin(BasicAuth('username', 'password'))
+        self._client = tritonhttpclient.InferenceServerClient(url="localhost:8004")
+        self._client.register_plugin(BasicAuth("username", "password"))
 
     def test_client_call(self):
         self.assertTrue(self._client.is_server_live())
@@ -53,12 +50,10 @@ def tearDown(self):
 
 
 class HTTPBasicAuthAsyncTest(unittest.IsolatedAsyncioTestCase):
-
     async def asyncSetUp(self):
         # Use the nginx port
-        self._client = asynctritonhttpclient.InferenceServerClient(
-            url='localhost:8004')
-        self._client.register_plugin(AsyncBasicAuth('username', 'password'))
+        self._client = asynctritonhttpclient.InferenceServerClient(url="localhost:8004")
+        self._client.register_plugin(AsyncBasicAuth("username", "password"))
 
     async def test_client_call(self):
         self.assertTrue(await self._client.is_server_live())
@@ -67,5 +62,5 @@ async def asyncTearDown(self):
         await self._client.close()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_http/http_client_plugin_test.py b/qa/L0_http/http_client_plugin_test.py
old mode 100644
new mode 100755
index e110b9fdea..963ea2a81b
--- a/qa/L0_http/http_client_plugin_test.py
+++ b/qa/L0_http/http_client_plugin_test.py
@@ -30,18 +30,18 @@
 sys.path.append("../common")
 
 import unittest
-from unittest.mock import AsyncMock, patch, MagicMock
+from unittest.mock import AsyncMock, MagicMock, patch
+
 import numpy as np
 import test_util as tu
 import tritonclient.http as tritonhttpclient
+import tritonclient.http.aio as asynctritonhttpclient
 from tritonclient.http import InferenceServerClientPlugin
 from tritonclient.utils import np_to_triton_dtype
-import tritonclient.http.aio as asynctritonhttpclient
 
 
 # A simple plugin that adds headers to the inference request.
 class TestPlugin(InferenceServerClientPlugin):
-
     def __init__(self, headers):
         self._headers = headers
 
@@ -50,12 +50,10 @@ def __call__(self, request):
 
 
 class HTTPClientPluginAsyncTest(unittest.IsolatedAsyncioTestCase):
-
     async def asyncSetUp(self):
-        self._headers = {'MY-KEY': 'MY-VALUE'}
+        self._headers = {"MY-KEY": "MY-VALUE"}
         self._plugin = TestPlugin(self._headers)
-        self._client = asynctritonhttpclient.InferenceServerClient(
-            url='localhost:8001')
+        self._client = asynctritonhttpclient.InferenceServerClient(url="localhost:8001")
 
     async def test_server_is_live(self):
         # We are testing is_server_live as an example API that uses GET method
@@ -65,15 +63,15 @@ async def test_server_is_live(self):
         self._client.register_plugin(self._plugin)
         self.assertEqual(self._plugin, self._client.plugin())
         await self._client.is_server_live()
-        self._client._stub.get.assert_awaited_with(url=unittest.mock.ANY,
-                                                   headers=self._headers)
+        self._client._stub.get.assert_awaited_with(
+            url=unittest.mock.ANY, headers=self._headers
+        )
 
         # Make sure unregistering the plugin would no longer add the headers
         self._client.unregister_plugin()
         self.assertEqual(None, self._client.plugin())
         await self._client.is_server_live()
-        self._client._stub.get.assert_awaited_with(url=unittest.mock.ANY,
-                                                   headers={})
+        self._client._stub.get.assert_awaited_with(url=unittest.mock.ANY, headers={})
 
     async def test_simple_infer(self):
         # Only the read function must return async
@@ -87,21 +85,22 @@ async def test_simple_infer(self):
         # Setup inputs
         inputs = []
         inputs.append(
-            tritonhttpclient.InferInput('INPUT0', np_input.shape,
-                                        np_to_triton_dtype(np_input.dtype)))
+            tritonhttpclient.InferInput(
+                "INPUT0", np_input.shape, np_to_triton_dtype(np_input.dtype)
+            )
+        )
 
         # Set the binary data to False so that 'Inference-Header-Length' is not
         # added to the headers.
         inputs[0].set_data_from_numpy(np_input, binary_data=False)
 
         async def run_infer(headers):
-            with patch('tritonclient.http.aio._raise_if_error'):
-                with patch('tritonclient.http.aio.InferResult'):
+            with patch("tritonclient.http.aio._raise_if_error"):
+                with patch("tritonclient.http.aio.InferResult"):
                     await self._client.infer(model_name=model, inputs=inputs)
                     self._client._stub.post.assert_awaited_with(
-                        url=unittest.mock.ANY,
-                        data=unittest.mock.ANY,
-                        headers=headers)
+                        url=unittest.mock.ANY, data=unittest.mock.ANY, headers=headers
+                    )
 
         self._client.register_plugin(self._plugin)
         await run_infer(self._headers)
@@ -114,12 +113,10 @@ async def asyncTearDown(self):
 
 
 class HTTPClientPluginTest(tu.TestResultCollector):
-
     def setUp(self):
-        self._headers = {'MY-KEY': 'MY-VALUE'}
+        self._headers = {"MY-KEY": "MY-VALUE"}
         self._plugin = TestPlugin(self._headers)
-        self._client = tritonhttpclient.InferenceServerClient(
-            url='localhost:8001')
+        self._client = tritonhttpclient.InferenceServerClient(url="localhost:8001")
 
         # Use magic mock for the client stub
         self._client._client_stub = MagicMock()
@@ -129,14 +126,14 @@ def test_server_is_live(self):
         # for communication with the server.
         self._client.register_plugin(self._plugin)
         self._client.is_server_live()
-        self._client._client_stub.get.assert_called_with(unittest.mock.ANY,
-                                                         headers=self._headers)
+        self._client._client_stub.get.assert_called_with(
+            unittest.mock.ANY, headers=self._headers
+        )
 
         # Make sure unregistering the plugin would no longer add the headers
         self._client.unregister_plugin()
         self._client.is_server_live()
-        self._client._client_stub.get.assert_called_with(unittest.mock.ANY,
-                                                         headers={})
+        self._client._client_stub.get.assert_called_with(unittest.mock.ANY, headers={})
 
     def test_simple_infer(self):
         np_input = np.arange(8, dtype=np.float32).reshape(1, -1)
@@ -145,21 +142,24 @@ def test_simple_infer(self):
         # Setup inputs
         inputs = []
         inputs.append(
-            tritonhttpclient.InferInput('INPUT0', np_input.shape,
-                                        np_to_triton_dtype(np_input.dtype)))
+            tritonhttpclient.InferInput(
+                "INPUT0", np_input.shape, np_to_triton_dtype(np_input.dtype)
+            )
+        )
 
         # Set the binary data to False so that 'Inference-Header-Length' is not
         # added to the headers.
         inputs[0].set_data_from_numpy(np_input, binary_data=False)
 
         def run_infer(headers):
-            with patch('tritonclient.http._client._raise_if_error'):
-                with patch('tritonclient.http._client.InferResult'):
+            with patch("tritonclient.http._client._raise_if_error"):
+                with patch("tritonclient.http._client.InferResult"):
                     self._client.infer(model_name=model, inputs=inputs)
                     self._client._client_stub.post.assert_called_with(
                         request_uri=unittest.mock.ANY,
                         body=unittest.mock.ANY,
-                        headers=headers)
+                        headers=headers,
+                    )
 
         self._client.register_plugin(self._plugin)
         run_infer(self._headers)
@@ -171,5 +171,5 @@ def tearDown(self):
         self._client.close()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_http/http_test.py b/qa/L0_http/http_test.py
old mode 100644
new mode 100755
index 2482227a8f..a6fa0bcccd
--- a/qa/L0_http/http_test.py
+++ b/qa/L0_http/http_test.py
@@ -29,40 +29,39 @@
 
 sys.path.append("../common")
 
-import requests
 import unittest
+
 import numpy as np
+import requests
 import test_util as tu
 import tritonclient.http as tritonhttpclient
-from tritonclient.utils import np_to_triton_dtype, InferenceServerException
+from tritonclient.utils import InferenceServerException, np_to_triton_dtype
 
 
 class HttpTest(tu.TestResultCollector):
-
     def _get_infer_url(self, model_name):
         return "http://localhost:8000/v2/models/{}/infer".format(model_name)
 
-    def _raw_binary_helper(self,
-                           model,
-                           input_bytes,
-                           expected_output_bytes,
-                           extra_headers={}):
+    def _raw_binary_helper(
+        self, model, input_bytes, expected_output_bytes, extra_headers={}
+    ):
         # Select model that satisfies constraints for raw binary request
-        headers = {'Inference-Header-Content-Length': '0'}
+        headers = {"Inference-Header-Content-Length": "0"}
         # Add extra headers (if any) before sending request
         headers.update(extra_headers)
-        r = requests.post(self._get_infer_url(model),
-                          data=input_bytes,
-                          headers=headers)
+        r = requests.post(self._get_infer_url(model), data=input_bytes, headers=headers)
         r.raise_for_status()
 
         # Get the inference header size so we can locate the output binary data
         header_size = int(r.headers["Inference-Header-Content-Length"])
         # Assert input == output since this tests an identity model
         self.assertEqual(
-            expected_output_bytes, r.content[header_size:],
-            "Expected response body contains correct output binary data: {}; got: {}"
-            .format(expected_output_bytes, r.content[header_size:]))
+            expected_output_bytes,
+            r.content[header_size:],
+            "Expected response body contains correct output binary data: {}; got: {}".format(
+                expected_output_bytes, r.content[header_size:]
+            ),
+        )
 
     def test_raw_binary(self):
         model = "onnx_zero_1_float32"
@@ -80,54 +79,61 @@ def test_byte(self):
         # i.e. BYTE type the element count must be 1
         model = "onnx_zero_1_object_1_element"
         input = "427"
-        headers = {'Inference-Header-Content-Length': '0'}
-        r = requests.post(self._get_infer_url(model),
-                          data=input,
-                          headers=headers)
+        headers = {"Inference-Header-Content-Length": "0"}
+        r = requests.post(self._get_infer_url(model), data=input, headers=headers)
         r.raise_for_status()
 
         # Get the inference header size so we can locate the output binary data
         header_size = int(r.headers["Inference-Header-Content-Length"])
         # Triton returns BYTES tensor with byte size prepended
-        output = r.content[header_size + 4:].decode()
+        output = r.content[header_size + 4 :].decode()
         self.assertEqual(
-            input, output,
-            "Expected response body contains correct output binary data: {}; got: {}"
-            .format(input, output))
+            input,
+            output,
+            "Expected response body contains correct output binary data: {}; got: {}".format(
+                input, output
+            ),
+        )
 
     def test_byte_too_many_elements(self):
         # Select model that doesn't satisfy constraints for raw binary request
         # i.e. BYTE type the element count must be 1
         model = "onnx_zero_1_object"
         input = "427"
-        headers = {'Inference-Header-Content-Length': '0'}
-        r = requests.post(self._get_infer_url(model),
-                          data=input,
-                          headers=headers)
+        headers = {"Inference-Header-Content-Length": "0"}
+        r = requests.post(self._get_infer_url(model), data=input, headers=headers)
         self.assertEqual(
-            400, r.status_code,
+            400,
+            r.status_code,
             "Expected error code {} returned for the request; got: {}".format(
-                400, r.status_code))
+                400, r.status_code
+            ),
+        )
         self.assertIn(
             "For BYTE datatype raw input, the model must have input shape [1]",
-            r.content.decode())
+            r.content.decode(),
+        )
 
     def test_multi_variable_dimensions(self):
         # Select model that doesn't satisfy constraints for raw binary request
         # i.e. this model has multiple variable-sized dimensions
         model = "onnx_zero_1_float16"
         input = np.ones([2, 2], dtype=np.float16)
-        headers = {'Inference-Header-Content-Length': '0'}
-        r = requests.post(self._get_infer_url(model),
-                          data=input.tobytes(),
-                          headers=headers)
+        headers = {"Inference-Header-Content-Length": "0"}
+        r = requests.post(
+            self._get_infer_url(model), data=input.tobytes(), headers=headers
+        )
         self.assertEqual(
-            400, r.status_code,
+            400,
+            r.status_code,
             "Expected error code {} returned for the request; got: {}".format(
-                400, r.status_code))
+                400, r.status_code
+            ),
+        )
         self.assertIn(
             "The shape of the raw input 'INPUT0' can not be deduced because there are more than one variable-sized dimension",
-            r.content.decode())
+            r.content.decode(),
+        )
 
     def test_multi_inputs(self):
         # Select model that doesn't satisfy constraints for raw binary request
@@ -136,21 +142,25 @@ def test_multi_inputs(self):
         # Use one numpy array, after tobytes() it can be seen as three inputs
         # each with 8 elements (this ambiguity is why this is not allowed)
         input = np.arange(24, dtype=np.float32)
-        headers = {'Inference-Header-Content-Length': '0'}
-        r = requests.post(self._get_infer_url(model),
-                          data=input.tobytes(),
-                          headers=headers)
+        headers = {"Inference-Header-Content-Length": "0"}
+        r = requests.post(
+            self._get_infer_url(model), data=input.tobytes(), headers=headers
+        )
         self.assertEqual(
-            400, r.status_code,
+            400,
+            r.status_code,
             "Expected error code {} returned for the request; got: {}".format(
-                400, r.status_code))
+                400, r.status_code
+            ),
+        )
         self.assertIn(
             "Raw request must only have 1 input (found 1) to be deduced but got 3 inputs in",
-            r.content.decode())
+            r.content.decode(),
+        )
 
     # This is to test that a properly chunk-encoded request by the caller works,
     # though Triton does not specifically do any special chunk handling outside
-    # of underlying HTTP libaries used
+    # of underlying HTTP libraries used
     # Future Enhancement: Test other encodings as they come up
     def test_content_encoding_chunked_manually(self):
         # Similar to test_raw_binary but test with extra headers
@@ -165,9 +175,8 @@ def test_content_encoding_chunked_manually(self):
         # Chunk bytes and line separator
         chunk_encoded_input += input_bytes + b"\r\n"
         # Final byte (0) and end message
-        chunk_encoded_input += b'0\r\n\r\n'
-        self._raw_binary_helper(model, chunk_encoded_input, input_bytes,
-                                extra_headers)
+        chunk_encoded_input += b"0\r\n\r\n"
+        self._raw_binary_helper(model, chunk_encoded_input, input_bytes, extra_headers)
 
     # Test that Python client rejects any "Transfer-Encoding" HTTP headers
     # as we don't specially handle encoding requests for the user through
@@ -183,20 +192,19 @@ def test_content_encoding_unsupported_client(self):
                 inputs = []
                 inputs.append(
                     tritonhttpclient.InferInput(
-                        'INPUT0', np_input.shape,
-                        np_to_triton_dtype(np_input.dtype)))
+                        "INPUT0", np_input.shape, np_to_triton_dtype(np_input.dtype)
+                    )
+                )
                 inputs[0].set_data_from_numpy(np_input)
 
-                with tritonhttpclient.InferenceServerClient(
-                        "localhost:8000") as client:
+                with tritonhttpclient.InferenceServerClient("localhost:8000") as client:
                     # Python client is expected to raise an exception to reject
                     # 'content-encoding' HTTP headers.
-                    with self.assertRaisesRegex(InferenceServerException,
-                                                "Unsupported HTTP header"):
-                        client.infer(model_name=model,
-                                     inputs=inputs,
-                                     headers=headers)
+                    with self.assertRaisesRegex(
+                        InferenceServerException, "Unsupported HTTP header"
+                    ):
+                        client.infer(model_name=model, inputs=inputs, headers=headers)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_http/python_http_aio_test.py b/qa/L0_http/python_http_aio_test.py
old mode 100644
new mode 100755
index d31b9c71f2..bd8d342bb1
--- a/qa/L0_http/python_http_aio_test.py
+++ b/qa/L0_http/python_http_aio_test.py
@@ -32,12 +32,10 @@
 
 
 class TestHttpAioClient(unittest.IsolatedAsyncioTestCase):
-    """Test if aio rpc can reach the server
-    """
+    """Test if aio rpc can reach the server"""
 
     async def asyncSetUp(self):
-        self._triton_client = httpclient.InferenceServerClient(
-            url="localhost:8000")
+        self._triton_client = httpclient.InferenceServerClient(url="localhost:8000")
 
     async def asyncTearDown(self):
         await self._triton_client.close()
@@ -72,15 +70,15 @@ async def test_get_model_repository_index(self):
 
     async def test_load_model(self):
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "explicit model load / unload is not allowed if polling is enabled"
+            InferenceServerException,
+            "explicit model load / unload is not allowed if polling is enabled",
         ):
             await self._triton_client.load_model("simple")
 
     async def test_unload_model(self):
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "explicit model load / unload is not allowed if polling is enabled"
+            InferenceServerException,
+            "explicit model load / unload is not allowed if polling is enabled",
         ):
             await self._triton_client.load_model("simple")
 
diff --git a/qa/L0_http/test.sh b/qa/L0_http/test.sh
old mode 100644
new mode 100755
index 56c1782879..c08a5fba74
--- a/qa/L0_http/test.sh
+++ b/qa/L0_http/test.sh
@@ -251,7 +251,7 @@ fi
 
 # Create a password file with username:password
 echo -n 'username:' > pswd
-echo "password" | openssl passwd -stdin -apr1 >> pswd  
+echo "password" | openssl passwd -stdin -apr1 >> pswd
 nginx -c `pwd`/$NGINX_CONF
 
 python3 $BASIC_AUTH_TEST
@@ -504,7 +504,7 @@ wait $SERVER_PID
 # Run cpp client unit test
 rm -rf unit_test_models && mkdir unit_test_models
 cp -r $DATADIR/qa_model_repository/onnx_int32_int32_int32 unit_test_models/.
-cp -r ${MODELDIR}/simple unit_test_models/. 
+cp -r ${MODELDIR}/simple unit_test_models/.
 
 SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=unit_test_models
             --trace-file=global_unittest.log --trace-level=TIMESTAMPS --trace-rate=1"
diff --git a/qa/L0_http_fuzz/fuzztest.py b/qa/L0_http_fuzz/fuzztest.py
old mode 100644
new mode 100755
index 4c2704ec40..8e84ffffc7
--- a/qa/L0_http_fuzz/fuzztest.py
+++ b/qa/L0_http_fuzz/fuzztest.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,28 +30,29 @@
 
 sys.path.append("../common")
 
+import glob
+import os
+import sqlite3
 import unittest
+
 import test_util as tu
-import sqlite3
 from boofuzz import *
-import glob
-import os
 
 
 class FuzzTest(tu.TestResultCollector):
-
     def _run_fuzz(self, url, logger):
         session = Session(
             target=Target(connection=TCPSocketConnection("127.0.0.1", 8000)),
             fuzz_loggers=logger,
-            keep_web_open=False)
+            keep_web_open=False,
+        )
 
         s_initialize(name="Request" + url)
         with s_block("Request-Line"):
-            s_group("Method", [
-                "GET", "HEAD", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS",
-                "TRACE"
-            ])
+            s_group(
+                "Method",
+                ["GET", "HEAD", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE"],
+            )
             s_delim(" ", name="space-1")
             s_string(url, name="Request-URI")
             s_delim(" ", name="space-2")
@@ -62,28 +65,36 @@ def _run_fuzz(self, url, logger):
 
     def test_failures_from_db(self):
         url_list = [
-            "/v2", "/v2/models/simple", "/v2/models/simple/infer",
-            "/v2/models/simple/versions/v1", "/v2/models/simple/config",
-            "/v2/models/simple/stats", "/v2/models/simple/ready",
-            "/v2/health/ready", "/v2/health/live", "/v2/repository/index",
+            "/v2",
+            "/v2/models/simple",
+            "/v2/models/simple/infer",
+            "/v2/models/simple/versions/v1",
+            "/v2/models/simple/config",
+            "/v2/models/simple/stats",
+            "/v2/models/simple/ready",
+            "/v2/health/ready",
+            "/v2/health/live",
+            "/v2/repository/index",
             "/v2/repository/models/simple/unload",
             "/v2/repository/models/simple/load",
-            "/v2/systemsharedmemory/status", "/v2/systemsharedmemory/register",
+            "/v2/systemsharedmemory/status",
+            "/v2/systemsharedmemory/register",
             "/v2/systemsharedmemory/unregister",
             "/v2/systemsharedmemory/region/xx/status",
-            "/v2/cudasharedmemory/status", "/v2/cudasharedmemory/register",
+            "/v2/cudasharedmemory/status",
+            "/v2/cudasharedmemory/register",
             "/v2/cudasharedmemory/unregister",
-            "/v2/cudasharedmemory/region/xx/status"
+            "/v2/cudasharedmemory/region/xx/status",
         ]
 
-        csv_log = open('fuzz_results.csv', 'w')
+        csv_log = open("fuzz_results.csv", "w")
         logger = [FuzzLoggerCsv(file_handle=csv_log)]
 
         for url in url_list:
             self._run_fuzz(url, logger)
 
             # Get latest db file
-            files = glob.glob('boofuzz-results/*')
+            files = glob.glob("boofuzz-results/*")
             dbfile = max(files, key=os.path.getctime)
 
             conn = sqlite3.connect(dbfile)
@@ -91,10 +102,8 @@ def test_failures_from_db(self):
 
             # Get number of failures, should be 0
             self.assertEqual(
-                len([
-                    x for x in c.execute(
-                        "SELECT * FROM steps WHERE type=\"fail\"")
-                ]), 0)
+                len([x for x in c.execute('SELECT * FROM steps WHERE type="fail"')]), 0
+            )
 
 
 if __name__ == "__main__":
diff --git a/qa/L0_http_fuzz/test.sh b/qa/L0_http_fuzz/test.sh
old mode 100644
new mode 100755
index 372fe5a242..f721135698
--- a/qa/L0_http_fuzz/test.sh
+++ b/qa/L0_http_fuzz/test.sh
@@ -53,15 +53,15 @@ FUZZ_LOG=`pwd`/fuzz.log
 DATADIR=`pwd`/models
 SERVER=/opt/tritonserver/bin/tritonserver
 SERVER_ARGS="--model-repository=$DATADIR"
-source ../common/util.sh 
+source ../common/util.sh
 
 # Remove this once foobuzz and tornado packages upgrade to work with python 3.10
-# This test tests the server's ability to handle poor input and not the compatibility 
+# This test tests the server's ability to handle poor input and not the compatibility
 # with python 3.10. Python 3.8 is ok to use here.
 function_install_python38() {
     source ../L0_backend_python/common.sh
     install_conda
-    create_conda_env "3.8" "python-3-8" 
+    create_conda_env "3.8" "python-3-8"
 
     # Install test script dependencies
     pip3 install --upgrade wheel setuptools boofuzz==0.3.0 numpy pillow attrdict future grpcio requests gsutil \
diff --git a/qa/L0_https/test.sh b/qa/L0_https/test.sh
old mode 100644
new mode 100755
index 7e3f4696d1..7fe03b843e
--- a/qa/L0_https/test.sh
+++ b/qa/L0_https/test.sh
@@ -57,23 +57,23 @@ rm -f *.key *.crt ${CLIENT_LOG}.* server.log
 
 # Generate valid CA
 openssl genrsa -passout pass:1234 -des3 -out ca.key 4096
-openssl req -passin pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA"
+openssl req -passing pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA"
 
 # Generate valid Server Key/Cert
 openssl genrsa -passout pass:1234 -des3 -out server.key 4096
-openssl req -passin pass:1234 -new -key server.key -out server.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost"
-openssl x509 -req -passin pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt
+openssl req -passing pass:1234 -new -key server.key -out server.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost"
+openssl x509 -req -passing pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt
 
 # Remove passphrase from the Server Key
-openssl rsa -passin pass:1234 -in server.key -out server.key
+openssl rsa -passing pass:1234 -in server.key -out server.key
 
 # Generate valid Client Key/Cert
 openssl genrsa -passout pass:1234 -des3 -out client.key 4096
-openssl req -passin pass:1234 -new -key client.key -out client.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost"
-openssl x509 -passin pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt
+openssl req -passing pass:1234 -new -key client.key -out client.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost"
+openssl x509 -passing pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt
 
 # Remove passphrase from Client Key
-openssl rsa -passin pass:1234 -in client.key -out client.key
+openssl rsa -passing pass:1234 -in client.key -out client.key
 
 # Create mutated client key (Make first char of each like capital)
 cp client.key client2.key && sed -i "s/\b\(.\)/\u\1/g" client2.key
@@ -135,7 +135,7 @@ if [ $? -ne 0 ]; then
 fi
 
 # Test failure cases for SSL
-# Try without SSL 
+# Try without SSL
 $SIMPLE_INFER_CLIENT_PY -v -u localhost >> ${CLIENT_LOG}.no_ssl_fail_infer 2>&1
 if [ $? -ne 0 ]; then
     cat ${CLIENT_LOG}.no_ssl_fail_infer
diff --git a/qa/L0_implicit_state/implicit_state.py b/qa/L0_implicit_state/implicit_state.py
old mode 100644
new mode 100755
index 147697cf16..db8053dcb1
--- a/qa/L0_implicit_state/implicit_state.py
+++ b/qa/L0_implicit_state/implicit_state.py
@@ -187,4 +187,4 @@ def test_request_output(self):
 
 
 if __name__ == '__main__':
-    unittest.main()
+    unittest.main()
\ No newline at end of file
diff --git a/qa/L0_implicit_state/test.sh b/qa/L0_implicit_state/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_infer/infer_test.py b/qa/L0_infer/infer_test.py
old mode 100644
new mode 100755
index 1e0e172a13..d97803b17d
--- a/qa/L0_infer/infer_test.py
+++ b/qa/L0_infer/infer_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,67 +30,66 @@
 
 sys.path.append("../common")
 
+import os
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
-import os
-
 from tritonclient.utils import *
 
-TEST_SYSTEM_SHARED_MEMORY = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
-TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get('TEST_CUDA_SHARED_MEMORY',
-                                                  0)))
-CPU_ONLY = (os.environ.get('TRITON_SERVER_CPU_ONLY') is not None)
-TEST_VALGRIND = bool(int(os.environ.get('TEST_VALGRIND', 0)))
+TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
+TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0)))
+CPU_ONLY = os.environ.get("TRITON_SERVER_CPU_ONLY") is not None
+TEST_VALGRIND = bool(int(os.environ.get("TEST_VALGRIND", 0)))
 
-USE_GRPC = (os.environ.get('USE_GRPC', 1) != "0")
-USE_HTTP = (os.environ.get('USE_HTTP', 1) != "0")
+USE_GRPC = os.environ.get("USE_GRPC", 1) != "0"
+USE_HTTP = os.environ.get("USE_HTTP", 1) != "0"
 assert USE_GRPC or USE_HTTP, "USE_GRPC or USE_HTTP must be non-zero"
 
 BACKENDS = os.environ.get(
-    'BACKENDS',
-    "graphdef savedmodel onnx libtorch plan python python_dlpack openvino")
-ENSEMBLES = bool(int(os.environ.get('ENSEMBLES', 1)))
-NOBATCH = bool(int(os.environ.get('NOBATCH', 1)))
-BATCH = bool(int(os.environ.get('BATCH', 1)))
+    "BACKENDS", "graphdef savedmodel onnx libtorch plan python python_dlpack openvino"
+)
+ENSEMBLES = bool(int(os.environ.get("ENSEMBLES", 1)))
+NOBATCH = bool(int(os.environ.get("NOBATCH", 1)))
+BATCH = bool(int(os.environ.get("BATCH", 1)))
 
 np_dtype_string = np.dtype(object)
 
 
 class InferTest(tu.TestResultCollector):
-
     def _full_exact(
-            self,
+        self,
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        output0_raw,
+        output1_raw,
+        swap,
+        # 60 sec is the default value
+        network_timeout=60.0,
+    ):
+        def _infer_exact_helper(
+            tester,
+            pf,
+            tensor_shape,
+            batch_size,
             input_dtype,
             output0_dtype,
             output1_dtype,
-            output0_raw,
-            output1_raw,
-            swap,
-            # 60 sec is the default value
-            network_timeout=60.0):
-
-        def _infer_exact_helper(tester,
-                                pf,
-                                tensor_shape,
-                                batch_size,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                output0_raw=True,
-                                output1_raw=True,
-                                model_version=None,
-                                swap=False,
-                                outputs=("OUTPUT0", "OUTPUT1"),
-                                use_http=USE_HTTP,
-                                use_grpc=USE_GRPC,
-                                use_http_json_tensors=True,
-                                skip_request_id_check=True,
-                                use_streaming=True,
-                                correlation_id=0,
-                                network_timeout=60.0):
+            output0_raw=True,
+            output1_raw=True,
+            model_version=None,
+            swap=False,
+            outputs=("OUTPUT0", "OUTPUT1"),
+            use_http=USE_HTTP,
+            use_grpc=USE_GRPC,
+            use_http_json_tensors=True,
+            skip_request_id_check=True,
+            use_streaming=True,
+            correlation_id=0,
+            network_timeout=60.0,
+        ):
             for bs in (1, batch_size):
                 # model that does not support batching
                 if NOBATCH:
@@ -114,13 +115,15 @@ def _infer_exact_helper(tester,
                             correlation_id=correlation_id,
                             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
                             use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
-                            network_timeout=network_timeout)
+                            network_timeout=network_timeout,
+                        )
 
                 if BATCH:
                     # model that supports batching.
                     iu.infer_exact(
                         tester,
-                        pf, (bs,) + tensor_shape,
+                        pf,
+                        (bs,) + tensor_shape,
                         bs,
                         input_dtype,
                         output0_dtype,
@@ -138,7 +141,8 @@ def _infer_exact_helper(tester,
                         correlation_id=correlation_id,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
                         use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
-                        network_timeout=network_timeout)
+                        network_timeout=network_timeout,
+                    )
 
         input_size = 16
 
@@ -146,89 +150,131 @@ def _infer_exact_helper(tester,
         ensemble_prefix = [""]
         if ENSEMBLES:
             for prefix in all_ensemble_prefix:
-                if tu.validate_for_ensemble_model(prefix, input_dtype,
-                                                  output0_dtype, output1_dtype,
-                                                  (input_size,), (input_size,),
-                                                  (input_size,)):
+                if tu.validate_for_ensemble_model(
+                    prefix,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    (input_size,),
+                    (input_size,),
+                    (input_size,),
+                ):
                     ensemble_prefix.append(prefix)
 
-        if tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
-                                    (input_size,), (input_size,),
-                                    (input_size,)):
+        if tu.validate_for_tf_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size,),
+            (input_size,),
+            (input_size,),
+        ):
             for prefix in ensemble_prefix:
                 for pf in ["graphdef", "savedmodel"]:
                     if pf in BACKENDS:
-                        _infer_exact_helper(self,
-                                            prefix + pf, (input_size,),
-                                            8,
-                                            input_dtype,
-                                            output0_dtype,
-                                            output1_dtype,
-                                            output0_raw=output0_raw,
-                                            output1_raw=output1_raw,
-                                            swap=swap,
-                                            network_timeout=network_timeout)
+                        _infer_exact_helper(
+                            self,
+                            prefix + pf,
+                            (input_size,),
+                            8,
+                            input_dtype,
+                            output0_dtype,
+                            output1_dtype,
+                            output0_raw=output0_raw,
+                            output1_raw=output1_raw,
+                            swap=swap,
+                            network_timeout=network_timeout,
+                        )
 
         if not CPU_ONLY and tu.validate_for_trt_model(
-                input_dtype, output0_dtype, output1_dtype, (input_size, 1, 1),
-            (input_size, 1, 1), (input_size, 1, 1)):
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size, 1, 1),
+            (input_size, 1, 1),
+            (input_size, 1, 1),
+        ):
             for prefix in ensemble_prefix:
-                if 'plan' in BACKENDS:
+                if "plan" in BACKENDS:
                     if input_dtype == np.int8:
-                        _infer_exact_helper(self,
-                                            prefix + 'plan', (input_size, 1, 1),
-                                            8,
-                                            input_dtype,
-                                            output0_dtype,
-                                            output1_dtype,
-                                            output0_raw=output0_raw,
-                                            output1_raw=output1_raw,
-                                            swap=swap)
+                        _infer_exact_helper(
+                            self,
+                            prefix + "plan",
+                            (input_size, 1, 1),
+                            8,
+                            input_dtype,
+                            output0_dtype,
+                            output1_dtype,
+                            output0_raw=output0_raw,
+                            output1_raw=output1_raw,
+                            swap=swap,
+                        )
                     else:
-                        _infer_exact_helper(self,
-                                            prefix + 'plan', (input_size,),
-                                            8,
-                                            input_dtype,
-                                            output0_dtype,
-                                            output1_dtype,
-                                            output0_raw=output0_raw,
-                                            output1_raw=output1_raw,
-                                            swap=swap)
-
-        if tu.validate_for_onnx_model(input_dtype, output0_dtype, output1_dtype,
-                                      (input_size,), (input_size,),
-                                      (input_size,)):
+                        _infer_exact_helper(
+                            self,
+                            prefix + "plan",
+                            (input_size,),
+                            8,
+                            input_dtype,
+                            output0_dtype,
+                            output1_dtype,
+                            output0_raw=output0_raw,
+                            output1_raw=output1_raw,
+                            swap=swap,
+                        )
+
+        if tu.validate_for_onnx_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size,),
+            (input_size,),
+            (input_size,),
+        ):
             for prefix in ensemble_prefix:
-                if 'onnx' in BACKENDS:
-                    _infer_exact_helper(self,
-                                        prefix + 'onnx', (input_size,),
-                                        8,
-                                        input_dtype,
-                                        output0_dtype,
-                                        output1_dtype,
-                                        output0_raw=output0_raw,
-                                        output1_raw=output1_raw,
-                                        swap=swap)
-
-        if tu.validate_for_libtorch_model(input_dtype, output0_dtype,
-                                          output1_dtype, (input_size,),
-                                          (input_size,), (input_size,)):
+                if "onnx" in BACKENDS:
+                    _infer_exact_helper(
+                        self,
+                        prefix + "onnx",
+                        (input_size,),
+                        8,
+                        input_dtype,
+                        output0_dtype,
+                        output1_dtype,
+                        output0_raw=output0_raw,
+                        output1_raw=output1_raw,
+                        swap=swap,
+                    )
+
+        if tu.validate_for_libtorch_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size,),
+            (input_size,),
+            (input_size,),
+        ):
             # Due to PyTorch bug
             # https://github.com/pytorch/pytorch/issues/66930 we can't
             # run this test with int8 input and int32 outputs.
-            if ((input_dtype == np.int8) and (output0_dtype == np.int32) and
-                (output1_dtype == np.int32)):
-                print('skipping pytorch test for int8_int32_int32')
+            if (
+                (input_dtype == np.int8)
+                and (output0_dtype == np.int32)
+                and (output1_dtype == np.int32)
+            ):
+                print("skipping pytorch test for int8_int32_int32")
             else:
                 for prefix in ensemble_prefix:
-                    if 'libtorch' in BACKENDS:
+                    if "libtorch" in BACKENDS:
                         # Skip batching for PyTorch String I/O
-                        if ((input_dtype == np_dtype_string) or
-                            (output0_dtype == np_dtype_string) or
-                            (output1_dtype == np_dtype_string)):
+                        if (
+                            (input_dtype == np_dtype_string)
+                            or (output0_dtype == np_dtype_string)
+                            or (output1_dtype == np_dtype_string)
+                        ):
                             iu.infer_exact(
                                 self,
-                                prefix + 'libtorch_nobatch',
+                                prefix + "libtorch_nobatch",
                                 (input_size,),
                                 1,  # batch_size
                                 input_dtype,
@@ -239,239 +285,259 @@ def _infer_exact_helper(tester,
                                 swap=swap,
                                 use_http=USE_HTTP,
                                 use_grpc=USE_GRPC,
-                                use_system_shared_memory=
-                                TEST_SYSTEM_SHARED_MEMORY,
-                                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                            )
                         else:
-                            _infer_exact_helper(self,
-                                                prefix + 'libtorch',
-                                                (input_size,),
-                                                8,
-                                                input_dtype,
-                                                output0_dtype,
-                                                output1_dtype,
-                                                output0_raw=output0_raw,
-                                                output1_raw=output1_raw,
-                                                swap=swap)
+                            _infer_exact_helper(
+                                self,
+                                prefix + "libtorch",
+                                (input_size,),
+                                8,
+                                input_dtype,
+                                output0_dtype,
+                                output1_dtype,
+                                output0_raw=output0_raw,
+                                output1_raw=output1_raw,
+                                swap=swap,
+                            )
 
         for prefix in ensemble_prefix:
             if prefix != "":
                 continue
-            if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
+            if (
+                input_dtype == np.uint8
+                or output0_dtype == np.uint8
+                or output1_dtype == np.uint8
+            ):
                 continue
 
-            if 'python_dlpack' in BACKENDS:
-                _infer_exact_helper(self,
-                                    prefix + 'python_dlpack', (input_size,),
-                                    8,
-                                    input_dtype,
-                                    output0_dtype,
-                                    output1_dtype,
-                                    output0_raw=output0_raw,
-                                    output1_raw=output1_raw,
-                                    swap=swap)
-            elif 'python' in BACKENDS:
-                _infer_exact_helper(self,
-                                    prefix + 'python', (input_size,),
-                                    8,
-                                    input_dtype,
-                                    output0_dtype,
-                                    output1_dtype,
-                                    output0_raw=output0_raw,
-                                    output1_raw=output1_raw,
-                                    swap=swap)
+            if "python_dlpack" in BACKENDS:
+                _infer_exact_helper(
+                    self,
+                    prefix + "python_dlpack",
+                    (input_size,),
+                    8,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    output0_raw=output0_raw,
+                    output1_raw=output1_raw,
+                    swap=swap,
+                )
+            elif "python" in BACKENDS:
+                _infer_exact_helper(
+                    self,
+                    prefix + "python",
+                    (input_size,),
+                    8,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    output0_raw=output0_raw,
+                    output1_raw=output1_raw,
+                    swap=swap,
+                )
 
     def test_raw_uuu(self):
-        self._full_exact(np.uint8,
-                         np.uint8,
-                         np.uint8,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.uint8, np.uint8, np.uint8, output0_raw=True, output1_raw=True, swap=True
+        )
 
     def test_raw_bbb(self):
-        self._full_exact(np.int8,
-                         np.int8,
-                         np.int8,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.int8, np.int8, np.int8, output0_raw=True, output1_raw=True, swap=True
+        )
 
     def test_raw_sss(self):
-        self._full_exact(np.int16,
-                         np.int16,
-                         np.int16,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.int16, np.int16, np.int16, output0_raw=True, output1_raw=True, swap=True
+        )
 
     def test_raw_iii(self):
-        self._full_exact(np.int32,
-                         np.int32,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.int32, np.int32, np.int32, output0_raw=True, output1_raw=True, swap=True
+        )
 
     def test_raw_lll(self):
-        self._full_exact(np.int64,
-                         np.int64,
-                         np.int64,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int64, np.int64, np.int64, output0_raw=True, output1_raw=True, swap=False
+        )
 
     def test_raw_hhh(self):
-        self._full_exact(np.float16,
-                         np.float16,
-                         np.float16,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.float16,
+            np.float16,
+            np.float16,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_fff(self):
-        self._full_exact(np.float32,
-                         np.float32,
-                         np.float32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.float32,
+            np.float32,
+            np.float32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=True,
+        )
 
     def test_raw_hff(self):
-        self._full_exact(np.float16,
-                         np.float32,
-                         np.float32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.float16,
+            np.float32,
+            np.float32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_bii(self):
-        self._full_exact(np.int8,
-                         np.int32,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int8, np.int32, np.int32, output0_raw=True, output1_raw=True, swap=False
+        )
 
     def test_raw_ibb(self):
-        self._full_exact(np.int32,
-                         np.int8,
-                         np.int8,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32, np.int8, np.int8, output0_raw=True, output1_raw=True, swap=False
+        )
 
     def test_raw_ibs(self):
-        self._full_exact(np.int32,
-                         np.int8,
-                         np.int16,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32, np.int8, np.int16, output0_raw=True, output1_raw=True, swap=False
+        )
 
     def test_raw_fuu(self):
-        self._full_exact(np.float32,
-                         np.uint8,
-                         np.uint8,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.float32,
+            np.uint8,
+            np.uint8,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_uff(self):
-        self._full_exact(np.uint8,
-                         np.float32,
-                         np.float32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.uint8,
+            np.float32,
+            np.float32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_fuh(self):
-        self._full_exact(np.float32,
-                         np.uint8,
-                         np.float16,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.float32,
+            np.uint8,
+            np.float16,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_iff(self):
-        self._full_exact(np.int32,
-                         np.float32,
-                         np.float32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32,
+            np.float32,
+            np.float32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_fii(self):
-        self._full_exact(np.float32,
-                         np.int32,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.float32,
+            np.int32,
+            np.int32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_ihs(self):
-        self._full_exact(np.int32,
-                         np.float16,
-                         np.int16,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32,
+            np.float16,
+            np.int16,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_ooo(self):
-        self._full_exact(np_dtype_string,
-                         np_dtype_string,
-                         np_dtype_string,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np_dtype_string,
+            np_dtype_string,
+            np_dtype_string,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_oii(self):
-        self._full_exact(np_dtype_string,
-                         np.int32,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np_dtype_string,
+            np.int32,
+            np.int32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_oio(self):
-        self._full_exact(np_dtype_string,
-                         np.int32,
-                         np_dtype_string,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np_dtype_string,
+            np.int32,
+            np_dtype_string,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_ooi(self):
-        self._full_exact(np_dtype_string,
-                         np_dtype_string,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np_dtype_string,
+            np_dtype_string,
+            np.int32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_ioo(self):
-        self._full_exact(np.int32,
-                         np_dtype_string,
-                         np_dtype_string,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32,
+            np_dtype_string,
+            np_dtype_string,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_iio(self):
-        self._full_exact(np.int32,
-                         np.int32,
-                         np_dtype_string,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32,
+            np.int32,
+            np_dtype_string,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_ioi(self):
-        self._full_exact(np.int32,
-                         np_dtype_string,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32,
+            np_dtype_string,
+            np.int32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     # shared memory does not support class output
     if not (TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY):
@@ -486,95 +552,118 @@ def test_class_bbb(self):
                 swap=True,
                 # Increase network_timeout for TensorFlow models for
                 # valgrind test.
-                network_timeout=100.0 if TEST_VALGRIND else 60.0)
+                network_timeout=100.0 if TEST_VALGRIND else 60.0,
+            )
 
         def test_class_sss(self):
-            self._full_exact(np.int16,
-                             np.int16,
-                             np.int16,
-                             output0_raw=False,
-                             output1_raw=False,
-                             swap=True)
+            self._full_exact(
+                np.int16,
+                np.int16,
+                np.int16,
+                output0_raw=False,
+                output1_raw=False,
+                swap=True,
+            )
 
         def test_class_iii(self):
-            self._full_exact(np.int32,
-                             np.int32,
-                             np.int32,
-                             output0_raw=False,
-                             output1_raw=False,
-                             swap=True)
+            self._full_exact(
+                np.int32,
+                np.int32,
+                np.int32,
+                output0_raw=False,
+                output1_raw=False,
+                swap=True,
+            )
 
         def test_class_lll(self):
-            self._full_exact(np.int64,
-                             np.int64,
-                             np.int64,
-                             output0_raw=False,
-                             output1_raw=False,
-                             swap=False)
+            self._full_exact(
+                np.int64,
+                np.int64,
+                np.int64,
+                output0_raw=False,
+                output1_raw=False,
+                swap=False,
+            )
 
         def test_class_fff(self):
-            self._full_exact(np.float32,
-                             np.float32,
-                             np.float32,
-                             output0_raw=False,
-                             output1_raw=False,
-                             swap=True)
+            self._full_exact(
+                np.float32,
+                np.float32,
+                np.float32,
+                output0_raw=False,
+                output1_raw=False,
+                swap=True,
+            )
 
         def test_class_iff(self):
-            self._full_exact(np.int32,
-                             np.float32,
-                             np.float32,
-                             output0_raw=False,
-                             output1_raw=False,
-                             swap=False)
+            self._full_exact(
+                np.int32,
+                np.float32,
+                np.float32,
+                output0_raw=False,
+                output1_raw=False,
+                swap=False,
+            )
 
         def test_mix_bbb(self):
-            self._full_exact(np.int8,
-                             np.int8,
-                             np.int8,
-                             output0_raw=True,
-                             output1_raw=False,
-                             swap=True)
+            self._full_exact(
+                np.int8,
+                np.int8,
+                np.int8,
+                output0_raw=True,
+                output1_raw=False,
+                swap=True,
+            )
 
         def test_mix_sss(self):
-            self._full_exact(np.int16,
-                             np.int16,
-                             np.int16,
-                             output0_raw=False,
-                             output1_raw=True,
-                             swap=True)
+            self._full_exact(
+                np.int16,
+                np.int16,
+                np.int16,
+                output0_raw=False,
+                output1_raw=True,
+                swap=True,
+            )
 
         def test_mix_iii(self):
-            self._full_exact(np.int32,
-                             np.int32,
-                             np.int32,
-                             output0_raw=True,
-                             output1_raw=False,
-                             swap=True)
+            self._full_exact(
+                np.int32,
+                np.int32,
+                np.int32,
+                output0_raw=True,
+                output1_raw=False,
+                swap=True,
+            )
 
         def test_mix_lll(self):
-            self._full_exact(np.int64,
-                             np.int64,
-                             np.int64,
-                             output0_raw=False,
-                             output1_raw=True,
-                             swap=False)
+            self._full_exact(
+                np.int64,
+                np.int64,
+                np.int64,
+                output0_raw=False,
+                output1_raw=True,
+                swap=False,
+            )
 
         def test_mix_fff(self):
-            self._full_exact(np.float32,
-                             np.float32,
-                             np.float32,
-                             output0_raw=True,
-                             output1_raw=False,
-                             swap=True)
+            self._full_exact(
+                np.float32,
+                np.float32,
+                np.float32,
+                output0_raw=True,
+                output1_raw=False,
+                swap=True,
+            )
 
         def test_mix_iff(self):
-            self._full_exact(np.int32,
-                             np.float32,
-                             np.float32,
-                             output0_raw=False,
-                             output1_raw=True,
-                             swap=False)
+            self._full_exact(
+                np.int32,
+                np.float32,
+                np.float32,
+                output0_raw=False,
+                output1_raw=True,
+                swap=False,
+            )
 
     def test_raw_version_latest_1(self):
         input_size = 16
@@ -582,7 +671,7 @@ def test_raw_version_latest_1(self):
 
         # There are 3 versions of graphdef_int8_int8_int8 but
         # only version 3 should be available
-        for platform in ('graphdef', 'savedmodel'):
+        for platform in ("graphdef", "savedmodel"):
             if platform not in BACKENDS:
                 continue
             try:
@@ -599,10 +688,10 @@ def test_raw_version_latest_1(self):
                     use_http=USE_HTTP,
                     use_grpc=USE_GRPC,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             except InferenceServerException as ex:
-                self.assertTrue(
-                    ex.message().startswith("Request for unknown model"))
+                self.assertTrue(ex.message().startswith("Request for unknown model"))
 
             try:
                 iu.infer_exact(
@@ -618,24 +707,26 @@ def test_raw_version_latest_1(self):
                     use_http=USE_HTTP,
                     use_grpc=USE_GRPC,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             except InferenceServerException as ex:
-                self.assertTrue(
-                    ex.message().startswith("Request for unknown model"))
-
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.int8,
-                           np.int8,
-                           np.int8,
-                           model_version=3,
-                           swap=True,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                self.assertTrue(ex.message().startswith("Request for unknown model"))
+
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.int8,
+                np.int8,
+                np.int8,
+                model_version=3,
+                swap=True,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
 
     def test_raw_version_latest_2(self):
         input_size = 16
@@ -643,7 +734,7 @@ def test_raw_version_latest_2(self):
 
         # There are 3 versions of graphdef_int16_int16_int16 but only
         # versions 2 and 3 should be available
-        for platform in ('graphdef', 'savedmodel'):
+        for platform in ("graphdef", "savedmodel"):
             if platform not in BACKENDS:
                 continue
             try:
@@ -660,37 +751,41 @@ def test_raw_version_latest_2(self):
                     use_http=USE_HTTP,
                     use_grpc=USE_GRPC,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             except InferenceServerException as ex:
-                self.assertTrue(
-                    ex.message().startswith("Request for unknown model"))
-
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.int16,
-                           np.int16,
-                           np.int16,
-                           model_version=2,
-                           swap=True,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.int16,
-                           np.int16,
-                           np.int16,
-                           model_version=3,
-                           swap=True,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                self.assertTrue(ex.message().startswith("Request for unknown model"))
+
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.int16,
+                np.int16,
+                np.int16,
+                model_version=2,
+                swap=True,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.int16,
+                np.int16,
+                np.int16,
+                model_version=3,
+                swap=True,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
 
     def test_raw_version_all(self):
         input_size = 16
@@ -698,48 +793,54 @@ def test_raw_version_all(self):
 
         # There are 3 versions of *_int32_int32_int32 and all should
         # be available.
-        for platform in ('graphdef', 'savedmodel'):
+        for platform in ("graphdef", "savedmodel"):
             if platform not in BACKENDS:
                 continue
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.int32,
-                           np.int32,
-                           np.int32,
-                           model_version=1,
-                           swap=False,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.int32,
-                           np.int32,
-                           np.int32,
-                           model_version=2,
-                           swap=True,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.int32,
-                           np.int32,
-                           np.int32,
-                           model_version=3,
-                           swap=True,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.int32,
+                np.int32,
+                np.int32,
+                model_version=1,
+                swap=False,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.int32,
+                np.int32,
+                np.int32,
+                model_version=2,
+                swap=True,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.int32,
+                np.int32,
+                np.int32,
+                model_version=3,
+                swap=True,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
 
     def test_raw_version_specific_1(self):
         input_size = 16
@@ -747,22 +848,24 @@ def test_raw_version_specific_1(self):
 
         # There are 3 versions of *_float16_float16_float16 but only
         # version 1 should be available.
-        for platform in ('graphdef', 'savedmodel'):
+        for platform in ("graphdef", "savedmodel"):
             if platform not in BACKENDS:
                 continue
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.float16,
-                           np.float16,
-                           np.float16,
-                           model_version=1,
-                           swap=False,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.float16,
+                np.float16,
+                np.float16,
+                model_version=1,
+                swap=False,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
 
             try:
                 iu.infer_exact(
@@ -778,10 +881,10 @@ def test_raw_version_specific_1(self):
                     use_http=USE_HTTP,
                     use_grpc=USE_GRPC,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             except InferenceServerException as ex:
-                self.assertTrue(
-                    ex.message().startswith("Request for unknown model"))
+                self.assertTrue(ex.message().startswith("Request for unknown model"))
 
             try:
                 iu.infer_exact(
@@ -797,35 +900,37 @@ def test_raw_version_specific_1(self):
                     use_http=USE_HTTP,
                     use_grpc=USE_GRPC,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             except InferenceServerException as ex:
-                self.assertTrue(
-                    ex.message().startswith("Request for unknown model"))
+                self.assertTrue(ex.message().startswith("Request for unknown model"))
 
     def test_raw_version_specific_1_3(self):
         input_size = 16
 
         # There are 3 versions of *_float32_float32_float32 but only
         # versions 1 and 3 should be available.
-        for platform in ('graphdef', 'savedmodel', 'plan'):
-            if platform == 'plan' and CPU_ONLY:
+        for platform in ("graphdef", "savedmodel", "plan"):
+            if platform == "plan" and CPU_ONLY:
                 continue
             if platform not in BACKENDS:
                 continue
             tensor_shape = (1, input_size)
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           model_version=1,
-                           swap=False,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                model_version=1,
+                swap=False,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
 
             try:
                 iu.infer_exact(
@@ -841,27 +946,29 @@ def test_raw_version_specific_1_3(self):
                     use_http=USE_HTTP,
                     use_grpc=USE_GRPC,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             except InferenceServerException as ex:
-                self.assertTrue(
-                    ex.message().startswith("Request for unknown model"))
-
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           model_version=3,
-                           swap=True,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                self.assertTrue(ex.message().startswith("Request for unknown model"))
+
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                model_version=3,
+                swap=True,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
 
     if ENSEMBLES:
-        if all(x in BACKENDS for x in ['graphdef', 'savedmodel']):
+        if all(x in BACKENDS for x in ["graphdef", "savedmodel"]):
 
             def test_ensemble_mix_platform(self):
                 # Skip on CPU only machine as TensorRT model is used in this ensemble
@@ -870,7 +977,8 @@ def test_ensemble_mix_platform(self):
                 for bs in (1, 8):
                     iu.infer_exact(
                         self,
-                        "mix_platform", (bs, 16),
+                        "mix_platform",
+                        (bs, 16),
                         bs,
                         np.float32,
                         np.float32,
@@ -878,7 +986,8 @@ def test_ensemble_mix_platform(self):
                         use_http=USE_HTTP,
                         use_grpc=USE_GRPC,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
         if "graphdef" in BACKENDS:
 
@@ -886,7 +995,8 @@ def test_ensemble_mix_type(self):
                 for bs in (1, 8):
                     iu.infer_exact(
                         self,
-                        "mix_type", (bs, 16),
+                        "mix_type",
+                        (bs, 16),
                         bs,
                         np.int32,
                         np.float32,
@@ -894,15 +1004,17 @@ def test_ensemble_mix_type(self):
                         use_http=USE_HTTP,
                         use_grpc=USE_GRPC,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
-        if all(x in BACKENDS for x in ['graphdef', 'savedmodel']):
+        if all(x in BACKENDS for x in ["graphdef", "savedmodel"]):
 
             def test_ensemble_mix_ensemble(self):
                 for bs in (1, 8):
                     iu.infer_exact(
                         self,
-                        "mix_ensemble", (bs, 16),
+                        "mix_ensemble",
+                        (bs, 16),
                         bs,
                         np.int32,
                         np.float32,
@@ -910,11 +1022,15 @@ def test_ensemble_mix_ensemble(self):
                         use_http=USE_HTTP,
                         use_grpc=USE_GRPC,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
-        if all(x in BACKENDS for x in [
-                'graphdef',
-        ]):
+        if all(
+            x in BACKENDS
+            for x in [
+                "graphdef",
+            ]
+        ):
 
             def test_ensemble_mix_batch_nobatch(self):
                 base_names = ["batch_to_nobatch", "nobatch_to_batch"]
@@ -922,7 +1038,8 @@ def test_ensemble_mix_batch_nobatch(self):
                     for bs in (1, 8):
                         iu.infer_exact(
                             self,
-                            name, (bs, 16),
+                            name,
+                            (bs, 16),
                             bs,
                             np.float32,
                             np.float32,
@@ -930,10 +1047,12 @@ def test_ensemble_mix_batch_nobatch(self):
                             use_http=USE_HTTP,
                             use_grpc=USE_GRPC,
                             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                        )
                     iu.infer_exact(
                         self,
-                        name + "_nobatch", (8, 16),
+                        name + "_nobatch",
+                        (8, 16),
                         1,
                         np.float32,
                         np.float32,
@@ -941,13 +1060,15 @@ def test_ensemble_mix_batch_nobatch(self):
                         use_http=USE_HTTP,
                         use_grpc=USE_GRPC,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
                 # batch -> nobatch -> batch
                 for bs in (1, 8):
                     iu.infer_exact(
                         self,
-                        "mix_nobatch_batch", (bs, 16),
+                        "mix_nobatch_batch",
+                        (bs, 16),
                         bs,
                         np.float32,
                         np.float32,
@@ -955,17 +1076,19 @@ def test_ensemble_mix_batch_nobatch(self):
                         use_http=USE_HTTP,
                         use_grpc=USE_GRPC,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
         if not (TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY):
 
             def test_ensemble_label_lookup(self):
-                if all(x in BACKENDS for x in ['graphdef', 'savedmodel']):
+                if all(x in BACKENDS for x in ["graphdef", "savedmodel"]):
                     # Ensemble needs to look up label from the actual model
                     for bs in (1, 8):
                         iu.infer_exact(
                             self,
-                            "mix_platform", (bs, 16),
+                            "mix_platform",
+                            (bs, 16),
                             bs,
                             np.float32,
                             np.float32,
@@ -975,14 +1098,16 @@ def test_ensemble_label_lookup(self):
                             use_http=USE_HTTP,
                             use_grpc=USE_GRPC,
                             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                        )
 
-                if all(x in BACKENDS for x in ['graphdef', 'savedmodel']):
+                if all(x in BACKENDS for x in ["graphdef", "savedmodel"]):
                     # Label from the actual model will be passed along the nested ensemble
                     for bs in (1, 8):
                         iu.infer_exact(
                             self,
-                            "mix_ensemble", (bs, 16),
+                            "mix_ensemble",
+                            (bs, 16),
                             bs,
                             np.int32,
                             np.float32,
@@ -992,14 +1117,16 @@ def test_ensemble_label_lookup(self):
                             use_http=USE_HTTP,
                             use_grpc=USE_GRPC,
                             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                        )
 
                 if "graphdef" in BACKENDS:
                     # If label file is provided, it will use the provided label file directly
                     try:
                         iu.infer_exact(
                             self,
-                            "wrong_label", (1, 16),
+                            "wrong_label",
+                            (1, 16),
                             1,
                             np.int32,
                             np.float32,
@@ -1009,7 +1136,8 @@ def test_ensemble_label_lookup(self):
                             use_http=USE_HTTP,
                             use_grpc=USE_GRPC,
                             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                        )
                     except AssertionError:
                         # Sanity check that infer_exact failed since this ensemble is provided
                         # with unexpected labels
@@ -1019,7 +1147,8 @@ def test_ensemble_label_lookup(self):
                     for bs in (1, 8):
                         iu.infer_exact(
                             self,
-                            "label_override", (bs, 16),
+                            "label_override",
+                            (bs, 16),
                             bs,
                             np.int32,
                             np.float32,
@@ -1029,8 +1158,9 @@ def test_ensemble_label_lookup(self):
                             use_http=USE_HTTP,
                             use_grpc=USE_GRPC,
                             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_infer/install_and_test.sh b/qa/L0_infer/install_and_test.sh
index f488f510f4..28e5dad52e 100755
--- a/qa/L0_infer/install_and_test.sh
+++ b/qa/L0_infer/install_and_test.sh
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# Note: This script is to be used with customized triton containers that need 
+# Note: This script is to be used with customized triton containers that need
 # dependencies to run L0_infer tests
 apt-get update && \
     apt-get install -y --no-install-recommends \
diff --git a/qa/L0_infer_reshape/infer_reshape_test.py b/qa/L0_infer_reshape/infer_reshape_test.py
old mode 100644
new mode 100755
index 0c3117131e..e77dcbecaf
--- a/qa/L0_infer_reshape/infer_reshape_test.py
+++ b/qa/L0_infer_reshape/infer_reshape_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,119 +30,139 @@
 
 sys.path.append("../common")
 
+import os
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
-import os
 
 np_dtype_string = np.dtype(object)
 
-TEST_SYSTEM_SHARED_MEMORY = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
-TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get('TEST_CUDA_SHARED_MEMORY',
-                                                  0)))
+TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
+TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0)))
 
 
 class InferReshapeTest(tu.TestResultCollector):
-
-    def _full_reshape(self,
-                      dtype,
-                      input_shapes,
-                      output_shapes=None,
-                      no_batch=True):
+    def _full_reshape(self, dtype, input_shapes, output_shapes=None, no_batch=True):
         # 'shapes' is list of shapes, one for each input.
         if output_shapes is None:
             output_shapes = input_shapes
 
         # For validation assume any shape can be used...
-        if tu.validate_for_tf_model(dtype, dtype, dtype, input_shapes[0],
-                                    input_shapes[0], input_shapes[0]):
+        if tu.validate_for_tf_model(
+            dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0]
+        ):
             # model that supports batching
             for bs in (1, 8):
-                full_shapes = [[
-                    bs,
-                ] + input_shape for input_shape in input_shapes]
-                full_output_shapes = [[
-                    bs,
-                ] + output_shape for output_shape in output_shapes]
+                full_shapes = [
+                    [
+                        bs,
+                    ]
+                    + input_shape
+                    for input_shape in input_shapes
+                ]
+                full_output_shapes = [
+                    [
+                        bs,
+                    ]
+                    + output_shape
+                    for output_shape in output_shapes
+                ]
                 iu.infer_zero(
                     self,
-                    'graphdef',
+                    "graphdef",
                     bs,
                     dtype,
                     full_shapes,
                     full_output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
                 iu.infer_zero(
                     self,
-                    'savedmodel',
+                    "savedmodel",
                     bs,
                     dtype,
                     full_shapes,
                     full_output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             # model that does not support batching
             if no_batch:
                 iu.infer_zero(
                     self,
-                    'graphdef_nobatch',
+                    "graphdef_nobatch",
                     1,
                     dtype,
                     input_shapes,
                     output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
                 iu.infer_zero(
                     self,
-                    'savedmodel_nobatch',
+                    "savedmodel_nobatch",
                     1,
                     dtype,
                     input_shapes,
                     output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
 
-        if tu.validate_for_onnx_model(dtype, dtype, dtype, input_shapes[0],
-                                      input_shapes[0], input_shapes[0]):
+        if tu.validate_for_onnx_model(
+            dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0]
+        ):
             # model that supports batching
             for bs in (1, 8):
-                full_shapes = [[
-                    bs,
-                ] + input_shape for input_shape in input_shapes]
-                full_output_shapes = [[
-                    bs,
-                ] + output_shape for output_shape in output_shapes]
+                full_shapes = [
+                    [
+                        bs,
+                    ]
+                    + input_shape
+                    for input_shape in input_shapes
+                ]
+                full_output_shapes = [
+                    [
+                        bs,
+                    ]
+                    + output_shape
+                    for output_shape in output_shapes
+                ]
                 iu.infer_zero(
                     self,
-                    'onnx',
+                    "onnx",
                     bs,
                     dtype,
                     full_shapes,
                     full_output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             # model that does not support batching
             if no_batch:
                 iu.infer_zero(
                     self,
-                    'onnx_nobatch',
+                    "onnx_nobatch",
                     1,
                     dtype,
                     input_shapes,
                     output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
 
-        if tu.validate_for_libtorch_model(dtype,
-                                          dtype,
-                                          dtype,
-                                          input_shapes[0],
-                                          input_shapes[0],
-                                          input_shapes[0],
-                                          reshape=True):
+        if tu.validate_for_libtorch_model(
+            dtype,
+            dtype,
+            dtype,
+            input_shapes[0],
+            input_shapes[0],
+            input_shapes[0],
+            reshape=True,
+        ):
             # skip variable size reshape on libtorch for now,
             # see "gen_qa_reshape_model.py" for detail
             if dtype != np.int32:
@@ -149,48 +171,72 @@ def _full_reshape(self,
                 if no_batch and (dtype != np_dtype_string):
                     iu.infer_zero(
                         self,
-                        'libtorch_nobatch',
+                        "libtorch_nobatch",
                         1,
                         dtype,
                         input_shapes,
                         output_shapes,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
                 # model that supports batching
                 for bs in (1, 8):
-                    full_shapes = [[
-                        bs,
-                    ] + input_shape for input_shape in input_shapes]
-                    full_output_shapes = [[
-                        bs,
-                    ] + output_shape for output_shape in output_shapes]
+                    full_shapes = [
+                        [
+                            bs,
+                        ]
+                        + input_shape
+                        for input_shape in input_shapes
+                    ]
+                    full_output_shapes = [
+                        [
+                            bs,
+                        ]
+                        + output_shape
+                        for output_shape in output_shapes
+                    ]
                     iu.infer_zero(
                         self,
-                        'libtorch',
+                        "libtorch",
                         bs,
                         dtype,
                         full_shapes,
                         full_output_shapes,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
         for name in ["simple_reshape", "sequence_reshape", "fan_reshape"]:
             # [TODO] Skip variable size reshape on ensemble for now.
             # Need rework on how ensemble for reshape are generated
             if dtype == np.int32:
                 break
-            if tu.validate_for_ensemble_model(name, dtype, dtype, dtype,
-                                              input_shapes[0], input_shapes[0],
-                                              input_shapes[0]):
+            if tu.validate_for_ensemble_model(
+                name,
+                dtype,
+                dtype,
+                dtype,
+                input_shapes[0],
+                input_shapes[0],
+                input_shapes[0],
+            ):
                 # model that supports batching
                 for bs in (1, 8):
-                    full_shapes = [[
-                        bs,
-                    ] + input_shape for input_shape in input_shapes]
-                    full_output_shapes = [[
-                        bs,
-                    ] + output_shape for output_shape in output_shapes]
+                    full_shapes = [
+                        [
+                            bs,
+                        ]
+                        + input_shape
+                        for input_shape in input_shapes
+                    ]
+                    full_output_shapes = [
+                        [
+                            bs,
+                        ]
+                        + output_shape
+                        for output_shape in output_shapes
+                    ]
                     iu.infer_zero(
                         self,
                         name,
@@ -199,58 +245,67 @@ def _full_reshape(self,
                         full_shapes,
                         full_output_shapes,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
                 # model that does not support batching
                 if no_batch:
                     iu.infer_zero(
                         self,
-                        name + '_nobatch',
+                        name + "_nobatch",
                         1,
                         dtype,
                         input_shapes,
                         output_shapes,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
-    def _trt_reshape(self,
-                     dtype,
-                     input_shapes,
-                     output_shapes=None,
-                     no_batch=True):
+    def _trt_reshape(self, dtype, input_shapes, output_shapes=None, no_batch=True):
         # 'shapes' is list of shapes, one for each input.
         if output_shapes is None:
             output_shapes = input_shapes
 
-        if tu.validate_for_trt_model(dtype, dtype, dtype, input_shapes[0],
-                                     input_shapes[0], input_shapes[0]):
+        if tu.validate_for_trt_model(
+            dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0]
+        ):
             # model that supports batching
             for bs in (1, 8):
-                full_shapes = [[
-                    bs,
-                ] + input_shape for input_shape in input_shapes]
-                full_output_shapes = [[
-                    bs,
-                ] + output_shape for output_shape in output_shapes]
+                full_shapes = [
+                    [
+                        bs,
+                    ]
+                    + input_shape
+                    for input_shape in input_shapes
+                ]
+                full_output_shapes = [
+                    [
+                        bs,
+                    ]
+                    + output_shape
+                    for output_shape in output_shapes
+                ]
                 iu.infer_zero(
                     self,
-                    'plan',
+                    "plan",
                     bs,
                     dtype,
                     full_shapes,
                     full_output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             # model that does not support batching
             if no_batch:
                 iu.infer_zero(
                     self,
-                    'plan_nobatch',
+                    "plan_nobatch",
                     1,
                     dtype,
                     input_shapes,
                     output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
 
     def test_ff1(self):
         self._full_reshape(np.float32, input_shapes=([1],), no_batch=False)
@@ -263,21 +318,24 @@ def test_ff3(self):
         self._full_reshape(np.float32, input_shapes=([4, 4], [2], [2, 2, 3]))
 
     def test_ff4(self):
-        self._full_reshape(np.float32,
-                           input_shapes=([4, 4], [2], [2, 2, 3], [1]),
-                           output_shapes=([16], [1, 2], [3, 2, 2], [1]))
-        self._trt_reshape(np.float32,
-                          input_shapes=([4, 4], [2], [2, 2, 3], [1]),
-                          output_shapes=([2, 2, 4], [1, 2, 1], [3, 2,
-                                                                2], [1, 1, 1]))
+        self._full_reshape(
+            np.float32,
+            input_shapes=([4, 4], [2], [2, 2, 3], [1]),
+            output_shapes=([16], [1, 2], [3, 2, 2], [1]),
+        )
+        self._trt_reshape(
+            np.float32,
+            input_shapes=([4, 4], [2], [2, 2, 3], [1]),
+            output_shapes=([2, 2, 4], [1, 2, 1], [3, 2, 2], [1, 1, 1]),
+        )
 
     def test_ii1(self):
         self._full_reshape(np.int32, input_shapes=([2, 4, 5, 6],))
 
     def test_ii2(self):
-        self._full_reshape(np.int32,
-                           input_shapes=([4, 1], [2]),
-                           output_shapes=([1, 4], [1, 2]))
+        self._full_reshape(
+            np.int32, input_shapes=([4, 1], [2]), output_shapes=([1, 4], [1, 2])
+        )
 
     def test_ii3(self):
         self._full_reshape(np.int32, input_shapes=([1, 4, 1], [8], [2, 2, 3]))
@@ -286,5 +344,5 @@ def test_oo1(self):
         self._full_reshape(np.object_, input_shapes=([1],), no_batch=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_infer_variable/infer_variable_test.py b/qa/L0_infer_variable/infer_variable_test.py
old mode 100644
new mode 100755
index 3769e30d4e..e5e6470a3c
--- a/qa/L0_infer_variable/infer_variable_test.py
+++ b/qa/L0_infer_variable/infer_variable_test.py
@@ -1,4 +1,6 @@
-# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -30,48 +32,49 @@
 
 import os
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
 
 np_dtype_string = np.dtype(object)
 
-TEST_SYSTEM_SHARED_MEMORY = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
-TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get('TEST_CUDA_SHARED_MEMORY',
-                                                  0)))
+TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
+TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0)))
 
 
 class InferVariableTest(tu.TestResultCollector):
-
-    def _full_exact(self,
-                    input_dtype,
-                    output0_dtype,
-                    output1_dtype,
-                    input_shape,
-                    output0_shape,
-                    output1_shape,
-                    output0_raw=True,
-                    output1_raw=True,
-                    swap=False):
-
-        def _infer_exact_helper(tester,
-                                pf,
-                                tensor_shape,
-                                batch_size,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                output0_raw=True,
-                                output1_raw=True,
-                                model_version=None,
-                                swap=False,
-                                outputs=("OUTPUT0", "OUTPUT1"),
-                                use_http=True,
-                                use_grpc=True,
-                                skip_request_id_check=False,
-                                use_streaming=True,
-                                correlation_id=0):
+    def _full_exact(
+        self,
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+        output0_raw=True,
+        output1_raw=True,
+        swap=False,
+    ):
+        def _infer_exact_helper(
+            tester,
+            pf,
+            tensor_shape,
+            batch_size,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_raw=True,
+            output1_raw=True,
+            model_version=None,
+            swap=False,
+            outputs=("OUTPUT0", "OUTPUT1"),
+            use_http=True,
+            use_grpc=True,
+            skip_request_id_check=False,
+            use_streaming=True,
+            correlation_id=0,
+        ):
             for bs in (1, batch_size):
                 # model that does not support batching
                 if bs == 1:
@@ -94,15 +97,23 @@ def _infer_exact_helper(tester,
                         use_streaming=use_streaming,
                         correlation_id=correlation_id,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
                 # model that supports batching. Skip for libtorch string I/O
-                elif pf == 'libtorch' and tu.validate_for_libtorch_model(
-                        input_dtype, output0_dtype, output1_dtype, tensor_shape,
-                        tensor_shape, tensor_shape, bs):
+                elif pf == "libtorch" and tu.validate_for_libtorch_model(
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    tensor_shape,
+                    tensor_shape,
+                    tensor_shape,
+                    bs,
+                ):
                     iu.infer_exact(
                         tester,
-                        pf, (bs,) + tensor_shape,
+                        pf,
+                        (bs,) + tensor_shape,
                         bs,
                         input_dtype,
                         output0_dtype,
@@ -118,91 +129,128 @@ def _infer_exact_helper(tester,
                         use_streaming=use_streaming,
                         correlation_id=correlation_id,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
         all_ensemble_prefix = ["simple_", "sequence_", "fan_"]
         ensemble_prefix = [""]
         for prefix in all_ensemble_prefix:
-            if tu.validate_for_ensemble_model(prefix, input_dtype,
-                                              output0_dtype, output1_dtype,
-                                              input_shape, input_shape,
-                                              input_shape):
+            if tu.validate_for_ensemble_model(
+                prefix,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                input_shape,
+                input_shape,
+                input_shape,
+            ):
                 ensemble_prefix.append(prefix)
 
-        if tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
-                                    input_shape, output0_shape, output1_shape):
+        if tu.validate_for_tf_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            input_shape,
+            output0_shape,
+            output1_shape,
+        ):
             for prefix in ensemble_prefix:
                 for pf in ["graphdef", "savedmodel"]:
-                    _infer_exact_helper(self,
-                                        prefix + pf,
-                                        input_shape,
-                                        8,
-                                        input_dtype,
-                                        output0_dtype,
-                                        output1_dtype,
-                                        output0_raw=output0_raw,
-                                        output1_raw=output1_raw,
-                                        swap=swap)
-
-        if tu.validate_for_trt_model(input_dtype, output0_dtype, output1_dtype,
-                                     input_shape, output0_shape, output1_shape):
+                    _infer_exact_helper(
+                        self,
+                        prefix + pf,
+                        input_shape,
+                        8,
+                        input_dtype,
+                        output0_dtype,
+                        output1_dtype,
+                        output0_raw=output0_raw,
+                        output1_raw=output1_raw,
+                        swap=swap,
+                    )
+
+        if tu.validate_for_trt_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            input_shape,
+            output0_shape,
+            output1_shape,
+        ):
             for prefix in ensemble_prefix:
                 if input_dtype == np.int8:
-                    _infer_exact_helper(self,
-                                        prefix + 'plan',
-                                        input_shape + (1, 1),
-                                        8,
-                                        input_dtype,
-                                        output0_dtype,
-                                        output1_dtype,
-                                        output0_raw=output0_raw,
-                                        output1_raw=output1_raw,
-                                        swap=swap)
+                    _infer_exact_helper(
+                        self,
+                        prefix + "plan",
+                        input_shape + (1, 1),
+                        8,
+                        input_dtype,
+                        output0_dtype,
+                        output1_dtype,
+                        output0_raw=output0_raw,
+                        output1_raw=output1_raw,
+                        swap=swap,
+                    )
                 else:
-                    _infer_exact_helper(self,
-                                        prefix + 'plan',
-                                        input_shape,
-                                        8,
-                                        input_dtype,
-                                        output0_dtype,
-                                        output1_dtype,
-                                        output0_raw=output0_raw,
-                                        output1_raw=output1_raw,
-                                        swap=swap)
-
-        if tu.validate_for_onnx_model(input_dtype, output0_dtype, output1_dtype,
-                                      input_shape, output0_shape,
-                                      output1_shape):
+                    _infer_exact_helper(
+                        self,
+                        prefix + "plan",
+                        input_shape,
+                        8,
+                        input_dtype,
+                        output0_dtype,
+                        output1_dtype,
+                        output0_raw=output0_raw,
+                        output1_raw=output1_raw,
+                        swap=swap,
+                    )
+
+        if tu.validate_for_onnx_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            input_shape,
+            output0_shape,
+            output1_shape,
+        ):
             # No basic ensemble models are created against custom models [TODO]
-            _infer_exact_helper(self,
-                                'onnx',
-                                input_shape,
-                                8,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                output0_raw=output0_raw,
-                                output1_raw=output1_raw,
-                                swap=swap)
-
-        if tu.validate_for_libtorch_model(input_dtype, output0_dtype,
-                                          output1_dtype, input_shape,
-                                          output0_shape, output1_shape):
+            _infer_exact_helper(
+                self,
+                "onnx",
+                input_shape,
+                8,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                output0_raw=output0_raw,
+                output1_raw=output1_raw,
+                swap=swap,
+            )
+
+        if tu.validate_for_libtorch_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            input_shape,
+            output0_shape,
+            output1_shape,
+        ):
             # No basic ensemble models are created against custom models [TODO]
-            _infer_exact_helper(self,
-                                'libtorch',
-                                input_shape,
-                                8,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                output0_raw=output0_raw,
-                                output1_raw=output1_raw,
-                                swap=swap)
+            _infer_exact_helper(
+                self,
+                "libtorch",
+                input_shape,
+                8,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                output0_raw=output0_raw,
+                output1_raw=output1_raw,
+                swap=swap,
+            )
 
     def test_raw_fff(self):
-        self._full_exact(np.float32, np.float32, np.float32, (16,), (16,),
-                         (16,))
+        self._full_exact(np.float32, np.float32, np.float32, (16,), (16,), (16,))
 
     def test_raw_fii(self):
         self._full_exact(np.float32, np.int32, np.int32, (2, 8), (2, 8), (2, 8))
@@ -211,8 +259,9 @@ def test_raw_fll(self):
         self._full_exact(np.float32, np.int64, np.int64, (8, 4), (8, 4), (8, 4))
 
     def test_raw_fil(self):
-        self._full_exact(np.float32, np.int32, np.int64, (2, 8, 2), (2, 8, 2),
-                         (2, 8, 2))
+        self._full_exact(
+            np.float32, np.int32, np.int64, (2, 8, 2), (2, 8, 2), (2, 8, 2)
+        )
 
     def test_raw_ffi(self):
         self._full_exact(np.float32, np.float32, np.int32, (16,), (16,), (16,))
@@ -221,95 +270,148 @@ def test_raw_iii(self):
         self._full_exact(np.int32, np.int32, np.int32, (2, 8), (2, 8), (2, 8))
 
     def test_faw_iif(self):
-        self._full_exact(np.int32, np.int32, np.float32, (2, 8, 2), (2, 8, 2),
-                         (2, 8, 2))
+        self._full_exact(
+            np.int32, np.int32, np.float32, (2, 8, 2), (2, 8, 2), (2, 8, 2)
+        )
 
     def test_raw_ooo(self):
-        self._full_exact(np_dtype_string, np_dtype_string, np_dtype_string,
-                         (16,), (16,), (16,))
+        self._full_exact(
+            np_dtype_string, np_dtype_string, np_dtype_string, (16,), (16,), (16,)
+        )
 
     def test_raw_oii(self):
-        self._full_exact(np_dtype_string, np.int32, np.int32, (2, 8), (2, 8),
-                         (2, 8))
+        self._full_exact(np_dtype_string, np.int32, np.int32, (2, 8), (2, 8), (2, 8))
 
     def test_raw_ooi(self):
-        self._full_exact(np_dtype_string, np_dtype_string, np.int32, (8, 4),
-                         (8, 4), (8, 4))
+        self._full_exact(
+            np_dtype_string, np_dtype_string, np.int32, (8, 4), (8, 4), (8, 4)
+        )
 
     def test_raw_oio(self):
-        self._full_exact(np_dtype_string, np.int32, np_dtype_string, (2, 8, 2),
-                         (2, 8, 2), (2, 8, 2))
+        self._full_exact(
+            np_dtype_string, np.int32, np_dtype_string, (2, 8, 2), (2, 8, 2), (2, 8, 2)
+        )
 
     def test_class_fff(self):
-        self._full_exact(np.float32,
-                         np.float32,
-                         np.float32, (16,), (16,), (16,),
-                         output0_raw=False,
-                         output1_raw=False)
+        self._full_exact(
+            np.float32,
+            np.float32,
+            np.float32,
+            (16,),
+            (16,),
+            (16,),
+            output0_raw=False,
+            output1_raw=False,
+        )
 
     def test_class_fii(self):
-        self._full_exact(np.float32,
-                         np.int32,
-                         np.int32, (2, 8), (2, 8), (2, 8),
-                         output0_raw=False,
-                         output1_raw=False)
+        self._full_exact(
+            np.float32,
+            np.int32,
+            np.int32,
+            (2, 8),
+            (2, 8),
+            (2, 8),
+            output0_raw=False,
+            output1_raw=False,
+        )
 
     def test_class_fll(self):
-        self._full_exact(np.float32,
-                         np.int64,
-                         np.int64, (8, 4), (8, 4), (8, 4),
-                         output0_raw=False,
-                         output1_raw=False)
+        self._full_exact(
+            np.float32,
+            np.int64,
+            np.int64,
+            (8, 4),
+            (8, 4),
+            (8, 4),
+            output0_raw=False,
+            output1_raw=False,
+        )
 
     def test_class_fil(self):
-        self._full_exact(np.float32,
-                         np.int32,
-                         np.int64, (2, 8, 2), (2, 8, 2), (2, 8, 2),
-                         output0_raw=False,
-                         output1_raw=False)
+        self._full_exact(
+            np.float32,
+            np.int32,
+            np.int64,
+            (2, 8, 2),
+            (2, 8, 2),
+            (2, 8, 2),
+            output0_raw=False,
+            output1_raw=False,
+        )
 
     def test_class_ffi(self):
-        self._full_exact(np.float32,
-                         np.float32,
-                         np.int32, (16,), (16,), (16,),
-                         output0_raw=False,
-                         output1_raw=False)
+        self._full_exact(
+            np.float32,
+            np.float32,
+            np.int32,
+            (16,),
+            (16,),
+            (16,),
+            output0_raw=False,
+            output1_raw=False,
+        )
 
     def test_class_iii(self):
-        self._full_exact(np.int32,
-                         np.int32,
-                         np.int32, (2, 8), (2, 8), (2, 8),
-                         output0_raw=False,
-                         output1_raw=False)
+        self._full_exact(
+            np.int32,
+            np.int32,
+            np.int32,
+            (2, 8),
+            (2, 8),
+            (2, 8),
+            output0_raw=False,
+            output1_raw=False,
+        )
 
     def test_class_iif(self):
-        self._full_exact(np.int32,
-                         np.int32,
-                         np.float32, (2, 8, 2), (2, 8, 2), (2, 8, 2),
-                         output0_raw=False,
-                         output1_raw=False)
+        self._full_exact(
+            np.int32,
+            np.int32,
+            np.float32,
+            (2, 8, 2),
+            (2, 8, 2),
+            (2, 8, 2),
+            output0_raw=False,
+            output1_raw=False,
+        )
 
     def test_mix_ffi(self):
-        self._full_exact(np.float32,
-                         np.float32,
-                         np.int32, (16,), (16,), (16,),
-                         output0_raw=True,
-                         output1_raw=False)
+        self._full_exact(
+            np.float32,
+            np.float32,
+            np.int32,
+            (16,),
+            (16,),
+            (16,),
+            output0_raw=True,
+            output1_raw=False,
+        )
 
     def test_mix_iii(self):
-        self._full_exact(np.int32,
-                         np.int32,
-                         np.int32, (2, 8), (2, 8), (2, 8),
-                         output0_raw=False,
-                         output1_raw=True)
+        self._full_exact(
+            np.int32,
+            np.int32,
+            np.int32,
+            (2, 8),
+            (2, 8),
+            (2, 8),
+            output0_raw=False,
+            output1_raw=True,
+        )
 
     def test_mix_iif(self):
-        self._full_exact(np.int32,
-                         np.int32,
-                         np.float32, (2, 8, 2), (2, 8, 2), (2, 8, 2),
-                         output0_raw=True,
-                         output1_raw=False)
-
-
-if __name__ == '__main__':
+        self._full_exact(
+            np.int32,
+            np.int32,
+            np.float32,
+            (2, 8, 2),
+            (2, 8, 2),
+            (2, 8, 2),
+            output0_raw=True,
+            output1_raw=False,
+        )
+
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_infer_zero/infer_zero_test.py b/qa/L0_infer_zero/infer_zero_test.py
old mode 100644
new mode 100755
index de00635450..9e9b0f4625
--- a/qa/L0_infer_zero/infer_zero_test.py
+++ b/qa/L0_infer_zero/infer_zero_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,103 +30,125 @@
 
 sys.path.append("../common")
 
+import os
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
-import os
 
 np_dtype_string = np.dtype(object)
 
-TEST_SYSTEM_SHARED_MEMORY = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
-TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get('TEST_CUDA_SHARED_MEMORY',
-                                                  0)))
+TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
+TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0)))
 
 
 class InferZeroTest(tu.TestResultCollector):
-
     def _full_zero(self, dtype, shapes):
         # 'shapes' is list of shapes, one for each input.
 
         # For validation assume any shape can be used...
-        if tu.validate_for_tf_model(dtype, dtype, dtype, shapes[0], shapes[0],
-                                    shapes[0]):
+        if tu.validate_for_tf_model(
+            dtype, dtype, dtype, shapes[0], shapes[0], shapes[0]
+        ):
             # model that supports batching
             for bs in (1, 8):
-                batch_shapes = [[
-                    bs,
-                ] + shape for shape in shapes]
+                batch_shapes = [
+                    [
+                        bs,
+                    ]
+                    + shape
+                    for shape in shapes
+                ]
                 iu.infer_zero(
                     self,
-                    'graphdef',
+                    "graphdef",
                     bs,
                     dtype,
                     batch_shapes,
                     batch_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
                 iu.infer_zero(
                     self,
-                    'savedmodel',
+                    "savedmodel",
                     bs,
                     dtype,
                     batch_shapes,
                     batch_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             # model that does not support batching
-            iu.infer_zero(self,
-                          'graphdef_nobatch',
-                          1,
-                          dtype,
-                          shapes,
-                          shapes,
-                          use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                          use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
-            iu.infer_zero(self,
-                          'savedmodel_nobatch',
-                          1,
-                          dtype,
-                          shapes,
-                          shapes,
-                          use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                          use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
-
-        if tu.validate_for_onnx_model(dtype, dtype, dtype, shapes[0], shapes[0],
-                                      shapes[0]):
+            iu.infer_zero(
+                self,
+                "graphdef_nobatch",
+                1,
+                dtype,
+                shapes,
+                shapes,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
+            iu.infer_zero(
+                self,
+                "savedmodel_nobatch",
+                1,
+                dtype,
+                shapes,
+                shapes,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
+
+        if tu.validate_for_onnx_model(
+            dtype, dtype, dtype, shapes[0], shapes[0], shapes[0]
+        ):
             # model that supports batching
             for bs in (1, 8):
-                batch_shapes = [[
-                    bs,
-                ] + shape for shape in shapes]
+                batch_shapes = [
+                    [
+                        bs,
+                    ]
+                    + shape
+                    for shape in shapes
+                ]
                 iu.infer_zero(
                     self,
-                    'onnx',
+                    "onnx",
                     bs,
                     dtype,
                     batch_shapes,
                     batch_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             # model that does not support batching
-            iu.infer_zero(self,
-                          'onnx_nobatch',
-                          1,
-                          dtype,
-                          shapes,
-                          shapes,
-                          use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                          use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+            iu.infer_zero(
+                self,
+                "onnx_nobatch",
+                1,
+                dtype,
+                shapes,
+                shapes,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
 
         for name in ["simple_zero", "sequence_zero", "fan_zero"]:
-            if tu.validate_for_ensemble_model(name, dtype, dtype, dtype,
-                                              shapes[0], shapes[0], shapes[0]):
+            if tu.validate_for_ensemble_model(
+                name, dtype, dtype, dtype, shapes[0], shapes[0], shapes[0]
+            ):
                 # model that supports batching
                 for bs in (1, 8):
-                    batch_shapes = [[
-                        bs,
-                    ] + shape for shape in shapes]
+                    batch_shapes = [
+                        [
+                            bs,
+                        ]
+                        + shape
+                        for shape in shapes
+                    ]
                     iu.infer_zero(
                         self,
                         name,
@@ -133,81 +157,135 @@ def _full_zero(self, dtype, shapes):
                         batch_shapes,
                         batch_shapes,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
                 # model that does not support batching
                 iu.infer_zero(
                     self,
-                    name + '_nobatch',
+                    name + "_nobatch",
                     1,
                     dtype,
                     shapes,
                     shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
 
     def test_ff1_sanity(self):
-        self._full_zero(np.float32, ([
-            1,
-        ],))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    1,
+                ],
+            ),
+        )
 
     def test_ff1(self):
-        self._full_zero(np.float32, ([
-            0,
-        ],))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    0,
+                ],
+            ),
+        )
 
     def test_ff3_sanity(self):
-        self._full_zero(np.float32, ([
-            1,
-        ], [
-            2,
-        ], [
-            1,
-        ]))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    1,
+                ],
+                [
+                    2,
+                ],
+                [
+                    1,
+                ],
+            ),
+        )
 
     def test_ff3_0(self):
-        self._full_zero(np.float32, ([
-            0,
-        ], [
-            0,
-        ], [
-            0,
-        ]))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    0,
+                ],
+                [
+                    0,
+                ],
+                [
+                    0,
+                ],
+            ),
+        )
 
     def test_ff3_1(self):
-        self._full_zero(np.float32, ([
-            0,
-        ], [
-            0,
-        ], [
-            1,
-        ]))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    0,
+                ],
+                [
+                    0,
+                ],
+                [
+                    1,
+                ],
+            ),
+        )
 
     def test_ff3_2(self):
-        self._full_zero(np.float32, ([
-            0,
-        ], [
-            1,
-        ], [
-            0,
-        ]))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    0,
+                ],
+                [
+                    1,
+                ],
+                [
+                    0,
+                ],
+            ),
+        )
 
     def test_ff3_3(self):
-        self._full_zero(np.float32, ([
-            1,
-        ], [
-            0,
-        ], [
-            0,
-        ]))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    1,
+                ],
+                [
+                    0,
+                ],
+                [
+                    0,
+                ],
+            ),
+        )
 
     def test_ff3_4(self):
-        self._full_zero(np.float32, ([
-            1,
-        ], [
-            0,
-        ], [
-            1,
-        ]))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    1,
+                ],
+                [
+                    0,
+                ],
+                [
+                    1,
+                ],
+            ),
+        )
 
     def test_hh1_sanity(self):
         self._full_zero(np.float16, ([2, 2],))
@@ -240,14 +318,24 @@ def test_hh3_4(self):
         self._full_zero(np.float16, ([1, 1], [0, 6], [2, 2]))
 
     def test_oo1_sanity(self):
-        self._full_zero(np_dtype_string, ([
-            2,
-        ],))
+        self._full_zero(
+            np_dtype_string,
+            (
+                [
+                    2,
+                ],
+            ),
+        )
 
     def test_oo1(self):
-        self._full_zero(np_dtype_string, ([
-            0,
-        ],))
+        self._full_zero(
+            np_dtype_string,
+            (
+                [
+                    0,
+                ],
+            ),
+        )
 
     def test_oo3_sanity(self):
         self._full_zero(np_dtype_string, ([2, 2], [2, 2], [1, 1]))
@@ -268,15 +356,25 @@ def test_oo3_4(self):
         self._full_zero(np_dtype_string, ([1, 1], [0, 6], [2, 2]))
 
     def test_bb1_sanity(self):
-        self._full_zero(bool, ([
-            10,
-        ],))
+        self._full_zero(
+            bool,
+            (
+                [
+                    10,
+                ],
+            ),
+        )
 
     def test_bb1_0(self):
-        self._full_zero(bool, ([
-            0,
-        ],))
+        self._full_zero(
+            bool,
+            (
+                [
+                    0,
+                ],
+            ),
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_inferentia_perf_analyzer/test.sh b/qa/L0_inferentia_perf_analyzer/test.sh
old mode 100644
new mode 100755
index 21e361ee6c..1881e07f87
--- a/qa/L0_inferentia_perf_analyzer/test.sh
+++ b/qa/L0_inferentia_perf_analyzer/test.sh
@@ -25,21 +25,21 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# First need to set up enviroment
+# First need to set up environment
 if [ ${USE_TENSORFLOW} == "1" ] && [ ${USE_PYTORCH} == "1" ] ; then
     echo " Unsupported test configuration. Only one of USE_TENSORFLOW and USE_PYTORCH can be set to 1."
     exit 0
 elif [ ${USE_TENSORFLOW} == "1" ] ; then
-    echo "Setting up enviroment with tensorflow 1"
+    echo "Setting up environment with tensorflow 1"
     source ${TRITON_PATH}/python_backend/inferentia/scripts/setup.sh -t --tensorflow-version 1
 elif [ ${USE_PYTORCH} == "1" ] ; then
-    echo "Setting up enviroment with pytorch"
+    echo "Setting up environment with pytorch"
     source ${TRITON_PATH}/python_backend/inferentia/scripts/setup.sh -p
-else 
+else
     echo " Unsupported test configuration. USE_TENSORFLOW flag is: ${USE_TENSORFLOW} and USE_PYTORCH flag is: ${USE_PYTORCH}. Only one of them can be set to 1."
     exit 0
 fi
-echo "done setting up enviroment"
+echo "done setting up environment"
 
 REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
 if [ "$#" -ge 1 ]; then
@@ -80,32 +80,32 @@ function create_inferentia_models () {
     for DISABLE_DEFAULT_BATCHING_FLAG in ${DISABLE_DEFAULT_BATCHING_FLAGS}; do
         for BATCHED_FLAG in ${BATCHED_FLAGS}; do
             for TEST_TYPE in ${TEST_TYPES}; do
-                CURR_GEN_SCRIPT="${GEN_SCRIPT} --model_type ${MODEL_TYPE}  
-                --triton_model_dir ${TRITON_PATH}/models_${TEST_TYPE}${BATCHED_FLAG}${TEST_FRAMEWORK}${DISABLE_DEFAULT_BATCHING_FLAG}/add-sub-1x4 
+                CURR_GEN_SCRIPT="${GEN_SCRIPT} --model_type ${MODEL_TYPE}
+                --triton_model_dir ${TRITON_PATH}/models_${TEST_TYPE}${BATCHED_FLAG}${TEST_FRAMEWORK}${DISABLE_DEFAULT_BATCHING_FLAG}/add-sub-1x4
                 --compiled_model ${COMPILED_MODEL}"
                 if [ ${DISABLE_DEFAULT_BATCHING_FLAG} == "_no_batch" ]; then
-                    CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT} 
+                    CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT}
                     --disable_batch_requests_to_neuron"
                 fi
                 if [ ${BATCHED_FLAG} == "_batched_" ]; then
                     CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT}
-                    --triton_input INPUT__0,INT64,4 INPUT__1,INT64,4 
-                    --triton_output OUTPUT__0,INT64,4 OUTPUT__1,INT64,4          
-                    --enable_dynamic_batching 
-                    --max_batch_size 1000 
-                    --preferred_batch_size 8 
+                    --triton_input INPUT__0,INT64,4 INPUT__1,INT64,4
+                    --triton_output OUTPUT__0,INT64,4 OUTPUT__1,INT64,4
+                    --enable_dynamic_batching
+                    --max_batch_size 1000
+                    --preferred_batch_size 8
                     --max_queue_delay_microseconds 100"
                 else
                     CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT}
-                    --triton_input INPUT__0,INT64,-1x4 INPUT__1,INT64,-1x4 
+                    --triton_input INPUT__0,INT64,-1x4 INPUT__1,INT64,-1x4
                     --triton_output OUTPUT__0,INT64,-1x4 OUTPUT__1,INT64,-1x4"
                 fi
                 if [ ${TEST_TYPE} == "single" ]; then
-                    CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT}   
+                    CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT}
                     --neuron_core_range 0:0"
                 elif [ ${TEST_TYPE} == "multiple" ]; then
-                    CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT} 
-                    --triton_model_instance_count 3 
+                    CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT}
+                    --triton_model_instance_count 3
                     --neuron_core_range 0:7"
                 fi
                 echo ${CURR_GEN_SCRIPT}
diff --git a/qa/L0_io/test.sh b/qa/L0_io/test.sh
index fc49a4d537..1f7d77ffcc 100755
--- a/qa/L0_io/test.sh
+++ b/qa/L0_io/test.sh
@@ -156,7 +156,7 @@ cp $MODELSDIR/libtorch_multi_device/config.pbtxt $MODELSDIR/libtorch_multi_gpu/.
     sed -i "s/name: \"libtorch_multi_device\"/name: \"libtorch_multi_gpu\"/" config.pbtxt)
 
 set +e
-python3 gen_libtorch_model.py >> $CLIENT_LOG 2>&1 
+python3 gen_libtorch_model.py >> $CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
     echo -e "\n***\n*** Error when generating libtorch models. \n***"
     cat $CLIENT_LOG
diff --git a/qa/L0_java_memory_growth/MemoryGrowthTest.java b/qa/L0_java_memory_growth/MemoryGrowthTest.java
index d5a8092872..3060b6542c 100644
--- a/qa/L0_java_memory_growth/MemoryGrowthTest.java
+++ b/qa/L0_java_memory_growth/MemoryGrowthTest.java
@@ -24,880 +24,833 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import static org.bytedeco.tritonserver.global.tritonserver.*;
+
+import com.google.gson.*;
 import java.io.*;
 import java.util.*;
 import java.util.concurrent.*;
-import com.google.gson.*;
 import org.bytedeco.javacpp.*;
 import org.bytedeco.tritonserver.tritonserver.*;
-import static org.bytedeco.tritonserver.global.tritonserver.*;
 
 public class MemoryGrowthTest {
-    static final double TRITON_MIN_COMPUTE_CAPABILITY = 6.0;
-    private static boolean done = false;
-    static float max_growth_allowed = .10f;
-    static int max_mem_allowed = 30;
-
-    static void FAIL(String MSG) {
-        System.err.println("failure: " + MSG);
-        System.exit(1);
-    }
-
-    static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG) {
-        if (err__ != null) {
-            System.err.println("error: " + MSG + ":"
-                             + TRITONSERVER_ErrorCodeString(err__) + " - "
-                             + TRITONSERVER_ErrorMessage(err__));
-            TRITONSERVER_ErrorDelete(err__);
-            System.exit(1);
-        }
-    }
-
-    static boolean enforce_memory_type = false;
-    static int requested_memory_type;
-    // Parameters for percentile range to include (exclude outliers)
-    static final int max_percentile = 90;
-    static final int min_percentile = 10;
-
-    static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server {
-        public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p) { super(p); deallocator(new DeleteDeallocator(this)); }
-        protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
-            DeleteDeallocator(Pointer p) { super(p); }
-            @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
-        }
-    }
-
-    static void
-    Usage(String msg)
-    {
-      if (msg != null) {
-        System.err.println(msg);
-      }
-
-      System.err.println("Usage: java " + MemoryGrowthTest.class.getSimpleName() + " [options]");
-      System.err.println("\t-i Set number of iterations");
-      System.err.println("\t-m <\"system\"|\"pinned\"|gpu>"
-                       + " Enforce the memory type for input and output tensors."
-                       + " If not specified, inputs will be in system memory and outputs"
-                       + " will be based on the model's preferred type.");
-      System.err.println("\t-v Enable verbose logging");
-      System.err.println("\t-r [model repository absolute path]");
-      System.err.println("\t--max-growth Specify maximum allowed memory growth (%)");
-      System.err.println("\t--max-memory Specify maximum allowed memory (MB)");
-
+  static final double TRITON_MIN_COMPUTE_CAPABILITY = 6.0;
+  private static boolean done = false;
+  static float max_growth_allowed = .10f;
+  static int max_mem_allowed = 30;
+
+  static void FAIL(String MSG)
+  {
+    System.err.println("failure: " + MSG);
+    System.exit(1);
+  }
+
+  static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG)
+  {
+    if (err__ != null) {
+      System.err.println(
+          "error: " + MSG + ":" + TRITONSERVER_ErrorCodeString(err__) + " - "
+          + TRITONSERVER_ErrorMessage(err__));
+      TRITONSERVER_ErrorDelete(err__);
       System.exit(1);
     }
+  }
 
-    static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
-        @Override public TRITONSERVER_Error call (
-            TRITONSERVER_ResponseAllocator allocator, String tensor_name,
-            long byte_size, int preferred_memory_type,
-            long preferred_memory_type_id, Pointer userp, PointerPointer buffer,
-            PointerPointer buffer_userp, IntPointer actual_memory_type,
-            LongPointer actual_memory_type_id)
-        {
-          // Initially attempt to make the actual memory type and id that we
-          // allocate be the same as preferred memory type
-          actual_memory_type.put(0, preferred_memory_type);
-          actual_memory_type_id.put(0, preferred_memory_type_id);
-
-          // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
-          // need to do any other book-keeping.
-          if (byte_size == 0) {
-            buffer.put(0, null);
-            buffer_userp.put(0, null);
-          } else {
-            Pointer allocated_ptr = new Pointer();
-            if (enforce_memory_type) {
-              actual_memory_type.put(0, requested_memory_type);
-            }
-
-            actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU);
-            allocated_ptr = Pointer.malloc(byte_size);
-
-            // Pass the tensor name with buffer_userp so we can show it when
-            // releasing the buffer.
-            if (!allocated_ptr.isNull()) {
-              buffer.put(0, allocated_ptr);
-              buffer_userp.put(0, Loader.newGlobalRef(tensor_name));
-            }
-          }
+  static boolean enforce_memory_type = false;
+  static int requested_memory_type;
+  // Parameters for percentile range to include (exclude outliers)
+  static final int max_percentile = 90;
+  static final int min_percentile = 10;
 
-          return null;  // Success
-        }
+  static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server {
+    public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p)
+    {
+      super(p);
+      deallocator(new DeleteDeallocator(this));
     }
-
-    static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
-        @Override public TRITONSERVER_Error call (
-            TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
-            long byte_size, int memory_type, long memory_type_id)
-        {
-          String name = null;
-          if (buffer_userp != null) {
-            name = (String)Loader.accessGlobalRef(buffer_userp);
-          } else {
-            name = "";
-          }
-          Pointer.free(buffer);
-          Loader.deleteGlobalRef(buffer_userp);
-
-          return null;  // Success
-        }
+    protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
+      DeleteDeallocator(Pointer p) { super(p); }
+      @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
     }
+  }
 
-    static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
-        @Override public void call (
-            TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
-        {
-          // We reuse the request so we don't delete it here.
-        }
+  static void Usage(String msg)
+  {
+    if (msg != null) {
+      System.err.println(msg);
     }
 
-    static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
-        @Override public void call (
-            TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
-        {
-          if (response != null) {
-            // Send 'response' to the future.
-            futures.get(userp).complete(response);
-          }
+    System.err.println("Usage: java " + MemoryGrowthTest.class.getSimpleName() + " [options]");
+    System.err.println("\t-i Set number of iterations");
+    System.err.println(
+        "\t-m <\"system\"|\"pinned\"|gpu>"
+        + " Enforce the memory type for input and output tensors."
+        + " If not specified, inputs will be in system memory and outputs"
+        + " will be based on the model's preferred type.");
+    System.err.println("\t-v Enable verbose logging");
+    System.err.println("\t-r [model repository absolute path]");
+    System.err.println("\t--max-growth Specify maximum allowed memory growth (%)");
+    System.err.println("\t--max-memory Specify maximum allowed memory (MB)");
+
+    System.exit(1);
+  }
+
+  static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
+    @Override
+    public TRITONSERVER_Error call(
+        TRITONSERVER_ResponseAllocator allocator, String tensor_name, long byte_size,
+        int preferred_memory_type, long preferred_memory_type_id, Pointer userp,
+        PointerPointer buffer, PointerPointer buffer_userp, IntPointer actual_memory_type,
+        LongPointer actual_memory_type_id)
+    {
+      // Initially attempt to make the actual memory type and id that we
+      // allocate be the same as preferred memory type
+      actual_memory_type.put(0, preferred_memory_type);
+      actual_memory_type_id.put(0, preferred_memory_type_id);
+
+      // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
+      // need to do any other book-keeping.
+      if (byte_size == 0) {
+        buffer.put(0, null);
+        buffer_userp.put(0, null);
+      } else {
+        Pointer allocated_ptr = new Pointer();
+        if (enforce_memory_type) {
+          actual_memory_type.put(0, requested_memory_type);
         }
-    }
 
-    static ConcurrentHashMap> futures = new ConcurrentHashMap<>();
-    static ResponseAlloc responseAlloc = new ResponseAlloc();
-    static ResponseRelease responseRelease = new ResponseRelease();
-    static InferRequestComplete inferRequestComplete = new InferRequestComplete();
-    static InferResponseComplete inferResponseComplete = new InferResponseComplete();
+        actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU);
+        allocated_ptr = Pointer.malloc(byte_size);
 
-    static TRITONSERVER_Error
-    ParseModelMetadata(
-        JsonObject model_metadata, boolean[] is_int,
-        boolean[] is_torch_model)
-    {
-      String seen_data_type = null;
-      for (JsonElement input_element : model_metadata.get("inputs").getAsJsonArray()) {
-        JsonObject input = input_element.getAsJsonObject();
-        if (!input.get("datatype").getAsString().equals("INT32") &&
-            !input.get("datatype").getAsString().equals("FP32")) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_UNSUPPORTED,
-              "simple lib example only supports model with data type INT32 or " +
-              "FP32");
-        }
-        if (seen_data_type == null) {
-          seen_data_type = input.get("datatype").getAsString();
-        } else if (!seen_data_type.equals(input.get("datatype").getAsString())) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              "the inputs and outputs of 'simple' model must have the data type");
-        }
-      }
-      for (JsonElement output_element : model_metadata.get("outputs").getAsJsonArray()) {
-        JsonObject output = output_element.getAsJsonObject();
-        if (!output.get("datatype").getAsString().equals("INT32") &&
-            !output.get("datatype").getAsString().equals("FP32")) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_UNSUPPORTED,
-              "simple lib example only supports model with data type INT32 or " +
-              "FP32");
-        } else if (!seen_data_type.equals(output.get("datatype").getAsString())) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              "the inputs and outputs of 'simple' model must have the data type");
+        // Pass the tensor name with buffer_userp so we can show it when
+        // releasing the buffer.
+        if (!allocated_ptr.isNull()) {
+          buffer.put(0, allocated_ptr);
+          buffer_userp.put(0, Loader.newGlobalRef(tensor_name));
         }
       }
 
-      is_int[0] = seen_data_type.equals("INT32");
-      is_torch_model[0] =
-          model_metadata.get("platform").getAsString().equals("pytorch_libtorch");
-      return null;
+      return null; // Success
     }
+  }
 
-    static void
-    GenerateInputData(
-        IntPointer[] input0_data, IntPointer[] input1_data)
+  static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
+    @Override
+    public TRITONSERVER_Error call(
+        TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
+        long byte_size, int memory_type, long memory_type_id)
     {
-      input0_data[0] = new IntPointer(16);
-      input1_data[0] = new IntPointer(16);
-      for (int i = 0; i < 16; ++i) {
-        input0_data[0].put(i, i);
-        input1_data[0].put(i, 1);
+      String name = null;
+      if (buffer_userp != null) {
+        name = (String) Loader.accessGlobalRef(buffer_userp);
+      } else {
+        name = "";
       }
+      Pointer.free(buffer);
+      Loader.deleteGlobalRef(buffer_userp);
+
+      return null; // Success
     }
+  }
 
-    static void
-    GenerateInputData(
-        FloatPointer[] input0_data, FloatPointer[] input1_data)
+  static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
+    @Override public void call(TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
     {
-      input0_data[0] = new FloatPointer(16);
-      input1_data[0] = new FloatPointer(16);
-      for (int i = 0; i < 16; ++i) {
-        input0_data[0].put(i, i);
-        input1_data[0].put(i, 1);
-      }
+      // We reuse the request so we don't delete it here.
     }
+  }
 
-    static void
-    CompareResult(
-        String output0_name, String output1_name,
-        IntPointer input0, IntPointer input1, IntPointer output0,
-        IntPointer output1)
+  static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
+    @Override public void call(TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
     {
-      for (int i = 0; i < 16; ++i) {
-        if ((input0.get(i) + input1.get(i)) != output0.get(i)) {
-          FAIL("incorrect sum in " + output0_name);
-        }
-        if ((input0.get(i) - input1.get(i)) != output1.get(i)) {
-          FAIL("incorrect difference in " + output1_name);
-        }
+      if (response != null) {
+        // Send 'response' to the future.
+        futures.get(userp).complete(response);
+      }
+    }
+  }
+
+  static ConcurrentHashMap> futures =
+      new ConcurrentHashMap<>();
+  static ResponseAlloc responseAlloc = new ResponseAlloc();
+  static ResponseRelease responseRelease = new ResponseRelease();
+  static InferRequestComplete inferRequestComplete = new InferRequestComplete();
+  static InferResponseComplete inferResponseComplete = new InferResponseComplete();
+
+  static TRITONSERVER_Error ParseModelMetadata(
+      JsonObject model_metadata, boolean[] is_int, boolean[] is_torch_model)
+  {
+    String seen_data_type = null;
+    for (JsonElement input_element : model_metadata.get("inputs").getAsJsonArray()) {
+      JsonObject input = input_element.getAsJsonObject();
+      if (!input.get("datatype").getAsString().equals("INT32")
+          && !input.get("datatype").getAsString().equals("FP32")) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_UNSUPPORTED,
+            "simple lib example only supports model with data type INT32 or "
+                + "FP32");
+      }
+      if (seen_data_type == null) {
+        seen_data_type = input.get("datatype").getAsString();
+      } else if (!seen_data_type.equals(input.get("datatype").getAsString())) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            "the inputs and outputs of 'simple' model must have the data type");
+      }
+    }
+    for (JsonElement output_element : model_metadata.get("outputs").getAsJsonArray()) {
+      JsonObject output = output_element.getAsJsonObject();
+      if (!output.get("datatype").getAsString().equals("INT32")
+          && !output.get("datatype").getAsString().equals("FP32")) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_UNSUPPORTED,
+            "simple lib example only supports model with data type INT32 or "
+                + "FP32");
+      } else if (!seen_data_type.equals(output.get("datatype").getAsString())) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            "the inputs and outputs of 'simple' model must have the data type");
       }
     }
 
-    static void
-    CompareResult(
-        String output0_name, String output1_name,
-        FloatPointer input0, FloatPointer input1, FloatPointer output0,
-        FloatPointer output1)
-    {
-      for (int i = 0; i < 16; ++i) {
-        if ((input0.get(i) + input1.get(i)) != output0.get(i)) {
-          FAIL("incorrect sum in " + output0_name);
-        }
-        if ((input0.get(i) - input1.get(i)) != output1.get(i)) {
-          FAIL("incorrect difference in " + output1_name);
-        }
+    is_int[0] = seen_data_type.equals("INT32");
+    is_torch_model[0] = model_metadata.get("platform").getAsString().equals("pytorch_libtorch");
+    return null;
+  }
+
+  static void GenerateInputData(IntPointer[] input0_data, IntPointer[] input1_data)
+  {
+    input0_data[0] = new IntPointer(16);
+    input1_data[0] = new IntPointer(16);
+    for (int i = 0; i < 16; ++i) {
+      input0_data[0].put(i, i);
+      input1_data[0].put(i, 1);
+    }
+  }
+
+  static void GenerateInputData(FloatPointer[] input0_data, FloatPointer[] input1_data)
+  {
+    input0_data[0] = new FloatPointer(16);
+    input1_data[0] = new FloatPointer(16);
+    for (int i = 0; i < 16; ++i) {
+      input0_data[0].put(i, i);
+      input1_data[0].put(i, 1);
+    }
+  }
+
+  static void CompareResult(
+      String output0_name, String output1_name, IntPointer input0, IntPointer input1,
+      IntPointer output0, IntPointer output1)
+  {
+    for (int i = 0; i < 16; ++i) {
+      if ((input0.get(i) + input1.get(i)) != output0.get(i)) {
+        FAIL("incorrect sum in " + output0_name);
+      }
+      if ((input0.get(i) - input1.get(i)) != output1.get(i)) {
+        FAIL("incorrect difference in " + output1_name);
+      }
+    }
+  }
+
+  static void CompareResult(
+      String output0_name, String output1_name, FloatPointer input0, FloatPointer input1,
+      FloatPointer output0, FloatPointer output1)
+  {
+    for (int i = 0; i < 16; ++i) {
+      if ((input0.get(i) + input1.get(i)) != output0.get(i)) {
+        FAIL("incorrect sum in " + output0_name);
+      }
+      if ((input0.get(i) - input1.get(i)) != output1.get(i)) {
+        FAIL("incorrect difference in " + output1_name);
       }
     }
+  }
+
+  static void Check(
+      TRITONSERVER_InferenceResponse response, Pointer input0_data, Pointer input1_data,
+      String output0, String output1, long expected_byte_size, int expected_datatype,
+      boolean is_int)
+  {
+    HashMap output_data = new HashMap<>();
+
+    int[] output_count = {0};
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceResponseOutputCount(response, output_count),
+        "getting number of response outputs");
+    if (output_count[0] != 2) {
+      FAIL("expecting 2 response outputs, got " + output_count[0]);
+    }
 
-    static void
-    Check(
-        TRITONSERVER_InferenceResponse response,
-        Pointer input0_data, Pointer input1_data,
-        String output0, String output1,
-        long expected_byte_size,
-        int expected_datatype, boolean is_int)
-    {
-      HashMap output_data = new HashMap<>();
+    for (int idx = 0; idx < output_count[0]; ++idx) {
+      BytePointer cname = new BytePointer((Pointer) null);
+      IntPointer datatype = new IntPointer(1);
+      LongPointer shape = new LongPointer((Pointer) null);
+      LongPointer dim_count = new LongPointer(1);
+      Pointer base = new Pointer();
+      SizeTPointer byte_size = new SizeTPointer(1);
+      IntPointer memory_type = new IntPointer(1);
+      LongPointer memory_type_id = new LongPointer(1);
+      Pointer userp = new Pointer();
 
-      int[] output_count = {0};
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceResponseOutputCount(response, output_count),
-          "getting number of response outputs");
-      if (output_count[0] != 2) {
-        FAIL("expecting 2 response outputs, got " + output_count[0]);
-      }
-
-      for (int idx = 0; idx < output_count[0]; ++idx) {
-        BytePointer cname = new BytePointer((Pointer)null);
-        IntPointer datatype = new IntPointer(1);
-        LongPointer shape = new LongPointer((Pointer)null);
-        LongPointer dim_count = new LongPointer(1);
-        Pointer base = new Pointer();
-        SizeTPointer byte_size = new SizeTPointer(1);
-        IntPointer memory_type = new IntPointer(1);
-        LongPointer memory_type_id = new LongPointer(1);
-        Pointer userp = new Pointer();
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseOutput(
-                response, idx, cname, datatype, shape, dim_count, base,
-                byte_size, memory_type, memory_type_id, userp),
-            "getting output info");
-
-        if (cname.isNull()) {
-          FAIL("unable to get output name");
-        }
-
-        String name = cname.getString();
-        if ((!name.equals(output0)) && (!name.equals(output1))) {
-          FAIL("unexpected output '" + name + "'");
-        }
-
-        if ((dim_count.get() != 2) || (shape.get(0) != 1) || (shape.get(1) != 16)) {
-          FAIL("unexpected shape for '" + name + "'");
-        }
+          TRITONSERVER_InferenceResponseOutput(
+              response, idx, cname, datatype, shape, dim_count, base, byte_size, memory_type,
+              memory_type_id, userp),
+          "getting output info");
 
-        if (datatype.get() != expected_datatype) {
-          FAIL(
-              "unexpected datatype '" +
-              TRITONSERVER_DataTypeString(datatype.get()) + "' for '" +
-              name + "'");
-        }
-
-        if (byte_size.get() != expected_byte_size) {
-          FAIL(
-              "unexpected byte-size, expected " +
-              expected_byte_size + ", got " +
-              byte_size.get() + " for " + name);
-        }
-
-        if (enforce_memory_type && (memory_type.get() != requested_memory_type)) {
-          FAIL(
-              "unexpected memory type, expected to be allocated in " +
-              TRITONSERVER_MemoryTypeString(requested_memory_type) +
-              ", got " + TRITONSERVER_MemoryTypeString(memory_type.get()) +
-              ", id " + memory_type_id.get() + " for " + name);
-        }
+      if (cname.isNull()) {
+        FAIL("unable to get output name");
+      }
 
-        // We make a copy of the data here... which we could avoid for
-        // performance reasons but ok for this simple example.
-        BytePointer odata = new BytePointer(byte_size.get());
-        output_data.put(name, odata);
-        odata.put(base.limit(byte_size.get()));
+      String name = cname.getString();
+      if ((!name.equals(output0)) && (!name.equals(output1))) {
+        FAIL("unexpected output '" + name + "'");
       }
 
-      if (is_int) {
-        CompareResult(
-            output0, output1, new IntPointer(input0_data), new IntPointer(input1_data),
-            new IntPointer(output_data.get(output0)), new IntPointer(output_data.get(output1)));
-      } else {
-        CompareResult(
-            output0, output1, new FloatPointer(input0_data), new FloatPointer(input1_data),
-            new FloatPointer(output_data.get(output0)), new FloatPointer(output_data.get(output1)));
-      }
-    }
-
-    /**
-    Returns whether the memory growth is within the acceptable range
-    @param  max_float_allowed     Maximum allowed memory growth (%)
-    @param  max_mem_allowed       Maximum allowed memory (MB)
-     */
-    static boolean
-    ValidateMemoryGrowth(float max_growth_allowed, int max_mem_allowed){
-      // Allocate list starting capacity to hold up to 24 hours worth of snapshots.
-      List memory_snapshots = new ArrayList(20000);
-      while(!done){
-        try {
-          Thread.sleep(5000);
-        } catch (InterruptedException e){
-          System.out.println("Memory growth validation interrupted.");
-        }
-        System.gc();
-        double snapshot = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory();
-        memory_snapshots.add(snapshot);
-        System.out.println("Memory allocated (MB):" + snapshot/1E6);
+      if ((dim_count.get() != 2) || (shape.get(0) != 1) || (shape.get(1) != 16)) {
+        FAIL("unexpected shape for '" + name + "'");
       }
-      if(memory_snapshots.size() < 5){
-        System.out.println("Error: Not enough snapshots, found " + memory_snapshots.size()
-        + " snapshots");
-        return false;
+
+      if (datatype.get() != expected_datatype) {
+        FAIL(
+            "unexpected datatype '" + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name
+            + "'");
       }
 
-      // Measure memory growth without outliers by taking difference
-      // between 90th percentile and 10th percentile memory usage.
-      final double bytes_in_mb = 1E6;
-      Collections.sort(memory_snapshots);
-      int index_max = ((int) Math.ceil(max_percentile / 100.0 * memory_snapshots.size())) - 1;
-      int index_min = ((int) Math.ceil(min_percentile / 100.0 * memory_snapshots.size())) - 1;
-      double memory_allocation_delta = memory_snapshots.get(index_max) - memory_snapshots.get(index_min);
-      double memory_allocation_delta_mb = memory_allocation_delta / bytes_in_mb;
-      double memory_allocation_delta_percent = memory_allocation_delta / memory_snapshots.get(index_max);
+      if (byte_size.get() != expected_byte_size) {
+        FAIL(
+            "unexpected byte-size, expected " + expected_byte_size + ", got " + byte_size.get()
+            + " for " + name);
+      }
 
-      System.out.println("Change in memory allocation (MB): " +
-          memory_allocation_delta_mb + ", " +
-          (memory_allocation_delta_percent * 100) + "%");
+      if (enforce_memory_type && (memory_type.get() != requested_memory_type)) {
+        FAIL(
+            "unexpected memory type, expected to be allocated in "
+            + TRITONSERVER_MemoryTypeString(requested_memory_type) + ", got "
+            + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id " + memory_type_id.get()
+            + " for " + name);
+      }
 
-      boolean passed = true;
+      // We make a copy of the data here... which we could avoid for
+      // performance reasons but ok for this simple example.
+      BytePointer odata = new BytePointer(byte_size.get());
+      output_data.put(name, odata);
+      odata.put(base.limit(byte_size.get()));
+    }
 
-      if(memory_allocation_delta_percent >= max_growth_allowed){
-        passed = false;
-        System.out.println("Exceeded allowed memory growth (" +
-          (max_growth_allowed * 100) + "%)");
+    if (is_int) {
+      CompareResult(
+          output0, output1, new IntPointer(input0_data), new IntPointer(input1_data),
+          new IntPointer(output_data.get(output0)), new IntPointer(output_data.get(output1)));
+    } else {
+      CompareResult(
+          output0, output1, new FloatPointer(input0_data), new FloatPointer(input1_data),
+          new FloatPointer(output_data.get(output0)), new FloatPointer(output_data.get(output1)));
+    }
+  }
+
+  /**
+  Returns whether the memory growth is within the acceptable range
+  @param  max_float_allowed     Maximum allowed memory growth (%)
+  @param  max_mem_allowed       Maximum allowed memory (MB)
+   */
+  static boolean ValidateMemoryGrowth(float max_growth_allowed, int max_mem_allowed)
+  {
+    // Allocate list starting capacity to hold up to 24 hours worth of snapshots.
+    List memory_snapshots = new ArrayList(20000);
+    while (!done) {
+      try {
+        Thread.sleep(5000);
       }
-
-      if((memory_snapshots.get(index_max) / bytes_in_mb) >= max_mem_allowed){
-        passed = false;
-        System.out.println("Exceeded allowed memory (" + max_mem_allowed + 
-          "MB), got " + (memory_snapshots.get(index_max) / bytes_in_mb) + "MB");
+      catch (InterruptedException e) {
+        System.out.println("Memory growth validation interrupted.");
       }
-      return passed;
+      System.gc();
+      double snapshot = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory();
+      memory_snapshots.add(snapshot);
+      System.out.println("Memory allocated (MB):" + snapshot / 1E6);
+    }
+    if (memory_snapshots.size() < 5) {
+      System.out.println(
+          "Error: Not enough snapshots, found " + memory_snapshots.size() + " snapshots");
+      return false;
     }
 
-    static void
-    RunInference(TRITONSERVER_ServerDeleter server, String model_name, boolean[] is_int, boolean[] is_torch_model, boolean check_accuracy)
-    throws Exception
-    {
-      // Create the allocator that will be used to allocate buffers for
-      // the result tensors.
-      TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_ResponseAllocatorNew(
-              allocator, responseAlloc, responseRelease, null /* start_fn */),
-          "creating response allocator");
-
-      // Inference
-      TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestNew(
-              irequest, server, model_name, -1 /* model_version */),
-          "creating inference request");
+    // Measure memory growth without outliers by taking difference
+    // between 90th percentile and 10th percentile memory usage.
+    final double bytes_in_mb = 1E6;
+    Collections.sort(memory_snapshots);
+    int index_max = ((int) Math.ceil(max_percentile / 100.0 * memory_snapshots.size())) - 1;
+    int index_min = ((int) Math.ceil(min_percentile / 100.0 * memory_snapshots.size())) - 1;
+    double memory_allocation_delta =
+        memory_snapshots.get(index_max) - memory_snapshots.get(index_min);
+    double memory_allocation_delta_mb = memory_allocation_delta / bytes_in_mb;
+    double memory_allocation_delta_percent =
+        memory_allocation_delta / memory_snapshots.get(index_max);
+
+    System.out.println(
+        "Change in memory allocation (MB): " + memory_allocation_delta_mb + ", "
+        + (memory_allocation_delta_percent * 100) + "%");
+
+    boolean passed = true;
+
+    if (memory_allocation_delta_percent >= max_growth_allowed) {
+      passed = false;
+      System.out.println("Exceeded allowed memory growth (" + (max_growth_allowed * 100) + "%)");
+    }
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"),
-          "setting ID for the request");
+    if ((memory_snapshots.get(index_max) / bytes_in_mb) >= max_mem_allowed) {
+      passed = false;
+      System.out.println(
+          "Exceeded allowed memory (" + max_mem_allowed + "MB), got "
+          + (memory_snapshots.get(index_max) / bytes_in_mb) + "MB");
+    }
+    return passed;
+  }
+
+  static void RunInference(
+      TRITONSERVER_ServerDeleter server, String model_name, boolean[] is_int,
+      boolean[] is_torch_model, boolean check_accuracy) throws Exception
+  {
+    // Create the allocator that will be used to allocate buffers for
+    // the result tensors.
+    TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_ResponseAllocatorNew(
+            allocator, responseAlloc, responseRelease, null /* start_fn */),
+        "creating response allocator");
+
+    // Inference
+    TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestNew(irequest, server, model_name, -1 /* model_version */),
+        "creating inference request");
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"),
+        "setting ID for the request");
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetReleaseCallback(
+            irequest, inferRequestComplete, null /* request_release_userp */),
+        "setting request release callback");
+
+    // Inputs
+    String input0 = is_torch_model[0] ? "INPUT__0" : "INPUT0";
+    String input1 = is_torch_model[0] ? "INPUT__1" : "INPUT1";
+
+    long[] input0_shape = {1, 16};
+    long[] input1_shape = {1, 16};
+
+    int datatype = (is_int[0]) ? TRITONSERVER_TYPE_INT32 : TRITONSERVER_TYPE_FP32;
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddInput(
+            irequest, input0, datatype, input0_shape, input0_shape.length),
+        "setting input 0 meta-data for the request");
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddInput(
+            irequest, input1, datatype, input1_shape, input1_shape.length),
+        "setting input 1 meta-data for the request");
+
+    String output0 = is_torch_model[0] ? "OUTPUT__0" : "OUTPUT0";
+    String output1 = is_torch_model[0] ? "OUTPUT__1" : "OUTPUT1";
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output0),
+        "requesting output 0 for the request");
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output1),
+        "requesting output 1 for the request");
+
+    // Create the data for the two input tensors. Initialize the first
+    // to unique values and the second to all ones.
+    BytePointer input0_data;
+    BytePointer input1_data;
+    if (is_int[0]) {
+      IntPointer[] p0 = {null}, p1 = {null};
+      GenerateInputData(p0, p1);
+      input0_data = p0[0].getPointer(BytePointer.class);
+      input1_data = p1[0].getPointer(BytePointer.class);
+    } else {
+      FloatPointer[] p0 = {null}, p1 = {null};
+      GenerateInputData(p0, p1);
+      input0_data = p0[0].getPointer(BytePointer.class);
+      input1_data = p1[0].getPointer(BytePointer.class);
+    }
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestSetReleaseCallback(
-              irequest, inferRequestComplete, null /* request_release_userp */),
-          "setting request release callback");
+    long input0_size = input0_data.limit();
+    long input1_size = input1_data.limit();
 
-      // Inputs
-      String input0 = is_torch_model[0] ? "INPUT__0" : "INPUT0";
-      String input1 = is_torch_model[0] ? "INPUT__1" : "INPUT1";
+    Pointer input0_base = input0_data;
+    Pointer input1_base = input1_data;
 
-      long[] input0_shape = {1, 16};
-      long[] input1_shape = {1, 16};
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAppendInputData(
+            irequest, input0, input0_base, input0_size, requested_memory_type,
+            0 /* memory_type_id */),
+        "assigning INPUT0 data");
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAppendInputData(
+            irequest, input1, input1_base, input1_size, requested_memory_type,
+            0 /* memory_type_id */),
+        "assigning INPUT1 data");
 
-      int datatype =
-          (is_int[0]) ? TRITONSERVER_TYPE_INT32 : TRITONSERVER_TYPE_FP32;
+    // Perform inference...
+    {
+      CompletableFuture completed = new CompletableFuture<>();
+      futures.put(irequest, completed);
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddInput(
-              irequest, input0, datatype, input0_shape, input0_shape.length),
-          "setting input 0 meta-data for the request");
+          TRITONSERVER_InferenceRequestSetResponseCallback(
+              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
+              irequest),
+          "setting response callback");
+
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddInput(
-              irequest, input1, datatype, input1_shape, input1_shape.length),
-          "setting input 1 meta-data for the request");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
 
-      String output0 = is_torch_model[0] ? "OUTPUT__0" : "OUTPUT0";
-      String output1 = is_torch_model[0] ? "OUTPUT__1" : "OUTPUT1";
+      // Wait for the inference to complete.
+      TRITONSERVER_InferenceResponse completed_response = completed.get();
+      futures.remove(irequest);
 
+      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
+      if (check_accuracy) {
+        Check(
+            completed_response, input0_data, input1_data, output0, output1, input0_size, datatype,
+            is_int[0]);
+      }
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output0),
-          "requesting output 0 for the request");
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output1),
-          "requesting output 1 for the request");
+          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
+    }
 
-      // Create the data for the two input tensors. Initialize the first
-      // to unique values and the second to all ones.
-      BytePointer input0_data;
-      BytePointer input1_data;
+    // Modify some input data in place and then reuse the request
+    // object. For simplicity we only do this when the input tensors are
+    // in non-pinned system memory.
+    if (!enforce_memory_type || (requested_memory_type == TRITONSERVER_MEMORY_CPU)) {
       if (is_int[0]) {
-        IntPointer[] p0 = {null}, p1 = {null};
-        GenerateInputData(p0, p1);
-        input0_data = p0[0].getPointer(BytePointer.class);
-        input1_data = p1[0].getPointer(BytePointer.class);
+        new IntPointer(input0_data).put(0, 27);
       } else {
-        FloatPointer[] p0 = {null}, p1 = {null};
-        GenerateInputData(p0, p1);
-        input0_data = p0[0].getPointer(BytePointer.class);
-        input1_data = p1[0].getPointer(BytePointer.class);
+        new FloatPointer(input0_data).put(0, 27.0f);
       }
 
-      long input0_size = input0_data.limit();
-      long input1_size = input1_data.limit();
+      CompletableFuture completed = new CompletableFuture<>();
+      futures.put(irequest, completed);
 
-      Pointer input0_base = input0_data;
-      Pointer input1_base = input1_data;
+      // Using a new promise so have to re-register the callback to set
+      // the promise as the userp.
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceRequestSetResponseCallback(
+              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
+              irequest),
+          "setting response callback");
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAppendInputData(
-              irequest, input0, input0_base, input0_size, requested_memory_type,
-              0 /* memory_type_id */),
-          "assigning INPUT0 data");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
+
+      // Wait for the inference to complete.
+      TRITONSERVER_InferenceResponse completed_response = completed.get();
+      futures.remove(irequest);
+      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
+      if (check_accuracy) {
+        Check(
+            completed_response, input0_data, input1_data, output0, output1, input0_size, datatype,
+            is_int[0]);
+      }
+
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
+    }
+
+    // Remove input data and then add back different data.
+    {
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceRequestRemoveAllInputData(irequest, input0),
+          "removing INPUT0 data");
       FAIL_IF_ERR(
           TRITONSERVER_InferenceRequestAppendInputData(
-              irequest, input1, input1_base, input1_size, requested_memory_type,
+              irequest, input0, input1_base, input1_size, requested_memory_type,
               0 /* memory_type_id */),
-          "assigning INPUT1 data");
-
-      // Perform inference...
-      {
-        CompletableFuture completed = new CompletableFuture<>();
-        futures.put(irequest, completed);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestSetResponseCallback(
-                irequest, allocator, null /* response_allocator_userp */,
-                inferResponseComplete, irequest),
-            "setting response callback");
-
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerInferAsync(
-                server, irequest, null /* trace */),
-            "running inference");
-
-        // Wait for the inference to complete.
-        TRITONSERVER_InferenceResponse completed_response = completed.get();
-        futures.remove(irequest);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseError(completed_response),
-            "response status");
-        if (check_accuracy) {
-          Check(
-              completed_response, input0_data, input1_data, output0, output1,
-              input0_size, datatype, is_int[0]);
-        }
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseDelete(completed_response),
-            "deleting inference response");
-      }
-
-      // Modify some input data in place and then reuse the request
-      // object. For simplicity we only do this when the input tensors are
-      // in non-pinned system memory.
-      if (!enforce_memory_type ||
-          (requested_memory_type == TRITONSERVER_MEMORY_CPU)) {
-        if (is_int[0]) {
-          new IntPointer(input0_data).put(0, 27);
-        } else {
-          new FloatPointer(input0_data).put(0, 27.0f);
-        }
-
-        CompletableFuture completed = new CompletableFuture<>();
-        futures.put(irequest, completed);
-
-        // Using a new promise so have to re-register the callback to set
-        // the promise as the userp.
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestSetResponseCallback(
-                irequest, allocator, null /* response_allocator_userp */,
-                inferResponseComplete, irequest),
-            "setting response callback");
-
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerInferAsync(
-                server, irequest, null /* trace */),
-            "running inference");
-
-        // Wait for the inference to complete.
-        TRITONSERVER_InferenceResponse completed_response = completed.get();
-        futures.remove(irequest);
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseError(completed_response),
-            "response status");
-        if (check_accuracy) {
-          Check(
-              completed_response, input0_data, input1_data, output0, output1,
-              input0_size, datatype, is_int[0]);
-        }
+          "assigning INPUT1 data to INPUT0");
 
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseDelete(completed_response),
-            "deleting inference response");
-      }
-
-      // Remove input data and then add back different data.
-      {
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestRemoveAllInputData(irequest, input0),
-            "removing INPUT0 data");
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestAppendInputData(
-                irequest, input0, input1_base, input1_size, requested_memory_type,
-                0 /* memory_type_id */),
-            "assigning INPUT1 data to INPUT0");
-
-        CompletableFuture completed = new CompletableFuture<>();
-        futures.put(irequest, completed);
-
-        // Using a new promise so have to re-register the callback to set
-        // the promise as the userp.
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestSetResponseCallback(
-                irequest, allocator, null /* response_allocator_userp */,
-                inferResponseComplete, irequest),
-            "setting response callback");
-
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerInferAsync(
-                server, irequest, null /* trace */),
-            "running inference");
-
-        // Wait for the inference to complete.
-        TRITONSERVER_InferenceResponse completed_response = completed.get();
-        futures.remove(irequest);
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseError(completed_response),
-            "response status");
-
-        if (check_accuracy) {
-          // Both inputs are using input1_data...
-          Check(
-              completed_response, input1_data, input1_data, output0, output1,
-              input0_size, datatype, is_int[0]);
-        }
+      CompletableFuture completed = new CompletableFuture<>();
+      futures.put(irequest, completed);
 
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseDelete(completed_response),
-            "deleting inference response");
-      }
+      // Using a new promise so have to re-register the callback to set
+      // the promise as the userp.
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceRequestSetResponseCallback(
+              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
+              irequest),
+          "setting response callback");
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestDelete(irequest),
-          "deleting inference request");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
+
+      // Wait for the inference to complete.
+      TRITONSERVER_InferenceResponse completed_response = completed.get();
+      futures.remove(irequest);
+      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
+
+      if (check_accuracy) {
+        // Both inputs are using input1_data...
+        Check(
+            completed_response, input1_data, input1_data, output0, output1, input0_size, datatype,
+            is_int[0]);
+      }
 
       FAIL_IF_ERR(
-          TRITONSERVER_ResponseAllocatorDelete(allocator),
-          "deleting response allocator");
+          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
     }
 
-    public static void
-    main(String[] args) throws Exception
-    {
-      int num_iterations = 1000000;
-      String model_repository_path = null;
-      int verbose_level = 0;
-      boolean check_accuracy = false;
-
-      // Parse commandline...
-      for (int i = 0; i < args.length; i++) {
-        switch (args[i]) {
-          case "-i":
-            i++;
-            try {
-              num_iterations = Integer.parseInt(args[i]);
-            } catch (NumberFormatException e){
-              Usage(
-                  "-i must be used to specify number of iterations");
-            }
-            break;
-          case "-m":
-            enforce_memory_type = true;
-            i++;
-            if (args[i].equals("system")) {
-              requested_memory_type = TRITONSERVER_MEMORY_CPU;
-            } else if (args[i].equals("pinned")) {
-              requested_memory_type = TRITONSERVER_MEMORY_CPU_PINNED;
-            } else if (args[i].equals("gpu")) {
-              requested_memory_type = TRITONSERVER_MEMORY_GPU;
-            } else {
-              Usage(
-                  "-m must be used to specify one of the following types:" +
-                  " <\"system\"|\"pinned\"|gpu>");
-            }
-            break;
-          case "-r":
-            model_repository_path = args[++i];
-            break;
-          case "-v":
-            verbose_level = 1;
-            break;
-          case "-c":
-            check_accuracy = true;
-            break;
-          case "-?":
-            Usage(null);
-            break;
-          case "--max-growth":
-            i++;
-            try {
-              max_growth_allowed = Integer.parseInt(args[i]) / 100.0f;
-            } catch (NumberFormatException e){
-              Usage(
-                  "--max-growth must be an integer value specifying allowed memory growth (%)");
-            }
-            break;
-          case "--max-memory":
-            i++;
-            try {
-              max_mem_allowed = Integer.parseInt(args[i]);
-            } catch (NumberFormatException e){
-              Usage(
-                  "--max-memory must be an integer value specifying maximum allowed memory (MB)");
-            }
-            break;
-        }
-      }
+    FAIL_IF_ERR(TRITONSERVER_InferenceRequestDelete(irequest), "deleting inference request");
+
+    FAIL_IF_ERR(TRITONSERVER_ResponseAllocatorDelete(allocator), "deleting response allocator");
+  }
+
+  public static void main(String[] args) throws Exception
+  {
+    int num_iterations = 1000000;
+    String model_repository_path = null;
+    int verbose_level = 0;
+    boolean check_accuracy = false;
 
-      if (model_repository_path == null) {
-        Usage("-r must be used to specify model repository path");
+    // Parse commandline...
+    for (int i = 0; i < args.length; i++) {
+      switch (args[i]) {
+        case "-i":
+          i++;
+          try {
+            num_iterations = Integer.parseInt(args[i]);
+          }
+          catch (NumberFormatException e) {
+            Usage("-i must be used to specify number of iterations");
+          }
+          break;
+        case "-m":
+          enforce_memory_type = true;
+          i++;
+          if (args[i].equals("system")) {
+            requested_memory_type = TRITONSERVER_MEMORY_CPU;
+          } else if (args[i].equals("pinned")) {
+            requested_memory_type = TRITONSERVER_MEMORY_CPU_PINNED;
+          } else if (args[i].equals("gpu")) {
+            requested_memory_type = TRITONSERVER_MEMORY_GPU;
+          } else {
+            Usage(
+                "-m must be used to specify one of the following types:"
+                + " <\"system\"|\"pinned\"|gpu>");
+          }
+          break;
+        case "-r":
+          model_repository_path = args[++i];
+          break;
+        case "-v":
+          verbose_level = 1;
+          break;
+        case "-c":
+          check_accuracy = true;
+          break;
+        case "-?":
+          Usage(null);
+          break;
+        case "--max-growth":
+          i++;
+          try {
+            max_growth_allowed = Integer.parseInt(args[i]) / 100.0f;
+          }
+          catch (NumberFormatException e) {
+            Usage("--max-growth must be an integer value specifying allowed memory growth (%)");
+          }
+          break;
+        case "--max-memory":
+          i++;
+          try {
+            max_mem_allowed = Integer.parseInt(args[i]);
+          }
+          catch (NumberFormatException e) {
+            Usage("--max-memory must be an integer value specifying maximum allowed memory (MB)");
+          }
+          break;
       }
-      if (enforce_memory_type && requested_memory_type != TRITONSERVER_MEMORY_CPU) {
-        Usage("-m can only be set to \"system\" without enabling GPU");
+    }
+
+    if (model_repository_path == null) {
+      Usage("-r must be used to specify model repository path");
+    }
+    if (enforce_memory_type && requested_memory_type != TRITONSERVER_MEMORY_CPU) {
+      Usage("-m can only be set to \"system\" without enabling GPU");
+    }
+
+    // Check API version.
+    int[] api_version_major = {0}, api_version_minor = {0};
+    FAIL_IF_ERR(
+        TRITONSERVER_ApiVersion(api_version_major, api_version_minor),
+        "getting Triton API version");
+    if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0])
+        || (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) {
+      FAIL("triton server API version mismatch");
+    }
+
+    // Create the server...
+    TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
+    FAIL_IF_ERR(TRITONSERVER_ServerOptionsNew(server_options), "creating server options");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetModelRepositoryPath(server_options, model_repository_path),
+        "setting model repository path");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
+        "setting verbose logging level");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetBackendDirectory(server_options, "/opt/tritonserver/backends"),
+        "setting backend directory");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
+            server_options, "/opt/tritonserver/repoagents"),
+        "setting repository agent directory");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true),
+        "setting strict model configuration");
+    double min_compute_capability = TRITON_MIN_COMPUTE_CAPABILITY;
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability(
+            server_options, min_compute_capability),
+        "setting minimum supported CUDA compute capability");
+
+    TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
+    FAIL_IF_ERR(TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+    FAIL_IF_ERR(TRITONSERVER_ServerOptionsDelete(server_options), "deleting server options");
+
+    TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
+
+    // Wait until the server is both live and ready.
+    int health_iters = 0;
+    while (true) {
+      boolean[] live = {false}, ready = {false};
+      FAIL_IF_ERR(TRITONSERVER_ServerIsLive(server, live), "unable to get server liveness");
+      FAIL_IF_ERR(TRITONSERVER_ServerIsReady(server, ready), "unable to get server readiness");
+      System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
+      if (live[0] && ready[0]) {
+        break;
       }
 
-      // Check API version.
-      int[] api_version_major = {0}, api_version_minor = {0};
-      FAIL_IF_ERR(
-          TRITONSERVER_ApiVersion(api_version_major, api_version_minor),
-          "getting Triton API version");
-      if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0]) ||
-          (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) {
-        FAIL("triton server API version mismatch");
+      if (++health_iters >= 10) {
+        FAIL("failed to find healthy inference server");
       }
 
-      // Create the server...
-      TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsNew(server_options),
-          "creating server options");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetModelRepositoryPath(
-              server_options, model_repository_path),
-          "setting model repository path");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
-          "setting verbose logging level");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetBackendDirectory(
-              server_options, "/opt/tritonserver/backends"),
-          "setting backend directory");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
-              server_options, "/opt/tritonserver/repoagents"),
-          "setting repository agent directory");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true),
-          "setting strict model configuration");
-      double min_compute_capability = TRITON_MIN_COMPUTE_CAPABILITY;
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability(
-              server_options, min_compute_capability),
-          "setting minimum supported CUDA compute capability");
+      Thread.sleep(500);
+    }
 
-      TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
+    // Print status of the server.
+    {
+      TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
       FAIL_IF_ERR(
-          TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+          TRITONSERVER_ServerMetadata(server, server_metadata_message),
+          "unable to get server metadata message");
+      BytePointer buffer = new BytePointer((Pointer) null);
+      SizeTPointer byte_size = new SizeTPointer(1);
       FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsDelete(server_options),
-          "deleting server options");
-
-      TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
-
-      // Wait until the server is both live and ready.
-      int health_iters = 0;
-      while (true) {
-        boolean[] live = {false}, ready = {false};
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerIsLive(server, live),
-            "unable to get server liveness");
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerIsReady(server, ready),
-            "unable to get server readiness");
-        System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
-        if (live[0] && ready[0]) {
-          break;
-        }
+          TRITONSERVER_MessageSerializeToJson(server_metadata_message, buffer, byte_size),
+          "unable to serialize server metadata message");
+
+      System.out.println("Server Status:");
+      System.out.println(buffer.limit(byte_size.get()).getString());
+
+      FAIL_IF_ERR(TRITONSERVER_MessageDelete(server_metadata_message), "deleting status metadata");
+    }
+
+    String model_name = "simple";
 
+    // Wait for the model to become available.
+    boolean[] is_torch_model = {false};
+    boolean[] is_int = {true};
+    boolean[] is_ready = {false};
+    health_iters = 0;
+    while (!is_ready[0]) {
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerModelIsReady(server, model_name, 1, is_ready),
+          "unable to get model readiness");
+      if (!is_ready[0]) {
         if (++health_iters >= 10) {
-          FAIL("failed to find healthy inference server");
+          FAIL("model failed to be ready in 10 iterations");
         }
-
         Thread.sleep(500);
+        continue;
       }
 
-      // Print status of the server.
-      {
-        TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerMetadata(server, server_metadata_message),
-            "unable to get server metadata message");
-        BytePointer buffer = new BytePointer((Pointer)null);
-        SizeTPointer byte_size = new SizeTPointer(1);
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageSerializeToJson(
-                server_metadata_message, buffer, byte_size),
-            "unable to serialize server metadata message");
-
-        System.out.println("Server Status:");
-        System.out.println(buffer.limit(byte_size.get()).getString());
-
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageDelete(server_metadata_message),
-            "deleting status metadata");
-      }
-
-      String model_name = "simple";
-
-      // Wait for the model to become available.
-      boolean[] is_torch_model = {false};
-      boolean[] is_int = {true};
-      boolean[] is_ready = {false};
-      health_iters = 0;
-      while (!is_ready[0]) {
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerModelIsReady(
-                server, model_name, 1, is_ready),
-            "unable to get model readiness");
-        if (!is_ready[0]) {
-          if (++health_iters >= 10) {
-            FAIL("model failed to be ready in 10 iterations");
-          }
-          Thread.sleep(500);
-          continue;
-        }
+      TRITONSERVER_Message model_metadata_message = new TRITONSERVER_Message(null);
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerModelMetadata(server, model_name, 1, model_metadata_message),
+          "unable to get model metadata message");
+      BytePointer buffer = new BytePointer((Pointer) null);
+      SizeTPointer byte_size = new SizeTPointer(1);
+      FAIL_IF_ERR(
+          TRITONSERVER_MessageSerializeToJson(model_metadata_message, buffer, byte_size),
+          "unable to serialize model status protobuf");
 
-        TRITONSERVER_Message model_metadata_message = new TRITONSERVER_Message(null);
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerModelMetadata(
-                server, model_name, 1, model_metadata_message),
-            "unable to get model metadata message");
-        BytePointer buffer = new BytePointer((Pointer)null);
-        SizeTPointer byte_size = new SizeTPointer(1);
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageSerializeToJson(
-                model_metadata_message, buffer, byte_size),
-            "unable to serialize model status protobuf");
-
-        JsonParser parser = new JsonParser();
-        JsonObject model_metadata = null;
-        try {
-          model_metadata = parser.parse(buffer.limit(byte_size.get()).getString()).getAsJsonObject();
-        } catch (Exception e) {
-          FAIL("error: failed to parse model metadata from JSON: " + e);
-        }
+      JsonParser parser = new JsonParser();
+      JsonObject model_metadata = null;
+      try {
+        model_metadata = parser.parse(buffer.limit(byte_size.get()).getString()).getAsJsonObject();
+      }
+      catch (Exception e) {
+        FAIL("error: failed to parse model metadata from JSON: " + e);
+      }
 
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageDelete(model_metadata_message),
-            "deleting status protobuf");
+      FAIL_IF_ERR(TRITONSERVER_MessageDelete(model_metadata_message), "deleting status protobuf");
 
-        if (!model_metadata.get("name").getAsString().equals(model_name)) {
-          FAIL("unable to find metadata for model");
-        }
+      if (!model_metadata.get("name").getAsString().equals(model_name)) {
+        FAIL("unable to find metadata for model");
+      }
 
-        boolean found_version = false;
-        if (model_metadata.has("versions")) {
-          for (JsonElement version : model_metadata.get("versions").getAsJsonArray()) {
-            if (version.getAsString().equals("1")) {
-              found_version = true;
-              break;
-            }
+      boolean found_version = false;
+      if (model_metadata.has("versions")) {
+        for (JsonElement version : model_metadata.get("versions").getAsJsonArray()) {
+          if (version.getAsString().equals("1")) {
+            found_version = true;
+            break;
           }
         }
-        if (!found_version) {
-          FAIL("unable to find version 1 status for model");
-        }
-
-        FAIL_IF_ERR(
-            ParseModelMetadata(model_metadata, is_int, is_torch_model),
-            "parsing model metadata");
+      }
+      if (!found_version) {
+        FAIL("unable to find version 1 status for model");
       }
 
-      Runnable runnable =
-        () -> {
-          boolean passed = ValidateMemoryGrowth(max_growth_allowed, max_mem_allowed);
-          
-          // Sleep to give the garbage collector time to free the server.
-          // This avoids race conditions between Triton bindings' printing and
-          // Java's native printing below.
-          try {
-            Thread.sleep(5000);
-          } catch (InterruptedException e){
-            System.out.println("Sleep interrupted: " + e.toString());
-          }
+      FAIL_IF_ERR(
+          ParseModelMetadata(model_metadata, is_int, is_torch_model), "parsing model metadata");
+    }
 
-          if(passed){
-            System.out.println("Memory growth test passed");
-          } else {
-            System.out.println("Memory growth test FAILED");
-          }
-        };
-      Thread memory_thread = new Thread(runnable);
-      memory_thread.start();
+    Runnable runnable = () ->
+    {
+      boolean passed = ValidateMemoryGrowth(max_growth_allowed, max_mem_allowed);
 
-      for(int i = 0; i < num_iterations; i++){
-        try (PointerScope scope = new PointerScope()) {
-          RunInference(server, model_name, is_int, is_torch_model, check_accuracy);
-        }
+      // Sleep to give the garbage collector time to free the server.
+      // This avoids race conditions between Triton bindings' printing and
+      // Java's native printing below.
+      try {
+        Thread.sleep(5000);
+      }
+      catch (InterruptedException e) {
+        System.out.println("Sleep interrupted: " + e.toString());
       }
-      done = true;
-      memory_thread.join();
 
-      System.exit(0);
+      if (passed) {
+        System.out.println("Memory growth test passed");
+      } else {
+        System.out.println("Memory growth test FAILED");
+      }
+    };
+    Thread memory_thread = new Thread(runnable);
+    memory_thread.start();
+
+    for (int i = 0; i < num_iterations; i++) {
+      try (PointerScope scope = new PointerScope()) {
+        RunInference(server, model_name, is_int, is_torch_model, check_accuracy);
+      }
     }
+    done = true;
+    memory_thread.join();
+
+    System.exit(0);
+  }
 }
diff --git a/qa/L0_java_memory_growth/test.sh b/qa/L0_java_memory_growth/test.sh
index 610315d34e..1011ec0633 100755
--- a/qa/L0_java_memory_growth/test.sh
+++ b/qa/L0_java_memory_growth/test.sh
@@ -76,7 +76,7 @@ fi
 LOG_IDX=$((LOG_IDX+1))
 CLIENT_LOG="./client_$LOG_IDX.log"
 
-# Longer-running memory growth test 
+# Longer-running memory growth test
 ITERS=1000000
 MAX_MEM_GROWTH_MB=10
 if [ "$TRITON_PERF_LONG" == 1 ]; then
diff --git a/qa/L0_java_resnet/ResnetTest.java b/qa/L0_java_resnet/ResnetTest.java
index 9bf46b22f7..e9f353cf62 100644
--- a/qa/L0_java_resnet/ResnetTest.java
+++ b/qa/L0_java_resnet/ResnetTest.java
@@ -24,593 +24,563 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import static org.bytedeco.tritonserver.global.tritonserver.*;
+
+import com.google.gson.*;
 import java.io.*;
 import java.util.*;
 import java.util.concurrent.*;
-import com.google.gson.*;
 import org.bytedeco.javacpp.*;
 import org.bytedeco.tritonserver.tritonserver.*;
-import static org.bytedeco.tritonserver.global.tritonserver.*;
 
 public class ResnetTest {
-    // Maximum allowed difference from expected model outputs
-    private static final float ALLOWED_DELTA = .001f;
-    private static final String[] MODELS = {
-      "resnet50_fp32_libtorch",
-      "resnet50_fp32_onnx",
+  // Maximum allowed difference from expected model outputs
+  private static final float ALLOWED_DELTA = .001f;
+  private static final String[] MODELS = {
+      "resnet50_fp32_libtorch", "resnet50_fp32_onnx",
       // TODO: fix build to support GPU only resnet50v1.5_fp16_savedmodel
       //"resnet50v1.5_fp16_savedmodel",
-      };
-    private static final double TRITON_MIN_COMPUTE_CAPABILITY = 6.0;
-    private enum Backend {
-      NONE,
-      ONNX,
-      TF,
-      TORCH,
+  };
+  private static final double TRITON_MIN_COMPUTE_CAPABILITY = 6.0;
+  private enum Backend {
+    NONE,
+    ONNX,
+    TF,
+    TORCH,
+  }
+
+  static void FAIL(String MSG)
+  {
+    System.err.println("failure: " + MSG);
+    System.exit(1);
+  }
+
+  static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG)
+  {
+    if (err__ != null) {
+      System.err.println(
+          "error: " + MSG + ":" + TRITONSERVER_ErrorCodeString(err__) + " - "
+          + TRITONSERVER_ErrorMessage(err__));
+      TRITONSERVER_ErrorDelete(err__);
+      System.exit(1);
     }
+  }
+
+  static boolean enforce_memory_type = false;
+  static int requested_memory_type;
 
-    static void FAIL(String MSG) {
-        System.err.println("failure: " + MSG);
-        System.exit(1);
+  static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server {
+    public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p)
+    {
+      super(p);
+      deallocator(new DeleteDeallocator(this));
+    }
+    protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
+      DeleteDeallocator(Pointer p) { super(p); }
+      @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
     }
+  }
 
-    static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG) {
-        if (err__ != null) {
-            System.err.println("error: " + MSG + ":"
-                             + TRITONSERVER_ErrorCodeString(err__) + " - "
-                             + TRITONSERVER_ErrorMessage(err__));
-            TRITONSERVER_ErrorDelete(err__);
-            System.exit(1);
-        }
+  static void Usage(String msg)
+  {
+    if (msg != null) {
+      System.err.println(msg);
     }
 
-    static boolean enforce_memory_type = false;
-    static int requested_memory_type;
+    System.err.println("Usage: java " + ResnetTest.class.getSimpleName() + " [options]");
+    System.err.println(
+        "\t-m <\"system\"|\"pinned\"|gpu>"
+        + " Enforce the memory type for input and output tensors."
+        + " If not specified, inputs will be in system memory and outputs"
+        + " will be based on the model's preferred type.");
+    System.err.println("\t-v Enable verbose logging");
+    System.err.println("\t-r [model repository absolute path]");
+
+    System.exit(1);
+  }
+
+  static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
+    @Override
+    public TRITONSERVER_Error call(
+        TRITONSERVER_ResponseAllocator allocator, String tensor_name, long byte_size,
+        int preferred_memory_type, long preferred_memory_type_id, Pointer userp,
+        PointerPointer buffer, PointerPointer buffer_userp, IntPointer actual_memory_type,
+        LongPointer actual_memory_type_id)
+    {
+      // Initially attempt to make the actual memory type and id that we
+      // allocate be the same as preferred memory type
+      actual_memory_type.put(0, preferred_memory_type);
+      actual_memory_type_id.put(0, preferred_memory_type_id);
+
+      // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
+      // need to do any other book-keeping.
+      if (byte_size == 0) {
+        buffer.put(0, null);
+        buffer_userp.put(0, null);
+        System.out.println("allocated " + byte_size + " bytes for result tensor " + tensor_name);
+      } else {
+        Pointer allocated_ptr = new Pointer();
+        if (enforce_memory_type) {
+          actual_memory_type.put(0, requested_memory_type);
+        }
 
-    static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server {
-        public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p) { super(p); deallocator(new DeleteDeallocator(this)); }
-        protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
-            DeleteDeallocator(Pointer p) { super(p); }
-            @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
+        actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU);
+        allocated_ptr = Pointer.malloc(byte_size);
+
+        // Pass the tensor name with buffer_userp so we can show it when
+        // releasing the buffer.
+        if (!allocated_ptr.isNull()) {
+          buffer.put(0, allocated_ptr);
+          buffer_userp.put(0, Loader.newGlobalRef(tensor_name));
+          System.out.println(
+              "allocated " + byte_size + " bytes in "
+              + TRITONSERVER_MemoryTypeString(actual_memory_type.get()) + " for result tensor "
+              + tensor_name);
         }
+      }
+
+      return null; // Success
     }
+  }
 
-    static void
-    Usage(String msg)
+  static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
+    @Override
+    public TRITONSERVER_Error call(
+        TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
+        long byte_size, int memory_type, long memory_type_id)
     {
-      if (msg != null) {
-        System.err.println(msg);
+      String name = null;
+      if (buffer_userp != null) {
+        name = (String) Loader.accessGlobalRef(buffer_userp);
+      } else {
+        name = "";
       }
 
-      System.err.println("Usage: java " + ResnetTest.class.getSimpleName() + " [options]");
-      System.err.println("\t-m <\"system\"|\"pinned\"|gpu>"
-                       + " Enforce the memory type for input and output tensors."
-                       + " If not specified, inputs will be in system memory and outputs"
-                       + " will be based on the model's preferred type.");
-      System.err.println("\t-v Enable verbose logging");
-      System.err.println("\t-r [model repository absolute path]");
+      Pointer.free(buffer);
+      Loader.deleteGlobalRef(buffer_userp);
 
-      System.exit(1);
+      return null; // Success
     }
+  }
 
-    static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
-        @Override public TRITONSERVER_Error call (
-            TRITONSERVER_ResponseAllocator allocator, String tensor_name,
-            long byte_size, int preferred_memory_type,
-            long preferred_memory_type_id, Pointer userp, PointerPointer buffer,
-            PointerPointer buffer_userp, IntPointer actual_memory_type,
-            LongPointer actual_memory_type_id)
-        {
-          // Initially attempt to make the actual memory type and id that we
-          // allocate be the same as preferred memory type
-          actual_memory_type.put(0, preferred_memory_type);
-          actual_memory_type_id.put(0, preferred_memory_type_id);
-
-          // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
-          // need to do any other book-keeping.
-          if (byte_size == 0) {
-            buffer.put(0, null);
-            buffer_userp.put(0, null);
-            System.out.println("allocated " + byte_size + " bytes for result tensor " + tensor_name);
-          } else {
-            Pointer allocated_ptr = new Pointer();
-            if (enforce_memory_type) {
-              actual_memory_type.put(0, requested_memory_type);
-            }
-
-            actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU);
-            allocated_ptr = Pointer.malloc(byte_size);
-
-            // Pass the tensor name with buffer_userp so we can show it when
-            // releasing the buffer.
-            if (!allocated_ptr.isNull()) {
-              buffer.put(0, allocated_ptr);
-              buffer_userp.put(0, Loader.newGlobalRef(tensor_name));
-              System.out.println("allocated " + byte_size + " bytes in "
-                               + TRITONSERVER_MemoryTypeString(actual_memory_type.get())
-                               + " for result tensor " + tensor_name);
-            }
-          }
-
-          return null;  // Success
-        }
+  static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
+    @Override public void call(TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
+    {
+      // We reuse the request so we don't delete it here.
     }
+  }
 
-    static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
-        @Override public TRITONSERVER_Error call (
-            TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
-            long byte_size, int memory_type, long memory_type_id)
-        {
-          String name = null;
-          if (buffer_userp != null) {
-            name = (String)Loader.accessGlobalRef(buffer_userp);
-          } else {
-            name = "";
-          }
-          
-          Pointer.free(buffer);
-          Loader.deleteGlobalRef(buffer_userp);
-
-          return null;  // Success
-        }
+  static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
+    @Override public void call(TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
+    {
+      if (response != null) {
+        // Send 'response' to the future.
+        futures.get(userp).complete(response);
+      }
     }
-
-    static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
-        @Override public void call (
-            TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
-        {
-          // We reuse the request so we don't delete it here.
-        }
+  }
+
+  static ConcurrentHashMap> futures =
+      new ConcurrentHashMap<>();
+  static ResponseAlloc responseAlloc = new ResponseAlloc();
+  static ResponseRelease responseRelease = new ResponseRelease();
+  static InferRequestComplete inferRequestComplete = new InferRequestComplete();
+  static InferResponseComplete inferResponseComplete = new InferResponseComplete();
+
+  static void GenerateInputData(FloatPointer[] input_data)
+  {
+    // Input size is 3 * 224 * 224
+    input_data[0] = new FloatPointer(150528);
+    for (int i = 0; i < 150528; ++i) {
+      input_data[0].put(i, 1);
     }
-
-    static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
-        @Override public void call (
-            TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
-        {
-          if (response != null) {
-            // Send 'response' to the future.
-            futures.get(userp).complete(response);
-          }
-        }
-    }
-
-    static ConcurrentHashMap> futures = new ConcurrentHashMap<>();
-    static ResponseAlloc responseAlloc = new ResponseAlloc();
-    static ResponseRelease responseRelease = new ResponseRelease();
-    static InferRequestComplete inferRequestComplete = new InferRequestComplete();
-    static InferResponseComplete inferResponseComplete = new InferResponseComplete();
-
-    static void
-    GenerateInputData(
-        FloatPointer[] input_data)
-    {
-      // Input size is 3 * 224 * 224
-      input_data[0] = new FloatPointer(150528);
-      for (int i = 0; i < 150528; ++i) {
-        input_data[0].put(i, 1);
+  }
+
+  static boolean AreValidResults(
+      String model_name, FloatPointer output, FloatPointer expected_output)
+  {
+    int output_length = model_name.contains("tensorflow") ? 1001 : 1000;
+    for (int i = 0; i < output_length; ++i) {
+      float difference = output.get(i) - expected_output.get(i);
+      if (difference > ALLOWED_DELTA) {
+        System.out.println(
+            model_name + "inference failure: unexpected output "
+            + "in " + model_name + ", index " + i);
+
+        System.out.println("Value: " + output.get(i) + ", expected " + expected_output.get(i));
+
+        return false; // Failure
       }
     }
+    return true; // Success
+  }
+
+  static void Check(
+      String model_name, Backend backend, TRITONSERVER_InferenceResponse response,
+      Pointer input_data, String output, int expected_datatype) throws Exception
+  {
+    HashMap output_data = new HashMap<>();
+
+    int[] output_count = {0};
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceResponseOutputCount(response, output_count),
+        "getting number of response outputs");
+    if (output_count[0] != 1) {
+      FAIL("expecting 1 response output, got " + output_count[0]);
+    }
 
-    static boolean
-    AreValidResults(
-        String model_name, FloatPointer output, FloatPointer expected_output)
-    {
-      int output_length = model_name.contains("tensorflow") ? 1001 : 1000;
-      for (int i = 0; i < output_length; ++i) {
-        float difference = output.get(i) - expected_output.get(i);
-        if (difference > ALLOWED_DELTA) {
-          System.out.println(model_name + "inference failure: unexpected output " +
-          "in " + model_name + ", index " + i);
+    for (int idx = 0; idx < output_count[0]; ++idx) {
+      BytePointer cname = new BytePointer((Pointer) null);
+      IntPointer datatype = new IntPointer(1);
+      LongPointer shape = new LongPointer((Pointer) null);
+      LongPointer dim_count = new LongPointer(1);
+      Pointer base = new Pointer();
+      SizeTPointer byte_size = new SizeTPointer(1);
+      IntPointer memory_type = new IntPointer(1);
+      LongPointer memory_type_id = new LongPointer(1);
+      Pointer userp = new Pointer();
 
-          System.out.println("Value: " + output.get(i) + ", expected " +
-          expected_output.get(i));
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceResponseOutput(
+              response, idx, cname, datatype, shape, dim_count, base, byte_size, memory_type,
+              memory_type_id, userp),
+          "getting output info");
 
-          return false; // Failure
-        }
+      if (cname.isNull()) {
+        FAIL("unable to get output name");
       }
-      return true; // Success
-    }
-
-    static void
-    Check(
-        String model_name, Backend backend,
-        TRITONSERVER_InferenceResponse response,
-        Pointer input_data, String output,
-        int expected_datatype) throws Exception
-    {
-      HashMap output_data = new HashMap<>();
 
-      int[] output_count = {0};
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceResponseOutputCount(response, output_count),
-          "getting number of response outputs");
-      if (output_count[0] != 1) {
-        FAIL("expecting 1 response output, got " + output_count[0]);
+      String name = cname.getString();
+      if (!name.equals(output)) {
+        FAIL("unexpected output '" + name + "'");
       }
 
-      for (int idx = 0; idx < output_count[0]; ++idx) {
-        BytePointer cname = new BytePointer((Pointer)null);
-        IntPointer datatype = new IntPointer(1);
-        LongPointer shape = new LongPointer((Pointer)null);
-        LongPointer dim_count = new LongPointer(1);
-        Pointer base = new Pointer();
-        SizeTPointer byte_size = new SizeTPointer(1);
-        IntPointer memory_type = new IntPointer(1);
-        LongPointer memory_type_id = new LongPointer(1);
-        Pointer userp = new Pointer();
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseOutput(
-                response, idx, cname, datatype, shape, dim_count, base,
-                byte_size, memory_type, memory_type_id, userp),
-            "getting output info");
-
-        if (cname.isNull()) {
-          FAIL("unable to get output name");
-        }
+      int output_length = backend == backend.TF ? 1001 : 1000;
 
-        String name = cname.getString();
-        if (!name.equals(output)) {
-          FAIL("unexpected output '" + name + "'");
-        }
+      if ((dim_count.get() != 2) || (shape.get(0) != 1) || shape.get(1) != output_length) {
+        FAIL("unexpected shape for '" + name + "'");
+      }
 
-        int output_length = backend == backend.TF ? 1001: 1000;
+      if (datatype.get() != expected_datatype) {
+        FAIL(
+            "unexpected datatype '" + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name
+            + "'");
+      }
 
-        if ((dim_count.get() != 2) || (shape.get(0) != 1)
-        || shape.get(1) != output_length) {
-          FAIL("unexpected shape for '" + name + "'");
-        }
+      if (enforce_memory_type && (memory_type.get() != requested_memory_type)) {
+        FAIL(
+            "unexpected memory type, expected to be allocated in "
+            + TRITONSERVER_MemoryTypeString(requested_memory_type) + ", got "
+            + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id " + memory_type_id.get()
+            + " for " + name);
+      }
 
-        if (datatype.get() != expected_datatype) {
-          FAIL(
-              "unexpected datatype '" +
-              TRITONSERVER_DataTypeString(datatype.get()) + "' for '" +
-              name + "'");
-        }
+      // We make a copy of the data here... which we could avoid for
+      // performance reasons but ok for this simple example.
+      BytePointer odata = new BytePointer(byte_size.get());
+      output_data.put(name, odata);
+      odata.put(base.limit(byte_size.get()));
+    }
 
-        if (enforce_memory_type && (memory_type.get() != requested_memory_type)) {
-          FAIL(
-              "unexpected memory type, expected to be allocated in " +
-              TRITONSERVER_MemoryTypeString(requested_memory_type) +
-              ", got " + TRITONSERVER_MemoryTypeString(memory_type.get()) +
-              ", id " + memory_type_id.get() + " for " + name);
-        }
+    // Expected output for model
+    String file_name = "expected_output_data/expected_output_";
+    switch (backend) {
+      case ONNX:
+        file_name += "onnx";
+        break;
+      case TF:
+        file_name += "tensorflow";
+        break;
+      case TORCH:
+        file_name += "pytorch";
+        break;
+      default:
+        FAIL("Unsupported model type");
+        break;
+    }
+    file_name += ".txt";
 
-        // We make a copy of the data here... which we could avoid for
-        // performance reasons but ok for this simple example.
-        BytePointer odata = new BytePointer(byte_size.get());
-        output_data.put(name, odata);
-        odata.put(base.limit(byte_size.get()));
-      }
+    int output_length = backend == backend.TF ? 1001 : 1000;
+    FloatPointer expected_output = new FloatPointer(output_length);
 
-      // Expected output for model
-      String file_name = "expected_output_data/expected_output_";
-      switch (backend) {
-        case ONNX:
-          file_name += "onnx";
-          break;
-        case TF:
-          file_name += "tensorflow";
-          break;
-        case TORCH:
-          file_name += "pytorch";
-          break;
-        default:
-          FAIL("Unsupported model type");
-          break;
-      }
-      file_name += ".txt";
-      
-      int output_length = backend == backend.TF ? 1001: 1000;
-      FloatPointer expected_output = new FloatPointer(output_length);
-
-      try (Scanner scanner = new Scanner(new File(file_name))) {
-        for (int i = 0; i < output_length; ++i) {
-          expected_output.put(i, scanner.nextFloat());
-        } 
+    try (Scanner scanner = new Scanner(new File(file_name))) {
+      for (int i = 0; i < output_length; ++i) {
+        expected_output.put(i, scanner.nextFloat());
       }
+    }
 
-      boolean correct_results = AreValidResults(
-          model_name, new FloatPointer(output_data.get(output)),
-          expected_output);
+    boolean correct_results =
+        AreValidResults(model_name, new FloatPointer(output_data.get(output)), expected_output);
 
-      if(correct_results){
-        System.out.println(backend.name() + " test PASSED");
-      } else {
-        System.out.println(backend.name() + " test FAILED");
-      }
+    if (correct_results) {
+      System.out.println(backend.name() + " test PASSED");
+    } else {
+      System.out.println(backend.name() + " test FAILED");
+    }
+  }
+
+  static void PerformInference(TRITONSERVER_ServerDeleter server, String model_name)
+      throws Exception
+  {
+    // Get type of model
+    Backend backend = Backend.NONE;
+    if (model_name.contains("onnx")) {
+      backend = Backend.ONNX;
+    } else if (model_name.contains("savedmodel")) {
+      backend = Backend.TF;
+    } else if (model_name.contains("torch")) {
+      backend = Backend.TORCH;
+    } else {
+      FAIL(
+          "Supported model types (Onnx, TensorFlow, Torch) "
+          + "cannot be inferred from model name " + model_name);
     }
 
-    static void
-    PerformInference(
-      TRITONSERVER_ServerDeleter server, String model_name) throws Exception
-    {
-      // Get type of model
-      Backend backend = Backend.NONE;
-      if(model_name.contains("onnx")) {
-        backend = Backend.ONNX;
-      } else if (model_name.contains("savedmodel")) {
-        backend = Backend.TF;
-      } else if (model_name.contains("torch")) {
-        backend = Backend.TORCH;
-      } else {
-        FAIL("Supported model types (Onnx, TensorFlow, Torch) " +
-        "cannot be inferred from model name " + model_name);
-      }
-
-      // Wait for the model to become available.
-      boolean[] is_ready = {false};
-      int health_iters = 0;
-      while (!is_ready[0]) {
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerModelIsReady(
-                server, model_name, 1, is_ready),
-            "unable to get model readiness");
-        if (!is_ready[0]) {
-          if (++health_iters >= 10) {
-            FAIL(model_name + " model failed to be ready in 10 iterations");
-          }
-          Thread.sleep(500);
-          continue;
+    // Wait for the model to become available.
+    boolean[] is_ready = {false};
+    int health_iters = 0;
+    while (!is_ready[0]) {
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerModelIsReady(server, model_name, 1, is_ready),
+          "unable to get model readiness");
+      if (!is_ready[0]) {
+        if (++health_iters >= 10) {
+          FAIL(model_name + " model failed to be ready in 10 iterations");
         }
+        Thread.sleep(500);
+        continue;
       }
+    }
 
-      // Create the allocator that will be used to allocate buffers for
-      // the result tensors.
-      TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_ResponseAllocatorNew(
-              allocator, responseAlloc, responseRelease, null /* start_fn */),
-          "creating response allocator");
+    // Create the allocator that will be used to allocate buffers for
+    // the result tensors.
+    TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_ResponseAllocatorNew(
+            allocator, responseAlloc, responseRelease, null /* start_fn */),
+        "creating response allocator");
+
+    // Inference
+    TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestNew(irequest, server, model_name, -1 /* model_version */),
+        "creating inference request");
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"),
+        "setting ID for the request");
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetReleaseCallback(
+            irequest, inferRequestComplete, null /* request_release_userp */),
+        "setting request release callback");
+
+
+    // Model inputs
+    String input = "";
+    String output = "";
+    long[] input_shape = {1, 224, 224, 3};
+
+    switch (backend) {
+      case ONNX:
+        input = "import/input:0";
+        output = "import/resnet_v1_50/predictions/Softmax:0";
+        break;
+      case TF:
+        input = "input";
+        output = "probabilities";
+        break;
+      case TORCH:
+        input = "INPUT__0";
+        input_shape[1] = 3;
+        input_shape[3] = 224;
+        output = "OUTPUT__0";
+        break;
+      default:
+        FAIL("Unsupported model type");
+        break;
+    }
 
-      // Inference
-      TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestNew(
-              irequest, server, model_name, -1 /* model_version */),
-          "creating inference request");
+    int datatype = TRITONSERVER_TYPE_FP32;
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"),
-          "setting ID for the request");
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddInput(
+            irequest, input, datatype, input_shape, input_shape.length),
+        "setting input 0 meta-data for the request");
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestSetReleaseCallback(
-              irequest, inferRequestComplete, null /* request_release_userp */),
-          "setting request release callback");
-
-      
-      // Model inputs
-      String input = "";
-      String output = "";
-      long[] input_shape = {1, 224, 224, 3};
-
-      switch (backend) {
-        case ONNX:
-          input = "import/input:0";
-          output = "import/resnet_v1_50/predictions/Softmax:0";
-          break;
-        case TF:
-          input = "input";
-          output = "probabilities";
-          break;
-        case TORCH:
-          input = "INPUT__0";
-          input_shape[1] = 3;
-          input_shape[3] = 224;
-          output = "OUTPUT__0";
-          break;
-        default:
-          FAIL("Unsupported model type");
-          break;
-      }
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output),
+        "requesting output 0 for the request");
 
-      int datatype = TRITONSERVER_TYPE_FP32;
+    // Create the data for the two input tensors. Initialize the first
+    // to unique values and the second to all ones.
+    BytePointer input_data;
+    FloatPointer[] p0 = {null};
+    GenerateInputData(p0);
+    input_data = p0[0].getPointer(BytePointer.class);
+    long input_size = input_data.limit();
+    Pointer input_base = input_data;
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddInput(
-              irequest, input, datatype, input_shape, input_shape.length),
-          "setting input 0 meta-data for the request");
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAppendInputData(
+            irequest, input, input_base, input_size, requested_memory_type, 0 /* memory_type_id */),
+        "assigning INPUT data");
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output),
-          "requesting output 0 for the request");
-
-      // Create the data for the two input tensors. Initialize the first
-      // to unique values and the second to all ones.
-      BytePointer input_data;
-      FloatPointer[] p0 = {null};
-      GenerateInputData(p0);
-      input_data = p0[0].getPointer(BytePointer.class);
-      long input_size = input_data.limit();
-      Pointer input_base = input_data;
+    // Perform inference...
+    {
+      CompletableFuture completed = new CompletableFuture<>();
+      futures.put(irequest, completed);
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAppendInputData(
-              irequest, input, input_base, input_size, requested_memory_type,
-              0 /* memory_type_id */),
-          "assigning INPUT data");
-
-      // Perform inference...
-      {
-        CompletableFuture completed = new CompletableFuture<>();
-        futures.put(irequest, completed);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestSetResponseCallback(
-                irequest, allocator, null /* response_allocator_userp */,
-                inferResponseComplete, irequest),
-            "setting response callback");
-
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerInferAsync(
-                server, irequest, null /* trace */),
-            "running inference");
-
-        // Wait for the inference to complete.
-        TRITONSERVER_InferenceResponse completed_response = completed.get();
-        futures.remove(irequest);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseError(completed_response),
-            "response status");
-
-        Check(
-            model_name, backend, completed_response, input_data, output, datatype);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseDelete(completed_response),
-            "deleting inference response");
-      }
+          TRITONSERVER_InferenceRequestSetResponseCallback(
+              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
+              irequest),
+          "setting response callback");
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestDelete(irequest),
-          "deleting inference request");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
+
+      // Wait for the inference to complete.
+      TRITONSERVER_InferenceResponse completed_response = completed.get();
+      futures.remove(irequest);
+
+      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
+
+      Check(model_name, backend, completed_response, input_data, output, datatype);
 
       FAIL_IF_ERR(
-          TRITONSERVER_ResponseAllocatorDelete(allocator),
-          "deleting response allocator");
+          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
     }
-    
-    public static void
-    main(String[] args) throws Exception
-    {
-      String model_repository_path = null;
-      int verbose_level = 0;
-
-      // Parse commandline...
-      for (int i = 0; i < args.length; i++) {
-        switch (args[i]) {
-          case "-m": {
-            enforce_memory_type = true;
-            i++;
-            if (args[i].equals("system")) {
-              requested_memory_type = TRITONSERVER_MEMORY_CPU;
-            } else if (args[i].equals("pinned")) {
-              requested_memory_type = TRITONSERVER_MEMORY_CPU_PINNED;
-            } else if (args[i].equals("gpu")) {
-              requested_memory_type = TRITONSERVER_MEMORY_GPU;
-            } else {
-              Usage(
-                  "-m must be used to specify one of the following types:" +
-                  " <\"system\"|\"pinned\"|gpu>");
-            }
-            break;
+
+    FAIL_IF_ERR(TRITONSERVER_InferenceRequestDelete(irequest), "deleting inference request");
+
+    FAIL_IF_ERR(TRITONSERVER_ResponseAllocatorDelete(allocator), "deleting response allocator");
+  }
+
+  public static void main(String[] args) throws Exception
+  {
+    String model_repository_path = null;
+    int verbose_level = 0;
+
+    // Parse commandline...
+    for (int i = 0; i < args.length; i++) {
+      switch (args[i]) {
+        case "-m": {
+          enforce_memory_type = true;
+          i++;
+          if (args[i].equals("system")) {
+            requested_memory_type = TRITONSERVER_MEMORY_CPU;
+          } else if (args[i].equals("pinned")) {
+            requested_memory_type = TRITONSERVER_MEMORY_CPU_PINNED;
+          } else if (args[i].equals("gpu")) {
+            requested_memory_type = TRITONSERVER_MEMORY_GPU;
+          } else {
+            Usage(
+                "-m must be used to specify one of the following types:"
+                + " <\"system\"|\"pinned\"|gpu>");
           }
-          case "-r":
-            model_repository_path = args[++i];
-            break;
-          case "-v":
-            verbose_level = 1;
-            break;
-          case "-?":
-            Usage(null);
-            break;
+          break;
         }
+        case "-r":
+          model_repository_path = args[++i];
+          break;
+        case "-v":
+          verbose_level = 1;
+          break;
+        case "-?":
+          Usage(null);
+          break;
       }
+    }
 
-      if (model_repository_path == null) {
-        Usage("-r must be used to specify model repository path");
-      }
-      if (enforce_memory_type && requested_memory_type != TRITONSERVER_MEMORY_CPU) {
-        Usage("-m can only be set to \"system\" without enabling GPU");
+    if (model_repository_path == null) {
+      Usage("-r must be used to specify model repository path");
+    }
+    if (enforce_memory_type && requested_memory_type != TRITONSERVER_MEMORY_CPU) {
+      Usage("-m can only be set to \"system\" without enabling GPU");
+    }
+
+    // Check API version.
+    int[] api_version_major = {0}, api_version_minor = {0};
+    FAIL_IF_ERR(
+        TRITONSERVER_ApiVersion(api_version_major, api_version_minor),
+        "getting Triton API version");
+    if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0])
+        || (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) {
+      FAIL("triton server API version mismatch");
+    }
+
+    // Create the server...
+    TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
+    FAIL_IF_ERR(TRITONSERVER_ServerOptionsNew(server_options), "creating server options");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetModelRepositoryPath(server_options, model_repository_path),
+        "setting model repository path");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
+        "setting verbose logging level");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetBackendDirectory(server_options, "/opt/tritonserver/backends"),
+        "setting backend directory");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
+            server_options, "/opt/tritonserver/repoagents"),
+        "setting repository agent directory");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true),
+        "setting strict model configuration");
+    double min_compute_capability = TRITON_MIN_COMPUTE_CAPABILITY;
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability(
+            server_options, min_compute_capability),
+        "setting minimum supported CUDA compute capability");
+
+    TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
+    FAIL_IF_ERR(TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+    FAIL_IF_ERR(TRITONSERVER_ServerOptionsDelete(server_options), "deleting server options");
+
+    TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
+
+    // Wait until the server is both live and ready.
+    int health_iters = 0;
+    while (true) {
+      boolean[] live = {false}, ready = {false};
+      FAIL_IF_ERR(TRITONSERVER_ServerIsLive(server, live), "unable to get server liveness");
+      FAIL_IF_ERR(TRITONSERVER_ServerIsReady(server, ready), "unable to get server readiness");
+      System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
+      if (live[0] && ready[0]) {
+        break;
       }
 
-      // Check API version.
-      int[] api_version_major = {0}, api_version_minor = {0};
-      FAIL_IF_ERR(
-          TRITONSERVER_ApiVersion(api_version_major, api_version_minor),
-          "getting Triton API version");
-      if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0]) ||
-          (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) {
-        FAIL("triton server API version mismatch");
+      if (++health_iters >= 10) {
+        FAIL("failed to find healthy inference server");
       }
 
-      // Create the server...
-      TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsNew(server_options),
-          "creating server options");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetModelRepositoryPath(
-              server_options, model_repository_path),
-          "setting model repository path");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
-          "setting verbose logging level");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetBackendDirectory(
-              server_options, "/opt/tritonserver/backends"),
-          "setting backend directory");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
-              server_options, "/opt/tritonserver/repoagents"),
-          "setting repository agent directory");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true),
-          "setting strict model configuration");
-      double min_compute_capability = TRITON_MIN_COMPUTE_CAPABILITY;
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability(
-              server_options, min_compute_capability),
-          "setting minimum supported CUDA compute capability");
+      Thread.sleep(500);
+    }
 
-      TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
+    // Print status of the server.
+    {
+      TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
       FAIL_IF_ERR(
-          TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+          TRITONSERVER_ServerMetadata(server, server_metadata_message),
+          "unable to get server metadata message");
+      BytePointer buffer = new BytePointer((Pointer) null);
+      SizeTPointer byte_size = new SizeTPointer(1);
       FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsDelete(server_options),
-          "deleting server options");
-
-      TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
-
-      // Wait until the server is both live and ready.
-      int health_iters = 0;
-      while (true) {
-        boolean[] live = {false}, ready = {false};
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerIsLive(server, live),
-            "unable to get server liveness");
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerIsReady(server, ready),
-            "unable to get server readiness");
-        System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
-        if (live[0] && ready[0]) {
-          break;
-        }
-
-        if (++health_iters >= 10) {
-          FAIL("failed to find healthy inference server");
-        }
-
-        Thread.sleep(500);
-      }
+          TRITONSERVER_MessageSerializeToJson(server_metadata_message, buffer, byte_size),
+          "unable to serialize server metadata message");
 
-      // Print status of the server.
-      {
-        TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerMetadata(server, server_metadata_message),
-            "unable to get server metadata message");
-        BytePointer buffer = new BytePointer((Pointer)null);
-        SizeTPointer byte_size = new SizeTPointer(1);
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageSerializeToJson(
-                server_metadata_message, buffer, byte_size),
-            "unable to serialize server metadata message");
-
-        System.out.println("Server Status:");
-        System.out.println(buffer.limit(byte_size.get()).getString());
-
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageDelete(server_metadata_message),
-            "deleting status metadata");
-      }
+      System.out.println("Server Status:");
+      System.out.println(buffer.limit(byte_size.get()).getString());
 
-      for(String model : MODELS) {
-        PerformInference(server, model);
-      }
+      FAIL_IF_ERR(TRITONSERVER_MessageDelete(server_metadata_message), "deleting status metadata");
+    }
 
-      System.exit(0);
+    for (String model : MODELS) {
+      PerformInference(server, model);
     }
+
+    System.exit(0);
+  }
 }
diff --git a/qa/L0_java_sequence_batcher/SequenceTest.java b/qa/L0_java_sequence_batcher/SequenceTest.java
index 3fdc5d63c1..e74214f695 100644
--- a/qa/L0_java_sequence_batcher/SequenceTest.java
+++ b/qa/L0_java_sequence_batcher/SequenceTest.java
@@ -24,615 +24,576 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import static org.bytedeco.tritonserver.global.tritonserver.*;
+
+import com.google.gson.*;
 import java.io.*;
 import java.util.*;
 import java.util.concurrent.*;
-import com.google.gson.*;
 import org.bytedeco.javacpp.*;
 import org.bytedeco.tritonserver.tritonserver.*;
-import static org.bytedeco.tritonserver.global.tritonserver.*;
 
 public class SequenceTest {
-
-    // Boilerplate code for setting up Triton
-    static void FAIL(String MSG) {
-        System.err.println("Failure: " + MSG);
-        System.exit(1);
-    }
-
-    static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG) {
-        if (err__ != null) {
-            System.err.println("error: " + MSG + ":"
-                             + TRITONSERVER_ErrorCodeString(err__) + " - "
-                             + TRITONSERVER_ErrorMessage(err__));
-            TRITONSERVER_ErrorDelete(err__);
-            System.exit(1);
-        }
+  // Boilerplate code for setting up Triton
+  static void FAIL(String MSG)
+  {
+    System.err.println("Failure: " + MSG);
+    System.exit(1);
+  }
+
+  static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG)
+  {
+    if (err__ != null) {
+      System.err.println(
+          "error: " + MSG + ":" + TRITONSERVER_ErrorCodeString(err__) + " - "
+          + TRITONSERVER_ErrorMessage(err__));
+      TRITONSERVER_ErrorDelete(err__);
+      System.exit(1);
     }
+  }
 
-    static int requested_memory_type = TRITONSERVER_MEMORY_CPU;
-
-    static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server {
-        public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p) { super(p); deallocator(new DeleteDeallocator(this)); }
-        protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
-            DeleteDeallocator(Pointer p) { super(p); }
-            @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
-        }
-    }
+  static int requested_memory_type = TRITONSERVER_MEMORY_CPU;
 
-    static void
-    Usage(String msg)
+  static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server {
+    public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p)
     {
-      if (msg != null) {
-        System.err.println(msg);
-      }
-
-      System.err.println("Usage: java " + SequenceTest.class.getSimpleName() + " [options]");
-      System.err.println("\t-m [model name]");
-      System.err.println("\t-v Enable verbose logging");
-      System.err.println("\t-r [model repository absolute path]");
-
-      System.exit(1);
+      super(p);
+      deallocator(new DeleteDeallocator(this));
     }
-
-    static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
-        @Override public TRITONSERVER_Error call (
-            TRITONSERVER_ResponseAllocator allocator, String tensor_name,
-            long byte_size, int preferred_memory_type,
-            long preferred_memory_type_id, Pointer userp, PointerPointer buffer,
-            PointerPointer buffer_userp, IntPointer actual_memory_type,
-            LongPointer actual_memory_type_id)
-        {
-          // Initially attempt to make the actual memory type and id that we
-          // allocate be the same as preferred memory type
-          actual_memory_type.put(0, preferred_memory_type);
-          actual_memory_type_id.put(0, preferred_memory_type_id);
-
-          // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
-          // need to do any other book-keeping.
-          if (byte_size == 0) {
-            buffer.put(0, null);
-            buffer_userp.put(0, null);
-            System.out.println("allocated " + byte_size + " bytes for result tensor " + tensor_name);
-          } else {
-            Pointer allocated_ptr = new Pointer();
-            actual_memory_type.put(0, requested_memory_type);
-
-            actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU);
-            allocated_ptr = Pointer.malloc(byte_size);
-
-            // Pass the tensor name with buffer_userp so we can show it when
-            // releasing the buffer.
-            if (!allocated_ptr.isNull()) {
-              buffer.put(0, allocated_ptr);
-              buffer_userp.put(0, new BytePointer(tensor_name));
-              System.out.println("allocated " + byte_size + " bytes in "
-                               + TRITONSERVER_MemoryTypeString(actual_memory_type.get())
-                               + " for result tensor " + tensor_name);
-            }
-          }
-
-          return null;  // Success
-        }
+    protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
+      DeleteDeallocator(Pointer p) { super(p); }
+      @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
     }
+  }
 
-    static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
-        @Override public TRITONSERVER_Error call (
-            TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
-            long byte_size, int memory_type, long memory_type_id)
-        {
-          BytePointer name = null;
-          if (buffer_userp != null) {
-            name = new BytePointer(buffer_userp);
-          } else {
-            name = new BytePointer("");
-          }
-
-          System.out.println("Releasing buffer " + buffer + " of size " + byte_size
-                           + " in " + TRITONSERVER_MemoryTypeString(memory_type)
-                           + " for result '" + name.getString() + "'");
-          Pointer.free(buffer);
-          name.deallocate();
-
-          return null;  // Success
-        }
+  static void Usage(String msg)
+  {
+    if (msg != null) {
+      System.err.println(msg);
     }
 
-    static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
-        @Override public void call (
-            TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
-        {
-          // We reuse the request so we don't delete it here.
+    System.err.println("Usage: java " + SequenceTest.class.getSimpleName() + " [options]");
+    System.err.println("\t-m [model name]");
+    System.err.println("\t-v Enable verbose logging");
+    System.err.println("\t-r [model repository absolute path]");
+
+    System.exit(1);
+  }
+
+  static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
+    @Override
+    public TRITONSERVER_Error call(
+        TRITONSERVER_ResponseAllocator allocator, String tensor_name, long byte_size,
+        int preferred_memory_type, long preferred_memory_type_id, Pointer userp,
+        PointerPointer buffer, PointerPointer buffer_userp, IntPointer actual_memory_type,
+        LongPointer actual_memory_type_id)
+    {
+      // Initially attempt to make the actual memory type and id that we
+      // allocate be the same as preferred memory type
+      actual_memory_type.put(0, preferred_memory_type);
+      actual_memory_type_id.put(0, preferred_memory_type_id);
+
+      // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
+      // need to do any other book-keeping.
+      if (byte_size == 0) {
+        buffer.put(0, null);
+        buffer_userp.put(0, null);
+        System.out.println("allocated " + byte_size + " bytes for result tensor " + tensor_name);
+      } else {
+        Pointer allocated_ptr = new Pointer();
+        actual_memory_type.put(0, requested_memory_type);
+
+        actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU);
+        allocated_ptr = Pointer.malloc(byte_size);
+
+        // Pass the tensor name with buffer_userp so we can show it when
+        // releasing the buffer.
+        if (!allocated_ptr.isNull()) {
+          buffer.put(0, allocated_ptr);
+          buffer_userp.put(0, new BytePointer(tensor_name));
+          System.out.println(
+              "allocated " + byte_size + " bytes in "
+              + TRITONSERVER_MemoryTypeString(actual_memory_type.get()) + " for result tensor "
+              + tensor_name);
         }
-    }
+      }
 
-    static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
-        @Override public void call (
-            TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
-        {
-          if (response != null) {
-            // Send 'response' to the future.
-            futures.get(userp).complete(response);
-          }
-        }
+      return null; // Success
     }
+  }
 
-    static ConcurrentHashMap> futures = new ConcurrentHashMap<>();
-    static ResponseAlloc responseAlloc = new ResponseAlloc();
-    static ResponseRelease responseRelease = new ResponseRelease();
-    static InferRequestComplete inferRequestComplete = new InferRequestComplete();
-    static InferResponseComplete inferResponseComplete = new InferResponseComplete();
-
-    static TRITONSERVER_Error
-    ParseModelMetadata(
-        JsonObject model_metadata,
-        boolean[] is_torch_model)
+  static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
+    @Override
+    public TRITONSERVER_Error call(
+        TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
+        long byte_size, int memory_type, long memory_type_id)
     {
-      String seen_data_type = null;
-      for (JsonElement input_element : model_metadata.get("inputs").getAsJsonArray()) {
-        JsonObject input = input_element.getAsJsonObject();
-        if (!input.get("datatype").getAsString().equals("INT32")) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_UNSUPPORTED,
-              "sequence qa example only supports model with data type INT32");
-        }
-        if (seen_data_type == null) {
-          seen_data_type = input.get("datatype").getAsString();
-        } else if (!seen_data_type.equals(input.get("datatype").getAsString())) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              "the inputs and outputs of sequence model must have the data type");
-        }
-      }
-      for (JsonElement output_element : model_metadata.get("outputs").getAsJsonArray()) {
-        JsonObject output = output_element.getAsJsonObject();
-        if (!output.get("datatype").getAsString().equals("INT32")) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_UNSUPPORTED,
-              "sequence qa example only supports model with data type INT32");
-        } else if (!seen_data_type.equals(output.get("datatype").getAsString())) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              "the inputs and outputs of sequence' model must have the data type");
-        }
+      BytePointer name = null;
+      if (buffer_userp != null) {
+        name = new BytePointer(buffer_userp);
+      } else {
+        name = new BytePointer("");
       }
 
-      is_torch_model[0] =
-          model_metadata.get("platform").getAsString().equals("pytorch_libtorch");
-      return null;
+      System.out.println(
+          "Releasing buffer " + buffer + " of size " + byte_size + " in "
+          + TRITONSERVER_MemoryTypeString(memory_type) + " for result '" + name.getString() + "'");
+      Pointer.free(buffer);
+      name.deallocate();
+
+      return null; // Success
     }
+  }
 
-    // Custom function to set metadata required for sequence batcher
-    static void
-    SetSequenceMetadata(TRITONSERVER_InferenceRequest irequest, long correlation_id, boolean sequence_start, boolean sequence_end)
+  static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
+    @Override public void call(TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
     {
+      // We reuse the request so we don't delete it here.
+    }
+  }
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestSetCorrelationId(
-              irequest, correlation_id), "Unable to set correlation ID");
-      int flags = 0;
-      if(sequence_start) {
-        flags += TRITONSERVER_REQUEST_FLAG_SEQUENCE_START;
+  static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
+    @Override public void call(TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
+    {
+      if (response != null) {
+        // Send 'response' to the future.
+        futures.get(userp).complete(response);
       }
-      if(sequence_end) {
-        flags += TRITONSERVER_REQUEST_FLAG_SEQUENCE_END;
+    }
+  }
+
+  static ConcurrentHashMap> futures =
+      new ConcurrentHashMap<>();
+  static ResponseAlloc responseAlloc = new ResponseAlloc();
+  static ResponseRelease responseRelease = new ResponseRelease();
+  static InferRequestComplete inferRequestComplete = new InferRequestComplete();
+  static InferResponseComplete inferResponseComplete = new InferResponseComplete();
+
+  static TRITONSERVER_Error ParseModelMetadata(JsonObject model_metadata, boolean[] is_torch_model)
+  {
+    String seen_data_type = null;
+    for (JsonElement input_element : model_metadata.get("inputs").getAsJsonArray()) {
+      JsonObject input = input_element.getAsJsonObject();
+      if (!input.get("datatype").getAsString().equals("INT32")) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_UNSUPPORTED,
+            "sequence qa example only supports model with data type INT32");
+      }
+      if (seen_data_type == null) {
+        seen_data_type = input.get("datatype").getAsString();
+      } else if (!seen_data_type.equals(input.get("datatype").getAsString())) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            "the inputs and outputs of sequence model must have the data type");
+      }
+    }
+    for (JsonElement output_element : model_metadata.get("outputs").getAsJsonArray()) {
+      JsonObject output = output_element.getAsJsonObject();
+      if (!output.get("datatype").getAsString().equals("INT32")) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_UNSUPPORTED,
+            "sequence qa example only supports model with data type INT32");
+      } else if (!seen_data_type.equals(output.get("datatype").getAsString())) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            "the inputs and outputs of sequence' model must have the data type");
       }
-      FAIL_IF_ERR(
-        TRITONSERVER_InferenceRequestSetFlags(
-            irequest, flags), "Unable to set flags");
-
     }
 
-    // Custom function for adjusting sequence batcher
-    // expected results for backends that do not implement
-    // full accumulator
-    static int
-    GetExpectedResult(String model_name, int expected_result, int value, String flag){
-      if((!model_name.contains("nobatch") && !model_name.contains("custom")) ||
-          model_name.contains("graphdef") || model_name.contains("plan") ||
-          model_name.contains("onnx") || model_name.contains("libtorch")){
-            expected_result = value;
-            if(flag != null && flag.contains("start")){
-              expected_result++;
-            }
-        }
-        return expected_result;
+    is_torch_model[0] = model_metadata.get("platform").getAsString().equals("pytorch_libtorch");
+    return null;
+  }
+
+  // Custom function to set metadata required for sequence batcher
+  static void SetSequenceMetadata(
+      TRITONSERVER_InferenceRequest irequest, long correlation_id, boolean sequence_start,
+      boolean sequence_end)
+  {
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetCorrelationId(irequest, correlation_id),
+        "Unable to set correlation ID");
+    int flags = 0;
+    if (sequence_start) {
+      flags += TRITONSERVER_REQUEST_FLAG_SEQUENCE_START;
+    }
+    if (sequence_end) {
+      flags += TRITONSERVER_REQUEST_FLAG_SEQUENCE_END;
+    }
+    FAIL_IF_ERR(TRITONSERVER_InferenceRequestSetFlags(irequest, flags), "Unable to set flags");
+  }
+
+  // Custom function for adjusting sequence batcher
+  // expected results for backends that do not implement
+  // full accumulator
+  static int GetExpectedResult(String model_name, int expected_result, int value, String flag)
+  {
+    if ((!model_name.contains("nobatch") && !model_name.contains("custom"))
+        || model_name.contains("graphdef") || model_name.contains("plan")
+        || model_name.contains("onnx") || model_name.contains("libtorch")) {
+      expected_result = value;
+      if (flag != null && flag.contains("start")) {
+        expected_result++;
+      }
+    }
+    return expected_result;
+  }
+
+  // Standard function for checking response parameters,
+  // plus customized check that final sequence result
+  // "out" matches expected result
+  static void Check(
+      String model_name, TRITONSERVER_InferenceResponse response, int input_value, String output0,
+      long expected_byte_size, int expected_datatype, boolean sequence_end, int expected_result)
+  {
+    HashMap output_data = new HashMap<>();
+
+    int[] output_count = {0};
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceResponseOutputCount(response, output_count),
+        "getting number of response outputs");
+    if (output_count[0] != 1) {
+      FAIL("expecting 1 response outputs, got " + output_count[0]);
     }
 
-    // Standard function for checking response parameters,
-    // plus customized check that final sequence result
-    // "out" matches expected result
-    static void
-    Check(
-        String model_name,
-        TRITONSERVER_InferenceResponse response,
-        int input_value, String output0,
-        long expected_byte_size, int expected_datatype,
-        boolean sequence_end, int expected_result)
-    {
-      HashMap output_data = new HashMap<>();
+    for (int idx = 0; idx < output_count[0]; ++idx) {
+      BytePointer cname = new BytePointer((Pointer) null);
+      IntPointer datatype = new IntPointer(1);
+      LongPointer shape = new LongPointer((Pointer) null);
+      LongPointer dim_count = new LongPointer(1);
+      Pointer base = new Pointer();
+      SizeTPointer byte_size = new SizeTPointer(1);
+      IntPointer memory_type = new IntPointer(1);
+      LongPointer memory_type_id = new LongPointer(1);
+      Pointer userp = new Pointer();
 
-      int[] output_count = {0};
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceResponseOutputCount(response, output_count),
-          "getting number of response outputs");
-      if (output_count[0] != 1) {
-        FAIL("expecting 1 response outputs, got " + output_count[0]);
+          TRITONSERVER_InferenceResponseOutput(
+              response, idx, cname, datatype, shape, dim_count, base, byte_size, memory_type,
+              memory_type_id, userp),
+          "getting output info");
+
+      if (cname.isNull()) {
+        FAIL("unable to get output name");
       }
 
-      for (int idx = 0; idx < output_count[0]; ++idx) {
-        BytePointer cname = new BytePointer((Pointer)null);
-        IntPointer datatype = new IntPointer(1);
-        LongPointer shape = new LongPointer((Pointer)null);
-        LongPointer dim_count = new LongPointer(1);
-        Pointer base = new Pointer();
-        SizeTPointer byte_size = new SizeTPointer(1);
-        IntPointer memory_type = new IntPointer(1);
-        LongPointer memory_type_id = new LongPointer(1);
-        Pointer userp = new Pointer();
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseOutput(
-                response, idx, cname, datatype, shape, dim_count, base,
-                byte_size, memory_type, memory_type_id, userp),
-            "getting output info");
-
-        if (cname.isNull()) {
-          FAIL("unable to get output name");
-        }
+      String name = cname.getString();
+      if (!name.equals(output0)) {
+        FAIL("unexpected output '" + name + "'");
+      }
 
-        String name = cname.getString();
-        if (!name.equals(output0)) {
-          FAIL("unexpected output '" + name + "'");
-        }
+      if ((dim_count.get() != 1) || (shape.get(0) != 1)) {
+        FAIL("unexpected shape for '" + name + "'");
+      }
 
-        if ((dim_count.get() != 1) || (shape.get(0) != 1)) {
-          FAIL("unexpected shape for '" + name + "'");
-        }
+      if (datatype.get() != expected_datatype) {
+        FAIL(
+            "unexpected datatype '" + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name
+            + "'");
+      }
 
-        if (datatype.get() != expected_datatype) {
-          FAIL(
-              "unexpected datatype '" +
-              TRITONSERVER_DataTypeString(datatype.get()) + "' for '" +
-              name + "'");
-        }
+      if (byte_size.get() != expected_byte_size) {
+        FAIL(
+            "unexpected byte-size, expected " + expected_byte_size + ", got " + byte_size.get()
+            + " for " + name);
+      }
 
-        if (byte_size.get() != expected_byte_size) {
-          FAIL(
-              "unexpected byte-size, expected " +
-              expected_byte_size + ", got " +
-              byte_size.get() + " for " + name);
-        }
+      if (memory_type.get() != requested_memory_type) {
+        FAIL(
+            "unexpected memory type, expected to be allocated in "
+            + TRITONSERVER_MemoryTypeString(requested_memory_type) + ", got "
+            + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id " + memory_type_id.get()
+            + " for " + name);
+      }
 
-        if (memory_type.get() != requested_memory_type) {
-          FAIL(
-              "unexpected memory type, expected to be allocated in " +
-              TRITONSERVER_MemoryTypeString(requested_memory_type) +
-              ", got " + TRITONSERVER_MemoryTypeString(memory_type.get()) +
-              ", id " + memory_type_id.get() + " for " + name);
-        }
+      // We make a copy of the data here... which we could avoid for
+      // performance reasons but ok for this sequence example.
+      BytePointer odata = new BytePointer(byte_size.get());
+      output_data.put(name, odata);
+      System.out.println(name + " is stored in system memory");
+      odata.put(base.limit(byte_size.get()));
+    }
 
-        // We make a copy of the data here... which we could avoid for
-        // performance reasons but ok for this sequence example.
-        BytePointer odata = new BytePointer(byte_size.get());
-        output_data.put(name, odata);
-        System.out.println(name + " is stored in system memory");
-        odata.put(base.limit(byte_size.get()));
+    int out = new IntPointer(output_data.get(output0)).get(0);
+    System.out.println("Value: " + out);
+    if (sequence_end) {
+      expected_result = GetExpectedResult(model_name, expected_result, input_value, "end");
+      if (out != expected_result) {
+        FAIL("Expected result: " + expected_result + ", got " + out);
+      } else {
+        System.out.println(model_name + " test PASSED");
       }
-
-      int out = new IntPointer(output_data.get(output0)).get(0);
-      System.out.println("Value: " + out);
-      if(sequence_end){
-        expected_result = GetExpectedResult(model_name, expected_result,
-            input_value, "end");
-        if(out != expected_result){
-          FAIL("Expected result: " + expected_result + ", got " + out);
-        } else {
-          System.out.println(model_name + " test PASSED");
-        }
+    }
+  }
+
+  // Boilerplate main function to run inference
+  // for provided model, custom setting of
+  // sequence metadata
+  public static void main(String[] args) throws Exception
+  {
+    String model_repository_path = null;
+    String model_name = null;
+    int verbose_level = 0;
+
+    // Parse commandline...
+    for (int i = 0; i < args.length; i++) {
+      switch (args[i]) {
+        case "-m":
+          model_name = args[++i];
+          break;
+        case "-r":
+          model_repository_path = args[++i];
+          break;
+        case "-v":
+          verbose_level = 1;
+          break;
+        case "-?":
+          Usage(null);
+          break;
       }
     }
 
-    // Boilerplate main function to run inference
-    // for provided model, custom setting of
-    // sequence metadata
-    public static void
-    main(String[] args) throws Exception
-    {
-      String model_repository_path = null;
-      String model_name = null;
-      int verbose_level = 0;
-
-      // Parse commandline...
-      for (int i = 0; i < args.length; i++) {
-        switch (args[i]) {
-          case "-m":
-            model_name = args[++i];
-            break;
-          case "-r":
-            model_repository_path = args[++i];
-            break;
-          case "-v":
-            verbose_level = 1;
-            break;
-          case "-?":
-            Usage(null);
-            break;
-        }
-      }
+    if (model_name == null) {
+      Usage("-m must be used to specify model name");
+    }
+    if (model_repository_path == null) {
+      Usage("-r must be used to specify model repository path");
+    }
 
-      if(model_name == null) {
-        Usage("-m must be used to specify model name");
-      }
-      if (model_repository_path == null) {
-        Usage("-r must be used to specify model repository path");
+    // Check API version.
+    int[] api_version_major = {0}, api_version_minor = {0};
+    FAIL_IF_ERR(
+        TRITONSERVER_ApiVersion(api_version_major, api_version_minor),
+        "getting Triton API version");
+    if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0])
+        || (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) {
+      FAIL("triton server API version mismatch");
+    }
+
+    // Create the server...
+    TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
+    FAIL_IF_ERR(TRITONSERVER_ServerOptionsNew(server_options), "creating server options");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetModelRepositoryPath(server_options, model_repository_path),
+        "setting model repository path");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
+        "setting verbose logging level");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetBackendDirectory(server_options, "/opt/tritonserver/backends"),
+        "setting backend directory");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
+            server_options, "/opt/tritonserver/repoagents"),
+        "setting repository agent directory");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true),
+        "setting strict model configuration");
+
+    TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
+    FAIL_IF_ERR(TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+    FAIL_IF_ERR(TRITONSERVER_ServerOptionsDelete(server_options), "deleting server options");
+
+    TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
+
+    // Wait until the server is both live and ready.
+    int health_iters = 0;
+    while (true) {
+      boolean[] live = {false}, ready = {false};
+      FAIL_IF_ERR(TRITONSERVER_ServerIsLive(server, live), "unable to get server liveness");
+      FAIL_IF_ERR(TRITONSERVER_ServerIsReady(server, ready), "unable to get server readiness");
+      System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
+      if (live[0] && ready[0]) {
+        break;
       }
 
-      // Check API version.
-      int[] api_version_major = {0}, api_version_minor = {0};
-      FAIL_IF_ERR(
-          TRITONSERVER_ApiVersion(api_version_major, api_version_minor),
-          "getting Triton API version");
-      if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0]) ||
-          (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) {
-        FAIL("triton server API version mismatch");
+      if (++health_iters >= 10) {
+        FAIL("failed to find healthy inference server");
       }
 
-      // Create the server...
-      TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsNew(server_options),
-          "creating server options");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetModelRepositoryPath(
-              server_options, model_repository_path),
-          "setting model repository path");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
-          "setting verbose logging level");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetBackendDirectory(
-              server_options, "/opt/tritonserver/backends"),
-          "setting backend directory");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
-              server_options, "/opt/tritonserver/repoagents"),
-          "setting repository agent directory");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true),
-          "setting strict model configuration");
+      Thread.sleep(500);
+    }
 
-      TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
+    // Print status of the server.
+    {
+      TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
       FAIL_IF_ERR(
-          TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+          TRITONSERVER_ServerMetadata(server, server_metadata_message),
+          "unable to get server metadata message");
+      BytePointer buffer = new BytePointer((Pointer) null);
+      SizeTPointer byte_size = new SizeTPointer(1);
       FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsDelete(server_options),
-          "deleting server options");
-
-      TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
-
-      // Wait until the server is both live and ready.
-      int health_iters = 0;
-      while (true) {
-        boolean[] live = {false}, ready = {false};
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerIsLive(server, live),
-            "unable to get server liveness");
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerIsReady(server, ready),
-            "unable to get server readiness");
-        System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
-        if (live[0] && ready[0]) {
-          break;
-        }
+          TRITONSERVER_MessageSerializeToJson(server_metadata_message, buffer, byte_size),
+          "unable to serialize server metadata message");
+
+      System.out.println("Server Status:");
+      System.out.println(buffer.limit(byte_size.get()).getString());
 
+      FAIL_IF_ERR(TRITONSERVER_MessageDelete(server_metadata_message), "deleting status metadata");
+    }
+
+    // Wait for the model to become available.
+    boolean[] is_torch_model = {false};
+    boolean[] is_ready = {false};
+    health_iters = 0;
+    while (!is_ready[0]) {
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerModelIsReady(server, model_name, 1, is_ready),
+          "unable to get model readiness");
+      if (!is_ready[0]) {
         if (++health_iters >= 10) {
-          FAIL("failed to find healthy inference server");
+          FAIL("model failed to be ready in 10 iterations");
         }
-
         Thread.sleep(500);
+        continue;
       }
 
-      // Print status of the server.
-      {
-        TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerMetadata(server, server_metadata_message),
-            "unable to get server metadata message");
-        BytePointer buffer = new BytePointer((Pointer)null);
-        SizeTPointer byte_size = new SizeTPointer(1);
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageSerializeToJson(
-                server_metadata_message, buffer, byte_size),
-            "unable to serialize server metadata message");
-
-        System.out.println("Server Status:");
-        System.out.println(buffer.limit(byte_size.get()).getString());
-
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageDelete(server_metadata_message),
-            "deleting status metadata");
-      }
-
-      // Wait for the model to become available.
-      boolean[] is_torch_model = {false};
-      boolean[] is_ready = {false};
-      health_iters = 0;
-      while (!is_ready[0]) {
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerModelIsReady(
-                server, model_name, 1, is_ready),
-            "unable to get model readiness");
-        if (!is_ready[0]) {
-          if (++health_iters >= 10) {
-            FAIL("model failed to be ready in 10 iterations");
-          }
-          Thread.sleep(500);
-          continue;
-        }
+      TRITONSERVER_Message model_metadata_message = new TRITONSERVER_Message(null);
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerModelMetadata(server, model_name, 1, model_metadata_message),
+          "unable to get model metadata message");
+      BytePointer buffer = new BytePointer((Pointer) null);
+      SizeTPointer byte_size = new SizeTPointer(1);
+      FAIL_IF_ERR(
+          TRITONSERVER_MessageSerializeToJson(model_metadata_message, buffer, byte_size),
+          "unable to serialize model status protobuf");
 
-        TRITONSERVER_Message model_metadata_message = new TRITONSERVER_Message(null);
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerModelMetadata(
-                server, model_name, 1, model_metadata_message),
-            "unable to get model metadata message");
-        BytePointer buffer = new BytePointer((Pointer)null);
-        SizeTPointer byte_size = new SizeTPointer(1);
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageSerializeToJson(
-                model_metadata_message, buffer, byte_size),
-            "unable to serialize model status protobuf");
-
-        JsonParser parser = new JsonParser();
-        JsonObject model_metadata = null;
-        try {
-          model_metadata = parser.parse(buffer.limit(byte_size.get()).getString()).getAsJsonObject();
-        } catch (Exception e) {
-          FAIL("error: failed to parse model metadata from JSON: " + e);
-        }
+      JsonParser parser = new JsonParser();
+      JsonObject model_metadata = null;
+      try {
+        model_metadata = parser.parse(buffer.limit(byte_size.get()).getString()).getAsJsonObject();
+      }
+      catch (Exception e) {
+        FAIL("error: failed to parse model metadata from JSON: " + e);
+      }
 
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageDelete(model_metadata_message),
-            "deleting status protobuf");
+      FAIL_IF_ERR(TRITONSERVER_MessageDelete(model_metadata_message), "deleting status protobuf");
 
-        if (!model_metadata.get("name").getAsString().equals(model_name)) {
-          FAIL("unable to find metadata for model");
-        }
+      if (!model_metadata.get("name").getAsString().equals(model_name)) {
+        FAIL("unable to find metadata for model");
+      }
 
-        boolean found_version = false;
-        if (model_metadata.has("versions")) {
-          for (JsonElement version : model_metadata.get("versions").getAsJsonArray()) {
-            if (version.getAsString().equals("1")) {
-              found_version = true;
-              break;
-            }
+      boolean found_version = false;
+      if (model_metadata.has("versions")) {
+        for (JsonElement version : model_metadata.get("versions").getAsJsonArray()) {
+          if (version.getAsString().equals("1")) {
+            found_version = true;
+            break;
           }
         }
-        if (!found_version) {
-          FAIL("unable to find version 1 status for model");
-        }
-
-        FAIL_IF_ERR(
-            ParseModelMetadata(model_metadata, is_torch_model),
-            "parsing model metadata");
+      }
+      if (!found_version) {
+        FAIL("unable to find version 1 status for model");
       }
 
-      // Create the allocator that will be used to allocate buffers for
-      // the result tensors.
-      TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_ResponseAllocatorNew(
-              allocator, responseAlloc, responseRelease, null /* start_fn */),
-          "creating response allocator");
+      FAIL_IF_ERR(ParseModelMetadata(model_metadata, is_torch_model), "parsing model metadata");
+    }
 
-      // Inference
-      TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestNew(
-              irequest, server, model_name, -1 /* model_version */),
-          "creating inference request");
+    // Create the allocator that will be used to allocate buffers for
+    // the result tensors.
+    TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_ResponseAllocatorNew(
+            allocator, responseAlloc, responseRelease, null /* start_fn */),
+        "creating response allocator");
+
+    // Inference
+    TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestNew(irequest, server, model_name, -1 /* model_version */),
+        "creating inference request");
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"),
+        "setting ID for the request");
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetReleaseCallback(
+            irequest, inferRequestComplete, null /* request_release_userp */),
+        "setting request release callback");
+
+    // Inputs
+    String input0 = is_torch_model[0] ? "INPUT__0" : "INPUT";
+
+    long[] input0_shape = {1};
+
+    int datatype = TRITONSERVER_TYPE_INT32;
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddInput(
+            irequest, input0, datatype, input0_shape, input0_shape.length),
+        "setting input 0 meta-data for the request");
+
+    String output0 = is_torch_model[0] ? "OUTPUT__0" : "OUTPUT";
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output0),
+        "requesting output 0 for the request");
+
+    // Non-zero ID for the sequence requests
+    long correlation_id = 5;
+    // Number of requests in the sequence
+    int num_requests = 9;
+    // Expected_result is  1+2+3+...+num_requests
+    int expected_result = num_requests * (1 + num_requests) / 2;
+    boolean sequence_start = true;
+    boolean sequence_end = false;
+
+    // Create the initial data for the input tensor.
+    IntPointer[] p0 = {new IntPointer(1)};
+    BytePointer input0_data = p0[0].getPointer(BytePointer.class);
+    long input0_size = input0_data.limit();
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAppendInputData(
+            irequest, input0, input0_data, input0_size, requested_memory_type,
+            0 /* memory_type_id */),
+        "assigning INPUT0 data");
+
+    for (int i = 0; i < num_requests; i++) {
+      // Update input value
+      int input = i + 1;
+      p0[0].put(0, input);
+
+      // Set sequence metadata
+      if (i == 1) {
+        sequence_start = false;
+      }
+      if (i == num_requests - 1) {
+        sequence_end = true;
+      }
+      SetSequenceMetadata(irequest, correlation_id, sequence_start, sequence_end);
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"),
-          "setting ID for the request");
+      // Perform inference...
+      CompletableFuture completed = new CompletableFuture<>();
+      futures.put(irequest, completed);
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestSetReleaseCallback(
-              irequest, inferRequestComplete, null /* request_release_userp */),
-          "setting request release callback");
-
-      // Inputs
-      String input0 = is_torch_model[0] ? "INPUT__0" : "INPUT";
-
-      long[] input0_shape = {1};
-
-      int datatype = TRITONSERVER_TYPE_INT32;
+          TRITONSERVER_InferenceRequestSetResponseCallback(
+              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
+              irequest),
+          "setting response callback");
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddInput(
-              irequest, input0, datatype, input0_shape, input0_shape.length),
-          "setting input 0 meta-data for the request");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
 
-      String output0 = is_torch_model[0] ? "OUTPUT__0" : "OUTPUT";
+      // Wait for the inference to complete.
+      TRITONSERVER_InferenceResponse completed_response = completed.get();
+      futures.remove(irequest);
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output0),
-          "requesting output 0 for the request");
-
-      // Non-zero ID for the sequence requests
-      long correlation_id = 5;
-      // Number of requests in the sequence
-      int num_requests = 9;
-      // Expected_result is  1+2+3+...+num_requests
-      int expected_result = num_requests * (1 + num_requests) / 2;
-      boolean sequence_start = true;
-      boolean sequence_end = false;
-
-      // Create the initial data for the input tensor.
-      IntPointer[] p0 = {new IntPointer(1)};
-      BytePointer input0_data = p0[0].getPointer(BytePointer.class);
-      long input0_size = input0_data.limit();
+      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
 
-      FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestAppendInputData(
-                irequest, input0, input0_data, input0_size, requested_memory_type,
-                0 /* memory_type_id */),
-            "assigning INPUT0 data");
-
-      for(int i = 0; i < num_requests; i++) {
-        // Update input value
-        int input = i + 1;
-        p0[0].put(0, input);
-
-        // Set sequence metadata
-        if(i == 1) {
-          sequence_start = false;
-        }
-        if(i == num_requests - 1) {
-          sequence_end = true;
-        }
-        SetSequenceMetadata(irequest, correlation_id, sequence_start, sequence_end);
-        
-        // Perform inference...
-        CompletableFuture completed = new CompletableFuture<>();
-        futures.put(irequest, completed);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestSetResponseCallback(
-                irequest, allocator, null /* response_allocator_userp */,
-                inferResponseComplete, irequest),
-            "setting response callback");
-
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerInferAsync(
-                server, irequest, null /* trace */),
-            "running inference");
-
-        // Wait for the inference to complete.
-        TRITONSERVER_InferenceResponse completed_response = completed.get();
-        futures.remove(irequest);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseError(completed_response),
-            "response status");
-
-        Check(
-            model_name, completed_response, input, output0, input0_size,
-            datatype, sequence_end, expected_result);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseDelete(completed_response),
-            "deleting inference response");
-      }
+      Check(
+          model_name, completed_response, input, output0, input0_size, datatype, sequence_end,
+          expected_result);
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestDelete(irequest),
-          "deleting inference request");
+          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
+    }
 
-      FAIL_IF_ERR(
-          TRITONSERVER_ResponseAllocatorDelete(allocator),
-          "deleting response allocator");
+    FAIL_IF_ERR(TRITONSERVER_InferenceRequestDelete(irequest), "deleting inference request");
 
-      System.exit(0);
-    }
+    FAIL_IF_ERR(TRITONSERVER_ResponseAllocatorDelete(allocator), "deleting response allocator");
+
+    System.exit(0);
+  }
 }
diff --git a/qa/L0_json/test.sh b/qa/L0_json/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_large_payload/large_payload_test.py b/qa/L0_large_payload/large_payload_test.py
old mode 100644
new mode 100755
index 051fa4790b..fff57290ef
--- a/qa/L0_large_payload/large_payload_test.py
+++ b/qa/L0_large_payload/large_payload_test.py
@@ -1,4 +1,6 @@
-# Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -30,15 +32,15 @@
 
 import math
 import unittest
+
 import numpy as np
 import test_util as tu
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
-from tritonclientutils import np_to_triton_dtype, InferenceServerException
+from tritonclientutils import InferenceServerException, np_to_triton_dtype
 
 
 class LargePayLoadTest(tu.TestResultCollector):
-
     def setUp(self):
         self._data_type = np.float32
 
@@ -46,36 +48,40 @@ def setUp(self):
         # hard limit on 2GBs for the size of input tensors. All backends except
         # plan backend should be able to handle payloads larger than 2GBs using
         # HTTP.
-        very_large_tensor_shape = (math.trunc(
-            3 * (1024 * 1024 * 1024) / np.dtype(self._data_type).itemsize),)
+        very_large_tensor_shape = (
+            math.trunc(3 * (1024 * 1024 * 1024) / np.dtype(self._data_type).itemsize),
+        )
         self._very_large_in0 = np.random.random(very_large_tensor_shape).astype(
-            self._data_type)
+            self._data_type
+        )
 
         # 1.9 GBs allows us to test gRPC with moderate sizes too.
-        large_tensor_shape = (math.trunc(1.9 * (1024 * 1024 * 1024) //
-                                         np.dtype(self._data_type).itemsize),)
-        self._large_in0 = np.random.random(large_tensor_shape).astype(
-            self._data_type)
+        large_tensor_shape = (
+            math.trunc(
+                1.9 * (1024 * 1024 * 1024) // np.dtype(self._data_type).itemsize
+            ),
+        )
+        self._large_in0 = np.random.random(large_tensor_shape).astype(self._data_type)
 
         small_tensor_shape = (1,)
-        self._small_in0 = np.random.random(small_tensor_shape).astype(
-            self._data_type)
-
-        self._clients = ((httpclient,
-                          httpclient.InferenceServerClient('localhost:8000')),
-                         (grpcclient,
-                          grpcclient.InferenceServerClient('localhost:8001')))
-
-    def _test_helper(self,
-                     client,
-                     model_name,
-                     input_name='INPUT0',
-                     output_name='OUTPUT0'):
-        # plan does not supoort large batch sizes.
-        if not model_name.startswith('plan'):
+        self._small_in0 = np.random.random(small_tensor_shape).astype(self._data_type)
+
+        self._clients = (
+            (httpclient, httpclient.InferenceServerClient("localhost:8000")),
+            (grpcclient, grpcclient.InferenceServerClient("localhost:8001")),
+        )
+
+    def _test_helper(
+        self, client, model_name, input_name="INPUT0", output_name="OUTPUT0"
+    ):
+        # plan does not support large batch sizes.
+        if not model_name.startswith("plan"):
             inputs = [
-                client[0].InferInput(input_name, self._large_in0.shape,
-                                     np_to_triton_dtype(self._data_type))
+                client[0].InferInput(
+                    input_name,
+                    self._large_in0.shape,
+                    np_to_triton_dtype(self._data_type),
+                )
             ]
             inputs[0].set_data_from_numpy(self._large_in0)
             results = client[1].infer(model_name, inputs)
@@ -84,13 +90,17 @@ def _test_helper(self,
             # the framework and protocol do support large payload
             self.assertTrue(
                 np.array_equal(self._large_in0, results.as_numpy(output_name)),
-                "output is different from input")
+                "output is different from input",
+            )
 
         if client[0] == httpclient:
             # FIXME HTTPServer cannot support large payloads. See DLIS-1776.
             inputs = [
-                client[0].InferInput(input_name, self._very_large_in0.shape,
-                                     np_to_triton_dtype(self._data_type))
+                client[0].InferInput(
+                    input_name,
+                    self._very_large_in0.shape,
+                    np_to_triton_dtype(self._data_type),
+                )
             ]
             inputs[0].set_data_from_numpy(self._very_large_in0)
             with self.assertRaises(InferenceServerException):
@@ -113,56 +123,54 @@ def _test_helper(self,
 
         # Send a small payload to verify if the server is still functional
         inputs = [
-            client[0].InferInput(input_name, self._small_in0.shape,
-                                 np_to_triton_dtype(self._data_type))
+            client[0].InferInput(
+                input_name, self._small_in0.shape, np_to_triton_dtype(self._data_type)
+            )
         ]
         inputs[0].set_data_from_numpy(self._small_in0)
         results = client[1].infer(model_name, inputs)
         self.assertTrue(
             np.array_equal(self._small_in0, results.as_numpy(output_name)),
-            "output is different from input")
+            "output is different from input",
+        )
 
     def test_graphdef(self):
         # graphdef_nobatch_zero_1_float32 is identity model with input shape [-1]
         for client in self._clients:
-            model_name = tu.get_zero_model_name("graphdef_nobatch", 1,
-                                                self._data_type)
+            model_name = tu.get_zero_model_name("graphdef_nobatch", 1, self._data_type)
             self._test_helper(client, model_name)
 
     def test_savedmodel(self):
         # savedmodel_nobatch_zero_1_float32 is identity model with input shape [-1]
         for client in self._clients:
-            model_name = tu.get_zero_model_name("savedmodel_nobatch", 1,
-                                                self._data_type)
+            model_name = tu.get_zero_model_name(
+                "savedmodel_nobatch", 1, self._data_type
+            )
             self._test_helper(client, model_name)
 
     def test_onnx(self):
         # onnx_nobatch_zero_1_float32 is identity model with input shape [-1]
         for client in self._clients:
-            model_name = tu.get_zero_model_name("onnx_nobatch", 1,
-                                                self._data_type)
+            model_name = tu.get_zero_model_name("onnx_nobatch", 1, self._data_type)
             self._test_helper(client, model_name)
 
     def test_python(self):
         # python_nobatch_zero_1_float32 is identity model with input shape [-1]
         for client in self._clients:
-            model_name = tu.get_zero_model_name("python_nobatch", 1,
-                                                self._data_type)
+            model_name = tu.get_zero_model_name("python_nobatch", 1, self._data_type)
             self._test_helper(client, model_name)
 
     def test_plan(self):
         # plan_nobatch_zero_1_float32 is identity model with input shape [-1]
         for client in self._clients:
-            model_name = tu.get_zero_model_name("plan_nobatch", 1,
-                                                self._data_type)
+            model_name = tu.get_zero_model_name("plan_nobatch", 1, self._data_type)
             self._test_helper(client, model_name)
 
     def test_libtorch(self):
         # libtorch_nobatch_zero_1_float32 is identity model with input shape [-1]
         for client in self._clients:
-            model_name = tu.get_zero_model_name("libtorch_nobatch", 1,
-                                                self._data_type)
-            self._test_helper(client, model_name, 'INPUT__0', 'OUTPUT__0')
+            model_name = tu.get_zero_model_name("libtorch_nobatch", 1, self._data_type)
+            self._test_helper(client, model_name, "INPUT__0", "OUTPUT__0")
 
     def test_custom(self):
         # custom_zero_1_float32 is identity model with input shape [-1]
@@ -171,5 +179,5 @@ def test_custom(self):
             self._test_helper(client, model_name)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_large_payload/test.sh b/qa/L0_large_payload/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_libtorch_inference_mode/test.sh b/qa/L0_libtorch_inference_mode/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_libtorch_instance_group_kind_model/client.py b/qa/L0_libtorch_instance_group_kind_model/client.py
old mode 100644
new mode 100755
index c3c8289f8a..92bead3464
--- a/qa/L0_libtorch_instance_group_kind_model/client.py
+++ b/qa/L0_libtorch_instance_group_kind_model/client.py
@@ -31,32 +31,32 @@
 sys.path.append("../common")
 
 import unittest
+
 import numpy as np
 import test_util as tu
-
 import tritonclient.http as httpclient
 
 # By default, find tritonserver on "localhost", but can be overridden
 # with TRITONSERVER_IPADDR envvar
-_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost')
+_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
 
 
 class InferTest(tu.TestResultCollector):
-
     def test_infer(self):
         try:
             triton_client = httpclient.InferenceServerClient(
-                url=f"{_tritonserver_ipaddr}:8000")
+                url=f"{_tritonserver_ipaddr}:8000"
+            )
         except Exception as e:
             print("channel creation failed: " + str(e))
             sys.exit(1)
 
-        model_name = os.environ['MODEL_NAME']
+        model_name = os.environ["MODEL_NAME"]
 
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "FP32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "FP32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "FP32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "FP32"))
 
         # Create the data for the two input tensors.
         input0_data = np.arange(start=0, stop=16, dtype=np.float32)
@@ -68,15 +68,13 @@ def test_infer(self):
         inputs[0].set_data_from_numpy(input0_data, binary_data=True)
         inputs[1].set_data_from_numpy(input1_data, binary_data=True)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT__0', binary_data=True))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT__1', binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT__0", binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT__1", binary_data=True))
 
         results = triton_client.infer(model_name, inputs, outputs=outputs)
 
-        output0_data = results.as_numpy('OUTPUT__0')
-        output1_data = results.as_numpy('OUTPUT__1')
+        output0_data = results.as_numpy("OUTPUT__0")
+        output1_data = results.as_numpy("OUTPUT__1")
 
         expected_output_0 = input0_data + input1_data
         expected_output_1 = input0_data - input1_data
@@ -88,5 +86,5 @@ def test_infer(self):
         self.assertTrue(np.all(expected_output_1 == output1_data))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_libtorch_instance_group_kind_model/gen_models.py b/qa/L0_libtorch_instance_group_kind_model/gen_models.py
index af8023e352..e61980f491 100755
--- a/qa/L0_libtorch_instance_group_kind_model/gen_models.py
+++ b/qa/L0_libtorch_instance_group_kind_model/gen_models.py
@@ -30,7 +30,6 @@
 
 
 class SumModule(nn.Module):
-
     def __init__(self, device):
         super(SumModule, self).__init__()
         self.device = device
@@ -38,13 +37,15 @@ def __init__(self, device):
     def forward(self, INPUT0, INPUT1):
         INPUT0 = INPUT0.to(self.device)
         INPUT1 = INPUT1.to(self.device)
-        print('SumModule - INPUT0 device: {}, INPUT1 device: {}\n'.format(
-            INPUT0.device, INPUT1.device))
+        print(
+            "SumModule - INPUT0 device: {}, INPUT1 device: {}\n".format(
+                INPUT0.device, INPUT1.device
+            )
+        )
         return INPUT0 + INPUT1
 
 
 class DiffModule(nn.Module):
-
     def __init__(self, device):
         super(DiffModule, self).__init__()
         self.device = device
@@ -52,13 +53,15 @@ def __init__(self, device):
     def forward(self, INPUT0, INPUT1):
         INPUT0 = INPUT0.to(self.device)
         INPUT1 = INPUT1.to(self.device)
-        print('DiffModule - INPUT0 device: {}, INPUT1 device: {}\n'.format(
-            INPUT0.device, INPUT1.device))
+        print(
+            "DiffModule - INPUT0 device: {}, INPUT1 device: {}\n".format(
+                INPUT0.device, INPUT1.device
+            )
+        )
         return INPUT0 - INPUT1
 
 
 class TestModel(nn.Module):
-
     def __init__(self, device0, device1):
         super(TestModel, self).__init__()
         self.device0 = device0
@@ -72,6 +75,7 @@ def forward(self, INPUT0, INPUT1):
         op1 = self.layer2(INPUT0, INPUT1)
         return op0, op1
 
+
 if torch.cuda.device_count() < 4:
     print("Need at least 4 GPUs to run this test")
     exit(1)
diff --git a/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt
old mode 100755
new mode 100644
diff --git a/qa/L0_libtorch_instance_group_kind_model/test.sh b/qa/L0_libtorch_instance_group_kind_model/test.sh
index 7dcb96d5d1..04d76bd036 100755
--- a/qa/L0_libtorch_instance_group_kind_model/test.sh
+++ b/qa/L0_libtorch_instance_group_kind_model/test.sh
@@ -63,9 +63,9 @@ cp models/libtorch_multi_device/config.pbtxt models/libtorch_multi_gpu/.
 (cd models/libtorch_multi_gpu && \
     sed -i "s/name: \"libtorch_multi_device\"/name: \"libtorch_multi_gpu\"/" config.pbtxt)
 
-# Generate the models which are partioned across multiple devices
+# Generate the models which are partitioned across multiple devices
 set +e
-python3 gen_models.py >> $CLIENT_LOG 2>&1 
+python3 gen_models.py >> $CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
     echo -e "\n***\n*** Error when generating models. \n***"
     cat $CLIENT_LOG
@@ -83,7 +83,7 @@ fi
 set +e
 
 export MODEL_NAME='libtorch_multi_device'
-python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 
+python3 $CLIENT_PY >> $CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
     echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***"
     cat $CLIENT_LOG
@@ -109,7 +109,7 @@ for MESSAGE in "${MESSAGES[@]}"; do
 done
 
 export MODEL_NAME='libtorch_multi_gpu'
-python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 
+python3 $CLIENT_PY >> $CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
     echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***"
     cat $CLIENT_LOG
diff --git a/qa/L0_libtorch_io_names/io_names_client.py b/qa/L0_libtorch_io_names/io_names_client.py
old mode 100644
new mode 100755
index 15971356d9..b74e520de2
--- a/qa/L0_libtorch_io_names/io_names_client.py
+++ b/qa/L0_libtorch_io_names/io_names_client.py
@@ -29,19 +29,19 @@
 
 sys.path.append("../common")
 
-from builtins import range
 import unittest
-import test_util as tu
-import numpy as np
+from builtins import range
 
+import numpy as np
+import test_util as tu
 import tritonclient.http as httpclient
 
 
 class IONamingConvention(tu.TestResultCollector):
-
     def _infer_helper(self, model_name, io_names, reversed_order=False):
-        triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                         verbose=False)
+        triton_client = httpclient.InferenceServerClient(
+            "localhost:8000", verbose=False
+        )
 
         # Create the data for the two inputs. Initialize the first to unique
         # integers and the second to all ones.
@@ -53,30 +53,34 @@ def _infer_helper(self, model_name, io_names, reversed_order=False):
         output_req = []
         inputs.append(
             httpclient.InferInput(
-                io_names[0] if not reversed_order else io_names[1], [1, 16],
-                "FP32"))
+                io_names[0] if not reversed_order else io_names[1], [1, 16], "FP32"
+            )
+        )
         inputs[-1].set_data_from_numpy(input0_data)
         inputs.append(
             httpclient.InferInput(
-                io_names[1] if not reversed_order else io_names[0], [1, 16],
-                "FP32"))
+                io_names[1] if not reversed_order else io_names[0], [1, 16], "FP32"
+            )
+        )
         inputs[-1].set_data_from_numpy(input1_data)
         output_req.append(
-            httpclient.InferRequestedOutput(io_names[2], binary_data=True))
+            httpclient.InferRequestedOutput(io_names[2], binary_data=True)
+        )
         output_req.append(
-            httpclient.InferRequestedOutput(io_names[3], binary_data=True))
+            httpclient.InferRequestedOutput(io_names[3], binary_data=True)
+        )
 
         results = triton_client.infer(model_name, inputs, outputs=output_req)
 
         output0_data = results.as_numpy(
-            io_names[2] if not reversed_order else io_names[3])
+            io_names[2] if not reversed_order else io_names[3]
+        )
         output1_data = results.as_numpy(
-            io_names[3] if not reversed_order else io_names[2])
+            io_names[3] if not reversed_order else io_names[2]
+        )
         for i in range(16):
-            self.assertEqual(input0_data[0][i] - input1_data[0][i],
-                             output0_data[0][i])
-            self.assertEqual(input0_data[0][i] + input1_data[0][i],
-                             output1_data[0][i])
+            self.assertEqual(input0_data[0][i] - input1_data[0][i], output0_data[0][i])
+            self.assertEqual(input0_data[0][i] + input1_data[0][i], output1_data[0][i])
 
     def test_io_index(self):
         io_names = ["INPUT__0", "INPUT__1", "OUTPUT__0", "OUTPUT__1"]
@@ -108,10 +112,8 @@ def test_mix_arguments_index(self):
 
     def test_unordered_index(self):
         io_names = ["INPUT1", "INPUT0", "OUT__1", "OUT__0"]
-        self._infer_helper("libtorch_unordered_index",
-                           io_names,
-                           reversed_order=True)
+        self._infer_helper("libtorch_unordered_index", io_names, reversed_order=True)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_libtorch_io_names/test.sh b/qa/L0_libtorch_io_names/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_libtorch_nvfuser/test.sh b/qa/L0_libtorch_nvfuser/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_libtorch_optimized_execution/test.sh b/qa/L0_libtorch_optimized_execution/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_libtorch_shared_weights/libtorch_shared_weights_test.py b/qa/L0_libtorch_shared_weights/libtorch_shared_weights_test.py
old mode 100644
new mode 100755
index eeb5651afa..7c2fdb5a71
--- a/qa/L0_libtorch_shared_weights/libtorch_shared_weights_test.py
+++ b/qa/L0_libtorch_shared_weights/libtorch_shared_weights_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,31 +30,29 @@
 
 sys.path.append("../common")
 
-import numpy as np
 import unittest
 from builtins import range
-import tritonhttpclient as httpclient
+
+import numpy as np
 import test_util as tu
+import tritonhttpclient as httpclient
 
 FLAGS = None
 
 
 class SharedWeightsTest(tu.TestResultCollector):
-
     def _full_exact(self, model_name, request_concurrency, shape):
-
         # Run async requests to make sure backend handles concurrent requests
         # correctly.
         client = httpclient.InferenceServerClient(
-            "localhost:8000", concurrency=request_concurrency)
+            "localhost:8000", concurrency=request_concurrency
+        )
         input_datas = []
         requests = []
         for i in range(request_concurrency):
             input_data = (16384 * np.random.randn(*shape)).astype(np.float32)
             input_datas.append(input_data)
-            inputs = [
-                httpclient.InferInput("INPUT__0", input_data.shape, "FP32")
-            ]
+            inputs = [httpclient.InferInput("INPUT__0", input_data.shape, "FP32")]
             inputs[0].set_data_from_numpy(input_data)
             requests.append(client.async_infer(model_name, inputs))
 
@@ -62,8 +62,7 @@ def _full_exact(self, model_name, request_concurrency, shape):
             results = requests[i].get_result()
 
             output_data = results.as_numpy("OUTPUT__0")
-            self.assertIsNotNone(output_data,
-                                 "error: expected 'OUTPUT__0' to be found")
+            self.assertIsNotNone(output_data, "error: expected 'OUTPUT__0' to be found")
             np.testing.assert_allclose(output_data, input_datas[i])
 
     def test_pytorch_identity_model(self):
@@ -71,5 +70,5 @@ def test_pytorch_identity_model(self):
         self._full_exact(model_name, 128, [8])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_libtorch_shared_weights/test.sh b/qa/L0_libtorch_shared_weights/test.sh
old mode 100644
new mode 100755
index e6f23b7a45..6ca251ce32
--- a/qa/L0_libtorch_shared_weights/test.sh
+++ b/qa/L0_libtorch_shared_weights/test.sh
@@ -1,4 +1,5 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/bin/bash
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/qa/L0_lifecycle/lifecycle_test.py b/qa/L0_lifecycle/lifecycle_test.py
old mode 100644
new mode 100755
index 5ce079a77a..1caffb8f56
--- a/qa/L0_lifecycle/lifecycle_test.py
+++ b/qa/L0_lifecycle/lifecycle_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,93 +30,99 @@
 
 sys.path.append("../common")
 
-from builtins import range
-from functools import partial
+import concurrent.futures
 import os
 import shutil
 import signal
+import threading
 import time
 import unittest
-import numpy as np
+from builtins import range
+from functools import partial
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
-import threading
-import concurrent.futures
-
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
 from tritonclient.utils import InferenceServerException
 
 
 class LifeCycleTest(tu.TestResultCollector):
-
-    def _infer_success_models(self,
-                              model_base_names,
-                              versions,
-                              tensor_shape,
-                              swap=False):
+    def _infer_success_models(
+        self, model_base_names, versions, tensor_shape, swap=False
+    ):
         for base_name in model_base_names:
             try:
-                model_name = tu.get_model_name(base_name, np.float32,
-                                               np.float32, np.float32)
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                model_name = tu.get_model_name(
+                    base_name, np.float32, np.float32, np.float32
+                )
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     # FIXME is_server_ready should be true here DLIS-1296
                     # self.assertTrue(triton_client.is_server_ready())
                     for v in versions:
                         self.assertTrue(
-                            triton_client.is_model_ready(model_name, str(v)))
+                            triton_client.is_model_ready(model_name, str(v))
+                        )
 
                 for v in versions:
-                    iu.infer_exact(self,
-                                   base_name,
-                                   tensor_shape,
-                                   1,
-                                   np.float32,
-                                   np.float32,
-                                   np.float32,
-                                   model_version=v,
-                                   swap=(swap or (v != 1)))
+                    iu.infer_exact(
+                        self,
+                        base_name,
+                        tensor_shape,
+                        1,
+                        np.float32,
+                        np.float32,
+                        np.float32,
+                        model_version=v,
+                        swap=(swap or (v != 1)),
+                    )
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
-    def _infer_success_identity(self, model_base, versions, tensor_dtype,
-                                tensor_shape):
+    def _infer_success_identity(self, model_base, versions, tensor_dtype, tensor_shape):
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             self.assertTrue(triton_client.is_server_live())
             self.assertTrue(triton_client.is_server_ready())
             for v in versions:
                 self.assertTrue(
                     triton_client.is_model_ready(
-                        tu.get_zero_model_name(model_base, 1, tensor_dtype),
-                        str(v)))
+                        tu.get_zero_model_name(model_base, 1, tensor_dtype), str(v)
+                    )
+                )
 
             for v in versions:
-                iu.infer_zero(self,
-                              model_base,
-                              1,
-                              tensor_dtype,
-                              tensor_shape,
-                              tensor_shape,
-                              use_http=False,
-                              use_grpc=True,
-                              use_http_json_tensors=False,
-                              use_streaming=False)
+                iu.infer_zero(
+                    self,
+                    model_base,
+                    1,
+                    tensor_dtype,
+                    tensor_shape,
+                    tensor_shape,
+                    use_http=False,
+                    use_grpc=True,
+                    use_http_json_tensors=False,
+                    use_streaming=False,
+                )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def _get_client(self, use_grpc=False):
         if use_grpc:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
         else:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
         return triton_client
 
     def _async_load(self, model_name, use_grpc):
@@ -130,8 +138,9 @@ def test_parse_error_noexit(self):
         # SERVER_FAILED_TO_INITIALIZE status.
         # Server is not live and not ready regardless of --strict-readiness
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             self.assertFalse(triton_client.is_server_live())
             self.assertFalse(triton_client.is_server_ready())
             md = triton_client.get_server_metadata()
@@ -141,13 +150,14 @@ def test_parse_error_noexit(self):
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             self.assertFalse(triton_client.is_server_live())
             self.assertFalse(triton_client.is_server_ready())
             md = triton_client.get_server_metadata()
-            self.assertEqual(os.environ["TRITON_SERVER_VERSION"], md['version'])
-            self.assertEqual("triton", md['name'])
+            self.assertEqual(os.environ["TRITON_SERVER_VERSION"], md["version"])
+            self.assertEqual("triton", md["name"])
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -157,17 +167,20 @@ def test_parse_error_modelfail(self):
 
         # Server was started but with a model that fails to load
         try:
-            model_name = tu.get_model_name('graphdef', np.float32, np.float32,
-                                           np.float32)
+            model_name = tu.get_model_name(
+                "graphdef", np.float32, np.float32, np.float32
+            )
 
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             self.assertTrue(triton_client.is_server_live())
             self.assertFalse(triton_client.is_server_ready())
             self.assertFalse(triton_client.is_model_ready(model_name, "1"))
 
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             self.assertTrue(triton_client.is_server_live())
             self.assertFalse(triton_client.is_server_ready())
             self.assertFalse(triton_client.is_model_ready(model_name, "1"))
@@ -176,35 +189,38 @@ def test_parse_error_modelfail(self):
 
         # Inferencing with the missing model should fail.
         try:
-            iu.infer_exact(self, 'graphdef', tensor_shape, 1, np.float32,
-                           np.float32, np.float32)
-            self.assertTrue(
-                False, "expected error for unavailable model " + model_name)
+            iu.infer_exact(
+                self, "graphdef", tensor_shape, 1, np.float32, np.float32, np.float32
+            )
+            self.assertTrue(False, "expected error for unavailable model " + model_name)
         except Exception as ex:
             self.assertIn(
                 "Request for unknown model: 'graphdef_float32_float32_float32' has no available versions",
-                ex.message())
+                ex.message(),
+            )
 
         # And other models should be loaded successfully
         try:
-            for base_name in ['savedmodel', 'onnx']:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
-                    model_name = tu.get_model_name(base_name, np.float32,
-                                                   np.float32, np.float32)
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "1"))
-
-                iu.infer_exact(self,
-                               base_name,
-                               tensor_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               model_version=1)
+            for base_name in ["savedmodel", "onnx"]:
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
+                    model_name = tu.get_model_name(
+                        base_name, np.float32, np.float32, np.float32
+                    )
+                    self.assertTrue(triton_client.is_model_ready(model_name, "1"))
+
+                iu.infer_exact(
+                    self,
+                    base_name,
+                    tensor_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    model_version=1,
+                )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -214,17 +230,20 @@ def test_parse_error_modelfail_nostrict(self):
 
         # Server was started but with a model that fails to load
         try:
-            model_name = tu.get_model_name('graphdef', np.float32, np.float32,
-                                           np.float32)
+            model_name = tu.get_model_name(
+                "graphdef", np.float32, np.float32, np.float32
+            )
 
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             self.assertTrue(triton_client.is_server_live())
             self.assertTrue(triton_client.is_server_ready())
             self.assertFalse(triton_client.is_model_ready(model_name, "1"))
 
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             self.assertTrue(triton_client.is_server_live())
             self.assertTrue(triton_client.is_server_ready())
             self.assertFalse(triton_client.is_model_ready(model_name, "1"))
@@ -233,35 +252,38 @@ def test_parse_error_modelfail_nostrict(self):
 
         # Inferencing with the missing model should fail.
         try:
-            iu.infer_exact(self, 'graphdef', tensor_shape, 1, np.float32,
-                           np.float32, np.float32)
-            self.assertTrue(
-                False, "expected error for unavailable model " + model_name)
+            iu.infer_exact(
+                self, "graphdef", tensor_shape, 1, np.float32, np.float32, np.float32
+            )
+            self.assertTrue(False, "expected error for unavailable model " + model_name)
         except Exception as ex:
             self.assertIn(
                 "Request for unknown model: 'graphdef_float32_float32_float32' has no available versions",
-                ex.message())
+                ex.message(),
+            )
 
         # And other models should be loaded successfully
         try:
-            for base_name in ['savedmodel', 'onnx']:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
-                    model_name = tu.get_model_name(base_name, np.float32,
-                                                   np.float32, np.float32)
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "1"))
-
-                iu.infer_exact(self,
-                               base_name,
-                               tensor_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               model_version=1)
+            for base_name in ["savedmodel", "onnx"]:
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
+                    model_name = tu.get_model_name(
+                        base_name, np.float32, np.float32, np.float32
+                    )
+                    self.assertTrue(triton_client.is_model_ready(model_name, "1"))
+
+                iu.infer_exact(
+                    self,
+                    base_name,
+                    tensor_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    model_version=1,
+                )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -269,13 +291,14 @@ def test_parse_error_no_model_config(self):
         tensor_shape = (1, 16)
 
         # Server was started but with a model that fails to be polled
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
-                model_name = tu.get_model_name('graphdef', np.float32,
-                                               np.float32, np.float32)
+                model_name = tu.get_model_name(
+                    "graphdef", np.float32, np.float32, np.float32
+                )
 
                 # expecting ready because not strict readiness
                 self.assertTrue(triton_client.is_server_live())
@@ -283,29 +306,36 @@ def test_parse_error_no_model_config(self):
 
                 md = triton_client.get_model_metadata(model_name, "1")
                 self.assertTrue(
-                    False, "expected model '" + model_name +
-                    "' to be ignored due to polling failure")
+                    False,
+                    "expected model '"
+                    + model_name
+                    + "' to be ignored due to polling failure",
+                )
 
             except Exception as ex:
                 self.assertIn(
                     "Request for unknown model: 'graphdef_float32_float32_float32' is not found",
-                    ex.message())
+                    ex.message(),
+                )
 
         # And other models should be loaded successfully
         try:
-            for base_name in ['savedmodel', 'onnx']:
-                model_name = tu.get_model_name(base_name, np.float32,
-                                               np.float32, np.float32)
+            for base_name in ["savedmodel", "onnx"]:
+                model_name = tu.get_model_name(
+                    base_name, np.float32, np.float32, np.float32
+                )
                 self.assertTrue(triton_client.is_model_ready(model_name, "1"))
 
-                iu.infer_exact(self,
-                               base_name,
-                               tensor_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               model_version=1)
+                iu.infer_exact(
+                    self,
+                    base_name,
+                    tensor_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    model_version=1,
+                )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -313,10 +343,10 @@ def test_init_error_modelfail(self):
         # --strict-readiness=true so server is live but not ready
 
         # Server was started but with models that fail to load
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 self.assertTrue(triton_client.is_server_live())
                 self.assertFalse(triton_client.is_server_ready())
@@ -331,24 +361,27 @@ def test_init_error_modelfail(self):
 
             # And other models should be loaded successfully
             try:
-                for base_name in ['graphdef', 'savedmodel', 'onnx']:
-                    model_name = tu.get_model_name(base_name, np.float32,
-                                                   np.float32, np.float32)
+                for base_name in ["graphdef", "savedmodel", "onnx"]:
+                    model_name = tu.get_model_name(
+                        base_name, np.float32, np.float32, np.float32
+                    )
                     self.assertTrue(triton_client.is_model_ready(model_name))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         try:
             tensor_shape = (1, 16)
-            for base_name in ['graphdef', 'savedmodel', 'onnx']:
-                iu.infer_exact(self,
-                               base_name,
-                               tensor_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               model_version=1)
+            for base_name in ["graphdef", "savedmodel", "onnx"]:
+                iu.infer_exact(
+                    self,
+                    base_name,
+                    tensor_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    model_version=1,
+                )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -357,95 +390,105 @@ def test_parse_error_model_no_version(self):
         tensor_shape = (1, 16)
 
         # Server was started but with a model that fails to load
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 self.assertTrue(triton_client.is_server_live())
                 self.assertFalse(triton_client.is_server_ready())
 
-                model_name = tu.get_model_name('graphdef', np.float32,
-                                               np.float32, np.float32)
+                model_name = tu.get_model_name(
+                    "graphdef", np.float32, np.float32, np.float32
+                )
                 self.assertFalse(triton_client.is_model_ready(model_name))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
             # Sanity check that other models are loaded properly
             try:
-                for base_name in ['savedmodel', 'onnx']:
-                    model_name = tu.get_model_name(base_name, np.float32,
-                                                   np.float32, np.float32)
+                for base_name in ["savedmodel", "onnx"]:
+                    model_name = tu.get_model_name(
+                        base_name, np.float32, np.float32, np.float32
+                    )
                     self.assertTrue(triton_client.is_model_ready(model_name))
                 for version in ["1", "3"]:
-                    model_name = tu.get_model_name("plan", np.float32,
-                                                   np.float32, np.float32)
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, version))
+                    model_name = tu.get_model_name(
+                        "plan", np.float32, np.float32, np.float32
+                    )
+                    self.assertTrue(triton_client.is_model_ready(model_name, version))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         try:
-            for base_name in ['savedmodel', 'onnx']:
-                iu.infer_exact(self,
-                               base_name,
-                               tensor_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               swap=True)
+            for base_name in ["savedmodel", "onnx"]:
+                iu.infer_exact(
+                    self,
+                    base_name,
+                    tensor_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    swap=True,
+                )
             for version in [1, 3]:
-                iu.infer_exact(self,
-                               'plan',
-                               tensor_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               swap=(version == 3),
-                               model_version=version)
+                iu.infer_exact(
+                    self,
+                    "plan",
+                    tensor_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    swap=(version == 3),
+                    model_version=version,
+                )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         try:
-            iu.infer_exact(self, 'graphdef', tensor_shape, 1, np.float32,
-                           np.float32, np.float32)
-            self.assertTrue(
-                False, "expected error for unavailable model " + model_name)
+            iu.infer_exact(
+                self, "graphdef", tensor_shape, 1, np.float32, np.float32, np.float32
+            )
+            self.assertTrue(False, "expected error for unavailable model " + model_name)
         except Exception as ex:
             self.assertIn(
                 "Request for unknown model: 'graphdef_float32_float32_float32' has no available versions",
-                ex.message())
+                ex.message(),
+            )
 
     def test_parse_ignore_zero_prefixed_version(self):
         tensor_shape = (1, 16)
 
         # Server was started but only version 1 is loaded
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
 
-                model_name = tu.get_model_name('savedmodel', np.float32,
-                                               np.float32, np.float32)
+                model_name = tu.get_model_name(
+                    "savedmodel", np.float32, np.float32, np.float32
+                )
                 self.assertTrue(triton_client.is_model_ready(model_name, "1"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         try:
             # swap=False for version 1
-            iu.infer_exact(self,
-                           'savedmodel',
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           swap=False)
+            iu.infer_exact(
+                self,
+                "savedmodel",
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                swap=False,
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -453,53 +496,54 @@ def test_parse_ignore_non_intergral_version(self):
         tensor_shape = (1, 16)
 
         # Server was started but only version 1 is loaded
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
 
-                model_name = tu.get_model_name('savedmodel', np.float32,
-                                               np.float32, np.float32)
+                model_name = tu.get_model_name(
+                    "savedmodel", np.float32, np.float32, np.float32
+                )
                 self.assertTrue(triton_client.is_model_ready(model_name, "1"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         try:
             # swap=False for version 1
-            iu.infer_exact(self,
-                           'savedmodel',
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           swap=False)
+            iu.infer_exact(
+                self,
+                "savedmodel",
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                swap=False,
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_dynamic_model_load_unload(self):
         tensor_shape = (1, 16)
-        savedmodel_name = tu.get_model_name('savedmodel', np.float32,
-                                            np.float32, np.float32)
-        onnx_name = tu.get_model_name('onnx', np.float32, np.float32,
-                                      np.float32)
+        savedmodel_name = tu.get_model_name(
+            "savedmodel", np.float32, np.float32, np.float32
+        )
+        onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32)
 
         # Make sure savedmodel model is not in the status (because
         # initially it is not in the model repository)
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
             except Exception as ex:
@@ -510,16 +554,14 @@ def test_dynamic_model_load_unload(self):
         try:
             shutil.copytree(savedmodel_name, "models/" + savedmodel_name)
             time.sleep(5)  # wait for model to load
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertTrue(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertTrue(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertTrue(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
         except Exception as ex:
@@ -527,47 +569,58 @@ def test_dynamic_model_load_unload(self):
 
         # Run inference on the just loaded model
         try:
-            iu.infer_exact(self,
-                           'savedmodel',
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           swap=True)
+            iu.infer_exact(
+                self,
+                "savedmodel",
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                swap=True,
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Make sure savedmodel has execution stats
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             stats = triton_client.get_inference_statistics(savedmodel_name)
             self.assertEqual(len(stats["model_stats"]), 2)
             for idx in range(len(stats["model_stats"])):
-                self.assertEqual(stats["model_stats"][idx]["name"],
-                                 savedmodel_name)
+                self.assertEqual(stats["model_stats"][idx]["name"], savedmodel_name)
                 if stats["model_stats"][idx]["version"] == "1":
                     self.assertEqual(
-                        stats["model_stats"][idx]["inference_stats"]["success"]
-                        ["count"], 0)
+                        stats["model_stats"][idx]["inference_stats"]["success"][
+                            "count"
+                        ],
+                        0,
+                    )
                 else:
                     self.assertNotEqual(
-                        stats["model_stats"][idx]["inference_stats"]["success"]
-                        ["count"], 0)
-
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+                        stats["model_stats"][idx]["inference_stats"]["success"][
+                            "count"
+                        ],
+                        0,
+                    )
+
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             stats = triton_client.get_inference_statistics(savedmodel_name)
             self.assertEqual(len(stats.model_stats), 2)
             for idx in range(len(stats.model_stats)):
                 self.assertEqual(stats.model_stats[idx].name, savedmodel_name)
                 if stats.model_stats[idx].version == "1":
                     self.assertEqual(
-                        stats.model_stats[idx].inference_stats.success.count, 0)
+                        stats.model_stats[idx].inference_stats.success.count, 0
+                    )
                 else:
                     self.assertNotEqual(
-                        stats.model_stats[idx].inference_stats.success.count, 0)
+                        stats.model_stats[idx].inference_stats.success.count, 0
+                    )
 
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
@@ -577,16 +630,14 @@ def test_dynamic_model_load_unload(self):
         try:
             shutil.rmtree("models/" + savedmodel_name)
             time.sleep(5)  # wait for model to unload
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
         except Exception as ex:
@@ -594,62 +645,65 @@ def test_dynamic_model_load_unload(self):
 
         # Model is removed so inference should fail
         try:
-            iu.infer_exact(self,
-                           'savedmodel',
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           swap=True)
+            iu.infer_exact(
+                self,
+                "savedmodel",
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                swap=True,
+            )
             self.assertTrue(
-                False,
-                "expected error for unavailable model " + savedmodel_name)
+                False, "expected error for unavailable model " + savedmodel_name
+            )
         except Exception as ex:
             self.assertIn(
-                "Request for unknown model: '{}' has no available versions".
-                format(savedmodel_name), ex.message())
+                "Request for unknown model: '{}' has no available versions".format(
+                    savedmodel_name
+                ),
+                ex.message(),
+            )
 
         # Add back the same model. The status/stats should be reset.
         try:
             shutil.copytree(savedmodel_name, "models/" + savedmodel_name)
             time.sleep(5)  # wait for model to load
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertTrue(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertTrue(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertTrue(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
 
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             stats = triton_client.get_inference_statistics(savedmodel_name)
             self.assertEqual(len(stats["model_stats"]), 2)
             self.assertEqual(stats["model_stats"][0]["name"], savedmodel_name)
             self.assertEqual(stats["model_stats"][1]["name"], savedmodel_name)
             self.assertEqual(
-                stats["model_stats"][0]["inference_stats"]["success"]["count"],
-                0)
+                stats["model_stats"][0]["inference_stats"]["success"]["count"], 0
+            )
             self.assertEqual(
-                stats["model_stats"][1]["inference_stats"]["success"]["count"],
-                0)
+                stats["model_stats"][1]["inference_stats"]["success"]["count"], 0
+            )
 
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             stats = triton_client.get_inference_statistics(savedmodel_name)
             self.assertEqual(len(stats.model_stats), 2)
             self.assertEqual(stats.model_stats[0].name, savedmodel_name)
             self.assertEqual(stats.model_stats[1].name, savedmodel_name)
-            self.assertEqual(stats.model_stats[0].inference_stats.success.count,
-                             0)
-            self.assertEqual(stats.model_stats[1].inference_stats.success.count,
-                             0)
+            self.assertEqual(stats.model_stats[0].inference_stats.success.count, 0)
+            self.assertEqual(stats.model_stats[1].inference_stats.success.count, 0)
 
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
@@ -659,16 +713,14 @@ def test_dynamic_model_load_unload(self):
         try:
             shutil.rmtree("models/" + onnx_name)
             time.sleep(5)  # wait for model to unload
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertTrue(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertTrue(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertTrue(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertFalse(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertFalse(triton_client.is_model_ready(onnx_name, "3"))
         except Exception as ex:
@@ -676,41 +728,41 @@ def test_dynamic_model_load_unload(self):
 
         # Model is removed so inference should fail
         try:
-            iu.infer_exact(self,
-                           'onnx',
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           swap=True)
-            self.assertTrue(False,
-                            "expected error for unavailable model " + onnx_name)
+            iu.infer_exact(
+                self,
+                "onnx",
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                swap=True,
+            )
+            self.assertTrue(False, "expected error for unavailable model " + onnx_name)
         except Exception as ex:
             self.assertIn(
                 "Request for unknown model: 'onnx_float32_float32_float32' has no available versions",
-                ex.message())
+                ex.message(),
+            )
 
     def test_dynamic_model_load_unload_disabled(self):
         tensor_shape = (1, 16)
-        savedmodel_name = tu.get_model_name('savedmodel', np.float32,
-                                            np.float32, np.float32)
-        onnx_name = tu.get_model_name('onnx', np.float32, np.float32,
-                                      np.float32)
+        savedmodel_name = tu.get_model_name(
+            "savedmodel", np.float32, np.float32, np.float32
+        )
+        onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32)
 
         # Make sure savedmodel model is not in the status (because
         # initially it is not in the model repository)
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
             except Exception as ex:
@@ -721,16 +773,14 @@ def test_dynamic_model_load_unload_disabled(self):
         try:
             shutil.copytree(savedmodel_name, "models/" + savedmodel_name)
             time.sleep(5)  # wait for model to load
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
         except Exception as ex:
@@ -738,37 +788,38 @@ def test_dynamic_model_load_unload_disabled(self):
 
         # Run inference which should fail because the model isn't there
         try:
-            iu.infer_exact(self,
-                           'savedmodel',
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           swap=True)
+            iu.infer_exact(
+                self,
+                "savedmodel",
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                swap=True,
+            )
             self.assertTrue(
-                False,
-                "expected error for unavailable model " + savedmodel_name)
+                False, "expected error for unavailable model " + savedmodel_name
+            )
         except Exception as ex:
             self.assertIn(
                 "Request for unknown model: 'savedmodel_float32_float32_float32' is not found",
-                ex.message())
+                ex.message(),
+            )
 
         # Remove one of the original models from the model repository.
         # Unloading is disabled so it should remain available in the status.
         try:
             shutil.rmtree("models/" + onnx_name)
             time.sleep(5)  # wait for model to unload (but it shouldn't)
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
         except Exception as ex:
@@ -777,84 +828,93 @@ def test_dynamic_model_load_unload_disabled(self):
         # Run inference to make sure model still being served even
         # though deleted from model repository
         try:
-            iu.infer_exact(self,
-                           'onnx',
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           swap=True)
+            iu.infer_exact(
+                self,
+                "onnx",
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                swap=True,
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_dynamic_version_load_unload(self):
         tensor_shape = (1, 16)
-        graphdef_name = tu.get_model_name('graphdef', np.int32, np.int32,
-                                          np.int32)
+        graphdef_name = tu.get_model_name("graphdef", np.int32, np.int32, np.int32)
 
         # There are 3 versions. Make sure that all have status and are
         # ready.
         try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "1"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "2"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "3"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "2"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "3"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Run inference on version 1 to make sure it is available
         try:
-            iu.infer_exact(self,
-                           'graphdef',
-                           tensor_shape,
-                           1,
-                           np.int32,
-                           np.int32,
-                           np.int32,
-                           swap=False,
-                           model_version=1)
+            iu.infer_exact(
+                self,
+                "graphdef",
+                tensor_shape,
+                1,
+                np.int32,
+                np.int32,
+                np.int32,
+                swap=False,
+                model_version=1,
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Make sure only version 1 has execution stats in the status.
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             stats = triton_client.get_inference_statistics(graphdef_name)
             self.assertEqual(len(stats["model_stats"]), 3)
             for idx in range(len(stats["model_stats"])):
-                self.assertEqual(stats["model_stats"][idx]["name"],
-                                 graphdef_name)
+                self.assertEqual(stats["model_stats"][idx]["name"], graphdef_name)
                 if stats["model_stats"][idx]["version"] == "1":
                     self.assertNotEqual(
-                        stats["model_stats"][idx]["inference_stats"]["success"]
-                        ["count"], 0)
+                        stats["model_stats"][idx]["inference_stats"]["success"][
+                            "count"
+                        ],
+                        0,
+                    )
                 else:
                     self.assertEqual(
-                        stats["model_stats"][idx]["inference_stats"]["success"]
-                        ["count"], 0)
-
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+                        stats["model_stats"][idx]["inference_stats"]["success"][
+                            "count"
+                        ],
+                        0,
+                    )
+
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             stats = triton_client.get_inference_statistics(graphdef_name)
             self.assertEqual(len(stats.model_stats), 3)
             for idx in range(len(stats.model_stats)):
                 self.assertEqual(stats.model_stats[idx].name, graphdef_name)
                 if stats.model_stats[idx].version == "1":
                     self.assertNotEqual(
-                        stats.model_stats[idx].inference_stats.success.count, 0)
+                        stats.model_stats[idx].inference_stats.success.count, 0
+                    )
                 else:
                     self.assertEqual(
-                        stats.model_stats[idx].inference_stats.success.count, 0)
+                        stats.model_stats[idx].inference_stats.success.count, 0
+                    )
 
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
@@ -864,87 +924,81 @@ def test_dynamic_version_load_unload(self):
         try:
             shutil.rmtree("models/" + graphdef_name + "/1")
             time.sleep(5)  # wait for version to unload
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(graphdef_name, "1"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "2"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(graphdef_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "2"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "3"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Version is removed so inference should fail
         try:
-            iu.infer_exact(self,
-                           'graphdef',
-                           tensor_shape,
-                           1,
-                           np.int32,
-                           np.int32,
-                           np.int32,
-                           swap=False,
-                           model_version=1)
+            iu.infer_exact(
+                self,
+                "graphdef",
+                tensor_shape,
+                1,
+                np.int32,
+                np.int32,
+                np.int32,
+                swap=False,
+                model_version=1,
+            )
             self.assertTrue(
-                False, "expected error for unavailable model " + graphdef_name)
+                False, "expected error for unavailable model " + graphdef_name
+            )
         except Exception as ex:
             self.assertIn(
                 "Request for unknown model: 'graphdef_int32_int32_int32' version 1 is not at ready state",
-                ex.message())
+                ex.message(),
+            )
 
         # Add another version to the model repository.
         try:
-            shutil.copytree("models/" + graphdef_name + "/2",
-                            "models/" + graphdef_name + "/7")
+            shutil.copytree(
+                "models/" + graphdef_name + "/2", "models/" + graphdef_name + "/7"
+            )
             time.sleep(5)  # wait for version to load
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(graphdef_name, "1"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "2"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "3"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "7"))
+                self.assertFalse(triton_client.is_model_ready(graphdef_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "2"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "3"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "7"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_dynamic_version_load_unload_disabled(self):
         tensor_shape = (1, 16)
-        graphdef_name = tu.get_model_name('graphdef', np.int32, np.int32,
-                                          np.int32)
+        graphdef_name = tu.get_model_name("graphdef", np.int32, np.int32, np.int32)
 
         # Add a new version to the model repository and give it time to
         # load. But it shouldn't load because dynamic loading is
         # disabled.
         try:
-            shutil.copytree("models/" + graphdef_name + "/2",
-                            "models/" + graphdef_name + "/7")
+            shutil.copytree(
+                "models/" + graphdef_name + "/2", "models/" + graphdef_name + "/7"
+            )
             time.sleep(5)  # wait for model to load
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "1"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "2"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "3"))
-                self.assertFalse(
-                    triton_client.is_model_ready(graphdef_name, "7"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "2"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(graphdef_name, "7"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -954,59 +1008,54 @@ def test_dynamic_version_load_unload_disabled(self):
         try:
             shutil.rmtree("models/" + graphdef_name + "/1")
             time.sleep(5)  # wait for version to unload (but it shouldn't)
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "1"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "2"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "3"))
-                self.assertFalse(
-                    triton_client.is_model_ready(graphdef_name, "7"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "2"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(graphdef_name, "7"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Run inference to make sure model still being served even
         # though version deleted from model repository
         try:
-            iu.infer_exact(self,
-                           'graphdef',
-                           tensor_shape,
-                           1,
-                           np.int32,
-                           np.int32,
-                           np.int32,
-                           swap=False,
-                           model_version=1)
+            iu.infer_exact(
+                self,
+                "graphdef",
+                tensor_shape,
+                1,
+                np.int32,
+                np.int32,
+                np.int32,
+                swap=False,
+                model_version=1,
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_dynamic_model_modify(self):
-        models_base = ('savedmodel', 'plan')
+        models_base = ("savedmodel", "plan")
         models_shape = ((1, 16), (1, 16))
         models = list()
         for m in models_base:
-            models.append(
-                tu.get_model_name(m, np.float32, np.float32, np.float32))
+            models.append(tu.get_model_name(m, np.float32, np.float32, np.float32))
 
         # Make sure savedmodel and plan are in the status
         for model_name in models:
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "1"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -1014,63 +1063,67 @@ def test_dynamic_model_modify(self):
         for version in (1, 3):
             for model_name, model_shape in zip(models_base, models_shape):
                 try:
-                    iu.infer_exact(self,
-                                   model_name,
-                                   model_shape,
-                                   1,
-                                   np.float32,
-                                   np.float32,
-                                   np.float32,
-                                   swap=(version == 3),
-                                   model_version=version)
+                    iu.infer_exact(
+                        self,
+                        model_name,
+                        model_shape,
+                        1,
+                        np.float32,
+                        np.float32,
+                        np.float32,
+                        swap=(version == 3),
+                        model_version=version,
+                    )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Change the model configuration to use wrong label file
         for base_name, model_name in zip(models_base, models):
-            shutil.copyfile("config.pbtxt.wrong." + base_name,
-                            "models/" + model_name + "/config.pbtxt")
+            shutil.copyfile(
+                "config.pbtxt.wrong." + base_name,
+                "models/" + model_name + "/config.pbtxt",
+            )
 
         time.sleep(5)  # wait for models to reload
         for model_name in models:
             for model_name, model_shape in zip(models_base, models_shape):
                 try:
-                    iu.infer_exact(self,
-                                   model_name,
-                                   model_shape,
-                                   1,
-                                   np.float32,
-                                   np.float32,
-                                   np.float32,
-                                   swap=(version == 3),
-                                   model_version=version,
-                                   output0_raw=False)
+                    iu.infer_exact(
+                        self,
+                        model_name,
+                        model_shape,
+                        1,
+                        np.float32,
+                        np.float32,
+                        np.float32,
+                        swap=(version == 3),
+                        model_version=version,
+                        output0_raw=False,
+                    )
                     self.assertTrue(
-                        False,
-                        "expected error for wrong label for " + model_name)
+                        False, "expected error for wrong label for " + model_name
+                    )
                 except AssertionError as ex:
-                    self.assertTrue("'label9" in str(ex) and "!=" in str(ex),
-                                    str(ex))
+                    self.assertTrue("'label9" in str(ex) and "!=" in str(ex), str(ex))
 
         # Change the model configuration to use correct label file and to have
         # the default version policy (so that only version 3) is available.
         for base_name, model_name in zip(models_base, models):
-            shutil.copyfile("config.pbtxt." + base_name,
-                            "models/" + model_name + "/config.pbtxt")
+            shutil.copyfile(
+                "config.pbtxt." + base_name, "models/" + model_name + "/config.pbtxt"
+            )
 
         time.sleep(5)  # wait for models to reload
         for model_name in models:
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -1078,56 +1131,58 @@ def test_dynamic_model_modify(self):
         # change in model policy makes that no longer available.
         for model_name, model_shape in zip(models_base, models_shape):
             try:
-                iu.infer_exact(self,
-                               model_name,
-                               model_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               swap=False,
-                               model_version=1)
+                iu.infer_exact(
+                    self,
+                    model_name,
+                    model_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    swap=False,
+                    model_version=1,
+                )
                 self.assertTrue(
-                    False, "expected error for unavailable model " + model_name)
+                    False, "expected error for unavailable model " + model_name
+                )
             except Exception as ex:
                 self.assertIn("Request for unknown model", ex.message())
 
         # Version 3 should continue to work...
         for model_name, model_shape in zip(models_base, models_shape):
             try:
-                iu.infer_exact(self,
-                               model_name,
-                               model_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               swap=True,
-                               model_version=3)
+                iu.infer_exact(
+                    self,
+                    model_name,
+                    model_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    swap=True,
+                    model_version=3,
+                )
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_dynamic_file_delete(self):
-        models_base = ('savedmodel', 'plan')
+        models_base = ("savedmodel", "plan")
         models_shape = ((1, 16), (1, 16))
         models = list()
         for m in models_base:
-            models.append(
-                tu.get_model_name(m, np.float32, np.float32, np.float32))
+            models.append(tu.get_model_name(m, np.float32, np.float32, np.float32))
 
         # Make sure savedmodel and plan are in the status
         for model_name in models:
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "1"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -1135,15 +1190,17 @@ def test_dynamic_file_delete(self):
         for version in (1, 3):
             for model_name, model_shape in zip(models_base, models_shape):
                 try:
-                    iu.infer_exact(self,
-                                   model_name,
-                                   model_shape,
-                                   1,
-                                   np.float32,
-                                   np.float32,
-                                   np.float32,
-                                   swap=(version == 3),
-                                   model_version=version)
+                    iu.infer_exact(
+                        self,
+                        model_name,
+                        model_shape,
+                        1,
+                        np.float32,
+                        np.float32,
+                        np.float32,
+                        swap=(version == 3),
+                        model_version=version,
+                    )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -1157,81 +1214,86 @@ def test_dynamic_file_delete(self):
         time.sleep(5)  # wait for models to reload
         for model_name in models:
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Only version 3 (latest) should work...
         for model_name, model_shape in zip(models_base, models_shape):
             try:
-                iu.infer_exact(self,
-                               model_name,
-                               model_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               swap=True,
-                               model_version=3)
+                iu.infer_exact(
+                    self,
+                    model_name,
+                    model_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    swap=True,
+                    model_version=3,
+                )
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
             try:
-                iu.infer_exact(self,
-                               model_name,
-                               model_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               swap=False,
-                               model_version=1)
+                iu.infer_exact(
+                    self,
+                    model_name,
+                    model_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    swap=False,
+                    model_version=1,
+                )
                 self.assertTrue(
-                    False,
-                    "expected error for unavailable model " + graphdef_name)
+                    False, "expected error for unavailable model " + graphdef_name
+                )
             except Exception as ex:
                 self.assertIn("Request for unknown model", ex.message())
 
     def test_multiple_model_repository_polling(self):
         model_shape = (1, 16)
-        savedmodel_name = tu.get_model_name('savedmodel', np.float32,
-                                            np.float32, np.float32)
+        savedmodel_name = tu.get_model_name(
+            "savedmodel", np.float32, np.float32, np.float32
+        )
 
         # Models should be loaded successfully and infer
         # successfully. Initially savedmodel only has version 1.
-        self._infer_success_models([
-            'savedmodel',
-        ], (1,), model_shape)
-        self._infer_success_models(['graphdef', 'onnx'], (1, 3), model_shape)
+        self._infer_success_models(
+            [
+                "savedmodel",
+            ],
+            (1,),
+            model_shape,
+        )
+        self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape)
 
         # Add the savedmodel to the second model repository, should cause
         # it to be unloaded due to duplication
         shutil.copytree(savedmodel_name, "models_0/" + savedmodel_name)
         time.sleep(5)  # wait for models to reload
         try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models(['graphdef', 'onnx'], (1, 3), model_shape)
+        self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape)
 
         # Remove the savedmodel from the first model repository, the
         # model from the second model repository should be loaded
@@ -1239,91 +1301,96 @@ def test_multiple_model_repository_polling(self):
         # have versions 1 and 3.
         shutil.rmtree("models/" + savedmodel_name)
         time.sleep(5)  # wait for model to unload
-        self._infer_success_models(['savedmodel', 'graphdef', 'onnx'], (1, 3),
-                                   model_shape)
+        self._infer_success_models(
+            ["savedmodel", "graphdef", "onnx"], (1, 3), model_shape
+        )
 
     def test_multiple_model_repository_control(self):
         # similar to test_multiple_model_repository_polling, but the
         # model load/unload is controlled by the API
         model_shape = (1, 16)
-        savedmodel_name = tu.get_model_name('savedmodel', np.float32,
-                                            np.float32, np.float32)
-        model_bases = ['savedmodel', 'graphdef', 'onnx']
+        savedmodel_name = tu.get_model_name(
+            "savedmodel", np.float32, np.float32, np.float32
+        )
+        model_bases = ["savedmodel", "graphdef", "onnx"]
 
         # Initially models are not loaded
         for base in model_bases:
             try:
-                model_name = tu.get_model_name(base, np.float32, np.float32,
-                                               np.float32)
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                model_name = tu.get_model_name(base, np.float32, np.float32, np.float32)
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Load all models, here we use GRPC
         for base in model_bases:
             try:
-                model_name = tu.get_model_name(base, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(base, np.float32, np.float32, np.float32)
                 triton_client = grpcclient.InferenceServerClient(
-                    "localhost:8001", verbose=True)
+                    "localhost:8001", verbose=True
+                )
                 triton_client.load_model(model_name)
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Models should be loaded successfully and infer
         # successfully. Initially savedmodel only has version 1.
-        self._infer_success_models([
-            'savedmodel',
-        ], (1,), model_shape)
-        self._infer_success_models(['graphdef', 'onnx'], (1, 3), model_shape)
+        self._infer_success_models(
+            [
+                "savedmodel",
+            ],
+            (1,),
+            model_shape,
+        )
+        self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape)
 
         # Add the savedmodel to the second model repository. Because
         # not polling this doesn't change any model state, all models
         # are still loaded and available.
         shutil.copytree(savedmodel_name, "models_0/" + savedmodel_name)
-        self._infer_success_models([
-            'savedmodel',
-        ], (1,), model_shape)
-        self._infer_success_models(['graphdef', 'onnx'], (1, 3), model_shape)
+        self._infer_success_models(
+            [
+                "savedmodel",
+            ],
+            (1,),
+            model_shape,
+        )
+        self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape)
 
         # Load savedmodel again which should fail because it is now duplicated
         # in 2 model repositories. Use HTTP here.
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(savedmodel_name)
         except Exception as ex:
-            self.assertIn("failed to load '{}'".format(savedmodel_name),
-                          ex.message())
+            self.assertIn("failed to load '{}'".format(savedmodel_name), ex.message())
 
         try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
                 # Unlike polling mode, the failed load on the duplicate model
                 # should NOT unload the existing versions in model control mode.
-                self.assertTrue(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(savedmodel_name, "1"))
                 # Version 3 did not exist in the first model repository, so
                 # it should still not be loaded.
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models(['graphdef', 'onnx'], (1, 3), model_shape)
+        self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape)
 
         # Remove the savedmodel from the first model repository and
         # explicitly load savedmodel. The savedmodel from the second
@@ -1331,23 +1398,23 @@ def test_multiple_model_repository_control(self):
         # model repository savedmodel should have versions 1 and 3.
         shutil.rmtree("models/" + savedmodel_name)
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             # Unload existing in-memory model from first model repository
             triton_client.unload_model(savedmodel_name)
             # Load model from second model repository since original was deleted
             triton_client.load_model(savedmodel_name)
         except Exception as ex:
-            self.assertIn("failed to load '{}'".format(savedmodel_name),
-                          ex.message())
+            self.assertIn("failed to load '{}'".format(savedmodel_name), ex.message())
 
-        self._infer_success_models(['savedmodel', 'graphdef', 'onnx'], (1, 3),
-                                   model_shape)
+        self._infer_success_models(
+            ["savedmodel", "graphdef", "onnx"], (1, 3), model_shape
+        )
 
     def test_model_control(self):
         model_shape = (1, 16)
-        onnx_name = tu.get_model_name('onnx', np.float32, np.float32,
-                                      np.float32)
+        onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32)
 
         ensemble_prefix = "simple_"
         ensemble_name = ensemble_prefix + onnx_name
@@ -1355,48 +1422,55 @@ def test_model_control(self):
         # Make sure no models are loaded
         for model_name in (onnx_name, ensemble_name):
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Load non-existent model
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 triton_client.load_model("unknown_model")
                 self.assertTrue(False, "expected unknown model failure")
             except Exception as ex:
                 self.assertIn(
                     "failed to load 'unknown_model', failed to poll from model repository",
-                    ex.message())
+                    ex.message(),
+                )
 
         # Load ensemble model, the dependent model should be polled and loaded
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(ensemble_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "onnx",
-        ], (1, 3), model_shape)
-        self._infer_success_models([
-            "simple_onnx",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (1, 3),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_onnx",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
 
         # Delete model configuration for onnx, which will cause
         # the autofiller to use the latest version policy so that only
@@ -1404,51 +1478,65 @@ def test_model_control(self):
         for model_name in (onnx_name,):
             os.remove("models/" + model_name + "/config.pbtxt")
 
-        self._infer_success_models([
-            "onnx",
-        ], (1, 3), model_shape)
-        self._infer_success_models([
-            "simple_onnx",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (1, 3),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_onnx",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
 
         # Reload models, only version 3 should be available for onnx
         for model_name in (onnx_name, ensemble_name):
             try:
                 triton_client = grpcclient.InferenceServerClient(
-                    "localhost:8001", verbose=True)
+                    "localhost:8001", verbose=True
+                )
                 triton_client.load_model(model_name)
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "onnx",
-        ], (3,), model_shape)
-        self._infer_success_models([
-            "simple_onnx",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (3,),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_onnx",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
 
         for model_name in (onnx_name,):
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Unload non-existing model, nothing should happen
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 triton_client.unload_model("unknown_model")
             except Exception as ex:
@@ -1457,24 +1545,23 @@ def test_model_control(self):
         # Unload the depending model, as side effect, the ensemble model will be
         # forced to be unloaded
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.unload_model(onnx_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         for model_name in (onnx_name, ensemble_name):
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -1482,41 +1569,43 @@ def test_model_control(self):
         # model. The ensemble model should not be reloaded because it
         # was explicitly unloaded.
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.unload_model(ensemble_name)
             triton_client.load_model(onnx_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "onnx",
-        ], (3,), model_shape)
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (3,),
+            model_shape,
+        )
 
         try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(ensemble_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(ensemble_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(ensemble_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(ensemble_name, "3"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_model_control_fail(self):
-        model_name = tu.get_model_name('onnx', np.float32, np.float32,
-                                       np.float32)
+        model_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32)
 
         # Make sure no models are loaded
         try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
                 self.assertFalse(triton_client.is_model_ready(model_name, "1"))
@@ -1526,28 +1615,27 @@ def test_model_control_fail(self):
 
         # Request to load the model and expect fail to load
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(model_name)
             self.assertTrue(False, "expecting load failure")
         except InferenceServerException as ex:
-            self.assertIn("load failed for model '{}'".format(model_name),
-                          ex.message())
+            self.assertIn("load failed for model '{}'".format(model_name), ex.message())
 
         # Another attempt should fail as well
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(model_name)
             self.assertTrue(False, "expecting load failure")
         except InferenceServerException as ex:
-            self.assertIn("load failed for model '{}'".format(model_name),
-                          ex.message())
+            self.assertIn("load failed for model '{}'".format(model_name), ex.message())
 
     def test_model_control_ensemble(self):
         model_shape = (1, 16)
-        onnx_name = tu.get_model_name('onnx', np.float32, np.float32,
-                                      np.float32)
+        onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32)
 
         ensemble_prefix = "simple_"
         ensemble_name = ensemble_prefix + onnx_name
@@ -1555,83 +1643,91 @@ def test_model_control_ensemble(self):
         # Make sure no models are loaded
         for model_name in (onnx_name, ensemble_name):
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Load ensemble model, the dependent model should be polled and loaded
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(ensemble_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "onnx",
-        ], (1, 3), model_shape)
-        self._infer_success_models([
-            "simple_onnx",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (1, 3),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_onnx",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
 
         # Unload the ensemble with unload_dependents flag. all models should be unloaded
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.unload_model(ensemble_name, unload_dependents=True)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         for model_name in (onnx_name, ensemble_name):
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Load ensemble model, and unload it without unload_dependents flag (default).
         # The dependent model should still be available
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(ensemble_name)
             triton_client.unload_model(ensemble_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "onnx",
-        ], (1, 3), model_shape)
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (1, 3),
+            model_shape,
+        )
 
         try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(ensemble_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(ensemble_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(ensemble_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(ensemble_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
         except Exception as ex:
@@ -1639,8 +1735,7 @@ def test_model_control_ensemble(self):
 
     def test_load_same_model_different_platform(self):
         model_shape = (1, 16)
-        model_name = tu.get_model_name('simple', np.float32, np.float32,
-                                       np.float32)
+        model_name = tu.get_model_name("simple", np.float32, np.float32, np.float32)
 
         # Check whether or not to use grpc protocol
         use_grpc = "TRITONSERVER_USE_GRPC" in os.environ
@@ -1654,19 +1749,22 @@ def test_load_same_model_different_platform(self):
             self.assertTrue(triton_client.is_model_ready(model_name, "1"))
             self.assertTrue(triton_client.is_model_ready(model_name, "3"))
             if use_grpc:
-                metadata = triton_client.get_model_metadata(model_name,
-                                                            as_json=True)
+                metadata = triton_client.get_model_metadata(model_name, as_json=True)
             else:
                 metadata = triton_client.get_model_metadata(model_name)
             self.assertEqual(metadata["platform"], "tensorrt_plan")
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
-        self._infer_success_models([
-            "simple",
-        ], (
-            1,
-            3,
-        ), model_shape)
+        self._infer_success_models(
+            [
+                "simple",
+            ],
+            (
+                1,
+                3,
+            ),
+            model_shape,
+        )
 
         # Copy the same model of different platform to model repository
         shutil.rmtree("models/" + model_name)
@@ -1688,19 +1786,22 @@ def test_load_same_model_different_platform(self):
             self.assertTrue(triton_client.is_model_ready(model_name, "1"))
             self.assertTrue(triton_client.is_model_ready(model_name, "3"))
             if use_grpc:
-                metadata = triton_client.get_model_metadata(model_name,
-                                                            as_json=True)
+                metadata = triton_client.get_model_metadata(model_name, as_json=True)
             else:
                 metadata = triton_client.get_model_metadata(model_name)
             self.assertEqual(metadata["platform"], "pytorch_libtorch")
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
-        self._infer_success_models([
-            "simple",
-        ], (
-            1,
-            3,
-        ), model_shape)
+        self._infer_success_models(
+            [
+                "simple",
+            ],
+            (
+                1,
+                3,
+            ),
+            model_shape,
+        )
 
     def test_model_availability_on_reload(self):
         model_name = "identity_zero_1_int32"
@@ -1725,9 +1826,8 @@ def test_model_availability_on_reload(self):
 
         # Reload models, v1 should still be available until v2 is loaded
         # The load is requested in other thread as it is blocking API,
-        # and the v1 availibility should be tested during the reload
-        thread = threading.Thread(target=self._async_load,
-                                  args=(model_name, use_grpc))
+        # and the v1 availability should be tested during the reload
+        thread = threading.Thread(target=self._async_load, args=(model_name, use_grpc))
         thread.start()
         # wait for time < model creation delay to ensure load request is sent
         time.sleep(3)
@@ -1738,9 +1838,12 @@ def test_model_availability_on_reload(self):
             triton_client = self._get_client(use_grpc)
             self.assertTrue(triton_client.is_server_live())
             load_end = time.time()
-            self.assertTrue((load_end - load_start) < 5,
-                            "server was waiting unexpectly, waited {}".format(
-                                (load_end - load_start)))
+            self.assertTrue(
+                (load_end - load_start) < 5,
+                "server was waiting unexpectedly, waited {}".format(
+                    (load_end - load_start)
+                ),
+            )
             self.assertTrue(triton_client.is_server_ready())
             self.assertTrue(triton_client.is_model_ready(model_name, "1"))
         except Exception as ex:
@@ -1778,14 +1881,12 @@ def test_model_availability_on_reload_2(self):
         self._infer_success_identity(model_base, (1,), np.int32, model_shape)
 
         # Overwrite config.pbtxt to load v2 only
-        shutil.copyfile("config.pbtxt.v2",
-                        "models/" + model_name + "/config.pbtxt")
+        shutil.copyfile("config.pbtxt.v2", "models/" + model_name + "/config.pbtxt")
 
         # Reload models, v1 should still be available until v2 is loaded
         # The load is requested in other thread as it is blocking API,
-        # and the v1 availibility should be tested during the reload
-        thread = threading.Thread(target=self._async_load,
-                                  args=(model_name, use_grpc))
+        # and the v1 availability should be tested during the reload
+        thread = threading.Thread(target=self._async_load, args=(model_name, use_grpc))
         thread.start()
         # wait for time < model creation delay to ensure load request is sent
         time.sleep(3)
@@ -1796,9 +1897,12 @@ def test_model_availability_on_reload_2(self):
             triton_client = self._get_client(use_grpc)
             self.assertTrue(triton_client.is_server_live())
             load_end = time.time()
-            self.assertTrue((load_end - load_start) < 5,
-                            "server was waiting unexpectly, waited {}".format(
-                                (load_end - load_start)))
+            self.assertTrue(
+                (load_end - load_start) < 5,
+                "server was waiting unexpectedly, waited {}".format(
+                    (load_end - load_start)
+                ),
+            )
             self.assertTrue(triton_client.is_server_ready())
             self.assertTrue(triton_client.is_model_ready(model_name, "1"))
         except Exception as ex:
@@ -1836,13 +1940,11 @@ def test_model_availability_on_reload_3(self):
         self._infer_success_identity(model_base, (1,), np.int32, model_shape)
 
         # Overwrite config.pbtxt to load v2 only
-        shutil.copyfile("config.pbtxt.new",
-                        "models/" + model_name + "/config.pbtxt")
+        shutil.copyfile("config.pbtxt.new", "models/" + model_name + "/config.pbtxt")
 
         # Reload models, v1 will be reloaded but it should  be available
         # during the whole reload
-        thread = threading.Thread(target=self._async_load,
-                                  args=(model_name, use_grpc))
+        thread = threading.Thread(target=self._async_load, args=(model_name, use_grpc))
         thread.start()
         # wait for time < model creation delay to ensure load request is sent
         time.sleep(3)
@@ -1853,9 +1955,12 @@ def test_model_availability_on_reload_3(self):
             triton_client = self._get_client(use_grpc)
             self.assertTrue(triton_client.is_server_live())
             load_end = time.time()
-            self.assertTrue((load_end - load_start) < 5,
-                            "server was waiting unexpectly, waited {}".format(
-                                (load_end - load_start)))
+            self.assertTrue(
+                (load_end - load_start) < 5,
+                "server was waiting unexpectedly, waited {}".format(
+                    (load_end - load_start)
+                ),
+            )
             self.assertTrue(triton_client.is_server_ready())
             self.assertTrue(triton_client.is_model_ready(model_name, "1"))
         except Exception as ex:
@@ -1880,8 +1985,9 @@ def test_model_reload_fail(self):
 
         # Make sure version 1 of the model is loaded
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             self.assertTrue(triton_client.is_server_live())
             self.assertTrue(triton_client.is_server_ready())
             self.assertTrue(triton_client.is_model_ready(model_name, "1"))
@@ -1890,24 +1996,26 @@ def test_model_reload_fail(self):
         self._infer_success_identity(model_base, (1,), np.int32, model_shape)
 
         # Overwrite config.pbtxt to load v2 only on GPU, which will fail
-        shutil.copyfile("config.pbtxt.v2.gpu",
-                        "models/" + model_name + "/config.pbtxt")
+        shutil.copyfile("config.pbtxt.v2.gpu", "models/" + model_name + "/config.pbtxt")
 
         # Reload models, v1 should still be available even if v2 fails to load
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(model_name)
             self.assertTrue(False, "expecting load failure")
         except Exception as ex:
             self.assertIn(
                 "version 2 is at UNAVAILABLE state: Internal: GPU instances not supported",
-                ex.message())
+                ex.message(),
+            )
 
         # Make sure version 1 of the model is available, and version 2 is not
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             self.assertTrue(triton_client.is_server_live())
             self.assertTrue(triton_client.is_server_ready())
             self.assertTrue(triton_client.is_model_ready(model_name, "1"))
@@ -1918,113 +2026,143 @@ def test_model_reload_fail(self):
 
     def test_multiple_model_repository_control_startup_models(self):
         model_shape = (1, 16)
-        onnx_name = tu.get_model_name('onnx', np.float32, np.float32,
-                                      np.float32)
-        plan_name = tu.get_model_name('plan', np.float32, np.float32,
-                                      np.float32)
+        onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32)
+        plan_name = tu.get_model_name("plan", np.float32, np.float32, np.float32)
 
         ensemble_prefix = "simple_"
         onnx_ensemble_name = ensemble_prefix + onnx_name
         plan_ensemble_name = ensemble_prefix + plan_name
 
         # Make sure unloaded models are not in the status
-        for base in ('savedmodel',):
-            model_name = tu.get_model_name(base, np.float32, np.float32,
-                                           np.float32)
+        for base in ("savedmodel",):
+            model_name = tu.get_model_name(base, np.float32, np.float32, np.float32)
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # And loaded models work properly
-        self._infer_success_models([
-            "onnx",
-        ], (1, 3), model_shape)
-        self._infer_success_models([
-            "simple_onnx",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
-        self._infer_success_models([
-            "plan",
-        ], (1, 3), model_shape)
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (1, 3),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_onnx",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
+        self._infer_success_models(
+            [
+                "plan",
+            ],
+            (1, 3),
+            model_shape,
+        )
 
         # Load non-existing model
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 triton_client.load_model("unknown_model")
                 self.assertTrue(False, "expected unknown model failure")
             except Exception as ex:
                 self.assertIn(
                     "failed to load 'unknown_model', failed to poll from model repository",
-                    ex.message())
+                    ex.message(),
+                )
 
         # Load plan ensemble model, the dependent model is already
         # loaded via command-line
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(plan_ensemble_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "plan",
-        ], (1, 3), model_shape)
-        self._infer_success_models([
-            "simple_plan",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
+        self._infer_success_models(
+            [
+                "plan",
+            ],
+            (1, 3),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_plan",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
 
         # Delete model configuration, which will cause the autofiller
         # to use the latest version policy so that only version 3 will
         # be available if the models are re-loaded
         os.remove("models/" + onnx_name + "/config.pbtxt")
 
-        self._infer_success_models([
-            "plan",
-        ], (1, 3), model_shape)
-        self._infer_success_models([
-            "simple_plan",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
+        self._infer_success_models(
+            [
+                "plan",
+            ],
+            (1, 3),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_plan",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
 
         # Reload onnx, only version 3 should be available
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             triton_client.load_model(onnx_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "onnx",
-        ], (3,), model_shape)
-        self._infer_success_models([
-            "simple_onnx",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
-
-        try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (3,),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_onnx",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
+
+        try:
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
                 self.assertFalse(triton_client.is_model_ready(onnx_name, "1"))
@@ -2032,10 +2170,10 @@ def test_multiple_model_repository_control_startup_models(self):
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Unload non-existing model, nothing should happen
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 triton_client.unload_model("unknown_model")
             except Exception as ex:
@@ -2044,24 +2182,23 @@ def test_multiple_model_repository_control_startup_models(self):
         # Unload the onnx, as side effect, the ensemble model
         # will be forced to be unloaded
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.unload_model(onnx_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         for model_name in [onnx_name, onnx_ensemble_name]:
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -2069,36 +2206,46 @@ def test_multiple_model_repository_control_startup_models(self):
         # depending model. The ensemble model should not be reloaded
         # because it was explicitly unloaded.
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.unload_model(onnx_ensemble_name)
             triton_client.load_model(onnx_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "onnx",
-        ], (3,), model_shape)
-        self._infer_success_models([
-            "plan",
-        ], (1, 3), model_shape)
-        self._infer_success_models([
-            "simple_plan",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
-
-        try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (3,),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "plan",
+            ],
+            (1, 3),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_plan",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
+
+        try:
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(onnx_ensemble_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(onnx_ensemble_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(onnx_ensemble_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(onnx_ensemble_name, "3"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -2106,7 +2253,7 @@ def test_model_repository_index(self):
         # use model control EXPLICIT and --load-model to load a subset of models
         # in model repository
         tensor_shape = (1, 16)
-        model_bases = ['graphdef', 'savedmodel', "simple_savedmodel"]
+        model_bases = ["graphdef", "savedmodel", "simple_savedmodel"]
 
         # Sanity check on loaded models
         # 3 models should be loaded:
@@ -2115,12 +2262,13 @@ def test_model_repository_index(self):
         #     graphdef_float32_float32_float32
         for model_base in model_bases:
             try:
-                model_name = tu.get_model_name(model_base, np.float32,
-                                               np.float32, np.float32)
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                model_name = tu.get_model_name(
+                    model_base, np.float32, np.float32, np.float32
+                )
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
                     self.assertTrue(triton_client.is_model_ready(model_name))
@@ -2132,8 +2280,9 @@ def test_model_repository_index(self):
         # which appears in two repositories.
         model_bases.append("simple_graphdef")
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             index = triton_client.get_model_repository_index()
             indexed = list()
             self.assertEqual(len(index), 8)
@@ -2142,15 +2291,17 @@ def test_model_repository_index(self):
                 if i["name"] == "onnx_float32_float32_float32":
                     self.assertEqual(i["state"], "UNAVAILABLE")
                     self.assertEqual(
-                        i["reason"],
-                        "model appears in two or more repositories")
+                        i["reason"], "model appears in two or more repositories"
+                    )
             for model_base in model_bases:
-                model_name = tu.get_model_name(model_base, np.float32,
-                                               np.float32, np.float32)
+                model_name = tu.get_model_name(
+                    model_base, np.float32, np.float32, np.float32
+                )
                 self.assertTrue(model_name in indexed)
 
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             index = triton_client.get_model_repository_index()
             indexed = list()
             self.assertEqual(len(index.models), 8)
@@ -2159,10 +2310,12 @@ def test_model_repository_index(self):
                 if i.name == "onnx_float32_float32_float32":
                     self.assertEqual(i.state, "UNAVAILABLE")
                     self.assertEqual(
-                        i.reason, "model appears in two or more repositories")
+                        i.reason, "model appears in two or more repositories"
+                    )
             for model_base in model_bases:
-                model_name = tu.get_model_name(model_base, np.float32,
-                                               np.float32, np.float32)
+                model_name = tu.get_model_name(
+                    model_base, np.float32, np.float32, np.float32
+                )
                 self.assertTrue(model_name in indexed)
 
         except Exception as ex:
@@ -2171,21 +2324,19 @@ def test_model_repository_index(self):
     def test_config_override(self):
         model_shape = (1, 16)
 
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
-            for base in (('onnx', 'onnxruntime'),):
-                model_name = tu.get_model_name(base[0], np.float32, np.float32,
-                                               np.float32)
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
+            for base in (("onnx", "onnxruntime"),):
+                model_name = tu.get_model_name(
+                    base[0], np.float32, np.float32, np.float32
+                )
                 try:
                     self.assertTrue(triton_client.is_server_live())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "2"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "2"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -2194,18 +2345,23 @@ def test_config_override(self):
                 try:
                     triton_client.load_model(model_name)
                     self.assertTrue(
-                        False, "expected fail to load '{}'".format(model_name))
+                        False, "expected fail to load '{}'".format(model_name)
+                    )
                 except Exception as ex:
                     self.assertIn(
-                        "load failed for model '{}'".format(model_name),
-                        ex.message())
+                        "load failed for model '{}'".format(model_name), ex.message()
+                    )
 
                 # Request to load the model with provided "correct" config
                 try:
-                    triton_client.load_model(model_name,
-                                             config="""
+                    triton_client.load_model(
+                        model_name,
+                        config="""
 {{"backend":"{backend}","version_policy":{{"specific" : {{ "versions": [2] }} }} }}
-""".format(backend=base[1]))
+""".format(
+                            backend=base[1]
+                        ),
+                    )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
                 self.assertFalse(triton_client.is_model_ready(model_name, "1"))
@@ -2213,67 +2369,61 @@ def test_config_override(self):
                 self.assertFalse(triton_client.is_model_ready(model_name, "3"))
 
                 # And loaded models work properly
-                self._infer_success_models([
-                    base[0],
-                ], (2,), model_shape)
+                self._infer_success_models(
+                    [
+                        base[0],
+                    ],
+                    (2,),
+                    model_shape,
+                )
 
                 # request without additional config will load with default
                 # config and expect to fail, and version 2 will not be unloaded.
                 try:
                     triton_client.load_model(model_name)
                     self.assertTrue(
-                        False, "expected fail to load '{}'".format(model_name))
+                        False, "expected fail to load '{}'".format(model_name)
+                    )
                 except Exception as ex:
                     self.assertIn(
-                        "load failed for model '{}'".format(model_name),
-                        ex.message())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "2"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                        "load failed for model '{}'".format(model_name), ex.message()
+                    )
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "2"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
 
                 # Unload model for the next client iteration
                 try:
                     triton_client.unload_model(model_name)
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "2"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "2"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_file_override(self):
-
         model_shape = (1, 16)
         override_base = "override_model"
 
-        for base in (('onnx', 'onnxruntime'),):
-            model_name = tu.get_model_name(base[0], np.float32, np.float32,
-                                           np.float32)
-            override_model_name = tu.get_model_name(override_base, np.float32,
-                                                    np.float32, np.float32)
+        for base in (("onnx", "onnxruntime"),):
+            model_name = tu.get_model_name(base[0], np.float32, np.float32, np.float32)
+            override_model_name = tu.get_model_name(
+                override_base, np.float32, np.float32, np.float32
+            )
 
             # Prepare override file
-            with open("models/{}/3/model.{}".format(model_name, base[0]),
-                      'rb') as f:
+            with open("models/{}/3/model.{}".format(model_name, base[0]), "rb") as f:
                 file_content = f.read()
 
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 try:
                     self.assertTrue(triton_client.is_server_live())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "2"))
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "2"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "3"))
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -2283,14 +2433,17 @@ def test_file_override(self):
                 # not be used.
                 try:
                     triton_client.load_model(
-                        model_name, files={"file:1/model.onnx": file_content})
-                    self.assertTrue(
-                        False, "expected error on missing override config")
+                        model_name, files={"file:1/model.onnx": file_content}
+                    )
+                    self.assertTrue(False, "expected error on missing override config")
                 except InferenceServerException as ex:
                     # [FIXME] Improve error reporting to mention missing config
                     self.assertIn(
-                        "failed to load '{}', failed to poll from model repository"
-                        .format(model_name), ex.message())
+                        "failed to load '{}', failed to poll from model repository".format(
+                            model_name
+                        ),
+                        ex.message(),
+                    )
 
                 # Sanity check on previous loaded version is still available
                 # after the failure attempt to load model with different version
@@ -2298,18 +2451,22 @@ def test_file_override(self):
                 self.assertFalse(triton_client.is_model_ready(model_name, "2"))
                 self.assertTrue(triton_client.is_model_ready(model_name, "3"))
 
-                self._infer_success_models([
-                    base[0],
-                ], (3,), model_shape)
+                self._infer_success_models(
+                    [
+                        base[0],
+                    ],
+                    (3,),
+                    model_shape,
+                )
 
                 # Request to load the model with override file and config in
                 # a different name
                 try:
                     triton_client.load_model(
                         override_model_name,
-                        config="""{{"backend":"{backend}" }}""".format(
-                            backend=base[1]),
-                        files={"file:1/model.onnx": file_content})
+                        config="""{{"backend":"{backend}" }}""".format(backend=base[1]),
+                        files={"file:1/model.onnx": file_content},
+                    )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -2318,31 +2475,35 @@ def test_file_override(self):
                 self.assertFalse(triton_client.is_model_ready(model_name, "1"))
                 self.assertFalse(triton_client.is_model_ready(model_name, "2"))
                 self.assertTrue(triton_client.is_model_ready(model_name, "3"))
-                self._infer_success_models([
-                    base[0],
-                ], (3,), model_shape)
+                self._infer_success_models(
+                    [
+                        base[0],
+                    ],
+                    (3,),
+                    model_shape,
+                )
 
                 # New override model should also be available
-                self.assertTrue(
-                    triton_client.is_model_ready(override_model_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(override_model_name, "2"))
-                self.assertFalse(
-                    triton_client.is_model_ready(override_model_name, "3"))
-                self._infer_success_models([
-                    override_base,
-                ], (1,),
-                                           model_shape,
-                                           swap=True)
+                self.assertTrue(triton_client.is_model_ready(override_model_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(override_model_name, "2"))
+                self.assertFalse(triton_client.is_model_ready(override_model_name, "3"))
+                self._infer_success_models(
+                    [
+                        override_base,
+                    ],
+                    (1,),
+                    model_shape,
+                    swap=True,
+                )
 
                 # Request to load the model with override file and config in
                 # original name
                 try:
                     triton_client.load_model(
                         model_name,
-                        config="""{{"backend":"{backend}" }}""".format(
-                            backend=base[1]),
-                        files={"file:1/model.onnx": file_content})
+                        config="""{{"backend":"{backend}" }}""".format(backend=base[1]),
+                        files={"file:1/model.onnx": file_content},
+                    )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -2351,24 +2512,27 @@ def test_file_override(self):
                 self.assertTrue(triton_client.is_model_ready(model_name, "1"))
                 self.assertFalse(triton_client.is_model_ready(model_name, "2"))
                 self.assertFalse(triton_client.is_model_ready(model_name, "3"))
-                self._infer_success_models([
-                    base[0],
-                ], (1,),
-                                           model_shape,
-                                           swap=True)
+                self._infer_success_models(
+                    [
+                        base[0],
+                    ],
+                    (1,),
+                    model_shape,
+                    swap=True,
+                )
 
                 # The model with different name should be available
-                self.assertTrue(
-                    triton_client.is_model_ready(override_model_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(override_model_name, "2"))
-                self.assertFalse(
-                    triton_client.is_model_ready(override_model_name, "3"))
-                self._infer_success_models([
-                    override_base,
-                ], (1,),
-                                           model_shape,
-                                           swap=True)
+                self.assertTrue(triton_client.is_model_ready(override_model_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(override_model_name, "2"))
+                self.assertFalse(triton_client.is_model_ready(override_model_name, "3"))
+                self._infer_success_models(
+                    [
+                        override_base,
+                    ],
+                    (1,),
+                    model_shape,
+                    swap=True,
+                )
 
                 # Reset model for the next client iteration
                 try:
@@ -2381,19 +2545,22 @@ def test_file_override(self):
                 self.assertFalse(triton_client.is_model_ready(model_name, "1"))
                 self.assertFalse(triton_client.is_model_ready(model_name, "2"))
                 self.assertTrue(triton_client.is_model_ready(model_name, "3"))
-                self._infer_success_models([
-                    base[0],
-                ], (3,), model_shape)
+                self._infer_success_models(
+                    [
+                        base[0],
+                    ],
+                    (3,),
+                    model_shape,
+                )
 
     def test_shutdown_dynamic(self):
         model_shape = (1, 1)
         input_data = np.ones(shape=(1, 1), dtype=np.float32)
 
-        inputs = [grpcclient.InferInput('INPUT0', model_shape, "FP32")]
+        inputs = [grpcclient.InferInput("INPUT0", model_shape, "FP32")]
         inputs[0].set_data_from_numpy(input_data)
 
-        triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                         verbose=True)
+        triton_client = grpcclient.InferenceServerClient("localhost:8001", verbose=True)
         model_name = "custom_zero_1_float32"
 
         # Send two requests as only requests held in scheduler are counted
@@ -2411,26 +2578,27 @@ def callback(user_data, result, error):
         request_count = 6
         async_results = []
         for _ in range(request_count):
-            triton_client.async_infer(model_name, inputs,
-                                      partial(callback, async_results))
+            triton_client.async_infer(
+                model_name, inputs, partial(callback, async_results)
+            )
         time.sleep(1)
 
         # Send signal to shutdown the server
-        os.kill(int(os.environ['SERVER_PID']), signal.SIGINT)
+        os.kill(int(os.environ["SERVER_PID"]), signal.SIGINT)
 
         # Send more requests and should be rejected
         try:
             triton_client.infer(model_name, inputs)
-            self.assertTrue(False,
-                            "expected error for new inference during shutdown")
+            self.assertTrue(False, "expected error for new inference during shutdown")
         except InferenceServerException as ex:
             self.assertIn(
                 "Server is stopping, scheduler for model has stopped accepting new inference requests",
-                ex.message())
+                ex.message(),
+            )
 
         # Wait until the results are available in user_data
         time_out = 30
-        while ((len(async_results) < request_count) and time_out > 0):
+        while (len(async_results) < request_count) and time_out > 0:
             time_out = time_out - 1
             time.sleep(1)
 
@@ -2438,21 +2606,19 @@ def callback(user_data, result, error):
         for result in async_results:
             if type(result) == InferenceServerException:
                 raise result
-            output_data = result.as_numpy('OUTPUT0')
+            output_data = result.as_numpy("OUTPUT0")
             np.testing.assert_allclose(
-                output_data,
-                input_data,
-                err_msg='Inference result is not correct')
+                output_data, input_data, err_msg="Inference result is not correct"
+            )
 
     def test_shutdown_sequence(self):
         model_shape = (1, 1)
         input_data = np.ones(shape=(1, 1), dtype=np.int32)
 
-        inputs = [grpcclient.InferInput('INPUT', model_shape, "INT32")]
+        inputs = [grpcclient.InferInput("INPUT", model_shape, "INT32")]
         inputs[0].set_data_from_numpy(input_data)
 
-        triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                         verbose=True)
+        triton_client = grpcclient.InferenceServerClient("localhost:8001", verbose=True)
         model_name = "custom_sequence_int32"
 
         # Send two requests as only requests held in scheduler are counted
@@ -2467,59 +2633,57 @@ def callback(user_data, result, error):
         request_count = 2
         async_results = []
         for i in range(request_count):
-            triton_client.async_infer(model_name,
-                                      inputs,
-                                      partial(callback, async_results),
-                                      sequence_id=(i + 1),
-                                      sequence_start=True)
+            triton_client.async_infer(
+                model_name,
+                inputs,
+                partial(callback, async_results),
+                sequence_id=(i + 1),
+                sequence_start=True,
+            )
         time.sleep(1)
 
         # Send signal to shutdown the server
-        os.kill(int(os.environ['SERVER_PID']), signal.SIGINT)
+        os.kill(int(os.environ["SERVER_PID"]), signal.SIGINT)
 
         # Send requests with different characteristic
-        # 1: New sequence with new seqeuence ID
-        try:
-            triton_client.infer(model_name,
-                                inputs,
-                                sequence_id=request_count,
-                                sequence_start=True)
-            self.assertTrue(False,
-                            "expected error for new inference during shutdown")
+        # 1: New sequence with new sequence ID
+        try:
+            triton_client.infer(
+                model_name, inputs, sequence_id=request_count, sequence_start=True
+            )
+            self.assertTrue(False, "expected error for new inference during shutdown")
         except InferenceServerException as ex:
             self.assertIn(
                 "Server is stopping, scheduler for model has stopped accepting new inference requests",
-                ex.message())
-        # 2: New sequence with existing seqeuence ID
-        try:
-            triton_client.infer(model_name,
-                                inputs,
-                                sequence_id=1,
-                                sequence_start=True)
-            self.assertTrue(False,
-                            "expected error for new inference during shutdown")
+                ex.message(),
+            )
+        # 2: New sequence with existing sequence ID
+        try:
+            triton_client.infer(model_name, inputs, sequence_id=1, sequence_start=True)
+            self.assertTrue(False, "expected error for new inference during shutdown")
         except InferenceServerException as ex:
             self.assertIn(
                 "Server is stopping, scheduler for model has stopped accepting new inference requests",
-                ex.message())
+                ex.message(),
+            )
         # 3: Continuing sequence
         try:
-            res = triton_client.infer(model_name,
-                                      inputs,
-                                      sequence_id=2,
-                                      sequence_end=True)
-            output_data = res.as_numpy('OUTPUT')
+            res = triton_client.infer(
+                model_name, inputs, sequence_id=2, sequence_end=True
+            )
+            output_data = res.as_numpy("OUTPUT")
             # Result are accumulated
             np.testing.assert_allclose(
                 output_data,
                 input_data + input_data,
-                err_msg='Inference result is not correct')
+                err_msg="Inference result is not correct",
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Wait until the results are available in user_data
         time_out = 30
-        while ((len(async_results) < request_count) and time_out > 0):
+        while (len(async_results) < request_count) and time_out > 0:
             time_out = time_out - 1
             time.sleep(1)
 
@@ -2527,11 +2691,10 @@ def callback(user_data, result, error):
         for result in async_results:
             if type(result) == InferenceServerException:
                 raise result
-            output_data = result.as_numpy('OUTPUT')
+            output_data = result.as_numpy("OUTPUT")
             np.testing.assert_allclose(
-                output_data,
-                input_data,
-                err_msg='Inference result is not correct')
+                output_data, input_data, err_msg="Inference result is not correct"
+            )
 
         # Sleep 5 seconds for scheduler timeout to work and should
         # reduce the in-flight count
@@ -2541,11 +2704,10 @@ def test_shutdown_ensemble(self):
         model_shape = (1, 1)
         input_data = np.ones(shape=(1, 1), dtype=np.float32)
 
-        inputs = [grpcclient.InferInput('INPUT0', model_shape, "FP32")]
+        inputs = [grpcclient.InferInput("INPUT0", model_shape, "FP32")]
         inputs[0].set_data_from_numpy(input_data)
 
-        triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                         verbose=True)
+        triton_client = grpcclient.InferenceServerClient("localhost:8001", verbose=True)
         model_name = "ensemble_zero_1_float32"
 
         # Send two requests as only requests held in scheduler are counted
@@ -2562,27 +2724,28 @@ def callback(user_data, result, error):
         request_count = 1
         async_results = []
         for _ in range(request_count):
-            triton_client.async_infer(model_name, inputs,
-                                      partial(callback, async_results))
+            triton_client.async_infer(
+                model_name, inputs, partial(callback, async_results)
+            )
         time.sleep(1)
 
         # Send signal to shutdown the server
-        os.kill(int(os.environ['SERVER_PID']), signal.SIGINT)
+        os.kill(int(os.environ["SERVER_PID"]), signal.SIGINT)
 
         # Send more requests and should be rejected
         try:
             triton_client.infer(model_name, inputs)
-            self.assertTrue(False,
-                            "expected error for new inference during shutdown")
+            self.assertTrue(False, "expected error for new inference during shutdown")
         except InferenceServerException as ex:
             self.assertIn("in ensemble 'ensemble_zero_1_float32'", ex.message())
             self.assertIn(
                 "Server is stopping, scheduler for model has stopped accepting new inference requests",
-                ex.message())
+                ex.message(),
+            )
 
         # Wait until the results are available in user_data
         time_out = 10
-        while ((len(async_results) < request_count) and time_out > 0):
+        while (len(async_results) < request_count) and time_out > 0:
             time_out = time_out - 1
             time.sleep(1)
 
@@ -2590,17 +2753,17 @@ def callback(user_data, result, error):
         for result in async_results:
             if type(result) == InferenceServerException:
                 raise result
-            output_data = result.as_numpy('OUTPUT0')
+            output_data = result.as_numpy("OUTPUT0")
             np.testing.assert_allclose(
-                output_data,
-                input_data,
-                err_msg='Inference result is not correct')
+                output_data, input_data, err_msg="Inference result is not correct"
+            )
 
     def test_load_gpu_limit(self):
         model_name = "cuda_memory_consumer"
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             triton_client.load_model(model_name + "_1")
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
@@ -2608,18 +2771,19 @@ def test_load_gpu_limit(self):
         # After the first load, the memory consumption should have exceeded
         # the specified limit, load will fail
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             triton_client.load_model(model_name + "_2")
             self.assertTrue(False, "expected error for loading model")
         except Exception as ex:
-            self.assertIn("memory limit set for GPU 0 has exceeded",
-                          ex.message())
+            self.assertIn("memory limit set for GPU 0 has exceeded", ex.message())
 
         # Load should work after explicitly unload model to free memory
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             triton_client.unload_model(model_name + "_1")
             triton_client.load_model(model_name + "_2")
         except Exception as ex:
@@ -2628,21 +2792,26 @@ def test_load_gpu_limit(self):
     def test_concurrent_load_speedup(self):
         # Initialize client
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         # Each model should have a loading delay of 10 seconds
-        model_pairs = [["identity_zero_1_int32_1", "identity_zero_1_int32_2"],
-                       ["python_identity_fp32_1", "python_identity_fp32_2"]]
+        model_pairs = [
+            ["identity_zero_1_int32_1", "identity_zero_1_int32_2"],
+            ["python_identity_fp32_1", "python_identity_fp32_2"],
+        ]
         # Test each model pair for speed up
         for model_pair in model_pairs:
             # Load both models concurrently
             threads = []
             for model_name in model_pair:
                 threads.append(
-                    threading.Thread(target=triton_client.load_model,
-                                     args=(model_name,)))
+                    threading.Thread(
+                        target=triton_client.load_model, args=(model_name,)
+                    )
+                )
             start_time = time.time()
             for thread in threads:
                 thread.start()
@@ -2653,11 +2822,13 @@ def test_concurrent_load_speedup(self):
             # Each of the two models has a minimum loading delay of 10 seconds
             # Speedup is observed when the concurrent loading time < 20 seconds
             # but use a tighter bound of 15 seconds
-            self.assertLess(loading_time, 15.0,
-                            "Concurrent loading speedup not observed")
+            self.assertLess(
+                loading_time, 15.0, "Concurrent loading speedup not observed"
+            )
             # Concurrent loading time cannot be < 10 seconds
-            self.assertGreaterEqual(loading_time, 10.0,
-                                    "Invalid concurrent loading time")
+            self.assertGreaterEqual(
+                loading_time, 10.0, "Invalid concurrent loading time"
+            )
             # Make sure the models are loaded
             self.assertTrue(triton_client.is_server_live())
             self.assertTrue(triton_client.is_server_ready())
@@ -2667,8 +2838,9 @@ def test_concurrent_load_speedup(self):
     def test_concurrent_load(self):
         # Initialize client
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         # Load same named model concurrently
@@ -2695,18 +2867,19 @@ def test_concurrent_load(self):
     def test_concurrent_load_unload(self):
         # Initialize client
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         # Load identity_zero_1_int32 and unload it while loading
         # The unload operation should wait until the load is completed
         with concurrent.futures.ThreadPoolExecutor() as pool:
-            load_thread = pool.submit(triton_client.load_model,
-                                      "identity_zero_1_int32")
+            load_thread = pool.submit(triton_client.load_model, "identity_zero_1_int32")
             time.sleep(2)  # wait between load and unload
-            unload_thread = pool.submit(triton_client.unload_model,
-                                        "identity_zero_1_int32")
+            unload_thread = pool.submit(
+                triton_client.unload_model, "identity_zero_1_int32"
+            )
             load_thread.result()
             unload_thread.result()
         self.assertTrue(triton_client.is_server_live())
@@ -2715,22 +2888,25 @@ def test_concurrent_load_unload(self):
         # Load ensemble_zero_1_float32 and unload its dependency while loading
         # The unload operation should wait until the load is completed
         with concurrent.futures.ThreadPoolExecutor() as pool:
-            load_thread = pool.submit(triton_client.load_model,
-                                      "ensemble_zero_1_float32")
+            load_thread = pool.submit(
+                triton_client.load_model, "ensemble_zero_1_float32"
+            )
             time.sleep(2)  # wait between load and unload
-            unload_thread = pool.submit(triton_client.unload_model,
-                                        "custom_zero_1_float32")
+            unload_thread = pool.submit(
+                triton_client.unload_model, "custom_zero_1_float32"
+            )
             load_thread.result()
             unload_thread.result()
         self.assertTrue(triton_client.is_server_live())
         self.assertTrue(triton_client.is_server_ready())
-        self.assertFalse(
-            triton_client.is_model_ready("ensemble_zero_1_float32"))
+        self.assertFalse(triton_client.is_model_ready("ensemble_zero_1_float32"))
         self.assertFalse(triton_client.is_model_ready("custom_zero_1_float32"))
         # Load both models and unload them concurrently
         model_names = ["identity_zero_1_int32", "ensemble_zero_1_float32"]
         for is_load in [True, False]:
-            action_fn = triton_client.load_model if is_load else triton_client.unload_model
+            action_fn = (
+                triton_client.load_model if is_load else triton_client.unload_model
+            )
             with concurrent.futures.ThreadPoolExecutor() as pool:
                 threads = []
                 for model_name in model_names:
@@ -2738,9 +2914,8 @@ def test_concurrent_load_unload(self):
                 for thread in concurrent.futures.as_completed(threads):
                     thread.result()
             for model_name in model_names:
-                self.assertEqual(is_load,
-                                 triton_client.is_model_ready(model_name))
+                self.assertEqual(is_load, triton_client.is_model_ready(model_name))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_lifecycle/test.sh b/qa/L0_lifecycle/test.sh
index ebbc409c85..ab12c1c7b8 100755
--- a/qa/L0_lifecycle/test.sh
+++ b/qa/L0_lifecycle/test.sh
@@ -1010,8 +1010,8 @@ LOG_IDX=$((LOG_IDX+1))
 
 # Test loading all models on startup in EXPLICIT model control mode AND
 # an additional --load-model argument, it should fail
-rm -fr models 
-mkdir models 
+rm -fr models
+mkdir models
 for i in onnx ; do
     cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/.
     sed -i "s/max_batch_size:.*/max_batch_size: 1/" models/${i}_float32_float32_float32/config.pbtxt
@@ -1542,7 +1542,7 @@ mkdir models
 cp -r ../custom_models/custom_zero_1_float32 models/. && \
     mkdir -p models/custom_zero_1_float32/1 && \
     (cd models/custom_zero_1_float32 && \
-        echo "dynamic_batching {}" >> config.pbtxt 
+        echo "dynamic_batching {}" >> config.pbtxt
         echo "parameters [" >> config.pbtxt && \
         echo "{ key: \"execute_delay_ms\"; value: { string_value: \"5000\" }}" >> config.pbtxt && \
         echo "]" >> config.pbtxt)
@@ -1621,7 +1621,7 @@ cp -r ensemble_zero_1_float32 models/. && \
 cp -r ../custom_models/custom_zero_1_float32 models/. && \
     mkdir -p models/custom_zero_1_float32/1 && \
     (cd models/custom_zero_1_float32 && \
-        echo "dynamic_batching {}" >> config.pbtxt 
+        echo "dynamic_batching {}" >> config.pbtxt
         echo "parameters [" >> config.pbtxt && \
         echo "{ key: \"execute_delay_ms\"; value: { string_value: \"5000\" }}" >> config.pbtxt && \
         echo "]" >> config.pbtxt)
diff --git a/qa/L0_logging/logging_endpoint_test.py b/qa/L0_logging/logging_endpoint_test.py
old mode 100644
new mode 100755
index 2058d941c2..26f98de3da
--- a/qa/L0_logging/logging_endpoint_test.py
+++ b/qa/L0_logging/logging_endpoint_test.py
@@ -27,21 +27,21 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import sys
+
 sys.path.append("../common")
 
+import json
 import sys
 import unittest
-import tritonclient.http as httpclient
-import tritonclient.grpc as grpcclient
-import json
-from google.protobuf import json_format
 
 import test_util as tu
+import tritonclient.grpc as grpcclient
+import tritonclient.http as httpclient
+from google.protobuf import json_format
 
 
 # Similar set up as dynamic batcher tests
 class LogEndpointTest(tu.TestResultCollector):
-
     def tearDown(self):
         # Clear all log settings to initial state.
         # Note that the tearDown function uses HTTP client so the pass/fail
@@ -54,7 +54,7 @@ def tearDown(self):
             "log_warning": True,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         triton_client = httpclient.InferenceServerClient("localhost:8000")
         triton_client.update_log_settings(settings=clear_settings)
@@ -71,7 +71,7 @@ def check_server_initial_state(self):
             "log_warning": True,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         triton_client = httpclient.InferenceServerClient("localhost:8000")
         self.assertEqual(initial_settings, triton_client.get_log_settings())
@@ -85,42 +85,40 @@ def test_http_get_settings(self):
             "log_warning": True,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         triton_client = httpclient.InferenceServerClient("localhost:8000")
-        self.assertEqual(initial_settings, triton_client.get_log_settings(),
-                         "Unexpected initial log settings")
+        self.assertEqual(
+            initial_settings,
+            triton_client.get_log_settings(),
+            "Unexpected initial log settings",
+        )
 
     def test_grpc_get_settings(self):
         # Log settings will be the same as default settings since
         # no update has been made.
         initial_settings = grpcclient.service_pb2.LogSettingsResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "log_file": {
-                        "stringParam": ""
-                    },
-                    "log_info": {
-                        "boolParam": True
-                    },
-                    "log_warning": {
-                        "boolParam": True
-                    },
-                    "log_error": {
-                        "boolParam": True
-                    },
-                    "log_verbose_level": {
-                        "uint32Param": 0
-                    },
-                    "log_format": {
-                        "stringParam": "default"
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "log_file": {"stringParam": ""},
+                        "log_info": {"boolParam": True},
+                        "log_warning": {"boolParam": True},
+                        "log_error": {"boolParam": True},
+                        "log_verbose_level": {"uint32Param": 0},
+                        "log_format": {"stringParam": "default"},
+                    }
                 }
-            }), initial_settings)
+            ),
+            initial_settings,
+        )
         triton_client = grpcclient.InferenceServerClient("localhost:8001")
-        self.assertEqual(initial_settings, triton_client.get_log_settings(),
-                         "Unexpected initial log settings")
+        self.assertEqual(
+            initial_settings,
+            triton_client.get_log_settings(),
+            "Unexpected initial log settings",
+        )
 
     def test_http_update_settings(self):
         # Update each possible log configuration
@@ -134,7 +132,7 @@ def test_http_update_settings(self):
             "log_warning": True,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_2 = {
             "log_file": "log_file.log",
@@ -142,7 +140,7 @@ def test_http_update_settings(self):
             "log_warning": True,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_3 = {
             "log_file": "log_file.log",
@@ -150,7 +148,7 @@ def test_http_update_settings(self):
             "log_warning": False,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_4 = {
             "log_file": "log_file.log",
@@ -158,7 +156,7 @@ def test_http_update_settings(self):
             "log_warning": False,
             "log_error": False,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_5 = {
             "log_file": "log_file.log",
@@ -166,7 +164,7 @@ def test_http_update_settings(self):
             "log_warning": False,
             "log_error": False,
             "log_verbose_level": 1,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_6 = {
             "log_file": "log_file.log",
@@ -174,34 +172,40 @@ def test_http_update_settings(self):
             "log_warning": False,
             "log_error": False,
             "log_verbose_level": 1,
-            "log_format": "ISO8601"
+            "log_format": "ISO8601",
         }
 
         triton_client = httpclient.InferenceServerClient("localhost:8000")
         self.assertEqual(
             expected_log_settings_1,
             triton_client.update_log_settings(settings=expected_log_settings_1),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
         self.assertEqual(
             expected_log_settings_2,
             triton_client.update_log_settings(settings=expected_log_settings_2),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
         self.assertEqual(
             expected_log_settings_3,
             triton_client.update_log_settings(settings=expected_log_settings_3),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
         self.assertEqual(
             expected_log_settings_4,
             triton_client.update_log_settings(settings=expected_log_settings_4),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
         self.assertEqual(
             expected_log_settings_5,
             triton_client.update_log_settings(settings=expected_log_settings_5),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
         self.assertEqual(
             expected_log_settings_6,
             triton_client.update_log_settings(settings=expected_log_settings_6),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
 
     def test_grpc_update_settings(self):
         # Update each possible log configuration
@@ -216,37 +220,30 @@ def test_grpc_update_settings(self):
             "log_warning": True,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_1 = grpcclient.service_pb2.LogSettingsResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "log_file": {
-                        "stringParam": "log_file.log"
-                    },
-                    "log_info": {
-                        "boolParam": True
-                    },
-                    "log_warning": {
-                        "boolParam": True
-                    },
-                    "log_error": {
-                        "boolParam": True
-                    },
-                    "log_verbose_level": {
-                        "uint32Param": 0
-                    },
-                    "log_format": {
-                        "stringParam": "default"
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "log_file": {"stringParam": "log_file.log"},
+                        "log_info": {"boolParam": True},
+                        "log_warning": {"boolParam": True},
+                        "log_error": {"boolParam": True},
+                        "log_verbose_level": {"uint32Param": 0},
+                        "log_format": {"stringParam": "default"},
+                    }
                 }
-            }), expected_log_settings_1)
+            ),
+            expected_log_settings_1,
+        )
 
         self.assertEqual(
             expected_log_settings_1,
             triton_client.update_log_settings(settings=log_settings_1),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
 
         log_settings_2 = {
             "log_file": "log_file.log",
@@ -254,37 +251,30 @@ def test_grpc_update_settings(self):
             "log_warning": True,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_2 = grpcclient.service_pb2.LogSettingsResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "log_file": {
-                        "stringParam": "log_file.log"
-                    },
-                    "log_info": {
-                        "boolParam": False
-                    },
-                    "log_warning": {
-                        "boolParam": True
-                    },
-                    "log_error": {
-                        "boolParam": True
-                    },
-                    "log_verbose_level": {
-                        "uint32Param": 0
-                    },
-                    "log_format": {
-                        "stringParam": "default"
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "log_file": {"stringParam": "log_file.log"},
+                        "log_info": {"boolParam": False},
+                        "log_warning": {"boolParam": True},
+                        "log_error": {"boolParam": True},
+                        "log_verbose_level": {"uint32Param": 0},
+                        "log_format": {"stringParam": "default"},
+                    }
                 }
-            }), expected_log_settings_2)
+            ),
+            expected_log_settings_2,
+        )
 
         self.assertEqual(
             expected_log_settings_2,
             triton_client.update_log_settings(settings=log_settings_2),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
 
         log_settings_3 = {
             "log_file": "log_file.log",
@@ -292,37 +282,30 @@ def test_grpc_update_settings(self):
             "log_warning": False,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_3 = grpcclient.service_pb2.LogSettingsResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "log_file": {
-                        "stringParam": "log_file.log"
-                    },
-                    "log_info": {
-                        "boolParam": False
-                    },
-                    "log_warning": {
-                        "boolParam": False
-                    },
-                    "log_error": {
-                        "boolParam": True
-                    },
-                    "log_verbose_level": {
-                        "uint32Param": 0
-                    },
-                    "log_format": {
-                        "stringParam": "default"
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "log_file": {"stringParam": "log_file.log"},
+                        "log_info": {"boolParam": False},
+                        "log_warning": {"boolParam": False},
+                        "log_error": {"boolParam": True},
+                        "log_verbose_level": {"uint32Param": 0},
+                        "log_format": {"stringParam": "default"},
+                    }
                 }
-            }), expected_log_settings_3)
+            ),
+            expected_log_settings_3,
+        )
 
         self.assertEqual(
             expected_log_settings_3,
             triton_client.update_log_settings(settings=log_settings_3),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
 
         log_settings_4 = {
             "log_file": "log_file.log",
@@ -330,37 +313,30 @@ def test_grpc_update_settings(self):
             "log_warning": False,
             "log_error": False,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_4 = grpcclient.service_pb2.LogSettingsResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "log_file": {
-                        "stringParam": "log_file.log"
-                    },
-                    "log_info": {
-                        "boolParam": False
-                    },
-                    "log_warning": {
-                        "boolParam": False
-                    },
-                    "log_error": {
-                        "boolParam": False
-                    },
-                    "log_verbose_level": {
-                        "uint32Param": 0
-                    },
-                    "log_format": {
-                        "stringParam": "default"
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "log_file": {"stringParam": "log_file.log"},
+                        "log_info": {"boolParam": False},
+                        "log_warning": {"boolParam": False},
+                        "log_error": {"boolParam": False},
+                        "log_verbose_level": {"uint32Param": 0},
+                        "log_format": {"stringParam": "default"},
+                    }
                 }
-            }), expected_log_settings_4)
+            ),
+            expected_log_settings_4,
+        )
 
         self.assertEqual(
             expected_log_settings_4,
             triton_client.update_log_settings(settings=log_settings_4),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
 
         log_settings_5 = {
             "log_file": "log_file.log",
@@ -368,37 +344,30 @@ def test_grpc_update_settings(self):
             "log_warning": False,
             "log_error": False,
             "log_verbose_level": 1,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_5 = grpcclient.service_pb2.LogSettingsResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "log_file": {
-                        "stringParam": "log_file.log"
-                    },
-                    "log_info": {
-                        "boolParam": False
-                    },
-                    "log_warning": {
-                        "boolParam": False
-                    },
-                    "log_error": {
-                        "boolParam": False
-                    },
-                    "log_verbose_level": {
-                        "uint32Param": 1
-                    },
-                    "log_format": {
-                        "stringParam": "default"
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "log_file": {"stringParam": "log_file.log"},
+                        "log_info": {"boolParam": False},
+                        "log_warning": {"boolParam": False},
+                        "log_error": {"boolParam": False},
+                        "log_verbose_level": {"uint32Param": 1},
+                        "log_format": {"stringParam": "default"},
+                    }
                 }
-            }), expected_log_settings_5)
+            ),
+            expected_log_settings_5,
+        )
 
         self.assertEqual(
             expected_log_settings_5,
             triton_client.update_log_settings(settings=log_settings_5),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
 
         log_settings_6 = {
             "log_file": "log_file.log",
@@ -406,38 +375,31 @@ def test_grpc_update_settings(self):
             "log_warning": False,
             "log_error": False,
             "log_verbose_level": 1,
-            "log_format": "ISO8601"
+            "log_format": "ISO8601",
         }
         expected_log_settings_6 = grpcclient.service_pb2.LogSettingsResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "log_file": {
-                        "stringParam": "log_file.log"
-                    },
-                    "log_info": {
-                        "boolParam": False
-                    },
-                    "log_warning": {
-                        "boolParam": False
-                    },
-                    "log_error": {
-                        "boolParam": False
-                    },
-                    "log_verbose_level": {
-                        "uint32Param": 1
-                    },
-                    "log_format": {
-                        "stringParam": "ISO8601"
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "log_file": {"stringParam": "log_file.log"},
+                        "log_info": {"boolParam": False},
+                        "log_warning": {"boolParam": False},
+                        "log_error": {"boolParam": False},
+                        "log_verbose_level": {"uint32Param": 1},
+                        "log_format": {"stringParam": "ISO8601"},
+                    }
                 }
-            }), expected_log_settings_6)
+            ),
+            expected_log_settings_6,
+        )
 
         self.assertEqual(
             expected_log_settings_6,
             triton_client.update_log_settings(settings=log_settings_6),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_logging/test.sh b/qa/L0_logging/test.sh
index 47c1e081cd..d83e0b76a4 100755
--- a/qa/L0_logging/test.sh
+++ b/qa/L0_logging/test.sh
@@ -70,7 +70,7 @@ RET=0
 
 function verify_correct_settings () {
   log_file_expected=$1
-  log_info_expected=$2 
+  log_info_expected=$2
   log_warn_expected=$3
   log_error_expected=$4
   log_verbose_expected=$5
@@ -142,7 +142,7 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-# Test Log File (Arguement)
+# Test Log File (Argument)
 SERVER_ARGS="--log-file=log_file.log --model-repository=$MODELSDIR"
 SERVER_LOG="./inference_server_log_file.log"
 run_server
@@ -214,7 +214,7 @@ if [ $? -ne 0 ]; then
     RET=1
 fi
 
-# Check redirection worked properly (server log has tolerance of 40 due to 
+# Check redirection worked properly (server log has tolerance of 40 due to
 # unavoidable onnx framework logging)
 expected_log_count=75
 actual_log_count=$(grep -c ^[IWEV][0-9][0-9][0-9][0-9].* ./log_file.log)
@@ -245,7 +245,7 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-# Test Log Info (Arguement)
+# Test Log Info (Argument)
 rm -f log_file.log
 SERVER_ARGS="--log-file=log_file.log --log-info=false --log-verbose=1 --model-repository=$MODELSDIR"
 SERVER_LOG="./inference_server_log_file.log"
@@ -375,7 +375,7 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-# Test Log Verbose Level (Arguement)
+# Test Log Verbose Level (Argument)
 rm -f log_file.log
 SERVER_ARGS="--log-file=log_file.log --log-verbose=1 --model-repository=$MODELSDIR"
 SERVER_LOG="./inference_server_log_file.log"
@@ -423,7 +423,7 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-# Test Log Format (Arguement)
+# Test Log Format (Argument)
 rm -f log_file.log
 SERVER_ARGS="--log-file=log_file.log --log-verbose=1 --log-format=ISO8601 --model-repository=$MODELSDIR"
 SERVER_LOG="./inference_server_log_file.log"
@@ -453,7 +453,7 @@ line=$(head -n 1 log_file.log)
 date=$(date '+%m%d')
 final_date="I${date}"
 format_date=$(echo $line | head -n1 | awk '{print $1;}')
-if [[ $final_date == $format_date ]]; then 
+if [[ $final_date == $format_date ]]; then
     echo -e "\n***\n*** Test Failed: Unexpected Log Format $LINENO\n***"
     RET=1
 fi
diff --git a/qa/L0_long_running_stress/crashing_client.py b/qa/L0_long_running_stress/crashing_client.py
old mode 100644
new mode 100755
index bb9faab45a..d9c727a3d3
--- a/qa/L0_long_running_stress/crashing_client.py
+++ b/qa/L0_long_running_stress/crashing_client.py
@@ -1,4 +1,6 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,27 +30,24 @@
 
 sys.path.append("../common")
 
-import numpy as np
-from multiprocessing import Process, shared_memory
+import argparse
 import time
+from multiprocessing import Process, shared_memory
+
+import numpy as np
 import test_util as tu
-import argparse
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import np_to_triton_dtype
 
 
-def crashing_client(model_name,
-                    dtype,
-                    tensor_shape,
-                    shm_name,
-                    triton_client,
-                    input_name="INPUT0"):
+def crashing_client(
+    model_name, dtype, tensor_shape, shm_name, triton_client, input_name="INPUT0"
+):
     in0 = np.random.random(tensor_shape).astype(dtype)
     if "libtorch" in model_name:
         input_name = "INPUT__0"
     inputs = [
-        grpcclient.InferInput(input_name, tensor_shape,
-                              np_to_triton_dtype(dtype)),
+        grpcclient.InferInput(input_name, tensor_shape, np_to_triton_dtype(dtype)),
     ]
     inputs[0].set_data_from_numpy(in0)
 
@@ -62,13 +61,15 @@ def crashing_client(model_name,
         results = triton_client.infer(model_name, inputs)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-t',
-                        '--trial',
-                        type=str,
-                        required=True,
-                        help='Set trial for the crashing client')
+    parser.add_argument(
+        "-t",
+        "--trial",
+        type=str,
+        required=True,
+        help="Set trial for the crashing client",
+    )
     FLAGS = parser.parse_args()
     trial = FLAGS.trial
 
@@ -76,22 +77,23 @@ def crashing_client(model_name,
     model_name = tu.get_zero_model_name(trial, 1, dtype)
     tensor_shape = (1,) if "nobatch" in trial else (1, 1)
 
-    triton_client = grpcclient.InferenceServerClient(url="localhost:8001",
-                                                     verbose=True)
+    triton_client = grpcclient.InferenceServerClient(url="localhost:8001", verbose=True)
 
     shm = shared_memory.SharedMemory(create=True, size=8)
     count = np.ndarray((1,), dtype=np.int32, buffer=shm.buf)
     count[0] = 0
 
-    p = Process(target=crashing_client,
-                name="crashing_client",
-                args=(
-                    model_name,
-                    dtype,
-                    tensor_shape,
-                    shm.name,
-                    triton_client,
-                ))
+    p = Process(
+        target=crashing_client,
+        name="crashing_client",
+        args=(
+            model_name,
+            dtype,
+            tensor_shape,
+            shm.name,
+            triton_client,
+        ),
+    )
 
     p.start()
 
diff --git a/qa/L0_long_running_stress/scenarios.py b/qa/L0_long_running_stress/scenarios.py
old mode 100644
new mode 100755
index 7e91968ccb..abb0004e90
--- a/qa/L0_long_running_stress/scenarios.py
+++ b/qa/L0_long_running_stress/scenarios.py
@@ -1,4 +1,6 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,26 +31,28 @@
 
 sys.path.append("../common")
 
-import numpy as np
-import time
-import test_util as tu
-import tritonclient.grpc as grpcclient
-from tritonclient.utils import np_to_triton_dtype
 import math
-from PIL import Image
 import os
 import subprocess
 import threading
+import time
+
+import numpy as np
+import test_util as tu
+import tritonclient.grpc as grpcclient
+from PIL import Image
+from tritonclient.utils import np_to_triton_dtype
+
 if sys.version_info >= (3, 0):
     import queue
 else:
     import Queue as queue
-from functools import partial
 
 import abc
 import csv
 import json
 import re
+from functools import partial
 
 DEFAULT_TIMEOUT_MS = 25000
 SEQUENCE_LENGTH_MEAN = 16
@@ -66,7 +70,6 @@ def completion_callback(user_data, result, error):
 
 
 class Scenario(metaclass=abc.ABCMeta):
-
     def __init__(self, name, trials, verbose=False, out_stream=sys.stdout):
         self.name_ = name
         self.trials_ = trials
@@ -109,13 +112,15 @@ class ModelOption:
         # 'queue_latency_range_us' specifies the range where queue latency
         # reported should be, otherwise, model concurrency will be adjusted
         # within 'concurrency_range' to influence the queue latency.
-        def __init__(self,
-                     model_name,
-                     batch_size,
-                     concurrency_range,
-                     queue_latency_range_us,
-                     input_shapes=[],
-                     input_file=None):
+        def __init__(
+            self,
+            model_name,
+            batch_size,
+            concurrency_range,
+            queue_latency_range_us,
+            input_shapes=[],
+            input_file=None,
+        ):
             self.model_name_ = model_name
             self.concurrency_range_ = list(concurrency_range)
             self.batch_size_ = batch_size
@@ -125,8 +130,11 @@ def __init__(self,
 
         def run(self, name, sequence_id_range, out_stream):
             csv_file = os.path.join(
-                "csv_dir", "{}_{}_{}.csv".format(name, self.model_name_,
-                                                 self.concurrency_range_[2]))
+                "csv_dir",
+                "{}_{}_{}.csv".format(
+                    name, self.model_name_, self.concurrency_range_[2]
+                ),
+            )
 
             arg_list = [PerfAnalyzerScenario.command_]
             # Always use GRPC streaming feature to ensure requests are handled
@@ -136,8 +144,9 @@ def run(self, name, sequence_id_range, out_stream):
             arg_list += ["-b", "{}".format(self.batch_size_)]
             arg_list += [
                 "--concurrency-range",
-                "{}:{}:1".format(self.concurrency_range_[2],
-                                 self.concurrency_range_[2])
+                "{}:{}:1".format(
+                    self.concurrency_range_[2], self.concurrency_range_[2]
+                ),
             ]
             arg_list += ["-f", csv_file]
             for name, shape in self.input_shapes_:
@@ -147,43 +156,44 @@ def run(self, name, sequence_id_range, out_stream):
             if sequence_id_range is not None:
                 arg_list += [
                     "--sequence-id-range",
-                    "{}:{}".format(sequence_id_range[0], sequence_id_range[1])
+                    "{}:{}".format(sequence_id_range[0], sequence_id_range[1]),
                 ]
 
-            completed_process = subprocess.run(arg_list,
-                                               text=True,
-                                               stdout=subprocess.PIPE,
-                                               stderr=subprocess.STDOUT)
+            completed_process = subprocess.run(
+                arg_list, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+            )
             # Write output to file before checking return code
             print(completed_process.stdout, file=out_stream)
             completed_process.check_returncode()
 
             # Read queue time and adjust concurrency
-            with open(csv_file, newline='') as csvfile:
+            with open(csv_file, newline="") as csvfile:
                 reader = csv.DictReader(csvfile)
                 for row in reader:
-                    current_queue_us = int(row['Server Queue'])
+                    current_queue_us = int(row["Server Queue"])
                     if current_queue_us < self.queue_latency_range_us_[0]:
                         self.concurrency_range_[2] = min(
-                            self.concurrency_range_[2] + 1,
-                            self.concurrency_range_[1])
+                            self.concurrency_range_[2] + 1, self.concurrency_range_[1]
+                        )
                     elif current_queue_us > self.queue_latency_range_us_[0]:
                         self.concurrency_range_[2] = max(
-                            self.concurrency_range_[2] - 1,
-                            self.concurrency_range_[0])
+                            self.concurrency_range_[2] - 1, self.concurrency_range_[0]
+                        )
                     break
-            m = re.search(r'Request count: ([0-9]+)', completed_process.stdout)
+            m = re.search(r"Request count: ([0-9]+)", completed_process.stdout)
             return int(m.group(1))
 
-    def __init__(self,
-                 name,
-                 rng,
-                 sequence_trials,
-                 identity_trials,
-                 queue_latency_range_us=(10000, 100000),
-                 sequence_id_range=None,
-                 verbose=False,
-                 out_stream=sys.stdout):
+    def __init__(
+        self,
+        name,
+        rng,
+        sequence_trials,
+        identity_trials,
+        queue_latency_range_us=(10000, 100000),
+        sequence_id_range=None,
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
         super().__init__(name, [], verbose, out_stream)
         self.rng_ = rng
         self.sequence_id_range_ = sequence_id_range
@@ -194,8 +204,10 @@ def __init__(self,
 
         # Add no validation models
         self.options_.append(
-            PerfAnalyzerScenario.ModelOption("resnet_v1_50_graphdef_def", 32,
-                                             (1, 4, 1), queue_latency_range_us))
+            PerfAnalyzerScenario.ModelOption(
+                "resnet_v1_50_graphdef_def", 32, (1, 4, 1), queue_latency_range_us
+            )
+        )
         for trial in sequence_trials:
             dtype = self.get_datatype(trial)
             # Skip string sequence model for now, it is hard for PA to generate
@@ -204,8 +216,10 @@ def __init__(self,
                 continue
             model_name = tu.get_sequence_model_name(trial, dtype)
             self.options_.append(
-                PerfAnalyzerScenario.ModelOption(model_name, 1, (1, 4, 1),
-                                                 queue_latency_range_us))
+                PerfAnalyzerScenario.ModelOption(
+                    model_name, 1, (1, 4, 1), queue_latency_range_us
+                )
+            )
         for trial in identity_trials:
             dtype = np.float32
             model_name = tu.get_zero_model_name(trial, 1, dtype)
@@ -214,9 +228,10 @@ def __init__(self,
             else:
                 input_shapes = [("INPUT0", "16")]
             self.options_.append(
-                PerfAnalyzerScenario.ModelOption(model_name, 1, (1, 4, 1),
-                                                 queue_latency_range_us,
-                                                 input_shapes))
+                PerfAnalyzerScenario.ModelOption(
+                    model_name, 1, (1, 4, 1), queue_latency_range_us, input_shapes
+                )
+            )
 
         # Add output validation version of the models
         # Skip resnet as the output data has variation which makes exact
@@ -224,25 +239,31 @@ def __init__(self,
         for trial in sequence_trials:
             dtype = self.get_datatype(trial)
             model_name = tu.get_sequence_model_name(trial, dtype)
-            data_file = os.path.join("validation_data",
-                                     "{}.json".format(model_name))
+            data_file = os.path.join("validation_data", "{}.json".format(model_name))
             self.generate_sequence_data(trial, dtype, data_file)
             self.options_.append(
-                PerfAnalyzerScenario.ModelOption(model_name,
-                                                 1, (1, 4, 1),
-                                                 queue_latency_range_us,
-                                                 input_file=data_file))
+                PerfAnalyzerScenario.ModelOption(
+                    model_name,
+                    1,
+                    (1, 4, 1),
+                    queue_latency_range_us,
+                    input_file=data_file,
+                )
+            )
         for trial in identity_trials:
             dtype = np.float32
             model_name = tu.get_zero_model_name(trial, 1, dtype)
-            data_file = os.path.join("validation_data",
-                                     "{}.json".format(model_name))
+            data_file = os.path.join("validation_data", "{}.json".format(model_name))
             self.generate_identity_data(trial, dtype, data_file)
             self.options_.append(
-                PerfAnalyzerScenario.ModelOption(model_name,
-                                                 1, (1, 4, 1),
-                                                 queue_latency_range_us,
-                                                 input_file=data_file))
+                PerfAnalyzerScenario.ModelOption(
+                    model_name,
+                    1,
+                    (1, 4, 1),
+                    queue_latency_range_us,
+                    input_file=data_file,
+                )
+            )
 
     def generate_sequence_data(self, trial, dtype, data_filename):
         input0 = "INPUT" if "libtorch" not in trial else "INPUT__0"
@@ -255,8 +276,7 @@ def generate_sequence_data(self, trial, dtype, data_filename):
             elif dtype == np.dtype(object):
                 res = str(i)
             else:
-                raise Exception(
-                    "unexpected sequence data type {}".format(dtype))
+                raise Exception("unexpected sequence data type {}".format(dtype))
             input_data.append({input0: [res]})
         output0 = "OUTPUT" if "libtorch" not in trial else "OUTPUT__0"
         output_data = []
@@ -272,8 +292,7 @@ def generate_sequence_data(self, trial, dtype, data_filename):
                 elif dtype == np.dtype(object):
                     res = str(sum)
                 else:
-                    raise Exception(
-                        "unexpected sequence data type {}".format(dtype))
+                    raise Exception("unexpected sequence data type {}".format(dtype))
                 output_data.append({output0: [res]})
         else:
             for i in range(3):
@@ -285,17 +304,17 @@ def generate_sequence_data(self, trial, dtype, data_filename):
                 elif dtype == np.dtype(object):
                     res = str(res)
                 else:
-                    raise Exception(
-                        "unexpected sequence data type {}".format(dtype))
+                    raise Exception("unexpected sequence data type {}".format(dtype))
                 output_data.append(
-                    {output0: [res if dtype != np.dtype(object) else str(res)]})
+                    {output0: [res if dtype != np.dtype(object) else str(res)]}
+                )
         data = {"data": [input_data]}
         data["validation_data"] = [output_data]
 
         # Only write to a file if there isn't validation file for the model
         PerfAnalyzerScenario.generation_mutex_.acquire()
         if not os.path.exists(data_filename):
-            with open(data_filename, 'w') as f:
+            with open(data_filename, "w") as f:
                 json.dump(data, f)
         PerfAnalyzerScenario.generation_mutex_.release()
 
@@ -311,43 +330,26 @@ def generate_identity_data(self, trial, dtype, data_filename):
             elif dtype == np.dtype(object):
                 res = str(i)
             else:
-                raise Exception(
-                    "unexpected identity data type {}".format(dtype))
+                raise Exception("unexpected identity data type {}".format(dtype))
             io_data.append(res)
         data = {
-            "data": [{
-                input0: {
-                    "content": io_data,
-                    "shape": [16]
-                }
-            }],
-            "validation_data": [{
-                output0: {
-                    "content": io_data,
-                    "shape": [16]
-                }
-            }]
+            "data": [{input0: {"content": io_data, "shape": [16]}}],
+            "validation_data": [{output0: {"content": io_data, "shape": [16]}}],
         }
         # Only write to a file if there isn't validation file for the model
         PerfAnalyzerScenario.generation_mutex_.acquire()
         if not os.path.exists(data_filename):
-            with open(data_filename, 'w') as f:
+            with open(data_filename, "w") as f:
                 json.dump(data, f)
         PerfAnalyzerScenario.generation_mutex_.release()
 
     def run(self, client_metadata):
         model_option = np.random.choice(self.options_)
-        return model_option.run(self.name_, self.sequence_id_range_,
-                                self.out_stream_)
+        return model_option.run(self.name_, self.sequence_id_range_, self.out_stream_)
 
 
 class ResNetScenario(Scenario):
-
-    def __init__(self,
-                 name,
-                 batch_size=32,
-                 verbose=False,
-                 out_stream=sys.stdout):
+    def __init__(self, name, batch_size=32, verbose=False, out_stream=sys.stdout):
         super().__init__(name, [], verbose, out_stream)
         self.model_name_ = "resnet_v1_50_graphdef_def"
         self.batch_size_ = batch_size
@@ -360,7 +362,7 @@ def __init__(self,
 
     def preprocess(self, filename):
         img = Image.open(filename)
-        resized_img = img.convert('RGB').resize((224, 224), Image.BILINEAR)
+        resized_img = img.convert("RGB").resize((224, 224), Image.BILINEAR)
         np_img = np.array(resized_img).astype(np.float32)
         if np_img.ndim == 2:
             np_img = np_img[:, :, np.newaxis]
@@ -370,31 +372,35 @@ def preprocess(self, filename):
     def postprocess(self, results):
         output_array = results.as_numpy("resnet_v1_50/predictions/Softmax")
         if len(output_array) != self.batch_size_:
-            raise Exception("expected {} results, got {}".format(
-                self.batch_size_, len(output_array)))
+            raise Exception(
+                "expected {} results, got {}".format(
+                    self.batch_size_, len(output_array)
+                )
+            )
 
         for results in output_array:
             for result in results:
                 if output_array.dtype.type == np.object_:
-                    cls = "".join(chr(x) for x in result).split(':')
+                    cls = "".join(chr(x) for x in result).split(":")
                 else:
-                    cls = result.split(':')
+                    cls = result.split(":")
                 if cls[2] != "VULTURE":
                     raise Exception(
-                        "expected VULTURE as classification result, got {}".
-                        format(cls[2]))
+                        "expected VULTURE as classification result, got {}".format(
+                            cls[2]
+                        )
+                    )
 
     def run(self, client_metadata):
         triton_client = client_metadata[0]
 
-        inputs = [
-            grpcclient.InferInput("input", self.image_data_.shape, "FP32")
-        ]
+        inputs = [grpcclient.InferInput("input", self.image_data_.shape, "FP32")]
         inputs[0].set_data_from_numpy(self.image_data_)
 
         outputs = [
-            grpcclient.InferRequestedOutput("resnet_v1_50/predictions/Softmax",
-                                            class_count=1)
+            grpcclient.InferRequestedOutput(
+                "resnet_v1_50/predictions/Softmax", class_count=1
+            )
         ]
         res = triton_client.infer(self.model_name_, inputs, outputs=outputs)
         self.postprocess(res)
@@ -402,14 +408,15 @@ def run(self, client_metadata):
 
 
 class TimeoutScenario(Scenario):
-
-    def __init__(self,
-                 name,
-                 trials,
-                 input_dtype=np.float32,
-                 input_name="INPUT0",
-                 verbose=False,
-                 out_stream=sys.stdout):
+    def __init__(
+        self,
+        name,
+        trials,
+        input_dtype=np.float32,
+        input_name="INPUT0",
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
         super().__init__(name, trials, verbose, out_stream)
         self.input_dtype_ = input_dtype
         self.input_name_ = input_name
@@ -422,12 +429,16 @@ def run(self, client_metadata):
         if "librotch" in trial:
             input_name = "INPUT__0"
 
-        tensor_shape = (math.trunc(1 * (1024 * 1024 * 1024) //
-                                   np.dtype(self.input_dtype_).itemsize),)
+        tensor_shape = (
+            math.trunc(
+                1 * (1024 * 1024 * 1024) // np.dtype(self.input_dtype_).itemsize
+            ),
+        )
         in0 = np.random.random(tensor_shape).astype(self.input_dtype_)
         inputs = [
-            grpcclient.InferInput(input_name, tensor_shape,
-                                  np_to_triton_dtype(self.input_dtype_)),
+            grpcclient.InferInput(
+                input_name, tensor_shape, np_to_triton_dtype(self.input_dtype_)
+            ),
         ]
         inputs[0].set_data_from_numpy(in0)
 
@@ -443,12 +454,11 @@ def run(self, client_metadata):
 
 
 class CrashingScenario(Scenario):
-
     def __init__(self, name, verbose=False, out_stream=sys.stdout):
         super().__init__(name, [], verbose, out_stream)
 
     def run(self, client_metadata):
-        # Only use "custom" model as it simulates exectuion delay which
+        # Only use "custom" model as it simulates execution delay which
         # simplifies "crashing simulation" (client exits while request is being
         # executed)
         trial = "custom"
@@ -456,8 +466,7 @@ def run(self, client_metadata):
         # Call the client as subprocess to avoid crashing stress test
         # and gather logging as string variable
         crashing_client = "crashing_client.py"
-        log = subprocess.check_output(
-            [sys.executable, crashing_client, "-t", trial])
+        log = subprocess.check_output([sys.executable, crashing_client, "-t", trial])
         result = self.parse_result(log.decode("utf-8"))
         if not result[1]:
             assert False, "crashing_client failed {}".format(self.name_)
@@ -472,22 +481,20 @@ def parse_result(self, log):
         if "request_count:" in log:
             idx_start = log.rindex("request_count:")
             idx_start = log.find(" ", idx_start)
-            idx_end = log.find('\n', idx_start)
-            request_count = int(log[idx_start + 1:idx_end])
+            idx_end = log.find("\n", idx_start)
+            request_count = int(log[idx_start + 1 : idx_end])
 
         if "live:" in log:
             idx_start = log.rindex("live:")
             idx_start = log.find(" ", idx_start)
-            idx_end = log.find('\n', idx_start)
-            is_server_live = log[idx_start + 1:idx_end]
+            idx_end = log.find("\n", idx_start)
+            is_server_live = log[idx_start + 1 : idx_end]
 
         return (request_count, is_server_live == "true")
 
 
 class SequenceScenario(Scenario):
-
     class UserData:
-
         def __init__(self):
             self._completed_requests = queue.Queue()
 
@@ -498,51 +505,63 @@ def __init__(self):
     def check_constraints(self, model_name, sequence_id):
         pass
 
-    def __init__(self,
-                 name,
-                 trials,
-                 rng,
-                 sequence_constraints,
-                 verbose=False,
-                 out_stream=sys.stdout):
+    def __init__(
+        self,
+        name,
+        trials,
+        rng,
+        sequence_constraints,
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
         super().__init__(name, trials, verbose, out_stream)
         self.rng_ = rng
         self.sequence_constraints_ = sequence_constraints
 
     def get_expected_result(self, expected_result, value, trial, flag_str=None):
         # Adjust the expected_result for models that
-        # couldn't implement the full accumulator. See
+        # could not implement the full accumulator. See
         # qa/common/gen_qa_sequence_models.py for more
         # information.
-        if (("nobatch" not in trial and
-             ("custom" not in trial)) or ("graphdef" in trial) or
-            ("plan" in trial) or ("onnx" in trial)) or ("libtorch" in trial):
+        if (
+            ("nobatch" not in trial and ("custom" not in trial))
+            or ("graphdef" in trial)
+            or ("plan" in trial)
+            or ("onnx" in trial)
+        ) or ("libtorch" in trial):
             expected_result = value
             if (flag_str is not None) and ("start" in flag_str):
                 expected_result += 1
         return expected_result
 
-    def check_sequence_async(self,
-                             client_metadata,
-                             trial,
-                             model_name,
-                             input_dtype,
-                             steps,
-                             timeout_ms=DEFAULT_TIMEOUT_MS,
-                             batch_size=1,
-                             sequence_name="",
-                             tensor_shape=(1,),
-                             input_name="INPUT",
-                             output_name="OUTPUT"):
+    def check_sequence_async(
+        self,
+        client_metadata,
+        trial,
+        model_name,
+        input_dtype,
+        steps,
+        timeout_ms=DEFAULT_TIMEOUT_MS,
+        batch_size=1,
+        sequence_name="",
+        tensor_shape=(1,),
+        input_name="INPUT",
+        output_name="OUTPUT",
+    ):
         """Perform sequence of inferences using async run. The 'steps' holds
         a list of tuples, one for each inference with format:
 
         (flag_str, value, expected_result, delay_ms)
 
         """
-        if (("savedmodel" not in trial) and ("graphdef" not in trial) and
-            ("custom" not in trial) and ("onnx" not in trial) and
-            ("libtorch" not in trial) and ("plan" not in trial)):
+        if (
+            ("savedmodel" not in trial)
+            and ("graphdef" not in trial)
+            and ("custom" not in trial)
+            and ("onnx" not in trial)
+            and ("libtorch" not in trial)
+            and ("plan" not in trial)
+        ):
             assert False, "unknown trial type: " + trial
 
         if "nobatch" not in trial:
@@ -566,28 +585,30 @@ def check_sequence_async(self,
             seq_start = False
             seq_end = False
             if flag_str is not None:
-                seq_start = ("start" in flag_str)
-                seq_end = ("end" in flag_str)
+                seq_start = "start" in flag_str
+                seq_end = "end" in flag_str
 
             if input_dtype == np.object_:
                 in0 = np.full(tensor_shape, value, dtype=np.int32)
-                in0n = np.array([str(x) for x in in0.reshape(in0.size)],
-                                dtype=object)
+                in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object)
                 in0 = in0n.reshape(tensor_shape)
             else:
                 in0 = np.full(tensor_shape, value, dtype=input_dtype)
 
             inputs = [
-                grpcclient.InferInput(input_name, tensor_shape,
-                                      np_to_triton_dtype(input_dtype)),
+                grpcclient.InferInput(
+                    input_name, tensor_shape, np_to_triton_dtype(input_dtype)
+                ),
             ]
             inputs[0].set_data_from_numpy(in0)
 
-            triton_client.async_stream_infer(model_name,
-                                             inputs,
-                                             sequence_id=sequence_id,
-                                             sequence_start=seq_start,
-                                             sequence_end=seq_end)
+            triton_client.async_stream_infer(
+                model_name,
+                inputs,
+                sequence_id=sequence_id,
+                sequence_start=seq_start,
+                sequence_end=seq_end,
+            )
             sent_count += 1
 
             if delay_ms is not None:
@@ -608,49 +629,62 @@ def check_sequence_async(self,
                 if (now_ms - seq_start_ms) > timeout_ms:
                     raise TimeoutException(
                         "Timeout expired for {}, got {} ms".format(
-                            sequence_name, (now_ms - seq_start_ms)))
-
-            result = results.as_numpy(
-                output_name)[0] if "nobatch" in trial else results.as_numpy(
-                    output_name)[0][0]
+                            sequence_name, (now_ms - seq_start_ms)
+                        )
+                    )
+
+            result = (
+                results.as_numpy(output_name)[0]
+                if "nobatch" in trial
+                else results.as_numpy(output_name)[0][0]
+            )
             if self.verbose_:
-                print("{} {}: + {} = {}".format(sequence_name, sequence_id,
-                                                value, result),
-                      file=self.out_stream_)
+                print(
+                    "{} {}: + {} = {}".format(
+                        sequence_name, sequence_id, value, result
+                    ),
+                    file=self.out_stream_,
+                )
 
             if expected is not None:
                 if input_dtype == np.object_:
-                    assert int(
-                        result
-                    ) == expected, "{}: expected result {}, got {} {} {}".format(
-                        sequence_name, expected, int(result), trial, model_name)
+                    assert (
+                        int(result) == expected
+                    ), "{}: expected result {}, got {} {} {}".format(
+                        sequence_name, expected, int(result), trial, model_name
+                    )
                 else:
-                    assert result == expected, "{}: expected result {}, got {} {} {}".format(
-                        sequence_name, expected, result, trial, model_name)
+                    assert (
+                        result == expected
+                    ), "{}: expected result {}, got {} {} {}".format(
+                        sequence_name, expected, result, trial, model_name
+                    )
         triton_client.stop_stream()
         return sent_count
 
 
 class SequenceNoEndScenario(SequenceScenario):
-
-    def __init__(self,
-                 name,
-                 trials,
-                 rng,
-                 sequence_constraints,
-                 verbose=False,
-                 out_stream=sys.stdout):
-        super().__init__(name, trials, rng, sequence_constraints, verbose,
-                         out_stream)
+    def __init__(
+        self,
+        name,
+        trials,
+        rng,
+        sequence_constraints,
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
+        super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream)
 
     def check_constraints(self, model_name, sequence_id):
         # The scenario can always be run regardless of the previous runs
         return True
 
-    def run(self,
-            client_metadata,
-            len_mean=SEQUENCE_LENGTH_MEAN,
-            len_stddev=SEQUENCE_LENGTH_STDEV):
+    def run(
+        self,
+        client_metadata,
+        len_mean=SEQUENCE_LENGTH_MEAN,
+        len_stddev=SEQUENCE_LENGTH_STDEV,
+    ):
         trial = self.get_trial()
         dtype = self.get_datatype(trial)
         model_name = tu.get_sequence_model_name(trial, dtype)
@@ -666,9 +700,10 @@ def run(self,
         # never ends. The sequence should be aborted by the server and its
         # slot reused for another sequence.
         seqlen = max(1, int(self.rng_.normal(len_mean, len_stddev)))
-        print("{} {}: no-end seqlen = {}".format(self.name_, client_metadata[1],
-                                                 seqlen),
-              file=self.out_stream_)
+        print(
+            "{} {}: no-end seqlen = {}".format(self.name_, client_metadata[1], seqlen),
+            file=self.out_stream_,
+        )
 
         values = self.rng_.randint(0, 1024 * 1024, size=seqlen).astype(dtype)
 
@@ -683,40 +718,42 @@ def run(self,
             val = values[idx]
             delay_ms = None
             expected_result += val
-            expected_result = self.get_expected_result(expected_result, val,
-                                                       trial, flags)
+            expected_result = self.get_expected_result(
+                expected_result, val, trial, flags
+            )
 
             # (flag_str, value, expected_result, delay_ms)
-            steps.append((flags, val, expected_result, delay_ms),)
+            steps.append(
+                (flags, val, expected_result, delay_ms),
+            )
 
-        return self.check_sequence_async(client_metadata,
-                                         trial,
-                                         model_name,
-                                         dtype,
-                                         steps,
-                                         sequence_name=self.name_)
+        return self.check_sequence_async(
+            client_metadata, trial, model_name, dtype, steps, sequence_name=self.name_
+        )
 
 
 class SequenceValidNoEndScenario(SequenceScenario):
-
-    def __init__(self,
-                 name,
-                 trials,
-                 rng,
-                 sequence_constraints,
-                 verbose=False,
-                 out_stream=sys.stdout):
-        super().__init__(name, trials, rng, sequence_constraints, verbose,
-                         out_stream)
+    def __init__(
+        self,
+        name,
+        trials,
+        rng,
+        sequence_constraints,
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
+        super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream)
 
     def check_constraints(self, model_name, sequence_id):
         # The scenario can always be run regardless of the previous runs
         return True
 
-    def run(self,
-            client_metadata,
-            len_mean=SEQUENCE_LENGTH_MEAN,
-            len_stddev=SEQUENCE_LENGTH_STDEV):
+    def run(
+        self,
+        client_metadata,
+        len_mean=SEQUENCE_LENGTH_MEAN,
+        len_stddev=SEQUENCE_LENGTH_STDEV,
+    ):
         trial = self.get_trial()
         dtype = self.get_datatype(trial)
         model_name = tu.get_sequence_model_name(trial, dtype)
@@ -733,15 +770,18 @@ def run(self,
         # sequences use the same correlation ID and are sent back-to-back.
         seqlen = [
             max(1, int(self.rng_.normal(len_mean, len_stddev))),
-            max(1, int(self.rng_.normal(len_mean, len_stddev)))
+            max(1, int(self.rng_.normal(len_mean, len_stddev))),
         ]
-        print("{} {}: valid-no-end seqlen[0] = {}, seqlen[1] = {}".format(
-            self.name_, client_metadata[1], seqlen[0], seqlen[1]),
-              file=self.out_stream_)
+        print(
+            "{} {}: valid-no-end seqlen[0] = {}, seqlen[1] = {}".format(
+                self.name_, client_metadata[1], seqlen[0], seqlen[1]
+            ),
+            file=self.out_stream_,
+        )
 
         values = [
             self.rng_.randint(0, 1024 * 1024, size=seqlen[0]).astype(dtype),
-            self.rng_.randint(0, 1024 * 1024, size=seqlen[1]).astype(dtype)
+            self.rng_.randint(0, 1024 * 1024, size=seqlen[1]).astype(dtype),
         ]
 
         for p in [0, 1]:
@@ -759,39 +799,41 @@ def run(self,
                 delay_ms = None
                 expected_result += val
                 expected_result = self.get_expected_result(
-                    expected_result, val, trial, flags)
+                    expected_result, val, trial, flags
+                )
 
                 # (flag_str, value, expected_result, delay_ms)
-                steps.append((flags, val, expected_result, delay_ms),)
+                steps.append(
+                    (flags, val, expected_result, delay_ms),
+                )
 
-        return self.check_sequence_async(client_metadata,
-                                         trial,
-                                         model_name,
-                                         dtype,
-                                         steps,
-                                         sequence_name=self.name_)
+        return self.check_sequence_async(
+            client_metadata, trial, model_name, dtype, steps, sequence_name=self.name_
+        )
 
 
 class SequenceValidValidScenario(SequenceScenario):
-
-    def __init__(self,
-                 name,
-                 trials,
-                 rng,
-                 sequence_constraints,
-                 verbose=False,
-                 out_stream=sys.stdout):
-        super().__init__(name, trials, rng, sequence_constraints, verbose,
-                         out_stream)
+    def __init__(
+        self,
+        name,
+        trials,
+        rng,
+        sequence_constraints,
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
+        super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream)
 
     def check_constraints(self, model_name, sequence_id):
         # The scenario can always be run regardless of the previous runs
         return True
 
-    def run(self,
-            client_metadata,
-            len_mean=SEQUENCE_LENGTH_MEAN,
-            len_stddev=SEQUENCE_LENGTH_STDEV):
+    def run(
+        self,
+        client_metadata,
+        len_mean=SEQUENCE_LENGTH_MEAN,
+        len_stddev=SEQUENCE_LENGTH_STDEV,
+    ):
         trial = self.get_trial()
         dtype = self.get_datatype(trial)
         model_name = tu.get_sequence_model_name(trial, dtype)
@@ -808,15 +850,18 @@ def run(self,
         # sent back-to-back.
         seqlen = [
             max(1, int(self.rng_.normal(len_mean, len_stddev))),
-            max(1, int(self.rng_.normal(len_mean, len_stddev)))
+            max(1, int(self.rng_.normal(len_mean, len_stddev))),
         ]
-        print("{} {}: valid-valid seqlen[0] = {}, seqlen[1] = {}".format(
-            self.name_, client_metadata[1], seqlen[0], seqlen[1]),
-              file=self.out_stream_)
+        print(
+            "{} {}: valid-valid seqlen[0] = {}, seqlen[1] = {}".format(
+                self.name_, client_metadata[1], seqlen[0], seqlen[1]
+            ),
+            file=self.out_stream_,
+        )
 
         values = [
             self.rng_.randint(0, 1024 * 1024, size=seqlen[0]).astype(dtype),
-            self.rng_.randint(0, 1024 * 1024, size=seqlen[1]).astype(dtype)
+            self.rng_.randint(0, 1024 * 1024, size=seqlen[1]).astype(dtype),
         ]
 
         for p in [0, 1]:
@@ -834,30 +879,30 @@ def run(self,
                 delay_ms = None
                 expected_result += val
                 expected_result = self.get_expected_result(
-                    expected_result, val, trial, flags)
+                    expected_result, val, trial, flags
+                )
 
                 # (flag_str, value, expected_result, delay_ms)
-                steps.append((flags, val, expected_result, delay_ms),)
+                steps.append(
+                    (flags, val, expected_result, delay_ms),
+                )
 
-        return self.check_sequence_async(client_metadata,
-                                         trial,
-                                         model_name,
-                                         dtype,
-                                         steps,
-                                         sequence_name=self.name_)
+        return self.check_sequence_async(
+            client_metadata, trial, model_name, dtype, steps, sequence_name=self.name_
+        )
 
 
 class SequenceNoStartScenario(SequenceScenario):
-
-    def __init__(self,
-                 name,
-                 trials,
-                 rng,
-                 sequence_constraints,
-                 verbose=False,
-                 out_stream=sys.stdout):
-        super().__init__(name, trials, rng, sequence_constraints, verbose,
-                         out_stream)
+    def __init__(
+        self,
+        name,
+        trials,
+        rng,
+        sequence_constraints,
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
+        super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream)
 
     def check_constraints(self, model_name, sequence_id):
         # no-start cannot follow no-end since the server will
@@ -865,7 +910,8 @@ def check_constraints(self, model_name, sequence_id):
         # the no-end sequence instead of being a sequence
         # missing start flag.
         if (model_name in self.sequence_constraints_) and (
-                sequence_id in self.sequence_constraints_[model_name]):
+            sequence_id in self.sequence_constraints_[model_name]
+        ):
             return not self.sequence_constraints_[model_name][sequence_id]
         return True
 
@@ -884,9 +930,12 @@ def run(self, client_metadata):
         # Create a sequence without a "start" flag. Sequence should get an
         # error from the server.
         seqlen = 1
-        print("{} {}: no-start seqlen = {}".format(self.name_,
-                                                   client_metadata[1], seqlen),
-              file=self.out_stream_)
+        print(
+            "{} {}: no-start seqlen = {}".format(
+                self.name_, client_metadata[1], seqlen
+            ),
+            file=self.out_stream_,
+        )
 
         values = self.rng_.randint(0, 1024 * 1024, size=seqlen).astype(dtype)
 
@@ -898,11 +947,12 @@ def run(self, client_metadata):
             delay_ms = None
 
             # (flag_str, value, expected_result, delay_ms)
-            steps.append((flags, val, None, delay_ms),)
+            steps.append(
+                (flags, val, None, delay_ms),
+            )
 
         try:
-            self.check_sequence_async(client_metadata, trial, model_name, dtype,
-                                      steps)
+            self.check_sequence_async(client_metadata, trial, model_name, dtype, steps)
             # Hit this point if sending no-start sequence to sequence id that
             # was used for no-end sequence and that means the constraints check
             # is inaccurate
@@ -915,25 +965,27 @@ def run(self, client_metadata):
 
 
 class SequenceValidScenario(SequenceScenario):
-
-    def __init__(self,
-                 name,
-                 trials,
-                 rng,
-                 sequence_constraints,
-                 verbose=False,
-                 out_stream=sys.stdout):
-        super().__init__(name, trials, rng, sequence_constraints, verbose,
-                         out_stream)
+    def __init__(
+        self,
+        name,
+        trials,
+        rng,
+        sequence_constraints,
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
+        super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream)
 
     def check_constraints(self, model_name, sequence_id):
         # The scenario can always be run regardless of the previous runs
         return True
 
-    def run(self,
-            client_metadata,
-            len_mean=SEQUENCE_LENGTH_MEAN,
-            len_stddev=SEQUENCE_LENGTH_STDEV):
+    def run(
+        self,
+        client_metadata,
+        len_mean=SEQUENCE_LENGTH_MEAN,
+        len_stddev=SEQUENCE_LENGTH_STDEV,
+    ):
         trial = self.get_trial()
         dtype = self.get_datatype(trial)
         model_name = tu.get_sequence_model_name(trial, dtype)
@@ -947,9 +999,10 @@ def run(self,
 
         # Create a variable length sequence with "start" and "end" flags.
         seqlen = max(1, int(self.rng_.normal(len_mean, len_stddev)))
-        print("{} {}: valid seqlen = {}".format(self.name_, client_metadata[1],
-                                                seqlen),
-              file=self.out_stream_)
+        print(
+            "{} {}: valid seqlen = {}".format(self.name_, client_metadata[1], seqlen),
+            file=self.out_stream_,
+        )
 
         values = self.rng_.randint(0, 1024 * 1024, size=seqlen).astype(dtype)
 
@@ -966,15 +1019,15 @@ def run(self,
             val = values[idx]
             delay_ms = None
             expected_result += val
-            expected_result = self.get_expected_result(expected_result, val,
-                                                       trial, flags)
+            expected_result = self.get_expected_result(
+                expected_result, val, trial, flags
+            )
 
             # (flag_str, value, expected_result, delay_ms)
-            steps.append((flags, val, expected_result, delay_ms),)
-
-        return self.check_sequence_async(client_metadata,
-                                         trial,
-                                         model_name,
-                                         dtype,
-                                         steps,
-                                         sequence_name=self.name_)
+            steps.append(
+                (flags, val, expected_result, delay_ms),
+            )
+
+        return self.check_sequence_async(
+            client_metadata, trial, model_name, dtype, steps, sequence_name=self.name_
+        )
diff --git a/qa/L0_long_running_stress/stress.py b/qa/L0_long_running_stress/stress.py
old mode 100644
new mode 100755
index a3713b4b0e..978f204ee6
--- a/qa/L0_long_running_stress/stress.py
+++ b/qa/L0_long_running_stress/stress.py
@@ -1,4 +1,6 @@
-# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -32,20 +34,20 @@
 
 import argparse
 import bisect
-from builtins import range
-from builtins import str
 import os
-import time
 import threading
+import time
 import traceback
-import numpy as np
+from builtins import range, str
 from functools import partial
-import tritonclient.grpc as grpcclient
+
+import numpy as np
 import prettytable
+import tritonclient.grpc as grpcclient
 
 FLAGS = None
 CORRELATION_ID_BLOCK_SIZE = 1024 * 1024
-BACKENDS = os.environ.get('BACKENDS', "graphdef savedmodel onnx plan")
+BACKENDS = os.environ.get("BACKENDS", "graphdef savedmodel onnx plan")
 
 _thread_exceptions = []
 _thread_exceptions_mutex = threading.Lock()
@@ -63,24 +65,26 @@
 def get_trials(is_sequence=True):
     _trials = ()
     if is_sequence:
-        for backend in BACKENDS.split(' '):
-            if (backend != "libtorch") and (backend != 'savedmodel'):
+        for backend in BACKENDS.split(" "):
+            if (backend != "libtorch") and (backend != "savedmodel"):
                 _trials += (backend + "_nobatch",)
             _trials += (backend,)
     else:
         _trials = ()
-        for backend in BACKENDS.split(' '):
-            if (backend != "libtorch"):
+        for backend in BACKENDS.split(" "):
+            if backend != "libtorch":
                 _trials += (backend + "_nobatch",)
     return _trials
 
 
-def update_test_count(test_case_count,
-                      failed_test_case_count,
-                      request_count,
-                      test_case_name,
-                      success=True,
-                      count=1):
+def update_test_count(
+    test_case_count,
+    failed_test_case_count,
+    request_count,
+    test_case_name,
+    success=True,
+    count=1,
+):
     if success:
         # Count the times each test case runs
         if test_case_name in test_case_count:
@@ -102,7 +106,6 @@ def update_test_count(test_case_count,
 
 
 class ScenarioSelector:
-
     def __init__(self, probs, rng):
         self.rng_ = rng
         self.probs_range_ = []
@@ -119,20 +122,24 @@ def __init__(self, probs, rng):
             self.probs_range_[i] /= total_weight
 
     def get_scenario(self):
-        return self.scenarios_[bisect.bisect_left(self.probs_range_,
-                                                  self.rng_.rand())]
+        return self.scenarios_[bisect.bisect_left(self.probs_range_, self.rng_.rand())]
 
 
-def stress_thread(name, seed, correlation_id_base, test_case_count,
-                  failed_test_case_count, sequence_request_count):
+def stress_thread(
+    name,
+    seed,
+    correlation_id_base,
+    test_case_count,
+    failed_test_case_count,
+    sequence_request_count,
+):
     # Thread responsible for generating sequences of inference
     # requests.
     global _thread_exceptions
 
     # Write any thread output to dedicated file
-    with open("{}.log".format(name), 'w') as out_file:
-        print("Starting thread {} with seed {}".format(name, seed),
-              file=out_file)
+    with open("{}.log".format(name), "w") as out_file:
+        print("Starting thread {} with seed {}".format(name, seed), file=out_file)
         rng = np.random.RandomState(seed)
 
         # FIXME revisit to check if it is necessary
@@ -151,74 +158,111 @@ def stress_thread(name, seed, correlation_id_base, test_case_count,
         rare_cnt = 8
         is_last_used_no_end = {}
 
-        update_counter_fn = partial(update_test_count, test_case_count,
-                                    failed_test_case_count,
-                                    sequence_request_count)
+        update_counter_fn = partial(
+            update_test_count,
+            test_case_count,
+            failed_test_case_count,
+            sequence_request_count,
+        )
         for c in range(common_cnt + rare_cnt):
             client_metadata_list.append(
-                (grpcclient.InferenceServerClient("localhost:8001",
-                                                  verbose=FLAGS.verbose),
-                 correlation_id_base + c))
+                (
+                    grpcclient.InferenceServerClient(
+                        "localhost:8001", verbose=FLAGS.verbose
+                    ),
+                    correlation_id_base + c,
+                )
+            )
         pa_start_seq_id = correlation_id_base + common_cnt + rare_cnt
         pa_end_seq_id = correlation_id_base + CORRELATION_ID_BLOCK_SIZE
 
         # Weight roughly in thousandth percent
-        ss = ScenarioSelector([
-            (60,
-             TimeoutScenario(name,
-                             get_trials(False),
-                             verbose=FLAGS.verbose,
-                             out_stream=out_file)),
-            (80, ResNetScenario(
-                name, verbose=FLAGS.verbose, out_stream=out_file)),
-            (60,
-             CrashingScenario(name, verbose=FLAGS.verbose,
-                              out_stream=out_file)),
-            (62,
-             SequenceNoEndScenario(name,
-                                   get_trials(),
-                                   rng,
-                                   is_last_used_no_end,
-                                   verbose=FLAGS.verbose,
-                                   out_stream=out_file)),
-            (68,
-             SequenceValidNoEndScenario(name,
-                                        get_trials(),
-                                        rng,
-                                        is_last_used_no_end,
-                                        verbose=FLAGS.verbose,
-                                        out_stream=out_file)),
-            (68,
-             SequenceValidValidScenario(name,
-                                        get_trials(),
-                                        rng,
-                                        is_last_used_no_end,
-                                        verbose=FLAGS.verbose,
-                                        out_stream=out_file)),
-            (7,
-             SequenceNoStartScenario(name,
-                                     get_trials(),
-                                     rng,
-                                     is_last_used_no_end,
-                                     verbose=FLAGS.verbose,
-                                     out_stream=out_file)),
-            (295,
-             SequenceValidScenario(name,
-                                   get_trials(),
-                                   rng,
-                                   is_last_used_no_end,
-                                   verbose=FLAGS.verbose,
-                                   out_stream=out_file)),
-            (300,
-             PerfAnalyzerScenario(
-                 name,
-                 rng,
-                 get_trials(),
-                 get_trials(False),
-                 sequence_id_range=(pa_start_seq_id, pa_end_seq_id),
-                 verbose=FLAGS.verbose,
-                 out_stream=out_file)),
-        ], rng)
+        ss = ScenarioSelector(
+            [
+                (
+                    60,
+                    TimeoutScenario(
+                        name,
+                        get_trials(False),
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+                (80, ResNetScenario(name, verbose=FLAGS.verbose, out_stream=out_file)),
+                (
+                    60,
+                    CrashingScenario(name, verbose=FLAGS.verbose, out_stream=out_file),
+                ),
+                (
+                    62,
+                    SequenceNoEndScenario(
+                        name,
+                        get_trials(),
+                        rng,
+                        is_last_used_no_end,
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+                (
+                    68,
+                    SequenceValidNoEndScenario(
+                        name,
+                        get_trials(),
+                        rng,
+                        is_last_used_no_end,
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+                (
+                    68,
+                    SequenceValidValidScenario(
+                        name,
+                        get_trials(),
+                        rng,
+                        is_last_used_no_end,
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+                (
+                    7,
+                    SequenceNoStartScenario(
+                        name,
+                        get_trials(),
+                        rng,
+                        is_last_used_no_end,
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+                (
+                    295,
+                    SequenceValidScenario(
+                        name,
+                        get_trials(),
+                        rng,
+                        is_last_used_no_end,
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+                (
+                    300,
+                    PerfAnalyzerScenario(
+                        name,
+                        rng,
+                        get_trials(),
+                        get_trials(False),
+                        sequence_id_range=(pa_start_seq_id, pa_end_seq_id),
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+            ],
+            rng,
+        )
 
         rare_idx = 0
         common_idx = 0
@@ -241,8 +285,9 @@ def stress_thread(name, seed, correlation_id_base, test_case_count,
                 update_counter_fn(scenario.scenario_name(), False)
                 _thread_exceptions_mutex.acquire()
                 try:
-                    _thread_exceptions.append((name, scenario.scenario_name(),
-                                               traceback.format_exc()))
+                    _thread_exceptions.append(
+                        (name, scenario.scenario_name(), traceback.format_exc())
+                    )
                 finally:
                     _thread_exceptions_mutex.release()
 
@@ -256,36 +301,52 @@ def stress_thread(name, seed, correlation_id_base, test_case_count,
         print("Exiting thread {}".format(name), file=out_file)
 
 
-def load_thread(name, seed, correlation_id_base, test_case_count,
-                failed_test_case_count, sequence_request_count):
+def load_thread(
+    name,
+    seed,
+    correlation_id_base,
+    test_case_count,
+    failed_test_case_count,
+    sequence_request_count,
+):
     # Thread responsible for generating sequences of inference
     # requests.
     global _thread_exceptions
 
     # Write any thread output to dedicated file
-    with open("{}.log".format(name), 'w') as out_file:
-        print("Starting thread {} with seed {}".format(name, seed),
-              file=out_file)
+    with open("{}.log".format(name), "w") as out_file:
+        print("Starting thread {} with seed {}".format(name, seed), file=out_file)
         rng = np.random.RandomState(seed)
 
-        update_counter_fn = partial(update_test_count, test_case_count,
-                                    failed_test_case_count,
-                                    sequence_request_count)
+        update_counter_fn = partial(
+            update_test_count,
+            test_case_count,
+            failed_test_case_count,
+            sequence_request_count,
+        )
         pa_start_seq_id = correlation_id_base
         pa_end_seq_id = correlation_id_base + CORRELATION_ID_BLOCK_SIZE
 
         # Create PerfAnalyzerScenario with no additional trial,
         # the default model 'resnet', more compute intense than the simple
         # models, will be the only choice for generating load
-        ss = ScenarioSelector([
-            (1,
-             PerfAnalyzerScenario(
-                 name,
-                 rng, [], [],
-                 sequence_id_range=(pa_start_seq_id, pa_end_seq_id),
-                 verbose=FLAGS.verbose,
-                 out_stream=out_file)),
-        ], rng)
+        ss = ScenarioSelector(
+            [
+                (
+                    1,
+                    PerfAnalyzerScenario(
+                        name,
+                        rng,
+                        [],
+                        [],
+                        sequence_id_range=(pa_start_seq_id, pa_end_seq_id),
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+            ],
+            rng,
+        )
 
         while not STOP_STRESS_THREAD:
             scenario = ss.get_scenario()
@@ -297,8 +358,9 @@ def load_thread(name, seed, correlation_id_base, test_case_count,
                 update_counter_fn(scenario.scenario_name(), False)
                 _thread_exceptions_mutex.acquire()
                 try:
-                    _thread_exceptions.append((name, scenario.scenario_name(),
-                                               traceback.format_exc()))
+                    _thread_exceptions.append(
+                        (name, scenario.scenario_name(), traceback.format_exc())
+                    )
                 finally:
                     _thread_exceptions_mutex.release()
 
@@ -333,47 +395,45 @@ def accumulate_count(dict_list, test_case_name):
     return count
 
 
-def generate_report(elapsed_time, _test_case_count, _failed_test_case_count,
-                    _sequence_request_count):
+def generate_report(
+    elapsed_time, _test_case_count, _failed_test_case_count, _sequence_request_count
+):
     hrs = elapsed_time // 3600
     mins = (elapsed_time / 60) % 60
     secs = elapsed_time % 60
 
     test_case_description = {
-        'SequenceValidScenario':
-            'Send a sequence with "start" and "end" flags.',
-        'SequenceValidValidScenario':
-            'Send two sequences back to back using the same correlation ID'
-            ' with "start" and "end" flags.',
-        'SequenceValidNoEndScenario':
-            'Send two sequences back to back using the same correlation ID.'
-            ' The first with "start" and "end" flags, and the second with no'
-            ' "end" flag.',
-        'SequenceNoStartScenario':
-            'Send a sequence without a "start" flag. Sequence should get an'
-            ' error from the server.',
-        'SequenceNoEndScenario':
-            'Send a sequence with "start" flag but that never ends. The'
-            ' sequence should be aborted by the server and its slot reused'
-            ' for another sequence.',
-        'TimeoutScenario':
-            'Expect an exception for small timeout values.',
-        'ResNetScenario':
-            'Send a request using resnet model.',
-        'CrashingScenario':
-            'Client crashes in the middle of inferences.',
-        'PerfAnalyzerScenario':
-            'Client that maintains a specific load.',
+        "SequenceValidScenario": 'Send a sequence with "start" and "end" flags.',
+        "SequenceValidValidScenario": "Send two sequences back to back using the same correlation ID"
+        ' with "start" and "end" flags.',
+        "SequenceValidNoEndScenario": "Send two sequences back to back using the same correlation ID."
+        ' The first with "start" and "end" flags, and the second with no'
+        ' "end" flag.',
+        "SequenceNoStartScenario": 'Send a sequence without a "start" flag. Sequence should get an'
+        " error from the server.",
+        "SequenceNoEndScenario": 'Send a sequence with "start" flag but that never ends. The'
+        " sequence should be aborted by the server and its slot reused"
+        " for another sequence.",
+        "TimeoutScenario": "Expect an exception for small timeout values.",
+        "ResNetScenario": "Send a request using resnet model.",
+        "CrashingScenario": "Client crashes in the middle of inferences.",
+        "PerfAnalyzerScenario": "Client that maintains a specific load.",
     }
 
     f = open("stress_report.txt", "w")
-    f.write("Test Duration: {:0>2}:{:0>2}:{:0>2} (HH:MM:SS)\n".format(
-        int(hrs), int(mins), int(secs)))
+    f.write(
+        "Test Duration: {:0>2}:{:0>2}:{:0>2} (HH:MM:SS)\n".format(
+            int(hrs), int(mins), int(secs)
+        )
+    )
 
     t = prettytable.PrettyTable(hrules=prettytable.ALL)
     t.field_names = [
-        'Test Case', 'Number of Failures', 'Test Count', 'Request Count',
-        'Test Case Description'
+        "Test Case",
+        "Number of Failures",
+        "Test Count",
+        "Request Count",
+        "Test Case Description",
     ]
 
     t.align["Test Case"] = "l"
@@ -389,33 +449,38 @@ def generate_report(elapsed_time, _test_case_count, _failed_test_case_count,
     for c in test_case_description:
         # Accumulate all the individual thread counts
         acc_test_case_count[c] = accumulate_count(_test_case_count, c)
-        acc_failed_test_case_count[c] = accumulate_count(
-            _failed_test_case_count, c)
-        acc_sequence_request_count[c] = accumulate_count(
-            _sequence_request_count, c)
+        acc_failed_test_case_count[c] = accumulate_count(_failed_test_case_count, c)
+        acc_sequence_request_count[c] = accumulate_count(_sequence_request_count, c)
 
         description = test_case_description[c]
         # Add additional description on scenarios that allow failure
         if c in ALLOW_FAILURE_SCENARIO:
-            description += " Note that this scenario is marked to allow " \
-                           "failure due to subtle edge cases that will be " \
-                           "investigated in the future. However, only a " \
-                           "minimal failure count is expected and we should " \
-                           "take action if the number is concerning."
-        t.add_row([
-            c, acc_failed_test_case_count[c] if c in acc_failed_test_case_count
-            else 0, acc_test_case_count[c] if c in acc_test_case_count else 0,
-            acc_sequence_request_count[c]
-            if c in acc_sequence_request_count else 0,
-            format_content(description, 50)
-        ])
-
-    t.add_row([
-        'TOTAL',
-        sum(acc_failed_test_case_count.values()),
-        sum(acc_test_case_count.values()),
-        sum(acc_sequence_request_count.values()), 'X'
-    ])
+            description += (
+                " Note that this scenario is marked to allow "
+                "failure due to subtle edge cases that will be "
+                "investigated in the future. However, only a "
+                "minimal failure count is expected and we should "
+                "take action if the number is concerning."
+            )
+        t.add_row(
+            [
+                c,
+                acc_failed_test_case_count[c] if c in acc_failed_test_case_count else 0,
+                acc_test_case_count[c] if c in acc_test_case_count else 0,
+                acc_sequence_request_count[c] if c in acc_sequence_request_count else 0,
+                format_content(description, 50),
+            ]
+        )
+
+    t.add_row(
+        [
+            "TOTAL",
+            sum(acc_failed_test_case_count.values()),
+            sum(acc_test_case_count.values()),
+            sum(acc_sequence_request_count.values()),
+            "X",
+        ]
+    )
 
     print(t)
     f.write(str(t))
@@ -423,43 +488,48 @@ def generate_report(elapsed_time, _test_case_count, _failed_test_case_count,
     f.close()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-r',
-                        '--random-seed',
-                        type=int,
-                        required=False,
-                        help='Random seed.')
-    parser.add_argument('-t',
-                        '--concurrency',
-                        type=int,
-                        required=False,
-                        default=8,
-                        help='Request concurrency. Default is 8.')
-    parser.add_argument('--load-thread',
-                        type=int,
-                        required=False,
-                        default=0,
-                        help='Number of dedicated threads that keep compute '
-                        'device (i.e. GPU/CPUs) under load. The load generated '
-                        'from "--concurrency" often behaves as request spike, '
-                        ' this argument may be used to produce consistent load '
-                        ' to keep devices at high utilization. Default is 0, '
-                        'which means no dedicated load thread will be created.')
     parser.add_argument(
-        '-d',
-        '--test-duration',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-r", "--random-seed", type=int, required=False, help="Random seed."
+    )
+    parser.add_argument(
+        "-t",
+        "--concurrency",
+        type=int,
+        required=False,
+        default=8,
+        help="Request concurrency. Default is 8.",
+    )
+    parser.add_argument(
+        "--load-thread",
+        type=int,
+        required=False,
+        default=0,
+        help="Number of dedicated threads that keep compute "
+        "device (i.e. GPU/CPUs) under load. The load generated "
+        'from "--concurrency" often behaves as request spike, '
+        " this argument may be used to produce consistent load "
+        " to keep devices at high utilization. Default is 0, "
+        "which means no dedicated load thread will be created.",
+    )
+    parser.add_argument(
+        "-d",
+        "--test-duration",
         type=int,
         required=False,
         default=25000,
-        help='Duration of stress test to run. Default is 25000 seconds ' +
-        '(approximately 7 hours).')
+        help="Duration of stress test to run. Default is 25000 seconds "
+        + "(approximately 7 hours).",
+    )
     FLAGS = parser.parse_args()
 
     # Initialize the random seed. For reproducibility each thread
@@ -476,9 +546,7 @@ def generate_report(elapsed_time, _test_case_count, _failed_test_case_count,
     print("test duration = {}".format(FLAGS.test_duration))
 
     # Create hashes for each thread for generating report
-    _test_case_count = [
-        dict() for _ in range(FLAGS.concurrency + FLAGS.load_thread)
-    ]
+    _test_case_count = [dict() for _ in range(FLAGS.concurrency + FLAGS.load_thread)]
     _failed_test_case_count = [
         dict() for _ in range(FLAGS.concurrency + FLAGS.load_thread)
     ]
@@ -501,11 +569,18 @@ def generate_report(elapsed_time, _test_case_count, _failed_test_case_count,
         correlation_id_base = 1 + (idx * CORRELATION_ID_BLOCK_SIZE)
 
         threads.append(
-            threading.Thread(target=stress_thread,
-                             args=(thread_name, seed, correlation_id_base,
-                                   _test_case_count[idx],
-                                   _failed_test_case_count[idx],
-                                   _sequence_request_count[idx])))
+            threading.Thread(
+                target=stress_thread,
+                args=(
+                    thread_name,
+                    seed,
+                    correlation_id_base,
+                    _test_case_count[idx],
+                    _failed_test_case_count[idx],
+                    _sequence_request_count[idx],
+                ),
+            )
+        )
 
     for idx in range(FLAGS.load_thread):
         thread_name = "load_thread_{}".format(idx)
@@ -518,14 +593,22 @@ def generate_report(elapsed_time, _test_case_count, _failed_test_case_count,
         # Each thread is reserved a block of correlation IDs or size
         # CORRELATION_ID_BLOCK_SIZE
         correlation_id_base = 1 + (
-            (FLAGS.concurrency + idx) * CORRELATION_ID_BLOCK_SIZE)
+            (FLAGS.concurrency + idx) * CORRELATION_ID_BLOCK_SIZE
+        )
 
         threads.append(
-            threading.Thread(target=load_thread,
-                             args=(thread_name, seed, correlation_id_base,
-                                   _test_case_count[idx],
-                                   _failed_test_case_count[idx],
-                                   _sequence_request_count[idx])))
+            threading.Thread(
+                target=load_thread,
+                args=(
+                    thread_name,
+                    seed,
+                    correlation_id_base,
+                    _test_case_count[idx],
+                    _failed_test_case_count[idx],
+                    _sequence_request_count[idx],
+                ),
+            )
+        )
 
     exit_code = 0
 
@@ -551,15 +634,18 @@ def generate_report(elapsed_time, _test_case_count, _failed_test_case_count,
         if t.is_alive() and (exit_code == 0):
             exit_code = 1
 
-    generate_report(time.time() - start_time, _test_case_count,
-                    _failed_test_case_count, _sequence_request_count)
+    generate_report(
+        time.time() - start_time,
+        _test_case_count,
+        _failed_test_case_count,
+        _sequence_request_count,
+    )
 
     _thread_exceptions_mutex.acquire()
     try:
         if len(_thread_exceptions) > 0:
             for thread, scenario, ex in _thread_exceptions:
-                print("*********\n* {} {}\n{}*********\n".format(
-                    thread, scenario, ex))
+                print("*********\n* {} {}\n{}*********\n".format(thread, scenario, ex))
                 if scenario not in ALLOW_FAILURE_SCENARIO:
                     exit_code = 1
     finally:
diff --git a/qa/L0_long_running_stress/stress_mail.py b/qa/L0_long_running_stress/stress_mail.py
old mode 100644
new mode 100755
index e240e2a354..36f347c2ac
--- a/qa/L0_long_running_stress/stress_mail.py
+++ b/qa/L0_long_running_stress/stress_mail.py
@@ -30,21 +30,33 @@
 sys.path.append("../common")
 
 import os
-import nightly_email_helper
-
 from datetime import date
 
-CI_JOB_ID = os.environ.get('CI_JOB_ID', '')
+import nightly_email_helper
+
+CI_JOB_ID = os.environ.get("CI_JOB_ID", "")
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     today = date.today().strftime("%Y-%m-%d")
-    subject = "Triton Long-Running Stress Test " + \
-        ((sys.argv[1] + " ") if len(sys.argv) >= 2 else "") + "Summary: " + today
+    subject = (
+        "Triton Long-Running Stress Test "
+        + ((sys.argv[1] + " ") if len(sys.argv) >= 2 else "")
+        + "Summary: "
+        + today
+    )
     stress_report = "stress_report.txt"
     link = "https://gitlab-master.nvidia.com/dl/dgx/tritonserver/-/jobs/" + CI_JOB_ID
     write_up = "

The table below includes results from long-running stress test. Please refer to the description of each test case to see what different kinds of inference requests were sent. Request concurrency is set to 8.

" - write_up += "

Please check the CI output webpage for the details of the failures: " + link + "

" - html_content = "
" + write_up + "
"
+    write_up += (
+        "

Please check the CI output webpage for the details of the failures: " + + link + + "

" + ) + html_content = ( + '
'
+        + write_up
+        + '
'
+    )
     with open(stress_report, "r") as f:
         html_content += f.read() + "\n"
     html_content += "
" diff --git a/qa/L0_memory/test.sh b/qa/L0_memory/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_memory_growth/busy_op_test.py b/qa/L0_memory_growth/busy_op_test.py old mode 100644 new mode 100755 index 537c328047..2814f38d8c --- a/qa/L0_memory_growth/busy_op_test.py +++ b/qa/L0_memory_growth/busy_op_test.py @@ -27,56 +27,63 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse -import numpy as np from builtins import range + +import numpy as np import tritongrpcclient as grpcclient import tritonhttpclient as httpclient from tritonclientutils import np_to_triton_dtype FLAGS = None -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - default='localhost:8000', - help='Inference server URL. Default is localhost:8000.') parser.add_argument( - '-i', - '--protocol', + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-u", + "--url", type=str, required=False, - default='http', - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') - parser.add_argument('-m', - '--model', - type=str, - required=True, - help='Name of model.') - parser.add_argument('-n', - '--num-requests', - type=int, - required=True, - help='Number of asynchronous requests to launch.') - parser.add_argument('-d', - '--delay', - type=int, - required=True, - help='Number of delay cycles to use as input to model.') + default="localhost:8000", + help="Inference server URL. Default is localhost:8000.", + ) + parser.add_argument( + "-i", + "--protocol", + type=str, + required=False, + default="http", + help='Protocol ("http"/"grpc") used to ' + + 'communicate with inference service. Default is "http".', + ) + parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.") + parser.add_argument( + "-n", + "--num-requests", + type=int, + required=True, + help="Number of asynchronous requests to launch.", + ) + parser.add_argument( + "-d", + "--delay", + type=int, + required=True, + help="Number of delay cycles to use as input to model.", + ) FLAGS = parser.parse_args() if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): - print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format( - FLAGS.protocol)) + print( + 'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol) + ) exit(1) client_util = httpclient if FLAGS.protocol == "http" else grpcclient @@ -94,8 +101,9 @@ input_data = np.array([FLAGS.delay], dtype=np.int32) inputs = [ - client_util.InferInput("in", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + client_util.InferInput( + "in", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) diff --git a/qa/L0_memory_growth/server_memory_mail.py b/qa/L0_memory_growth/server_memory_mail.py old mode 100644 new mode 100755 index ef57c8732e..d1307d97a6 --- a/qa/L0_memory_growth/server_memory_mail.py +++ b/qa/L0_memory_growth/server_memory_mail.py @@ -29,19 +29,23 @@ sys.path.append("../common") -import nightly_email_helper - import glob from datetime import date -if __name__ == '__main__': +import nightly_email_helper + +if __name__ == "__main__": today = date.today().strftime("%Y-%m-%d") subject = "Triton Server Memory Growth " + sys.argv[1] + " Summary: " + today memory_graphs_resnet = glob.glob("memory_growth_resnet*.log") memory_graphs_busyop = glob.glob("memory_growth_busyop.log") write_up = "

This test uses perf_analyzer as clients running on 4 different models. The max allowed difference between mean and maximum memory usage is set to 150MB.

" write_up += "

• What to look for
A linear memory growth in the beginning of the graph is acceptable only when it is followed by a flat memory usage. If a linear memory growth is observed during the entire test then there is possibly a memory leak.

" - html_content = "
" + write_up + "
"
+    html_content = (
+        '
'
+        + write_up
+        + '
'
+    )
     for mem_graph in sorted(memory_graphs_resnet):
         html_content += "\n" + mem_graph + "\n"
         with open(mem_graph, "r") as f:
@@ -52,12 +56,18 @@
     # When we see PTX failures in CI, the busyop memory graph is not created.
     if len(memory_graphs_busyop):
         write_up = "

• What to look for
The memory usage should increase continually over time, and a linear growth should be observed in the graph below.

" - html_content += "
" + write_up + "
"
+        html_content += (
+            '
'
+            + write_up
+            + '
'
+        )
         for mem_graph in sorted(memory_graphs_busyop):
             html_content += "\n" + mem_graph + "\n"
             with open(mem_graph, "r") as f:
                 html_content += f.read() + "\n"
     else:
-        html_content += "

The busyop model caused PTX failures when running the CI.

" + html_content += ( + "

The busyop model caused PTX failures when running the CI.

" + ) html_content += "
" nightly_email_helper.send(subject, html_content, is_html=True) diff --git a/qa/L0_metrics/metrics_test.py b/qa/L0_metrics/metrics_test.py index 36d732cdfa..13efdb0d10 100755 --- a/qa/L0_metrics/metrics_test.py +++ b/qa/L0_metrics/metrics_test.py @@ -27,32 +27,38 @@ import os import sys + sys.path.append("../common") -import requests import unittest + +import requests import test_util as tu INF_COUNTER_PATTERNS = [ - 'nv_inference_request_duration', 'nv_inference_queue_duration', - 'nv_inference_compute_input_duration', - 'nv_inference_compute_infer_duration', - 'nv_inference_compute_output_duration' + "nv_inference_request_duration", + "nv_inference_queue_duration", + "nv_inference_compute_input_duration", + "nv_inference_compute_infer_duration", + "nv_inference_compute_output_duration", ] INF_SUMMARY_PATTERNS = [ - 'nv_inference_request_summary', 'nv_inference_queue_summary', - 'nv_inference_compute_input_summary', 'nv_inference_compute_infer_summary', - 'nv_inference_compute_output_summary' + "nv_inference_request_summary", + "nv_inference_queue_summary", + "nv_inference_compute_input_summary", + "nv_inference_compute_infer_summary", + "nv_inference_compute_output_summary", ] CACHE_COUNTER_PATTERNS = [ - 'nv_cache_num_hits_per_model', 'nv_cache_num_misses_per_model', - 'nv_cache_hit_duration_per_model', 'nv_cache_miss_duration_per_model' + "nv_cache_num_hits_per_model", + "nv_cache_num_misses_per_model", + "nv_cache_hit_duration_per_model", + "nv_cache_miss_duration_per_model", ] -CACHE_SUMMARY_PATTERNS = ['nv_cache_hit_summary', 'nv_cache_miss_summary'] +CACHE_SUMMARY_PATTERNS = ["nv_cache_hit_summary", "nv_cache_miss_summary"] class MetricsTest(tu.TestResultCollector): - def _get_metrics(self): metrics_url = "http://localhost:8002/metrics" r = requests.get(metrics_url) @@ -111,7 +117,7 @@ def test_summaries_custom_quantiles(self): print(metrics) for quantile in quantiles: print(quantile) - self.assertIn(f"quantile=\"{quantile}\"", metrics) + self.assertIn(f'quantile="{quantile}"', metrics) # DLIS-4762: Disable request summary when caching enabled for now def test_inf_summaries_exist_with_cache(self): @@ -124,5 +130,5 @@ def test_inf_summaries_exist_with_cache(self): self.assertNotIn(metric, metrics) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh index 6ad17ec6ed..d0cf0193ae 100755 --- a/qa/L0_metrics/test.sh +++ b/qa/L0_metrics/test.sh @@ -290,7 +290,7 @@ python3 ${PYTHON_TEST} MetricsTest.test_summaries_custom_quantiles 2>&1 | tee ${ check_unit_test kill $SERVER_PID wait $SERVER_PID - + if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" else diff --git a/qa/L0_mlflow/plugin_test.py b/qa/L0_mlflow/plugin_test.py old mode 100644 new mode 100755 index 65a7cbb248..a5d87a3c19 --- a/qa/L0_mlflow/plugin_test.py +++ b/qa/L0_mlflow/plugin_test.py @@ -30,49 +30,49 @@ sys.path.append("../common") +import json import unittest + +import numpy as np import test_util as tu from mlflow.deployments import get_deploy_client -import json -import numpy as np class PluginTest(tu.TestResultCollector): - def setUp(self): - self.client_ = get_deploy_client('triton') + self.client_ = get_deploy_client("triton") def _validate_deployment(self, model_name): # create - self.client_.create_deployment(model_name, - "models:/{}/1".format(model_name), - flavor="onnx") + self.client_.create_deployment( + model_name, "models:/{}/1".format(model_name), flavor="onnx" + ) # list deployment_list = self.client_.list_deployments() self.assertEqual(len(deployment_list), 1) - self.assertEqual(deployment_list[0]['name'], model_name) + self.assertEqual(deployment_list[0]["name"], model_name) # get deployment = self.client_.get_deployment(model_name) - self.assertEqual(deployment['name'], model_name) + self.assertEqual(deployment["name"], model_name) # predict inputs = {} with open("./mlflow-triton-plugin/examples/input.json", "r") as f: input_json = json.load(f) - for key, value in input_json['inputs'].items(): + for key, value in input_json["inputs"].items(): inputs[key] = np.array(value, dtype=np.float32) output = self.client_.predict(model_name, inputs) - with open("./mlflow-triton-plugin/examples/expected_output.json", - "r") as f: + with open("./mlflow-triton-plugin/examples/expected_output.json", "r") as f: output_json = json.load(f) - for key, value in output_json['outputs'].items(): + for key, value in output_json["outputs"].items(): np.testing.assert_allclose( - output['outputs'][key], + output["outputs"][key], np.array(value, dtype=np.int32), - err_msg='Inference result is not correct') + err_msg="Inference result is not correct", + ) # delete self.client_.delete_deployment(model_name) @@ -81,13 +81,12 @@ def test_onnx_flavor(self): # Log the ONNX model to MLFlow import mlflow.onnx import onnx + model = onnx.load( "./mlflow-triton-plugin/examples/onnx_float32_int32_int32/1/model.onnx" ) # Use a different name to ensure the plugin operates on correct model - mlflow.onnx.log_model(model, - "triton", - registered_model_name="onnx_model") + mlflow.onnx.log_model(model, "triton", registered_model_name="onnx_model") self._validate_deployment("onnx_model") @@ -95,24 +94,28 @@ def test_onnx_flavor_with_files(self): # Log the ONNX model and additional Triton config file to MLFlow import mlflow.onnx import onnx + model = onnx.load( "./mlflow-triton-plugin/examples/onnx_float32_int32_int32/1/model.onnx" ) - config_path = "./mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt" + config_path = ( + "./mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt" + ) # Use a different name to ensure the plugin operates on correct model - mlflow.onnx.log_model(model, - "triton", - registered_model_name="onnx_model_with_files") + mlflow.onnx.log_model( + model, "triton", registered_model_name="onnx_model_with_files" + ) mlflow.log_artifact(config_path, "triton") self._validate_deployment("onnx_model_with_files") # Check if the additional files are properly copied import filecmp + self.assertTrue( - filecmp.cmp(config_path, - "./models/onnx_model_with_files/config.pbtxt")) + filecmp.cmp(config_path, "./models/onnx_model_with_files/config.pbtxt") + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_mlflow/test.sh b/qa/L0_mlflow/test.sh index 2ea7980735..4b5205ba25 100755 --- a/qa/L0_mlflow/test.sh +++ b/qa/L0_mlflow/test.sh @@ -32,12 +32,12 @@ source ../common/util.sh rm -fr *.log *.json # The default version of python 3.10.6 included in -# Ubuntu 22.04 installs blinker 1.4. This doesn't -# work with the awscli which we try to install. -# Uninstalling blinker and allowing pip to install blinker 1.6 -# fixes this issue. The alternative to this is to +# Ubuntu 22.04 installs blinker 1.4. This doesn't +# work with the awscli which we try to install. +# Uninstalling blinker and allowing pip to install blinker 1.6 +# fixes this issue. The alternative to this is to # install a higher version of python which uses blinker 1.6, -# but it is unknown whether this test should rely on +# but it is unknown whether this test should rely on # the default installation of python. apt remove -y python3-blinker diff --git a/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py b/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py old mode 100644 new mode 100755 index 2810cd9b90..9c5e99e49e --- a/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py +++ b/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,13 +28,12 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(4) auto_complete_model_config.add_input(input0) diff --git a/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py b/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py old mode 100644 new mode 100755 index 80e9f9d59c..f617ac6faf --- a/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py +++ b/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,13 +28,12 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(4) auto_complete_model_config.set_dynamic_batching() diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py b/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py old mode 100644 new mode 100755 index a5e02161f6..ef915705e6 --- a/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py +++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,13 +28,12 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'dims': [4]} - input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(0) auto_complete_model_config.add_input(input0) diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py b/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py old mode 100644 new mode 100755 index 02a29b9a16..b5f3a0c9fc --- a/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py +++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,13 +28,12 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32'} - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32"} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(0) auto_complete_model_config.add_input(input0) diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py b/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py old mode 100644 new mode 100755 index 10492cc438..78ba70742c --- a/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py +++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,13 +28,12 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - input1 = {'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(0) auto_complete_model_config.add_input(input0) diff --git a/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py b/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py old mode 100644 new mode 100755 index 037339a091..6a83d9fcbd --- a/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py +++ b/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,18 +28,17 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} input1 = { - 'name': 'INPUT1', - 'data_type': 'TYPE_FP32', - 'dims': [4], - 'is_shape_tensor:': True + "name": "INPUT1", + "data_type": "TYPE_FP32", + "dims": [4], + "is_shape_tensor:": True, } - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(0) auto_complete_model_config.add_input(input0) diff --git a/qa/L0_model_config/autofill_noplatform/python/no_return/model.py b/qa/L0_model_config/autofill_noplatform/python/no_return/model.py old mode 100644 new mode 100755 index 5c90b2bcfb..6bb52bc152 --- a/qa/L0_model_config/autofill_noplatform/python/no_return/model.py +++ b/qa/L0_model_config/autofill_noplatform/python/no_return/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,13 +28,12 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(0) auto_complete_model_config.add_input(input0) diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py b/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py old mode 100644 new mode 100755 index e1af57e747..64a08ca859 --- a/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py +++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,13 +28,12 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'name': 'OUTPUT0', 'dims': [4]} - output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(0) auto_complete_model_config.add_input(input0) diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py b/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py old mode 100644 new mode 100755 index 88294cdb97..0ee2d01f1a --- a/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py +++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,13 +28,12 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32'} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32"} auto_complete_model_config.set_max_batch_size(0) auto_complete_model_config.add_input(input0) diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py b/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py old mode 100644 new mode 100755 index 130e854e05..12c777c613 --- a/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py +++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,13 +28,12 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'data_type': 'TYPE_FP32', 'dims': [4]} - output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(0) auto_complete_model_config.add_input(input0) diff --git a/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py b/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py old mode 100644 new mode 100755 index 4d3298f866..40874ab404 --- a/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py +++ b/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,17 +28,16 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} output1 = { - 'name': 'OUTPUT1', - 'data_type': 'TYPE_FP32', - 'dims': [4], - 'is_shape_tensor:': True + "name": "OUTPUT1", + "data_type": "TYPE_FP32", + "dims": [4], + "is_shape_tensor:": True, } auto_complete_model_config.set_max_batch_size(0) diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/cpu_instance/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/onnx/cpu_instance/config.pbtxt old mode 100755 new mode 100644 diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/config.pbtxt old mode 100755 new mode 100644 diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py old mode 100644 new mode 100755 index 723c343702..14ca01ee47 --- a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py +++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,11 +28,10 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(4) auto_complete_model_config.set_dynamic_batching() diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py old mode 100644 new mode 100755 index 723c343702..14ca01ee47 --- a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py +++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,11 +28,10 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(4) auto_complete_model_config.set_dynamic_batching() diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py old mode 100644 new mode 100755 index 723c343702..14ca01ee47 --- a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py +++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,11 +28,10 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(4) auto_complete_model_config.set_dynamic_batching() diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py old mode 100644 new mode 100755 index 80e9f9d59c..f617ac6faf --- a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py +++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,13 +28,12 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(4) auto_complete_model_config.set_dynamic_batching() diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py old mode 100644 new mode 100755 index 80e9f9d59c..f617ac6faf --- a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py +++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,13 +28,12 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(4) auto_complete_model_config.set_dynamic_batching() diff --git a/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py old mode 100644 new mode 100755 index fc150ff497..e951a2ef35 --- a/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py +++ b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -26,12 +28,11 @@ class TritonPythonModel: - @staticmethod def auto_complete_config(auto_complete_model_config): - input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]} - output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]} + input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]} + output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]} auto_complete_model_config.set_max_batch_size(0) auto_complete_model_config.add_input(input0) diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/config.pbtxt old mode 100755 new mode 100644 diff --git a/qa/L0_model_config/compare_status.py b/qa/L0_model_config/compare_status.py old mode 100644 new mode 100755 index f1548e6de4..dbed05772a --- a/qa/L0_model_config/compare_status.py +++ b/qa/L0_model_config/compare_status.py @@ -1,4 +1,6 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,43 +27,46 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import json import os import sys -import json + import tritonclient.grpc as grpcclient +import tritonclient.grpc.model_config_pb2 as mc import tritonclient.http as httpclient +from google.protobuf import json_format, text_format from tritonclient.utils import * -from google.protobuf import text_format -from google.protobuf import json_format -import tritonclient.grpc.model_config_pb2 as mc FLAGS = None -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--expected_dir', - type=str, - required=True, - help='Directory containing expected output files') - parser.add_argument('--model', type=str, required=True, help='Model name') + parser.add_argument( + "--expected_dir", + type=str, + required=True, + help="Directory containing expected output files", + ) + parser.add_argument("--model", type=str, required=True, help="Model name") FLAGS, unparsed = parser.parse_known_args() for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: model_name = FLAGS.model if pair[1] == "http": - triton_client = httpclient.InferenceServerClient(url=pair[0], - verbose=False) + triton_client = httpclient.InferenceServerClient(url=pair[0], verbose=False) model_config = triton_client.get_model_config(model_name) else: - triton_client = grpcclient.InferenceServerClient(url=pair[0], - verbose=False) + triton_client = grpcclient.InferenceServerClient(url=pair[0], verbose=False) model_config = triton_client.get_model_config(model_name) nonmatch = list() expected_files = [ - f for f in os.listdir(FLAGS.expected_dir) - if (os.path.isfile(os.path.join(FLAGS.expected_dir, f)) and - (f.startswith("expected"))) + f + for f in os.listdir(FLAGS.expected_dir) + if ( + os.path.isfile(os.path.join(FLAGS.expected_dir, f)) + and (f.startswith("expected")) + ) ] for efile in expected_files: with open(os.path.join(FLAGS.expected_dir, efile)) as f: @@ -69,8 +74,8 @@ if pair[1] == "http": config_json = json.loads( - json_format.MessageToJson(config, - preserving_proto_field_name=True)) + json_format.MessageToJson(config, preserving_proto_field_name=True) + ) if config_json == model_config: sys.exit(0) else: diff --git a/qa/L0_model_config/noautofill_test.py b/qa/L0_model_config/noautofill_test.py old mode 100644 new mode 100755 index 926e4d850e..d89e306eb8 --- a/qa/L0_model_config/noautofill_test.py +++ b/qa/L0_model_config/noautofill_test.py @@ -30,13 +30,13 @@ sys.path.append("../common") import unittest + import test_util as tu import tritonclient.http as httpclient from tritonclient.utils import InferenceServerException class NoAutoFillTest(tu.TestResultCollector): - def setUp(self): self._model_name = "noautofill_noconfig" self._triton_client = httpclient.InferenceServerClient("localhost:8000") @@ -45,12 +45,12 @@ def tearDown(self): self._triton_client.unload_model(self._model_name) def test_load_no_autofill_model_with_config(self): - config = "{\"max_batch_size\":\"16\"}" + config = '{"max_batch_size":"16"}' self._triton_client.load_model(self._model_name, config=config) # Check if the model config is correct model_config = self._triton_client.get_model_config(self._model_name) - self.assertEqual(model_config['max_batch_size'], 16) + self.assertEqual(model_config["max_batch_size"], 16) def test_load_no_autofill_model_with_no_config(self): with self.assertRaises(InferenceServerException) as ex: @@ -58,5 +58,5 @@ def test_load_no_autofill_model_with_no_config(self): self.assertIn("model configuration is not provided", str(ex.exception)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_model_config/test.sh b/qa/L0_model_config/test.sh index 8b1a318c51..f3bc98fe87 100755 --- a/qa/L0_model_config/test.sh +++ b/qa/L0_model_config/test.sh @@ -291,14 +291,14 @@ cp /data/inferenceserver/${REPO_VERSION}/qa_model_repository/openvino_int8_int8_ rm -f $SERVER_LOG_BASE* $CLIENT_LOG RET=0 -# Run tests for logs which do not have a timestamp on them +# Run tests for logs which do not have a timestamp on them for TARGET in `ls cli_messages`; do case $TARGET in "cli_override") EXTRA_ARGS="--disable-auto-complete-config --strict-model-config=false" ;; - "cli_deprecation") + "cli_deprecation") EXTRA_ARGS="--strict-model-config=true" ;; - *) + *) EXTRA_ARGS="" ;; esac diff --git a/qa/L0_model_namespacing/python_addsub/__init__.py b/qa/L0_model_namespacing/python_addsub/__init__.py old mode 100644 new mode 100755 index e14880ceba..a664eafef0 --- a/qa/L0_model_namespacing/python_addsub/__init__.py +++ b/qa/L0_model_namespacing/python_addsub/__init__.py @@ -1,4 +1,6 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,8 +26,9 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import numpy as np import json + +import numpy as np import triton_python_backend_utils as pb_utils @@ -36,67 +39,85 @@ class TritonPythonModel: def auto_complete_config(auto_complete_model_config): # Only use packaged config if config is not explicitly provided config = auto_complete_model_config.as_dict() - if (len(config['input']) != 0) or (len(config['output']) != 0): + if (len(config["input"]) != 0) or (len(config["output"]) != 0): return auto_complete_model_config - auto_complete_model_config.add_input({ - 'name': 'INPUT0', - 'data_type': 'TYPE_INT32', - 'dims': [16,] - }) - auto_complete_model_config.add_input({ - 'name': 'INPUT1', - 'data_type': 'TYPE_INT32', - 'dims': [16,] - }) - auto_complete_model_config.add_output({ - 'name': 'OUTPUT0', - 'data_type': 'TYPE_INT32', - 'dims': [16,] - }) - auto_complete_model_config.add_output({ - 'name': 'OUTPUT1', - 'data_type': 'TYPE_INT32', - 'dims': [16,] - }) + auto_complete_model_config.add_input( + { + "name": "INPUT0", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_input( + { + "name": "INPUT1", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_output( + { + "name": "OUTPUT0", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_output( + { + "name": "OUTPUT1", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) return auto_complete_model_config def initialize(self, args): - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) - output0_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT0") - output1_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT1") + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) + output0_config["data_type"] + ) self.output1_dtype = pb_utils.triton_string_to_numpy( - output1_config['data_type']) + output1_config["data_type"] + ) def execute(self, requests): - """ This function is called on inference request. - """ + """This function is called on inference request.""" responses = [] for request in requests: in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") - responses.append(pb_utils.InferenceResponse(self.addsub(in_0, - in_1))) + responses.append(pb_utils.InferenceResponse(self.addsub(in_0, in_1))) return responses def addsub(self, in_0, in_1): - if in_0.as_numpy().dtype.type is np.bytes_ or in_0.as_numpy( - ).dtype == np.object_: - out_0, out_1 = (in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32),\ - in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32)) + if ( + in_0.as_numpy().dtype.type is np.bytes_ + or in_0.as_numpy().dtype == np.object_ + ): + out_0, out_1 = ( + in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32), + in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32), + ) else: - out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(), - in_0.as_numpy() - in_1.as_numpy()) + out_0, out_1 = ( + in_0.as_numpy() + in_1.as_numpy(), + in_0.as_numpy() - in_1.as_numpy(), + ) - out_tensor_0 = pb_utils.Tensor("OUTPUT0", - out_0.astype(self.output0_dtype)) - out_tensor_1 = pb_utils.Tensor("OUTPUT1", - out_1.astype(self.output1_dtype)) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(self.output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(self.output1_dtype)) return [out_tensor_0, out_tensor_1] diff --git a/qa/L0_model_namespacing/python_subadd/__init__.py b/qa/L0_model_namespacing/python_subadd/__init__.py old mode 100644 new mode 100755 index 6d38542bf0..bd3ddefe9e --- a/qa/L0_model_namespacing/python_subadd/__init__.py +++ b/qa/L0_model_namespacing/python_subadd/__init__.py @@ -1,4 +1,6 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,8 +26,9 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import numpy as np import json + +import numpy as np import triton_python_backend_utils as pb_utils @@ -36,67 +39,85 @@ class TritonPythonModel: def auto_complete_config(auto_complete_model_config): # Only use packaged config if config is not explicitly provided config = auto_complete_model_config.as_dict() - if (len(config['input']) != 0) or (len(config['output']) != 0): + if (len(config["input"]) != 0) or (len(config["output"]) != 0): return auto_complete_model_config - auto_complete_model_config.add_input({ - 'name': 'INPUT0', - 'data_type': 'TYPE_INT32', - 'dims': [16,] - }) - auto_complete_model_config.add_input({ - 'name': 'INPUT1', - 'data_type': 'TYPE_INT32', - 'dims': [16,] - }) - auto_complete_model_config.add_output({ - 'name': 'OUTPUT0', - 'data_type': 'TYPE_INT32', - 'dims': [16,] - }) - auto_complete_model_config.add_output({ - 'name': 'OUTPUT1', - 'data_type': 'TYPE_INT32', - 'dims': [16,] - }) + auto_complete_model_config.add_input( + { + "name": "INPUT0", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_input( + { + "name": "INPUT1", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_output( + { + "name": "OUTPUT0", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) + auto_complete_model_config.add_output( + { + "name": "OUTPUT1", + "data_type": "TYPE_INT32", + "dims": [ + 16, + ], + } + ) return auto_complete_model_config def initialize(self, args): - self.model_config = model_config = json.loads(args['model_config']) + self.model_config = model_config = json.loads(args["model_config"]) - output0_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT0") - output1_config = pb_utils.get_output_config_by_name( - model_config, "OUTPUT1") + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1") self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) + output0_config["data_type"] + ) self.output1_dtype = pb_utils.triton_string_to_numpy( - output1_config['data_type']) + output1_config["data_type"] + ) def execute(self, requests): - """ This function is called on inference request. - """ + """This function is called on inference request.""" responses = [] for request in requests: in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1") - responses.append(pb_utils.InferenceResponse(self.subadd(in_0, - in_1))) + responses.append(pb_utils.InferenceResponse(self.subadd(in_0, in_1))) return responses def subadd(self, in_0, in_1): - if in_0.as_numpy().dtype.type is np.bytes_ or in_0.as_numpy( - ).dtype == np.object_: - out_0, out_1 = (in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32),\ - in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32)) + if ( + in_0.as_numpy().dtype.type is np.bytes_ + or in_0.as_numpy().dtype == np.object_ + ): + out_0, out_1 = ( + in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32), + in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32), + ) else: - out_0, out_1 = (in_0.as_numpy() - in_1.as_numpy(), - in_0.as_numpy() + in_1.as_numpy()) + out_0, out_1 = ( + in_0.as_numpy() - in_1.as_numpy(), + in_0.as_numpy() + in_1.as_numpy(), + ) - out_tensor_0 = pb_utils.Tensor("OUTPUT0", - out_0.astype(self.output0_dtype)) - out_tensor_1 = pb_utils.Tensor("OUTPUT1", - out_1.astype(self.output1_dtype)) + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(self.output0_dtype)) + out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(self.output1_dtype)) return [out_tensor_0, out_tensor_1] diff --git a/qa/L0_model_namespacing/test.py b/qa/L0_model_namespacing/test.py old mode 100644 new mode 100755 index 9de6ac749c..f45300d4fd --- a/qa/L0_model_namespacing/test.py +++ b/qa/L0_model_namespacing/test.py @@ -25,17 +25,17 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import sys import os +import sys sys.path.append(os.path.join(os.environ["TRITON_QA_ROOT_DIR"], "common")) -import numpy as np -import unittest -import time import shutil -import test_util as tu +import time +import unittest +import numpy as np +import test_util as tu import tritonclient.http as httpclient from tritonclient.utils import InferenceServerException @@ -57,16 +57,14 @@ def __init__(self, checker_client=None): if checker_client is None: import tritonclient.http as checker_client if "http" in checker_client.__name__: - self.client_ = checker_client.InferenceServerClient( - "localhost:8000") + self.client_ = checker_client.InferenceServerClient("localhost:8000") else: - self.client_ = checker_client.InferenceServerClient( - "localhost:8001") + self.client_ = checker_client.InferenceServerClient("localhost:8001") # Create infer input tensors self.inputs_ = [] - self.inputs_.append(checker_client.InferInput('INPUT0', [16], "INT32")) - self.inputs_.append(checker_client.InferInput('INPUT1', [16], "INT32")) + self.inputs_.append(checker_client.InferInput("INPUT0", [16], "INT32")) + self.inputs_.append(checker_client.InferInput("INPUT1", [16], "INT32")) # Initialize the data and expected output input_data = np.arange(start=0, stop=16, dtype=np.int32) @@ -74,15 +72,17 @@ def __init__(self, checker_client=None): self.inputs_[1].set_data_from_numpy(input_data) self.expected_outputs_ = { "add": (input_data + input_data), - "sub": (input_data - input_data) + "sub": (input_data - input_data), } def infer(self, model): res = self.client_.infer(model, self.inputs_) - np.testing.assert_allclose(res.as_numpy('OUTPUT0'), - self.expected_outputs_["add"]) - np.testing.assert_allclose(res.as_numpy('OUTPUT1'), - self.expected_outputs_["sub"]) + np.testing.assert_allclose( + res.as_numpy("OUTPUT0"), self.expected_outputs_["add"] + ) + np.testing.assert_allclose( + res.as_numpy("OUTPUT1"), self.expected_outputs_["sub"] + ) # Checker to perform inference on given model, expecting model to have @@ -90,13 +90,14 @@ def infer(self, model): # OUTPUT0 = INPUT0 - INPUT1 # OUTPUT1 = INPUT0 + INPUT1 class SubAddChecker(AddSubChecker): - def infer(self, model): res = self.client_.infer(model, self.inputs_) - np.testing.assert_allclose(res.as_numpy('OUTPUT0'), - self.expected_outputs_["sub"]) - np.testing.assert_allclose(res.as_numpy('OUTPUT1'), - self.expected_outputs_["add"]) + np.testing.assert_allclose( + res.as_numpy("OUTPUT0"), self.expected_outputs_["sub"] + ) + np.testing.assert_allclose( + res.as_numpy("OUTPUT1"), self.expected_outputs_["add"] + ) # @@ -105,7 +106,6 @@ def infer(self, model): class ModelNamespacePoll(tu.TestResultCollector): - def setUp(self): self.addsub_ = AddSubChecker() self.subadd_ = SubAddChecker() @@ -138,19 +138,18 @@ def test_duplication(self): # infer check for model in [ - "simple_addsub", + "simple_addsub", ]: self.addsub_.infer(model) for model in [ - "simple_subadd", + "simple_subadd", ]: self.subadd_.infer(model) # error check try: self.addsub_.infer("composing_model") - self.assertTrue( - False, "expected error for inferring ambiguous named model") + self.assertTrue(False, "expected error for inferring ambiguous named model") except InferenceServerException as ex: self.assertIn("ambiguity", ex.message()) @@ -165,34 +164,32 @@ def test_ensemble_duplication(self): # infer for model in [ - "composing_addsub", + "composing_addsub", ]: self.addsub_.infer(model) for model in [ - "composing_subadd", + "composing_subadd", ]: self.subadd_.infer(model) # error check try: self.addsub_.infer("simple_ensemble") - self.assertTrue( - False, "expected error for inferring ambiguous named model") + self.assertTrue(False, "expected error for inferring ambiguous named model") except InferenceServerException as ex: self.assertIn("ambiguity", ex.message()) def test_dynamic_resolution(self): # Same model setup as 'test_duplication', will remove / add one of the # composing model at runtime and expect the ensemble to be properly - # linked to exisiting composing model at different steps. + # linked to existing composing model at different steps. # 1. Remove 'composing_model' in addsub_repo, expect both ensembles use # 'composing_model' in subadd_repo and act as subadd # 2. Add back 'composing_model' in addsub_repo, expect the ensembles to behave the # same as before the removal. self.assertTrue("NAMESPACE_TESTING_DIRCTORY" in os.environ) td = os.environ["NAMESPACE_TESTING_DIRCTORY"] - composing_before_path = os.path.join(td, "addsub_repo", - "composing_model") + composing_before_path = os.path.join(td, "addsub_repo", "composing_model") composing_after_path = os.path.join(td, "composing_model") self.check_health() @@ -210,25 +207,23 @@ def test_dynamic_resolution(self): # infer for model in [ - "simple_addsub", + "simple_addsub", ]: self.addsub_.infer(model) for model in [ - "simple_subadd", + "simple_subadd", ]: self.subadd_.infer(model) # error check try: self.addsub_.infer("composing_model") - self.assertTrue( - False, "expected error for inferring ambiguous named model") + self.assertTrue(False, "expected error for inferring ambiguous named model") except InferenceServerException as ex: self.assertIn("ambiguity", ex.message()) class ModelNamespaceExplicit(tu.TestResultCollector): - def setUp(self): self.addsub_ = AddSubChecker() self.subadd_ = SubAddChecker() @@ -267,19 +262,18 @@ def test_duplication(self): # infer for model in [ - "simple_addsub", + "simple_addsub", ]: self.addsub_.infer(model) for model in [ - "simple_subadd", + "simple_subadd", ]: self.subadd_.infer(model) # error check try: self.addsub_.infer("composing_model") - self.assertTrue( - False, "expected error for inferring ambiguous named model") + self.assertTrue(False, "expected error for inferring ambiguous named model") except InferenceServerException as ex: self.assertIn("ambiguity", ex.message()) @@ -297,34 +291,32 @@ def test_ensemble_duplication(self): # infer for model in [ - "composing_addsub", + "composing_addsub", ]: self.addsub_.infer(model) for model in [ - "composing_subadd", + "composing_subadd", ]: self.subadd_.infer(model) # error check try: self.addsub_.infer("simple_ensemble") - self.assertTrue( - False, "expected error for inferring ambiguous named model") + self.assertTrue(False, "expected error for inferring ambiguous named model") except InferenceServerException as ex: self.assertIn("ambiguity", ex.message()) def test_dynamic_resolution(self): # Same model setup as 'test_duplication', will remove / add one of the # composing model at runtime and expect the ensemble to be properly - # linked to exisiting composing model at different steps. + # linked to existing composing model at different steps. # 1. Remove 'composing_model' in addsub_repo, expect both ensembles use # 'composing_model' in subadd_repo and act as subadd. # 2. Add back 'composing_model' in addsub_repo, expect the ensembles to behave the # same as before the removal. self.assertTrue("NAMESPACE_TESTING_DIRCTORY" in os.environ) td = os.environ["NAMESPACE_TESTING_DIRCTORY"] - composing_before_path = os.path.join(td, "addsub_repo", - "composing_model") + composing_before_path = os.path.join(td, "addsub_repo", "composing_model") composing_after_path = os.path.join(td, "composing_model") self.check_health() @@ -343,28 +335,27 @@ def test_dynamic_resolution(self): # Explicitly load one of the ensembel, should still trigger cascading # (re-)load for model in [ - "simple_addsub", + "simple_addsub", ]: self.client_.load_model(model) # infer for model in [ - "simple_addsub", + "simple_addsub", ]: self.addsub_.infer(model) for model in [ - "simple_subadd", + "simple_subadd", ]: self.subadd_.infer(model) # error check try: self.addsub_.infer("composing_model") - self.assertTrue( - False, "expected error for inferring ambiguous named model") + self.assertTrue(False, "expected error for inferring ambiguous named model") except InferenceServerException as ex: self.assertIn("ambiguity", ex.message()) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_model_namespacing/test.sh b/qa/L0_model_namespacing/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py old mode 100644 new mode 100755 index 8a184619b0..71f89a1659 --- a/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py +++ b/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py @@ -1,5 +1,7 @@ -import sys +#!/usr/bin/env python3 + import os +import sys # load pre-defined QA model sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) diff --git a/qa/L0_model_namespacing/test_duplication/addsub_repo/simple_addsub/config.pbtxt b/qa/L0_model_namespacing/test_duplication/addsub_repo/simple_addsub/config.pbtxt old mode 100755 new mode 100644 index 944adcecc2..245e256976 --- a/qa/L0_model_namespacing/test_duplication/addsub_repo/simple_addsub/config.pbtxt +++ b/qa/L0_model_namespacing/test_duplication/addsub_repo/simple_addsub/config.pbtxt @@ -35,7 +35,7 @@ input [ name: "INPUT0" data_type: TYPE_INT32 dims: [ 16 ] - + } ] input [ @@ -43,7 +43,7 @@ input [ name: "INPUT1" data_type: TYPE_INT32 dims: [ 16 ] - + } ] output [ @@ -51,8 +51,8 @@ output [ name: "OUTPUT0" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] output [ @@ -60,8 +60,8 @@ output [ name: "OUTPUT1" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] ensemble_scheduling { diff --git a/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py old mode 100644 new mode 100755 index b21b24fd4e..4eed1f9a40 --- a/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py +++ b/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py @@ -1,5 +1,7 @@ -import sys +#!/usr/bin/env python3 + import os +import sys # load pre-defined QA model sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) diff --git a/qa/L0_model_namespacing/test_duplication/subadd_repo/simple_subadd/config.pbtxt b/qa/L0_model_namespacing/test_duplication/subadd_repo/simple_subadd/config.pbtxt old mode 100755 new mode 100644 index fc9fe34081..85d8ec0051 --- a/qa/L0_model_namespacing/test_duplication/subadd_repo/simple_subadd/config.pbtxt +++ b/qa/L0_model_namespacing/test_duplication/subadd_repo/simple_subadd/config.pbtxt @@ -33,7 +33,7 @@ input [ name: "INPUT0" data_type: TYPE_INT32 dims: [ 16 ] - + } ] input [ @@ -41,7 +41,7 @@ input [ name: "INPUT1" data_type: TYPE_INT32 dims: [ 16 ] - + } ] output [ @@ -49,8 +49,8 @@ output [ name: "OUTPUT0" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] output [ @@ -58,8 +58,8 @@ output [ name: "OUTPUT1" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] ensemble_scheduling { diff --git a/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py old mode 100644 new mode 100755 index 8a184619b0..71f89a1659 --- a/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py +++ b/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py @@ -1,5 +1,7 @@ -import sys +#!/usr/bin/env python3 + import os +import sys # load pre-defined QA model sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) diff --git a/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/simple_addsub/config.pbtxt b/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/simple_addsub/config.pbtxt old mode 100755 new mode 100644 index 944adcecc2..245e256976 --- a/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/simple_addsub/config.pbtxt +++ b/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/simple_addsub/config.pbtxt @@ -35,7 +35,7 @@ input [ name: "INPUT0" data_type: TYPE_INT32 dims: [ 16 ] - + } ] input [ @@ -43,7 +43,7 @@ input [ name: "INPUT1" data_type: TYPE_INT32 dims: [ 16 ] - + } ] output [ @@ -51,8 +51,8 @@ output [ name: "OUTPUT0" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] output [ @@ -60,8 +60,8 @@ output [ name: "OUTPUT1" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] ensemble_scheduling { diff --git a/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py old mode 100644 new mode 100755 index b21b24fd4e..4eed1f9a40 --- a/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py +++ b/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py @@ -1,5 +1,7 @@ -import sys +#!/usr/bin/env python3 + import os +import sys # load pre-defined QA model sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) diff --git a/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/simple_subadd/config.pbtxt b/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/simple_subadd/config.pbtxt old mode 100755 new mode 100644 index 944adcecc2..245e256976 --- a/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/simple_subadd/config.pbtxt +++ b/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/simple_subadd/config.pbtxt @@ -35,7 +35,7 @@ input [ name: "INPUT0" data_type: TYPE_INT32 dims: [ 16 ] - + } ] input [ @@ -43,7 +43,7 @@ input [ name: "INPUT1" data_type: TYPE_INT32 dims: [ 16 ] - + } ] output [ @@ -51,8 +51,8 @@ output [ name: "OUTPUT0" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] output [ @@ -60,8 +60,8 @@ output [ name: "OUTPUT1" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] ensemble_scheduling { diff --git a/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py b/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py old mode 100644 new mode 100755 index 8a184619b0..71f89a1659 --- a/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py +++ b/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py @@ -1,5 +1,7 @@ -import sys +#!/usr/bin/env python3 + import os +import sys # load pre-defined QA model sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) diff --git a/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/simple_ensemble/config.pbtxt b/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/simple_ensemble/config.pbtxt old mode 100755 new mode 100644 index 2bf341b364..2a9f0003a3 --- a/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/simple_ensemble/config.pbtxt +++ b/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/simple_ensemble/config.pbtxt @@ -35,7 +35,7 @@ input [ name: "INPUT0" data_type: TYPE_INT32 dims: [ 16 ] - + } ] input [ @@ -43,7 +43,7 @@ input [ name: "INPUT1" data_type: TYPE_INT32 dims: [ 16 ] - + } ] output [ @@ -51,8 +51,8 @@ output [ name: "OUTPUT0" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] output [ @@ -60,8 +60,8 @@ output [ name: "OUTPUT1" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] ensemble_scheduling { diff --git a/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py b/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py old mode 100644 new mode 100755 index b21b24fd4e..4eed1f9a40 --- a/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py +++ b/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py @@ -1,5 +1,7 @@ -import sys +#!/usr/bin/env python3 + import os +import sys # load pre-defined QA model sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) diff --git a/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/simple_ensemble/config.pbtxt b/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/simple_ensemble/config.pbtxt old mode 100755 new mode 100644 index aa79a7bd08..0ee1015f25 --- a/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/simple_ensemble/config.pbtxt +++ b/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/simple_ensemble/config.pbtxt @@ -35,7 +35,7 @@ input [ name: "INPUT0" data_type: TYPE_INT32 dims: [ 16 ] - + } ] input [ @@ -43,7 +43,7 @@ input [ name: "INPUT1" data_type: TYPE_INT32 dims: [ 16 ] - + } ] output [ @@ -51,8 +51,8 @@ output [ name: "OUTPUT0" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] output [ @@ -60,8 +60,8 @@ output [ name: "OUTPUT1" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] ensemble_scheduling { diff --git a/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py b/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py old mode 100644 new mode 100755 index 8a184619b0..71f89a1659 --- a/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py +++ b/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py @@ -1,5 +1,7 @@ -import sys +#!/usr/bin/env python3 + import os +import sys # load pre-defined QA model sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) diff --git a/qa/L0_model_namespacing/test_no_duplication/addsub_repo/simple_addsub/config.pbtxt b/qa/L0_model_namespacing/test_no_duplication/addsub_repo/simple_addsub/config.pbtxt old mode 100755 new mode 100644 index 2bf341b364..2a9f0003a3 --- a/qa/L0_model_namespacing/test_no_duplication/addsub_repo/simple_addsub/config.pbtxt +++ b/qa/L0_model_namespacing/test_no_duplication/addsub_repo/simple_addsub/config.pbtxt @@ -35,7 +35,7 @@ input [ name: "INPUT0" data_type: TYPE_INT32 dims: [ 16 ] - + } ] input [ @@ -43,7 +43,7 @@ input [ name: "INPUT1" data_type: TYPE_INT32 dims: [ 16 ] - + } ] output [ @@ -51,8 +51,8 @@ output [ name: "OUTPUT0" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] output [ @@ -60,8 +60,8 @@ output [ name: "OUTPUT1" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] ensemble_scheduling { diff --git a/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py b/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py old mode 100644 new mode 100755 index b21b24fd4e..4eed1f9a40 --- a/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py +++ b/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py @@ -1,5 +1,7 @@ -import sys +#!/usr/bin/env python3 + import os +import sys # load pre-defined QA model sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"]) diff --git a/qa/L0_model_namespacing/test_no_duplication/subadd_repo/simple_subadd/config.pbtxt b/qa/L0_model_namespacing/test_no_duplication/subadd_repo/simple_subadd/config.pbtxt old mode 100755 new mode 100644 index aa79a7bd08..0ee1015f25 --- a/qa/L0_model_namespacing/test_no_duplication/subadd_repo/simple_subadd/config.pbtxt +++ b/qa/L0_model_namespacing/test_no_duplication/subadd_repo/simple_subadd/config.pbtxt @@ -35,7 +35,7 @@ input [ name: "INPUT0" data_type: TYPE_INT32 dims: [ 16 ] - + } ] input [ @@ -43,7 +43,7 @@ input [ name: "INPUT1" data_type: TYPE_INT32 dims: [ 16 ] - + } ] output [ @@ -51,8 +51,8 @@ output [ name: "OUTPUT0" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] output [ @@ -60,8 +60,8 @@ output [ name: "OUTPUT1" data_type: TYPE_INT32 dims: [ 16 ] - - + + } ] ensemble_scheduling { diff --git a/qa/L0_model_queue/model_queue_test.py b/qa/L0_model_queue/model_queue_test.py old mode 100644 new mode 100755 index e0875205ff..14d2349c8c --- a/qa/L0_model_queue/model_queue_test.py +++ b/qa/L0_model_queue/model_queue_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,15 +30,16 @@ sys.path.append("../common") -from builtins import range -import time import threading +import time import unittest -import numpy as np +from builtins import range +from ctypes import * + import infer_util as iu +import numpy as np import test_util as tu from tritonclientutils import InferenceServerException -from ctypes import * _max_queue_delay_ms = 10000 @@ -45,15 +48,11 @@ class ModelQueueTest(tu.TestResultCollector): - def setUp(self): self.trials_ = [] for base in ["custom", "ensemble"]: for is_http_trial in [True, False]: - self.trials_.append({ - "base": base, - "is_http_trial": is_http_trial - }) + self.trials_.append({"base": base, "is_http_trial": is_http_trial}) global _deferred_exceptions _deferred_exceptions = [] @@ -70,33 +69,41 @@ def check_deferred_exception(self): _deferred_exceptions.pop(0) raise first_exception - def check_response(self, - bs, - dtype, - shapes, - priority, - timeout_us, - thresholds, - base="custom", - is_http_trial=True): - full_shapes = [[ - bs, - ] + shape for shape in shapes] + def check_response( + self, + bs, + dtype, + shapes, + priority, + timeout_us, + thresholds, + base="custom", + is_http_trial=True, + ): + full_shapes = [ + [ + bs, + ] + + shape + for shape in shapes + ] try: start_ms = int(round(time.time() * 1000)) - iu.infer_zero(self, - base, - bs, - dtype, - full_shapes, - full_shapes, - model_version=1, - use_http_json_tensors=False, - use_http=is_http_trial, - use_grpc=(not is_http_trial), - use_streaming=False, - priority=priority, - timeout_us=timeout_us) + iu.infer_zero( + self, + base, + bs, + dtype, + full_shapes, + full_shapes, + model_version=1, + use_http_json_tensors=False, + use_http=is_http_trial, + use_grpc=(not is_http_trial), + use_streaming=False, + priority=priority, + timeout_us=timeout_us, + ) end_ms = int(round(time.time() * 1000)) @@ -105,13 +112,21 @@ def check_response(self, if lt_ms is not None: self.assertTrue( (end_ms - start_ms) < lt_ms, - "expected less than " + str(lt_ms) + - "ms response time, got " + str(end_ms - start_ms) + " ms") + "expected less than " + + str(lt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) if gt_ms is not None: self.assertTrue( (end_ms - start_ms) > gt_ms, - "expected greater than " + str(gt_ms) + - "ms response time, got " + str(end_ms - start_ms) + " ms") + "expected greater than " + + str(gt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) except Exception as ex: self.add_deferred_exception(ex) @@ -130,10 +145,12 @@ def test_max_queue_size(self): threads = [] for i in range(10): threads.append( - threading.Thread(target=self.check_response, - args=(1, dtype, shapes, 0, 0, (None, - None)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 0, 0, (None, None)), + kwargs=trial, + ) + ) preceding_thread.start() time.sleep(0.5) for t in threads: @@ -150,8 +167,10 @@ def test_max_queue_size(self): except InferenceServerException as ex: self.assertTrue( "Exceeds maximum queue size" in ex.message(), - "Expected error message \"Exceeds maximum queue size\", got: {}" - .format(ex)) + 'Expected error message "Exceeds maximum queue size", got: {}'.format( + ex + ), + ) try: self.check_deferred_exception() except InferenceServerException as ex: @@ -170,18 +189,26 @@ def test_policy_delay(self): try: threads = [] threads.append( - threading.Thread(target=self.check_response, - args=(1, dtype, shapes, 0, 0, (15000, - 10000)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 0, 0, (15000, 10000)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 0, 0, (100, 0)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (100, 0)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 0, 0, (100, 0)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (100, 0)), + kwargs=trial, + ) + ) threads[0].start() time.sleep(0.2) threads[1].start() @@ -203,17 +230,26 @@ def test_policy_reject(self): for trial in self.trials_: threads = [] threads.append( - threading.Thread(target=self.check_response, - args=(1, dtype, shapes, 0, 0, (None, None)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 0, 0, (None, None)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 0, 0, (100, 0)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (100, 0)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 0, 0, (100, 0)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (100, 0)), + kwargs=trial, + ) + ) threads[0].start() time.sleep(0.2) threads[1].start() @@ -228,8 +264,10 @@ def test_policy_reject(self): except InferenceServerException as ex: self.assertTrue( "Request timeout expired" in ex.message(), - "Expected error message \"Request timeout expired\", got: {}" - .format(ex)) + 'Expected error message "Request timeout expired", got: {}'.format( + ex + ), + ) try: self.check_deferred_exception() @@ -238,7 +276,7 @@ def test_policy_reject(self): def test_timeout_override(self): # Send requests with batch sizes 1, 1, 3 where the first request - # overrides the timout to be less than 'default_timeout_microseconds', + # overrides the timeout to be less than 'default_timeout_microseconds', # and the second and third requests are sent after the overridden # timeout. Expect the first request is timed-out and rejected before # 'default_timeout_microseconds', which makes the second and third @@ -250,18 +288,26 @@ def test_timeout_override(self): for trial in self.trials_: threads = [] threads.append( - threading.Thread(target=self.check_response, - args=(1, dtype, shapes, 0, 100000, (None, - None)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 0, 100000, (None, None)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 0, 0, (100, 0)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (100, 0)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 0, 0, (100, 0)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (100, 0)), + kwargs=trial, + ) + ) threads[0].start() time.sleep(0.2) threads[1].start() @@ -276,8 +322,10 @@ def test_timeout_override(self): except InferenceServerException as ex: self.assertTrue( "Request timeout expired" in ex.message(), - "Expected error message \"Request timeout expired\", got: {}" - .format(ex)) + 'Expected error message "Request timeout expired", got: {}'.format( + ex + ), + ) try: self.check_deferred_exception() @@ -289,18 +337,26 @@ def test_timeout_override(self): # 'default_timeout_microseconds' and before queue delay. threads = [] threads.append( - threading.Thread(target=self.check_response, - args=(1, dtype, shapes, 0, 10000000, (None, - None)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 0, 10000000, (None, None)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 0, 0, (1100, 700)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (1100, 700)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 0, 0, (1100, 700)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (1100, 700)), + kwargs=trial, + ) + ) threads[0].start() time.sleep(0.2) threads[1].start() @@ -315,8 +371,10 @@ def test_timeout_override(self): except InferenceServerException as ex: self.assertTrue( "Request timeout expired" in ex.message(), - "Expected error message \"Request timeout expired\", got: {}" - .format(ex)) + 'Expected error message "Request timeout expired", got: {}'.format( + ex + ), + ) try: self.check_deferred_exception() @@ -327,17 +385,26 @@ def test_timeout_override(self): # processed only after 'default_timeout_microseconds' threads = [] threads.append( - threading.Thread(target=self.check_response, - args=(1, dtype, shapes, 0, 0, (None, None)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 0, 0, (None, None)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 0, 0, (1100, 700)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (1100, 700)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 0, 0, (1100, 700)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (1100, 700)), + kwargs=trial, + ) + ) threads[0].start() time.sleep(0.2) threads[1].start() @@ -352,8 +419,10 @@ def test_timeout_override(self): except InferenceServerException as ex: self.assertTrue( "Request timeout expired" in ex.message(), - "Expected error message \"Request timeout expired\", got: {}" - .format(ex)) + 'Expected error message "Request timeout expired", got: {}'.format( + ex + ), + ) try: self.check_deferred_exception() @@ -370,17 +439,26 @@ def test_priority_levels(self): for trial in self.trials_: threads = [] threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 0, 0, (500, 200)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (500, 200)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(1, dtype, shapes, 0, 0, (15000, 10000)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 0, 0, (15000, 10000)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 1, 0, (100, 0)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 1, 0, (100, 0)), + kwargs=trial, + ) + ) threads[0].start() # wait to make sure the order is correct time.sleep(0.1) @@ -407,18 +485,26 @@ def test_max_priority_levels(self): for trial in self.trials_: threads = [] threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 0, 0, (500, 200)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 0, 0, (500, 200)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(1, dtype, shapes, MAX_UINT32_PLUS_1, 0, - (15000, 10000)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, MAX_UINT32_PLUS_1, 0, (15000, 10000)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 1, 0, (100, 0)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 1, 0, (100, 0)), + kwargs=trial, + ) + ) threads[0].start() # wait to make sure the order is correct time.sleep(0.1) @@ -464,31 +550,47 @@ def test_priority_with_policy(self): # The expected ranges may not be rounded to accommodate # the sleep between sending requests threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 1, 0, (2000, 1000)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 1, 0, (2000, 1000)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(1, dtype, shapes, 1, 1000000, (3400, - 2400)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(1, dtype, shapes, 1, 1000000, (3400, 2400)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 1, 0, (1700, 700)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 1, 0, (1700, 700)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, dtype, shapes, 2, 2000000, (None, - None)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(2, dtype, shapes, 2, 2000000, (None, None)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(3, dtype, shapes, 2, 0, (2700, 1700)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(3, dtype, shapes, 2, 0, (2700, 1700)), + kwargs=trial, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(6, dtype, shapes, 2, 0, (15000, 10000)), - kwargs=trial)) + threading.Thread( + target=self.check_response, + args=(6, dtype, shapes, 2, 0, (15000, 10000)), + kwargs=trial, + ) + ) for t in threads: t.start() time.sleep(0.2) @@ -502,8 +604,10 @@ def test_priority_with_policy(self): except InferenceServerException as ex: self.assertTrue( "Request timeout expired" in ex.message(), - "Expected error message \"Request timeout expired\", got: {}" - .format(ex)) + 'Expected error message "Request timeout expired", got: {}'.format( + ex + ), + ) try: self.check_deferred_exception() @@ -511,5 +615,5 @@ def test_priority_with_policy(self): self.assertTrue(False, "unexpected error {}".format(ex)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_model_update/instance_update_test.py b/qa/L0_model_update/instance_update_test.py old mode 100644 new mode 100755 index 39f5bfc8d4..27a09486d9 --- a/qa/L0_model_update/instance_update_test.py +++ b/qa/L0_model_update/instance_update_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -24,23 +26,28 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import unittest +import concurrent.futures +import json import os import random import time -import concurrent.futures -import json +import unittest + import numpy as np import tritonclient.grpc as grpcclient +from models.model_init_del.util import ( + disable_batching, + enable_batching, + get_count, + reset_count, + set_delay, + update_instance_group, + update_model_file, +) from tritonclient.utils import InferenceServerException -from models.model_init_del.util import (get_count, reset_count, set_delay, - update_instance_group, - update_model_file, enable_batching, - disable_batching) class TestInstanceUpdate(unittest.TestCase): - __model_name = "model_init_del" def setUp(self): @@ -99,17 +106,18 @@ def __load_model(self, instance_count, instance_config="", batching=False): set_delay("initialize", 0) set_delay("infer", 0) # Load model - self.__update_instance_count(instance_count, - 0, - instance_config, - batching=batching) - - def __update_instance_count(self, - add_count, - del_count, - instance_config="", - wait_for_finalize=False, - batching=False): + self.__update_instance_count( + instance_count, 0, instance_config, batching=batching + ) + + def __update_instance_count( + self, + add_count, + del_count, + instance_config="", + wait_for_finalize=False, + batching=False, + ): self.assertIsInstance(add_count, int) self.assertGreaterEqual(add_count, 0) self.assertIsInstance(del_count, int) @@ -122,8 +130,7 @@ def __update_instance_count(self, if len(instance_config) == 0: prev_count = prev_initialize_count - prev_finalize_count new_count = prev_count + add_count - del_count - instance_config = ("{\ncount: " + str(new_count) + - "\nkind: KIND_CPU\n}") + instance_config = "{\ncount: " + str(new_count) + "\nkind: KIND_CPU\n}" update_instance_group(instance_config) self.__triton.load_model(self.__model_name) self.__check_count("initialize", new_initialize_count) @@ -190,20 +197,20 @@ def test_gpu_instance_update(self): def test_gpu_cpu_instance_update(self): # Load model with 1 GPU instance and 2 CPU instance self.__load_model( - 3, - "{\ncount: 2\nkind: KIND_CPU\n},\n{\ncount: 1\nkind: KIND_GPU\n}") + 3, "{\ncount: 2\nkind: KIND_CPU\n},\n{\ncount: 1\nkind: KIND_GPU\n}" + ) # Add 2 GPU instance and remove 1 CPU instance self.__update_instance_count( - 2, 1, - "{\ncount: 1\nkind: KIND_CPU\n},\n{\ncount: 3\nkind: KIND_GPU\n}") + 2, 1, "{\ncount: 1\nkind: KIND_CPU\n},\n{\ncount: 3\nkind: KIND_GPU\n}" + ) # Shuffle the instances self.__update_instance_count( - 0, 0, - "{\ncount: 3\nkind: KIND_GPU\n},\n{\ncount: 1\nkind: KIND_CPU\n}") + 0, 0, "{\ncount: 3\nkind: KIND_GPU\n},\n{\ncount: 1\nkind: KIND_CPU\n}" + ) # Remove 1 GPU instance and add 1 CPU instance self.__update_instance_count( - 1, 1, - "{\ncount: 2\nkind: KIND_GPU\n},\n{\ncount: 2\nkind: KIND_CPU\n}") + 1, 1, "{\ncount: 2\nkind: KIND_GPU\n},\n{\ncount: 2\nkind: KIND_CPU\n}" + ) # Unload model self.__unload_model() @@ -212,12 +219,13 @@ def test_instance_name_update(self): # Load 3 instances with 2 different names self.__load_model( 3, - "{\nname: \"old_1\"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: \"old_2\"\ncount: 2\nkind: KIND_GPU\n}" + '{\nname: "old_1"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "old_2"\ncount: 2\nkind: KIND_GPU\n}', ) # Change the instance names self.__update_instance_count( - 0, 0, - "{\nname: \"new_1\"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: \"new_2\"\ncount: 2\nkind: KIND_GPU\n}" + 0, + 0, + '{\nname: "new_1"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "new_2"\ncount: 2\nkind: KIND_GPU\n}', ) # Unload model self.__unload_model() @@ -227,24 +235,27 @@ def test_instance_signature(self): # Load 2 GPU instances and 3 CPU instances self.__load_model( 5, - "{\nname: \"GPU_group\"\ncount: 2\nkind: KIND_GPU\n},\n{\nname: \"CPU_group\"\ncount: 3\nkind: KIND_CPU\n}" + '{\nname: "GPU_group"\ncount: 2\nkind: KIND_GPU\n},\n{\nname: "CPU_group"\ncount: 3\nkind: KIND_CPU\n}', ) # Flatten the instances representation self.__update_instance_count( - 0, 0, - "{\nname: \"CPU_1\"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: \"CPU_2_3\"\ncount: 2\nkind: KIND_CPU\n},\n{\nname: \"GPU_1\"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: \"GPU_2\"\ncount: 1\nkind: KIND_GPU\n}" + 0, + 0, + '{\nname: "CPU_1"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "CPU_2_3"\ncount: 2\nkind: KIND_CPU\n},\n{\nname: "GPU_1"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: "GPU_2"\ncount: 1\nkind: KIND_GPU\n}', ) time.sleep(0.1) # larger the gap for config.pbtxt timestamp to update # Consolidate different representations self.__update_instance_count( - 0, 0, - "{\nname: \"CPU_group\"\ncount: 3\nkind: KIND_CPU\n},\n{\nname: \"GPU_group\"\ncount: 2\nkind: KIND_GPU\n}" + 0, + 0, + '{\nname: "CPU_group"\ncount: 3\nkind: KIND_CPU\n},\n{\nname: "GPU_group"\ncount: 2\nkind: KIND_GPU\n}', ) time.sleep(0.1) # larger the gap for config.pbtxt timestamp to update # Flatten the instances representation self.__update_instance_count( - 0, 0, - "{\nname: \"GPU_1\"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: \"GPU_2\"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: \"CPU_1\"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: \"CPU_2\"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: \"CPU_3\"\ncount: 1\nkind: KIND_CPU\n}" + 0, + 0, + '{\nname: "GPU_1"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: "GPU_2"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: "CPU_1"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "CPU_2"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "CPU_3"\ncount: 1\nkind: KIND_CPU\n}', ) # Unload model self.__unload_model() @@ -266,21 +277,22 @@ def test_invalid_config(self): def test_model_file_update(self): self.__load_model(5) update_model_file() - self.__update_instance_count(6, - 5, - "{\ncount: 6\nkind: KIND_CPU\n}", - wait_for_finalize=True) + self.__update_instance_count( + 6, 5, "{\ncount: 6\nkind: KIND_CPU\n}", wait_for_finalize=True + ) self.__unload_model() # Test instance update with non instance config changed in config.pbtxt def test_non_instance_config_update(self): self.__load_model(4, batching=False) enable_batching() - self.__update_instance_count(2, - 4, - "{\ncount: 2\nkind: KIND_CPU\n}", - wait_for_finalize=True, - batching=True) + self.__update_instance_count( + 2, + 4, + "{\ncount: 2\nkind: KIND_CPU\n}", + wait_for_finalize=True, + batching=True, + ) self.__unload_model(batching=True) # Test passing new instance config via load API @@ -320,8 +332,7 @@ def test_update_while_inferencing(self): infer_thread = pool.submit(self.__infer) time.sleep(2) # make sure inference has started update_start_time = time.time() - update_thread = pool.submit(self.__triton.load_model, - self.__model_name) + update_thread = pool.submit(self.__triton.load_model, self.__model_name) update_thread.result() update_end_time = time.time() infer_thread.result() @@ -347,8 +358,7 @@ def test_infer_while_updating(self): update_instance_group("{\ncount: 2\nkind: KIND_CPU\n}") with concurrent.futures.ThreadPoolExecutor() as pool: update_start_time = time.time() - update_thread = pool.submit(self.__triton.load_model, - self.__model_name) + update_thread = pool.submit(self.__triton.load_model, self.__model_name) time.sleep(2) # make sure update has started infer_start_time = time.time() infer_thread = pool.submit(self.__infer) @@ -369,18 +379,21 @@ def test_infer_while_updating(self): self.__unload_model() # Test instance resource requirement increase - @unittest.skipUnless("execution_count" in os.environ["RATE_LIMIT_MODE"], - "Rate limiter precondition not met for this test") + @unittest.skipUnless( + "execution_count" in os.environ["RATE_LIMIT_MODE"], + "Rate limiter precondition not met for this test", + ) def test_instance_resource_increase(self): # Load model self.__load_model( 1, - "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 2\n}\n]\n}\n}" + '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 2\n}\n]\n}\n}', ) # Increase resource requirement self.__update_instance_count( - 1, 1, - "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 8\n}\n]\n}\n}" + 1, + 1, + '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 8\n}\n]\n}\n}', ) # Check the model is not blocked from infer due to the default resource # possibly not updated to the larger resource requirement. @@ -401,42 +414,48 @@ def infer(): self.__unload_model() # Test instance resource requirement increase above explicit resource - @unittest.skipUnless(os.environ["RATE_LIMIT_MODE"] == - "execution_count_with_explicit_resource", - "Rate limiter precondition not met for this test") + @unittest.skipUnless( + os.environ["RATE_LIMIT_MODE"] == "execution_count_with_explicit_resource", + "Rate limiter precondition not met for this test", + ) def test_instance_resource_increase_above_explicit(self): # Load model self.__load_model( 1, - "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 2\n}\n]\n}\n}" + '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 2\n}\n]\n}\n}', ) # Increase resource requirement with self.assertRaises(InferenceServerException): self.__update_instance_count( - 0, 0, - "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 32\n}\n]\n}\n}" + 0, + 0, + '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 32\n}\n]\n}\n}', ) # Correct the resource requirement to match the explicit resource self.__update_instance_count( - 1, 1, - "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 10\n}\n]\n}\n}" + 1, + 1, + '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 10\n}\n]\n}\n}', ) # Unload model self.__unload_model() # Test instance resource requirement decrease - @unittest.skipUnless("execution_count" in os.environ["RATE_LIMIT_MODE"], - "Rate limiter precondition not met for this test") + @unittest.skipUnless( + "execution_count" in os.environ["RATE_LIMIT_MODE"], + "Rate limiter precondition not met for this test", + ) def test_instance_resource_decrease(self): # Load model self.__load_model( 1, - "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 4\n}\n]\n}\n}" + '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 4\n}\n]\n}\n}', ) # Decrease resource requirement self.__update_instance_count( - 1, 1, - "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 3\n}\n]\n}\n}" + 1, + 1, + '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 3\n}\n]\n}\n}', ) # Unload model self.__unload_model() @@ -445,8 +464,11 @@ def test_instance_resource_decrease(self): # max resource is actually decreased. time.sleep(1) # make sure the log file is updated log_path = os.path.join( - os.environ["MODEL_LOG_DIR"], "instance_update_test.rate_limit_" + - os.environ["RATE_LIMIT_MODE"] + ".server.log") + os.environ["MODEL_LOG_DIR"], + "instance_update_test.rate_limit_" + + os.environ["RATE_LIMIT_MODE"] + + ".server.log", + ) with open(log_path, mode="r", encoding="utf-8", errors="strict") as f: if os.environ["RATE_LIMIT_MODE"] == "execution_count": # Make sure the previous max resource limit of 4 is reduced to 3 diff --git a/qa/L0_multi_server/test.sh b/qa/L0_multi_server/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_nan_inf/models/nan_inf_output/1/model.py b/qa/L0_nan_inf/models/nan_inf_output/1/model.py old mode 100644 new mode 100755 index df269edf52..d85c3b4702 --- a/qa/L0_nan_inf/models/nan_inf_output/1/model.py +++ b/qa/L0_nan_inf/models/nan_inf_output/1/model.py @@ -1,4 +1,6 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -25,24 +27,22 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import json + import numpy as np import triton_python_backend_utils as pb_utils class TritonPythonModel: - def initialize(self, args): - self.model_config = json.loads(args['model_config']) + self.model_config = json.loads(args["model_config"]) def execute(self, requests): - """ This function is called on inference request. - """ + """This function is called on inference request.""" responses = [] for _ in requests: # Include one of each specially parsed JSON value: nan, inf, and -inf - out_0 = np.array([np.nan, np.inf, np.NINF, 1, 2, 3], - dtype=np.float32) + out_0 = np.array([np.nan, np.inf, np.NINF, 1, 2, 3], dtype=np.float32) out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0) responses.append(pb_utils.InferenceResponse([out_tensor_0])) diff --git a/qa/L0_nan_inf/nan_inf_test.py b/qa/L0_nan_inf/nan_inf_test.py old mode 100644 new mode 100755 index 630c573a1b..3013b03850 --- a/qa/L0_nan_inf/nan_inf_test.py +++ b/qa/L0_nan_inf/nan_inf_test.py @@ -27,37 +27,34 @@ import sys -sys.path.append('../common') +sys.path.append("../common") import json -import unittest import traceback +import unittest -import requests import numpy as np -import tritonclient.http as tritonhttpclient +import requests +import test_util as tu import tritonclient.grpc as tritongrpcclient +import tritonclient.http as tritonhttpclient from tritonclient.utils import InferenceServerException -import test_util as tu class NanInfTest(tu.TestResultCollector): - expected_output = np.array([np.nan, np.inf, np.NINF, 1, 2, 3], - dtype=np.float32) + expected_output = np.array([np.nan, np.inf, np.NINF, 1, 2, 3], dtype=np.float32) model_name = "nan_inf_output" def test_http_raw(self): payload = { - "inputs": [{ - "name": "INPUT0", - "datatype": "FP32", - "shape": [1], - "data": [1] - }] + "inputs": [ + {"name": "INPUT0", "datatype": "FP32", "shape": [1], "data": [1]} + ] } response = requests.post( "http://localhost:8000/v2/models/nan_inf_output/infer", - data=json.dumps(payload)) + data=json.dumps(payload), + ) if not response.ok: self.assertTrue(False, "Response not OK: {}".format(response.text)) @@ -65,40 +62,40 @@ def test_http_raw(self): print(response.json()) except: self.assertTrue( - False, "Response was not valid JSON:\n{}".format(response.text)) + False, "Response was not valid JSON:\n{}".format(response.text) + ) def test_http(self): triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") inputs = [] - inputs.append(tritonhttpclient.InferInput('INPUT0', [1], "FP32")) + inputs.append(tritonhttpclient.InferInput("INPUT0", [1], "FP32")) self.infer_helper(triton_client, inputs) def test_grpc(self): triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") inputs = [] - inputs.append(tritongrpcclient.InferInput('INPUT0', [1], "FP32")) + inputs.append(tritongrpcclient.InferInput("INPUT0", [1], "FP32")) self.infer_helper(triton_client, inputs) def infer_helper(self, triton_client, inputs): inputs[0].set_data_from_numpy(np.arange(1, dtype=np.float32)) try: - results = triton_client.infer(model_name=self.model_name, - inputs=inputs) - output0_data = results.as_numpy('OUTPUT0') + results = triton_client.infer(model_name=self.model_name, inputs=inputs) + output0_data = results.as_numpy("OUTPUT0") # Verify output is as expected # Make sure nan's are equivalent when compared - output_correct = np.array_equal(output0_data, - self.expected_output, - equal_nan=True) + output_correct = np.array_equal( + output0_data, self.expected_output, equal_nan=True + ) self.assertTrue( - output_correct, - "didn't get expected output0: {}".format(output0_data)) + output_correct, "didn't get expected output0: {}".format(output0_data) + ) except InferenceServerException as ex: self.assertTrue(False, ex.message()) except: self.assertTrue(False, traceback.format_exc()) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_nullchar_string/nullchar_string_client.py b/qa/L0_nullchar_string/nullchar_string_client.py old mode 100644 new mode 100755 index d90304856d..2d69b41b3d --- a/qa/L0_nullchar_string/nullchar_string_client.py +++ b/qa/L0_nullchar_string/nullchar_string_client.py @@ -26,47 +26,51 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse -import numpy as np +import numpy as np import tritongrpcclient as grpcclient import tritonhttpclient as httpclient from tritonclientutils import np_to_triton_dtype FLAGS = None -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-m', - '--model-name', - type=str, - required=True, - help='Name of model') - parser.add_argument('-u', - '--url', - type=str, - required=False, - default='localhost:8000', - help='Inference server URL. Default is localhost:8000.') parser.add_argument( - '-i', - '--protocol', + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-m", "--model-name", type=str, required=True, help="Name of model" + ) + parser.add_argument( + "-u", + "--url", + type=str, + required=False, + default="localhost:8000", + help="Inference server URL. Default is localhost:8000.", + ) + parser.add_argument( + "-i", + "--protocol", type=str, required=False, - default='http', - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') + default="http", + help='Protocol ("http"/"grpc") used to ' + + 'communicate with inference service. Default is "http".', + ) FLAGS = parser.parse_args() if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): - print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format( - FLAGS.protocol)) + print( + 'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol) + ) exit(1) client_util = httpclient if FLAGS.protocol == "http" else grpcclient @@ -86,8 +90,9 @@ # Send inference request to the inference server. Get results for # output tensor. inputs = [ - client_util.InferInput("INPUT0", input0_data.shape, - np_to_triton_dtype(np.object_)) + client_util.InferInput( + "INPUT0", input0_data.shape, np_to_triton_dtype(np.object_) + ) ] inputs[0].set_data_from_numpy(input0_data) @@ -95,7 +100,7 @@ # We expect there to be 1 result (with batch-size 1). Compare the input # and output tensor calculated by the model. They must be the same. - output0_data = results.as_numpy('OUTPUT0') + output0_data = results.as_numpy("OUTPUT0") print(input0_data, "?=?", output0_data) assert np.equal(input0_data.astype(np.bytes_), output0_data).all() diff --git a/qa/L0_nullchar_string/test.sh b/qa/L0_nullchar_string/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_optional_input/models/ensemble_identity_2_float32/config.pbtxt b/qa/L0_optional_input/models/ensemble_identity_2_float32/config.pbtxt old mode 100755 new mode 100644 diff --git a/qa/L0_optional_input/models/identity_2_float32/config.pbtxt b/qa/L0_optional_input/models/identity_2_float32/config.pbtxt old mode 100755 new mode 100644 diff --git a/qa/L0_optional_input/models/pipeline_identity_2_float32/config.pbtxt b/qa/L0_optional_input/models/pipeline_identity_2_float32/config.pbtxt old mode 100755 new mode 100644 diff --git a/qa/L0_optional_input/optional_input_test.py b/qa/L0_optional_input/optional_input_test.py old mode 100644 new mode 100755 index c813146ecd..308efebf45 --- a/qa/L0_optional_input/optional_input_test.py +++ b/qa/L0_optional_input/optional_input_test.py @@ -30,13 +30,14 @@ sys.path.append("../common") -import numpy as np import sys -import time import threading +import time import unittest -import tritonclient.grpc as grpcclient + +import numpy as np import test_util as tu +import tritonclient.grpc as grpcclient _deferred_exceptions_lock = threading.Lock() _deferred_exceptions = [] @@ -44,31 +45,30 @@ # Similar set up as dynamic batcher tests class OptionalInputTest(tu.TestResultCollector): - def setUp(self): global _deferred_exceptions _deferred_exceptions = [] # The helper client for setup will be GRPC for simplicity. self.triton_client_ = grpcclient.InferenceServerClient("localhost:8001") - self.model_name_ = 'identity_2_float32' + self.model_name_ = "identity_2_float32" # This will not be changed even when ensemble is under test, # as the dynamic batching is performed within the composing model - self.check_status_model = 'identity_2_float32' + self.check_status_model = "identity_2_float32" self.tensor_shape_ = (1, 1) self.inputs_ = { - "INPUT0": grpcclient.InferInput('INPUT0', [1, 1], "FP32"), - "INPUT1": grpcclient.InferInput('INPUT1', [1, 1], "FP32") + "INPUT0": grpcclient.InferInput("INPUT0", [1, 1], "FP32"), + "INPUT1": grpcclient.InferInput("INPUT1", [1, 1], "FP32"), } self.input_data_ = { "INPUT0": np.ones(shape=(1, 1), dtype=np.float32), - "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32) + "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32), } self.inputs_["INPUT0"].set_data_from_numpy(self.input_data_["INPUT0"]) self.inputs_["INPUT1"].set_data_from_numpy(self.input_data_["INPUT1"]) self.outputs_ = { - "INPUT0": grpcclient.InferRequestedOutput('OUTPUT0'), - "INPUT1": grpcclient.InferRequestedOutput('OUTPUT1') + "INPUT0": grpcclient.InferRequestedOutput("OUTPUT0"), + "INPUT1": grpcclient.InferRequestedOutput("OUTPUT1"), } def add_deferred_exception(self, ex): @@ -93,9 +93,9 @@ def check_response(self, thresholds, provided_inputs=("INPUT0", "INPUT1")): outputs.append(self.outputs_[provided_input]) triton_client = grpcclient.InferenceServerClient("localhost:8001") - results = triton_client.infer(model_name=self.model_name_, - inputs=inputs, - outputs=outputs) + results = triton_client.infer( + model_name=self.model_name_, inputs=inputs, outputs=outputs + ) end_ms = int(round(time.time() * 1000)) @@ -106,20 +106,30 @@ def check_response(self, thresholds, provided_inputs=("INPUT0", "INPUT1")): self.assertTrue( np.array_equal(output_data, expected), "{}, {}, expected: {}, got {}".format( - self.model_name_, output_name, expected, output_data)) + self.model_name_, output_name, expected, output_data + ), + ) gt_ms = thresholds[0] lt_ms = thresholds[1] if lt_ms is not None: self.assertTrue( (end_ms - start_ms) < lt_ms, - "expected less than " + str(lt_ms) + - "ms response time, got " + str(end_ms - start_ms) + " ms") + "expected less than " + + str(lt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) if gt_ms is not None: self.assertTrue( (end_ms - start_ms) > gt_ms, - "expected greater than " + str(gt_ms) + - "ms response time, got " + str(end_ms - start_ms) + " ms") + "expected greater than " + + str(gt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) except Exception as ex: self.add_deferred_exception(ex) @@ -129,56 +139,75 @@ def check_status(self, model_name, batch_exec, request_cnt, infer_cnt): # inference statistics to be ready. num_tries = 10 for i in range(num_tries): - stats = self.triton_client_.get_inference_statistics( - model_name, "1") + stats = self.triton_client_.get_inference_statistics(model_name, "1") self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") actual_exec_cnt = stats.model_stats[0].execution_count if actual_exec_cnt == exec_cnt: break - print("WARNING: expect {} executions, got {} (attempt {})".format( - exec_cnt, actual_exec_cnt, i)) + print( + "WARNING: expect {} executions, got {} (attempt {})".format( + exec_cnt, actual_exec_cnt, i + ) + ) time.sleep(1) - self.assertEqual(stats.model_stats[0].name, model_name, - "expect model stats for model {}".format(model_name)) self.assertEqual( - stats.model_stats[0].version, "1", - "expect model stats for model {} version 1".format(model_name)) + stats.model_stats[0].name, + model_name, + "expect model stats for model {}".format(model_name), + ) + self.assertEqual( + stats.model_stats[0].version, + "1", + "expect model stats for model {} version 1".format(model_name), + ) batch_stats = stats.model_stats[0].batch_stats self.assertEqual( - len(batch_stats), len(batch_exec), + len(batch_stats), + len(batch_exec), "expected {} different batch-sizes, got {}".format( - len(batch_exec), len(batch_stats))) + len(batch_exec), len(batch_stats) + ), + ) for batch_stat in batch_stats: bs = batch_stat.batch_size bc = batch_stat.compute_infer.count - self.assertTrue(bs in batch_exec, - "unexpected batch-size {}".format(bs)) + self.assertTrue(bs in batch_exec, "unexpected batch-size {}".format(bs)) # Get count from one of the stats self.assertEqual( - bc, batch_exec[bs], - "expected model-execution-count {} for batch size {}, got {}". - format(batch_exec[bs], bs, bc)) + bc, + batch_exec[bs], + "expected model-execution-count {} for batch size {}, got {}".format( + batch_exec[bs], bs, bc + ), + ) actual_request_cnt = stats.model_stats[0].inference_stats.success.count self.assertEqual( - actual_request_cnt, request_cnt, + actual_request_cnt, + request_cnt, "expected model-request-count {}, got {}".format( - request_cnt, actual_request_cnt)) + request_cnt, actual_request_cnt + ), + ) actual_exec_cnt = stats.model_stats[0].execution_count self.assertEqual( - actual_request_cnt, request_cnt, - "expected model-exec-count {}, got {}".format( - request_cnt, actual_exec_cnt)) + actual_request_cnt, + request_cnt, + "expected model-exec-count {}, got {}".format(request_cnt, actual_exec_cnt), + ) actual_infer_cnt = stats.model_stats[0].inference_count self.assertEqual( - actual_infer_cnt, infer_cnt, + actual_infer_cnt, + infer_cnt, "expected model-inference-count {}, got {}".format( - infer_cnt, actual_infer_cnt)) + infer_cnt, actual_infer_cnt + ), + ) def test_all_inputs(self): # Provide all inputs, send requests that don't form preferred batch @@ -186,11 +215,11 @@ def test_all_inputs(self): try: threads = [] threads.append( - threading.Thread(target=self.check_response, - args=((4000, None),))) + threading.Thread(target=self.check_response, args=((4000, None),)) + ) threads.append( - threading.Thread(target=self.check_response, - args=((4000, None),))) + threading.Thread(target=self.check_response, args=((4000, None),)) + ) threads[0].start() threads[1].start() for t in threads: @@ -207,13 +236,19 @@ def test_optional_same_input(self): try: threads = [] threads.append( - threading.Thread(target=self.check_response, - args=((4000, None),), - kwargs={'provided_inputs': ("INPUT1",)})) + threading.Thread( + target=self.check_response, + args=((4000, None),), + kwargs={"provided_inputs": ("INPUT1",)}, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=((4000, None),), - kwargs={'provided_inputs': ("INPUT1",)})) + threading.Thread( + target=self.check_response, + args=((4000, None),), + kwargs={"provided_inputs": ("INPUT1",)}, + ) + ) threads[0].start() threads[1].start() for t in threads: @@ -231,22 +266,34 @@ def test_optional_mix_inputs(self): try: threads = [] threads.append( - threading.Thread(target=self.check_response, - args=((0, 4000),), - kwargs={'provided_inputs': ("INPUT0",)})) + threading.Thread( + target=self.check_response, + args=((0, 4000),), + kwargs={"provided_inputs": ("INPUT0",)}, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=((0, 4000),), - kwargs={'provided_inputs': ("INPUT1",)})) + threading.Thread( + target=self.check_response, + args=((0, 4000),), + kwargs={"provided_inputs": ("INPUT1",)}, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=((0, 4000),), - kwargs={'provided_inputs': ("INPUT0",)})) + threading.Thread( + target=self.check_response, + args=((0, 4000),), + kwargs={"provided_inputs": ("INPUT0",)}, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=((4000, None),), - kwargs={'provided_inputs': ("INPUT1",)})) + threading.Thread( + target=self.check_response, + args=((4000, None),), + kwargs={"provided_inputs": ("INPUT1",)}, + ) + ) for t in threads: t.start() time.sleep(0.5) @@ -266,19 +313,26 @@ def test_optional_mix_inputs_2(self): try: threads = [] threads.append( - threading.Thread(target=self.check_response, - args=((0, 4000),), - kwargs={'provided_inputs': ("INPUT0",)})) + threading.Thread( + target=self.check_response, + args=((0, 4000),), + kwargs={"provided_inputs": ("INPUT0",)}, + ) + ) threads.append( - threading.Thread(target=self.check_response, args=((0, 4000),))) + threading.Thread(target=self.check_response, args=((0, 4000),)) + ) threads.append( - threading.Thread(target=self.check_response, - args=((0, 4000),), - kwargs={'provided_inputs': ("INPUT0",)})) + threading.Thread( + target=self.check_response, + args=((0, 4000),), + kwargs={"provided_inputs": ("INPUT0",)}, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=((4000, None),))) + threading.Thread(target=self.check_response, args=((4000, None),)) + ) for t in threads: t.start() time.sleep(0.5) @@ -292,28 +346,28 @@ def test_optional_mix_inputs_2(self): def test_ensemble_all_inputs(self): # The ensemble is only a wrapper over 'identity_2_float32' - self.model_name_ = 'ensemble_identity_2_float32' + self.model_name_ = "ensemble_identity_2_float32" self.test_all_inputs() # From the ensemble's perspective, the requests are processed as it is self.check_status(self.model_name_, {1: 2}, 2, 2) def test_ensemble_optional_same_input(self): # The ensemble is only a wrapper over 'identity_2_float32' - self.model_name_ = 'ensemble_identity_2_float32' + self.model_name_ = "ensemble_identity_2_float32" self.test_optional_same_input() # From the ensemble's perspective, the requests are processed as it is self.check_status(self.model_name_, {1: 2}, 2, 2) def test_ensemble_optional_mix_inputs(self): # The ensemble is only a wrapper over 'identity_2_float32' - self.model_name_ = 'ensemble_identity_2_float32' + self.model_name_ = "ensemble_identity_2_float32" self.test_optional_mix_inputs() # From the ensemble's perspective, the requests are processed as it is self.check_status(self.model_name_, {1: 4}, 4, 4) def test_ensemble_optional_mix_inputs_2(self): # The ensemble is only a wrapper over 'identity_2_float32' - self.model_name_ = 'ensemble_identity_2_float32' + self.model_name_ = "ensemble_identity_2_float32" self.test_optional_mix_inputs_2() # From the ensemble's perspective, the requests are processed as it is self.check_status(self.model_name_, {1: 4}, 4, 4) @@ -323,7 +377,7 @@ def test_ensemble_optional_pipeline(self): # inputs, where the ensemble step only connects a subset of inputs # for the second model (which is valid because the disconnected inputs # are marked optional). See 'config.pbtxt' for detail. - self.model_name_ = 'pipeline_identity_2_float32' + self.model_name_ = "pipeline_identity_2_float32" # Provide all inputs, send requests that don't form preferred batch # so all requests should be returned after the queue delay @@ -334,28 +388,29 @@ def test_ensemble_optional_pipeline(self): inputs.append(self.inputs_[provided_input]) triton_client = grpcclient.InferenceServerClient("localhost:8001") - results = triton_client.infer(model_name=self.model_name_, - inputs=inputs) + results = triton_client.infer(model_name=self.model_name_, inputs=inputs) # OUTPU0 is always zero, OUTPUT1 = INPUT0 output_data = results.as_numpy("OUTPUT0") expected = np.zeros(shape=(1, 1), dtype=np.float32) self.assertTrue( np.array_equal(output_data, expected), - "{}, {}, expected: {}, got {}".format(self.model_name_, - "OUTPUT0", expected, - output_data)) + "{}, {}, expected: {}, got {}".format( + self.model_name_, "OUTPUT0", expected, output_data + ), + ) expected = self.input_data_["INPUT0"] output_data = results.as_numpy("OUTPUT1") self.assertTrue( np.array_equal(output_data, expected), - "{}, {}, expected: {}, got {}".format(self.model_name_, - "OUTPUT1", expected, - output_data)) + "{}, {}, expected: {}, got {}".format( + self.model_name_, "OUTPUT1", expected, output_data + ), + ) except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_output_name/output_name_test.py b/qa/L0_output_name/output_name_test.py old mode 100644 new mode 100755 index 46a464ada0..905174640c --- a/qa/L0_output_name/output_name_test.py +++ b/qa/L0_output_name/output_name_test.py @@ -30,17 +30,16 @@ sys.path.append("../common") import unittest + import test_util as tu +from tritongrpcclient import grpc_service_pb2, grpc_service_pb2_grpc import grpc -from tritongrpcclient import grpc_service_pb2 -from tritongrpcclient import grpc_service_pb2_grpc _trials = ("graphdef", "libtorch", "onnx", "plan", "savedmodel") class OutputNameValidationTest(tu.TestResultCollector): - def requestGenerator(self, model_name, output_name): request = grpc_service_pb2.ModelInferRequest() request.model_name = model_name @@ -53,12 +52,11 @@ def requestGenerator(self, model_name, output_name): request.inputs.extend([input]) - output = grpc_service_pb2.ModelInferRequest( - ).InferRequestedOutputTensor() + output = grpc_service_pb2.ModelInferRequest().InferRequestedOutputTensor() output.name = output_name request.outputs.extend([output]) - request.raw_input_contents.extend([bytes(4 * 'a', 'utf-8')]) + request.raw_input_contents.extend([bytes(4 * "a", "utf-8")]) return request @@ -73,14 +71,14 @@ def test_grpc(self): try: response = grpc_stub.ModelInfer(request) self.assertTrue( - False, - "unexpected success for unknown output " + model_name) + False, "unexpected success for unknown output " + model_name + ) except grpc.RpcError as rpc_error: msg = rpc_error.details() self.assertTrue( - msg.startswith( - "unexpected inference output 'DUMMY' for model")) + msg.startswith("unexpected inference output 'DUMMY' for model") + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_output_name/test.sh b/qa/L0_output_name/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_output_validation/lt_op_val_client.py b/qa/L0_output_validation/lt_op_val_client.py old mode 100644 new mode 100755 index 220ae42e64..77b5a16e3f --- a/qa/L0_output_validation/lt_op_val_client.py +++ b/qa/L0_output_validation/lt_op_val_client.py @@ -30,41 +30,44 @@ sys.path.append("../common") -import requests import unittest + +import requests import test_util as tu class OutputValidationTest(tu.TestResultCollector): # for datatype mismatch def test_datatype(self): - url = 'http://localhost:8000/v2/models/libtorch_datatype_1_float32/infer' + url = "http://localhost:8000/v2/models/libtorch_datatype_1_float32/infer" body = '{"inputs":[{"name":"INPUT__0","shape":[1,1],"datatype":"FP32","data":[1.0]}],"outputs":[{"name":"OUTPUT__0"}]}' response = requests.post(url, data=body) msg = response.json()["error"] self.assertTrue( msg.startswith( "configuration expects datatype TYPE_INT32 for output 'OUTPUT__0', model provides TYPE_FP32" - )) + ) + ) # for output mismatch def test_index(self): - url = 'http://localhost:8000/v2/models/libtorch_index_1_float32/infer' + url = "http://localhost:8000/v2/models/libtorch_index_1_float32/infer" body = '{"inputs":[{"name":"INPUT__0","shape":[1,1],"datatype":"FP32","data":[1.0]}],"outputs":[{"name":"OUTPUT__1"}]}' response = requests.post(url, data=body) msg = response.json()["error"] self.assertTrue( msg.startswith( "The output OUTPUT__1 in the model configuration refers to an output index which doesn't exist. This model has 1 outputs" - )) + ) + ) # successful run def test_success(self): - url = 'http://localhost:8000/v2/models/libtorch_zero_1_float32/infer' + url = "http://localhost:8000/v2/models/libtorch_zero_1_float32/infer" body = '{"inputs":[{"name":"INPUT__0","shape":[1,1],"datatype":"FP32","data":[1.0]}],"outputs":[{"name":"OUTPUT__0"}]}' response = requests.post(url, data=body) self.assertEqual(response.status_code, 200) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_output_validation/test.sh b/qa/L0_output_validation/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_parallel_copy/parallel_copy_test.py b/qa/L0_parallel_copy/parallel_copy_test.py old mode 100644 new mode 100755 index 4fdf406cc1..6748fee006 --- a/qa/L0_parallel_copy/parallel_copy_test.py +++ b/qa/L0_parallel_copy/parallel_copy_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,33 +30,36 @@ sys.path.append("../common") -from builtins import range +import functools import time import unittest +from builtins import range + import numpy as np import test_util as tu -import functools import tritonclient.grpc as grpcclient from tritonclient.utils import InferenceServerException class ParallelCopyTest(tu.TestResultCollector): - def setUp(self): self.client_ = grpcclient.InferenceServerClient("localhost:8001") self.dtype_ = np.float32 - self.model_name_ = tu.get_zero_model_name('plan', 1, self.dtype_) + self.model_name_ = tu.get_zero_model_name("plan", 1, self.dtype_) def _batch_input_duration(self, batch_size): stats = self.client_.get_inference_statistics(self.model_name_, "1") self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") self.assertEqual( - stats.model_stats[0].name, self.model_name_, - "expect model stats for model {}".format(self.model_name_)) + stats.model_stats[0].name, + self.model_name_, + "expect model stats for model {}".format(self.model_name_), + ) self.assertEqual( - stats.model_stats[0].version, "1", - "expect model stats for model {} version 1".format( - self.model_name_)) + stats.model_stats[0].version, + "1", + "expect model stats for model {} version 1".format(self.model_name_), + ) batch_stats = stats.model_stats[0].batch_stats @@ -70,10 +75,11 @@ def _run(self, batch_sizes): np.random.random([bs, 16 * 1024 * 1024]).astype(self.dtype_) for bs in batch_sizes ] - inputs = [[ - grpcclient.InferInput('INPUT0', [bs, 16 * 1024 * 1024], "FP32") - ] for bs in batch_sizes] - output = [grpcclient.InferRequestedOutput('OUTPUT0')] + inputs = [ + [grpcclient.InferInput("INPUT0", [bs, 16 * 1024 * 1024], "FP32")] + for bs in batch_sizes + ] + output = [grpcclient.InferRequestedOutput("OUTPUT0")] for idx in range(len(inputs)): inputs[idx][0].set_data_from_numpy(input_data[idx]) @@ -89,11 +95,12 @@ def callback(user_data, idx, result, error): before_compute_input_duration = self._batch_input_duration(batch_size) for idx in range(len(batch_sizes)): - self.client_.async_infer(model_name=self.model_name_, - inputs=inputs[idx], - callback=functools.partial( - callback, user_data, idx), - outputs=output) + self.client_.async_infer( + model_name=self.model_name_, + inputs=inputs[idx], + callback=functools.partial(callback, user_data, idx), + outputs=output, + ) # Wait until the results are available in user_data time_out = 20 @@ -108,19 +115,24 @@ def callback(user_data, idx, result, error): time_out = time_out - 1 time.sleep(1) done_cnt = functools.reduce( - lambda dc, x: dc + 1 if x is not None else dc, user_data, 0) + lambda dc, x: dc + 1 if x is not None else dc, user_data, 0 + ) self.assertEqual( - done_cnt, len(batch_sizes), - "expected {} responses, got {}".format(len(batch_sizes), done_cnt)) + done_cnt, + len(batch_sizes), + "expected {} responses, got {}".format(len(batch_sizes), done_cnt), + ) for idx in range(len(batch_sizes)): res = user_data[idx] self.assertFalse( type(res) == InferenceServerException, - "expected response for request {}, got exception {}".format( - idx, res)) - output_data = res.as_numpy('OUTPUT0') - self.assertTrue(np.array_equal(output_data, input_data[idx]), - "Mismatched output data for request {}".format(idx)) + "expected response for request {}, got exception {}".format(idx, res), + ) + output_data = res.as_numpy("OUTPUT0") + self.assertTrue( + np.array_equal(output_data, input_data[idx]), + "Mismatched output data for request {}".format(idx), + ) after_compute_input_duration = self._batch_input_duration(batch_size) return after_compute_input_duration - before_compute_input_duration @@ -135,13 +147,17 @@ def test_performance(self): # The following check is loose, local runs show that the speedup is not # significant (~15%), may be due to the dispatch overhead - # which cancels part of the improvment + # which cancels part of the improvement self.assertTrue( serialized_time > parallelized_time, - "Expected parallelized copy is faster than serialized copy") - print("serialized v.s. parallelized : {} v.s. {}".format( - serialized_time, parallelized_time)) + "Expected parallelized copy is faster than serialized copy", + ) + print( + "serialized v.s. parallelized : {} v.s. {}".format( + serialized_time, parallelized_time + ) + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_parameters/model_repository/parameter/1/model.py b/qa/L0_parameters/model_repository/parameter/1/model.py old mode 100644 new mode 100755 index 70388d6c40..458d5467c8 --- a/qa/L0_parameters/model_repository/parameter/1/model.py +++ b/qa/L0_parameters/model_repository/parameter/1/model.py @@ -1,4 +1,6 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -24,39 +26,34 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import triton_python_backend_utils as pb_utils -import numpy as np import json +import numpy as np +import triton_python_backend_utils as pb_utils -class TritonPythonModel: +class TritonPythonModel: @staticmethod def auto_complete_config(auto_complete_model_config): - inputs = [{'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [1]}] - outputs = [{ - 'name': 'key', - 'data_type': 'TYPE_STRING', - 'dims': [-1] - }, { - 'name': 'value', - 'data_type': 'TYPE_STRING', - 'dims': [-1] - }] + inputs = [{"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [1]}] + outputs = [ + {"name": "key", "data_type": "TYPE_STRING", "dims": [-1]}, + {"name": "value", "data_type": "TYPE_STRING", "dims": [-1]}, + ] config = auto_complete_model_config.as_dict() input_names = [] output_names = [] - for input in config['input']: - input_names.append(input['name']) - for output in config['output']: - output_names.append(output['name']) + for input in config["input"]: + input_names.append(input["name"]) + for output in config["output"]: + output_names.append(output["name"]) for input in inputs: - if input['name'] not in input_names: + if input["name"] not in input_names: auto_complete_model_config.add_input(input) for output in outputs: - if output['name'] not in output_names: + if output["name"] not in output_names: auto_complete_model_config.add_output(output) auto_complete_model_config.set_max_batch_size(0) @@ -73,10 +70,10 @@ def execute(self, requests): keys.append(key) values.append(value) key_output = pb_utils.Tensor("key", np.asarray(keys, dtype=object)) - value_output = pb_utils.Tensor("value", - np.asarray(values, dtype=object)) + value_output = pb_utils.Tensor("value", np.asarray(values, dtype=object)) inference_response = pb_utils.InferenceResponse( - output_tensors=[key_output, value_output]) + output_tensors=[key_output, value_output] + ) responses.append(inference_response) return responses diff --git a/qa/L0_parameters/parameters_test.py b/qa/L0_parameters/parameters_test.py old mode 100644 new mode 100755 index 5cbc2c7586..0a2f142e34 --- a/qa/L0_parameters/parameters_test.py +++ b/qa/L0_parameters/parameters_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,50 +30,49 @@ sys.path.append("../common") +import os +import queue +import unittest +from functools import partial +from unittest import IsolatedAsyncioTestCase + import numpy as np -import tritonclient.http as httpclient import tritonclient.grpc as grpcclient -import tritonclient.http.aio as asynchttpclient import tritonclient.grpc.aio as asyncgrpcclient +import tritonclient.http as httpclient +import tritonclient.http.aio as asynchttpclient from tritonclient.utils import InferenceServerException -from unittest import IsolatedAsyncioTestCase -import unittest -import queue -from functools import partial -import os -TEST_HEADER = os.environ.get('TEST_HEADER') +TEST_HEADER = os.environ.get("TEST_HEADER") -class InferenceParametersTest(IsolatedAsyncioTestCase): +class InferenceParametersTest(IsolatedAsyncioTestCase): async def asyncSetUp(self): - self.http = httpclient.InferenceServerClient(url='localhost:8000') - self.async_http = asynchttpclient.InferenceServerClient( - url='localhost:8000') - self.grpc = grpcclient.InferenceServerClient(url='localhost:8001') - self.async_grpc = asyncgrpcclient.InferenceServerClient( - url='localhost:8001') + self.http = httpclient.InferenceServerClient(url="localhost:8000") + self.async_http = asynchttpclient.InferenceServerClient(url="localhost:8000") + self.grpc = grpcclient.InferenceServerClient(url="localhost:8001") + self.async_grpc = asyncgrpcclient.InferenceServerClient(url="localhost:8001") self.parameter_list = [] - self.parameter_list.append({'key1': 'value1', 'key2': 'value2'}) - self.parameter_list.append({'key1': 1, 'key2': 2}) - self.parameter_list.append({'key1': True, 'key2': 'value2'}) - self.parameter_list.append({'triton_': True, 'key2': 'value2'}) + self.parameter_list.append({"key1": "value1", "key2": "value2"}) + self.parameter_list.append({"key1": 1, "key2": 2}) + self.parameter_list.append({"key1": True, "key2": "value2"}) + self.parameter_list.append({"triton_": True, "key2": "value2"}) if TEST_HEADER == "1": self.headers = { - 'header_1': 'value_1', - 'header_2': 'value_2', - 'my_header_1': 'my_value_1', - 'my_header_2': 'my_value_2', - 'my_header_3': 'This is a "quoted" string with a backslash\ ' + "header_1": "value_1", + "header_2": "value_2", + "my_header_1": "my_value_1", + "my_header_2": "my_value_2", + "my_header_3": 'This is a "quoted" string with a backslash\ ', } # only these headers should be forwarded to the model. self.expected_headers = { - 'my_header_1': 'my_value_1', - 'my_header_2': 'my_value_2', - 'my_header_3': 'This is a "quoted" string with a backslash\ ' + "my_header_1": "my_value_1", + "my_header_2": "my_value_2", + "my_header_3": 'This is a "quoted" string with a backslash\ ', } else: self.headers = {} @@ -87,60 +88,63 @@ def callback(user_data, result, error): def create_inputs(self, client_type): inputs = [] - inputs.append(client_type.InferInput('INPUT0', [1], "FP32")) + inputs.append(client_type.InferInput("INPUT0", [1], "FP32")) # Initialize the data inputs[0].set_data_from_numpy(np.asarray([1], dtype=np.float32)) return inputs - async def send_request_and_verify(self, - client_type, - client, - is_async=False): - + async def send_request_and_verify(self, client_type, client, is_async=False): inputs = self.create_inputs(client_type) for parameters in self.parameter_list: # The `triton_` prefix is reserved for Triton usage should_error = False - if 'triton_' in parameters.keys(): + if "triton_" in parameters.keys(): should_error = True if is_async: if should_error: with self.assertRaises(InferenceServerException): - result = await client.infer(model_name='parameter', - inputs=inputs, - parameters=parameters, - headers=self.headers) + result = await client.infer( + model_name="parameter", + inputs=inputs, + parameters=parameters, + headers=self.headers, + ) return else: - result = await client.infer(model_name='parameter', - inputs=inputs, - parameters=parameters, - headers=self.headers) + result = await client.infer( + model_name="parameter", + inputs=inputs, + parameters=parameters, + headers=self.headers, + ) else: if should_error: with self.assertRaises(InferenceServerException): - result = client.infer(model_name='parameter', - inputs=inputs, - parameters=parameters, - headers=self.headers) + result = client.infer( + model_name="parameter", + inputs=inputs, + parameters=parameters, + headers=self.headers, + ) return else: - result = client.infer(model_name='parameter', - inputs=inputs, - parameters=parameters, - headers=self.headers) + result = client.infer( + model_name="parameter", + inputs=inputs, + parameters=parameters, + headers=self.headers, + ) self.verify_outputs(result, parameters) def verify_outputs(self, result, parameters): - keys = result.as_numpy('key') - values = result.as_numpy('value') + keys = result.as_numpy("key") + values = result.as_numpy("value") keys = keys.astype(str).tolist() - expected_keys = list(parameters.keys()) + list( - self.expected_headers.keys()) + expected_keys = list(parameters.keys()) + list(self.expected_headers.keys()) self.assertEqual(set(keys), set(expected_keys)) # We have to convert the parameter values to string @@ -158,24 +162,26 @@ async def test_http_parameter(self): await self.send_request_and_verify(httpclient, self.http) async def test_async_http_parameter(self): - await self.send_request_and_verify(asynchttpclient, - self.async_http, - is_async=True) + await self.send_request_and_verify( + asynchttpclient, self.async_http, is_async=True + ) async def test_async_grpc_parameter(self): - await self.send_request_and_verify(asyncgrpcclient, - self.async_grpc, - is_async=True) + await self.send_request_and_verify( + asyncgrpcclient, self.async_grpc, is_async=True + ) def test_http_async_parameter(self): inputs = self.create_inputs(httpclient) # Skip the parameter that returns an error parameter_list = self.parameter_list[:-1] for parameters in parameter_list: - result = self.http.async_infer(model_name='parameter', - inputs=inputs, - parameters=parameters, - headers=self.headers).get_result() + result = self.http.async_infer( + model_name="parameter", + inputs=inputs, + parameters=parameters, + headers=self.headers, + ).get_result() self.verify_outputs(result, parameters) def test_grpc_async_parameter(self): @@ -184,28 +190,30 @@ def test_grpc_async_parameter(self): # Skip the parameter that returns an error parameter_list = self.parameter_list[:-1] for parameters in parameter_list: - self.grpc.async_infer(model_name='parameter', - inputs=inputs, - parameters=parameters, - headers=self.headers, - callback=partial(self.grpc_callback, - user_data)) + self.grpc.async_infer( + model_name="parameter", + inputs=inputs, + parameters=parameters, + headers=self.headers, + callback=partial(self.grpc_callback, user_data), + ) result = user_data.get() self.assertFalse(result is InferenceServerException) self.verify_outputs(result, parameters) def test_grpc_stream_parameter(self): user_data = queue.Queue() - self.grpc.start_stream(callback=partial(self.grpc_callback, user_data), - headers=self.headers) + self.grpc.start_stream( + callback=partial(self.grpc_callback, user_data), headers=self.headers + ) inputs = self.create_inputs(grpcclient) # Skip the parameter that returns an error parameter_list = self.parameter_list[:-1] for parameters in parameter_list: # async stream infer - self.grpc.async_stream_infer(model_name='parameter', - inputs=inputs, - parameters=parameters) + self.grpc.async_stream_infer( + model_name="parameter", inputs=inputs, parameters=parameters + ) result = user_data.get() self.assertFalse(result is InferenceServerException) self.verify_outputs(result, parameters) @@ -218,5 +226,5 @@ async def asyncTearDown(self): await self.async_http.close() -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_parameters/test.sh b/qa/L0_parameters/test.sh old mode 100644 new mode 100755 index 48513cb61a..4c8ac00931 --- a/qa/L0_parameters/test.sh +++ b/qa/L0_parameters/test.sh @@ -50,7 +50,7 @@ source ../common/util.sh RET=0 for i in {0..1}; do - + # TEST_HEADER is a parameter used by `parameters_test.py` that controls # whether the script will test for inclusion of headers in parameters or not. if [ $i == 1 ]; then @@ -64,7 +64,7 @@ for i in {0..1}; do cat $SERVER_LOG exit 1 fi - + set +e TEST_HEADER=$i python3 $TEST_SCRIPT_PY >$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then @@ -72,9 +72,9 @@ for i in {0..1}; do echo -e "\n***\n*** Test Failed\n***" RET=1 fi - + set -e - + kill $SERVER_PID wait $SERVER_PID done diff --git a/qa/L0_passive_instance/models/distributed_int32_int32_int32/config.pbtxt b/qa/L0_passive_instance/models/distributed_int32_int32_int32/config.pbtxt old mode 100755 new mode 100644 diff --git a/qa/L0_passive_instance/passive_instance_test.py b/qa/L0_passive_instance/passive_instance_test.py old mode 100644 new mode 100755 index b96055b0b3..d7cdfffa7b --- a/qa/L0_passive_instance/passive_instance_test.py +++ b/qa/L0_passive_instance/passive_instance_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -29,20 +31,21 @@ sys.path.append("../common") import unittest -import numpy as np + import infer_util as iu +import numpy as np import test_util as tu class PassiveInstanceTest(tu.TestResultCollector): - def test_inference(self): try: - iu.infer_exact(self, "distributed", (1, 16), 1, np.int32, np.int32, - np.int32) + iu.infer_exact( + self, "distributed", (1, 16), 1, np.int32, np.int32, np.int32 + ) except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_passive_instance/test.sh b/qa/L0_passive_instance/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_perf_analyzer/test.sh b/qa/L0_perf_analyzer/test.sh index 8a6d0cfff4..42b80f009f 100755 --- a/qa/L0_perf_analyzer/test.sh +++ b/qa/L0_perf_analyzer/test.sh @@ -409,7 +409,7 @@ for PROTOCOL in grpc http; do RET=1 fi set -e - + # Binary search for concurrency range mode and make sure it doesn't hang $PERF_ANALYZER -v -a --request-distribution "poisson" --shared-memory none \ --percentile 99 --binary-search --concurrency-range 1:8:2 -l 5 \ @@ -809,8 +809,8 @@ set -e # Test with optional inputs missing and invalid set +e -OPTIONAL_INPUT_ERROR_STRING="For batch sizes larger than 1, the same set of -inputs must be specified for each batch. You cannot use different set of +OPTIONAL_INPUT_ERROR_STRING="For batch sizes larger than 1, the same set of +inputs must be specified for each batch. You cannot use different set of optional inputs for each individual batch." $PERF_ANALYZER -v -m optional -b 2 --measurement-mode "count_windows" \ --input-data=${INT_OPTIONAL_JSONDATAFILE} -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 @@ -854,7 +854,7 @@ if [ $(cat $CLIENT_LOG | grep "Request Rate: 40" | wc -l) -eq 0 ]; then fi set -e -# Test --serial-sequences mode +# Test --serial-sequences mode set +e $PERF_ANALYZER -v -i $PROTOCOL -m simple_savedmodel_sequence_object -p 1000 --request-rate-range 100:200:50 --serial-sequences \ --input-data=$SEQ_JSONDATAFILE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1 @@ -880,7 +880,7 @@ if [ $(cat $CLIENT_LOG | grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" RET=1 -fi +fi set -e ## Test perf_analyzer with MPI / multiple models @@ -984,23 +984,23 @@ wait $SERVER_PID # Generate valid CA openssl genrsa -passout pass:1234 -des3 -out ca.key 4096 -openssl req -passin pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA" +openssl req -passing pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA" # Generate valid Server Key/Cert openssl genrsa -passout pass:1234 -des3 -out server.key 4096 -openssl req -passin pass:1234 -new -key server.key -out server.csr -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost" -openssl x509 -req -passin pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt +openssl req -passing pass:1234 -new -key server.key -out server.csr -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost" +openssl x509 -req -passing pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt # Remove passphrase from the Server Key -openssl rsa -passin pass:1234 -in server.key -out server.key +openssl rsa -passing pass:1234 -in server.key -out server.key # Generate valid Client Key/Cert openssl genrsa -passout pass:1234 -des3 -out client.key 4096 -openssl req -passin pass:1234 -new -key client.key -out client.csr -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost" -openssl x509 -passin pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt +openssl req -passing pass:1234 -new -key client.key -out client.csr -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost" +openssl x509 -passing pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt # Remove passphrase from Client Key -openssl rsa -passin pass:1234 -in client.key -out client.key +openssl rsa -passing pass:1234 -in client.key -out client.key # Create mutated client key (Make first char of each like capital) cp client.key client2.key && sed -i "s/\b\(.\)/\u\1/g" client2.key diff --git a/qa/L0_perf_analyzer_doc_links/test.sh b/qa/L0_perf_analyzer_doc_links/test.sh old mode 100644 new mode 100755 index 52e3e76e12..ec6eeef057 --- a/qa/L0_perf_analyzer_doc_links/test.sh +++ b/qa/L0_perf_analyzer_doc_links/test.sh @@ -1,3 +1,4 @@ +#!/bin/bash # Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -33,14 +34,14 @@ python3 -m pip install mkdocs python3 -m pip install mkdocs-htmlproofer-plugin==0.10.3 #Download perf_analyzer docs -TRITON_CLIENT_REPO_TAG="${TRITON_CLIENT_REPO_TAG:=main}" +TRITON_CLIENT_REPO_TAG="${TRITON_CLIENT_REPO_TAG:=main}" git clone -b ${TRITON_CLIENT_REPO_TAG} https://github.com/triton-inference-server/client.git cp `pwd`/client/src/c++/perf_analyzer/README.md . cp -rf `pwd`/client/src/c++/perf_analyzer/docs . -# Need to remove all links that start with -- or -. Mkdocs converts all -- to - for anchor links. -# This breaks all links to cli commands throughout the docs. This will iterate over all -# files in the docs directory and remove -- and - at the start of options, which allows the +# Need to remove all links that start with -- or -. Mkdocs converts all -- to - for anchor links. +# This breaks all links to cli commands throughout the docs. This will iterate over all +# files in the docs directory and remove -- and - at the start of options, which allows the # tool to check links for correctness. for file in `pwd`/docs/* do diff --git a/qa/L0_perf_analyzer_ground_truth/test.sh b/qa/L0_perf_analyzer_ground_truth/test.sh index f01d1a0ec2..d5d78e63f4 100755 --- a/qa/L0_perf_analyzer_ground_truth/test.sh +++ b/qa/L0_perf_analyzer_ground_truth/test.sh @@ -92,7 +92,7 @@ function check_grpc_time { done } -# Create input_data.json to communicate the requested model delay +# Create input_data.json to communicate the requested model delay # $1: desired model delay function create_input_data { echo "{\"data\":[{\"INPUT0\" : [${1}]}]}" > input_data.json @@ -134,7 +134,7 @@ TOLERANCE="0.05" for model_delay in ${MODEL_DELAYS[@]}; do create_input_data ${model_delay} - EXPECTED_RESULT=$(python3 -c "print(1 / ${model_delay})") + EXPECTED_RESULT=$(python3 -c "print(1 / ${model_delay})") for protocol in ${PROTOCOLS}; do for model in ${MODELS}; do echo "================================================================" diff --git a/qa/L0_perf_analyzer_report/test.sh b/qa/L0_perf_analyzer_report/test.sh index c6f3d210f1..7a04905842 100755 --- a/qa/L0_perf_analyzer_report/test.sh +++ b/qa/L0_perf_analyzer_report/test.sh @@ -125,7 +125,7 @@ done sed -i "s/${COMPOSING_MODEL}/${COMPOSING_MODEL_CACHE_ENABLED}/g" "${MODEL_DIR}/${ENSEMBLE_MODEL_CACHE_ENABLED}/config.pbtxt" sed -i "s/${COMPOSING_MODEL}/${COMPOSING_MODEL_CACHE_DISABLED}/g" "${MODEL_DIR}/${ENSEMBLE_MODEL_CACHE_DISABLED}/config.pbtxt" -## Append cache config to each model config +## Append cache config to each model config echo -e "response_cache { enable: True }" >> "${MODEL_DIR}/${ENSEMBLE_MODEL_CACHE_ENABLED}/config.pbtxt" echo -e "response_cache { enable: False }" >> "${MODEL_DIR}/${ENSEMBLE_MODEL_CACHE_DISABLED}/config.pbtxt" echo -e "response_cache { enable: True }" >> "${MODEL_DIR}/${COMPOSING_MODEL_CACHE_ENABLED}/config.pbtxt" diff --git a/qa/L0_perf_kaldi/create_data.sh b/qa/L0_perf_kaldi/create_data.sh old mode 100644 new mode 100755 index 68b32a4099..849b56d906 --- a/qa/L0_perf_kaldi/create_data.sh +++ b/qa/L0_perf_kaldi/create_data.sh @@ -25,7 +25,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# Needs to be run in asr_kaldi main directory and must be copied to +# Needs to be run in asr_kaldi main directory and must be copied to # draco for benchmark test TRITON_VERSION="20.05" diff --git a/qa/L0_perf_kaldi/test.sh b/qa/L0_perf_kaldi/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_perf_nomodel/run_test.sh b/qa/L0_perf_nomodel/run_test.sh index 1efe38cc97..b1e2702ecb 100755 --- a/qa/L0_perf_nomodel/run_test.sh +++ b/qa/L0_perf_nomodel/run_test.sh @@ -83,7 +83,7 @@ PERF_CLIENT_PERCENTILE_ARGS="" && PERF_CLIENT_PERCENTILE_ARGS="--percentile=${PERF_CLIENT_PERCENTILE}" PERF_CLIENT_EXTRA_ARGS="$PERF_CLIENT_PERCENTILE_ARGS --shared-memory ${SHARED_MEMORY}" -# Overload use of PERF_CLIENT_PROTOCOL for convenience with existing test and +# Overload use of PERF_CLIENT_PROTOCOL for convenience with existing test and # reporting structure, though "triton_c_api" is not strictly a "protocol". if [[ "${PERF_CLIENT_PROTOCOL}" == "triton_c_api" ]]; then # Server will be run in-process with C API diff --git a/qa/L0_perf_pyclients/simple_perf_client.py b/qa/L0_perf_pyclients/simple_perf_client.py old mode 100644 new mode 100755 index f73f774c27..fd02f94887 --- a/qa/L0_perf_pyclients/simple_perf_client.py +++ b/qa/L0_perf_pyclients/simple_perf_client.py @@ -26,14 +26,13 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse -import numpy as np import sys import time +import numpy as np import tritonclient.grpc as grpcclient import tritonclient.http as httpclient -from tritonclient.utils import triton_to_np_dtype -from tritonclient.utils import InferenceServerException +from tritonclient.utils import InferenceServerException, triton_to_np_dtype FLAGS = None @@ -44,47 +43,59 @@ def parse_model_grpc(model_metadata, model_config): by this client. """ if len(model_metadata.inputs) != 1: - raise Exception("expecting 1 input, got {}".format( - len(model_metadata.inputs))) + raise Exception("expecting 1 input, got {}".format(len(model_metadata.inputs))) if len(model_metadata.outputs) != 1: - raise Exception("expecting 1 output, got {}".format( - len(model_metadata.outputs))) + raise Exception( + "expecting 1 output, got {}".format(len(model_metadata.outputs)) + ) if len(model_config.input) != 1: raise Exception( "expecting 1 input in model configuration, got {}".format( - len(model_config.input))) + len(model_config.input) + ) + ) input_metadata = model_metadata.inputs[0] output_metadata = model_metadata.outputs[0] - batch_dim = (model_config.max_batch_size > 0) + batch_dim = model_config.max_batch_size > 0 expected_dims = 1 + (1 if batch_dim else 0) if len(input_metadata.shape) != expected_dims: raise Exception( - "expecting input to have {} dimensions, model '{}' input has {}". - format(expected_dims, model_metadata.name, - len(input_metadata.shape))) + "expecting input to have {} dimensions, model '{}' input has {}".format( + expected_dims, model_metadata.name, len(input_metadata.shape) + ) + ) if len(output_metadata.shape) != expected_dims: raise Exception( - "expecting output to have {} dimensions, model '{}' output has {}". - format(expected_dims, model_metadata.name, - len(output_metadata.shape))) + "expecting output to have {} dimensions, model '{}' output has {}".format( + expected_dims, model_metadata.name, len(output_metadata.shape) + ) + ) if input_metadata.shape[-1] != -1: raise Exception( - "expecting input to have variable shape [-1], model '{}' input has {}" - .format(model_metadata.name, input_metadata.shape)) + "expecting input to have variable shape [-1], model '{}' input has {}".format( + model_metadata.name, input_metadata.shape + ) + ) if output_metadata.shape[-1] != -1: raise Exception( - "expecting output to have variable shape [-1], model '{}' output has {}" - .format(model_metadata.name, output_metadata.shape)) + "expecting output to have variable shape [-1], model '{}' output has {}".format( + model_metadata.name, output_metadata.shape + ) + ) - return (model_config.max_batch_size, input_metadata.name, - output_metadata.name, input_metadata.datatype) + return ( + model_config.max_batch_size, + input_metadata.name, + output_metadata.name, + input_metadata.datatype, + ) def parse_model_http(model_metadata, model_config): @@ -92,151 +103,176 @@ def parse_model_http(model_metadata, model_config): Check the configuration of a model to make sure it is supported by this client. """ - if len(model_metadata['inputs']) != 1: - raise Exception("expecting 1 input, got {}".format( - len(model_metadata['inputs']))) - if len(model_metadata['outputs']) != 1: - raise Exception("expecting 1 output, got {}".format( - len(model_metadata['outputs']))) - - if len(model_config['input']) != 1: + if len(model_metadata["inputs"]) != 1: + raise Exception( + "expecting 1 input, got {}".format(len(model_metadata["inputs"])) + ) + if len(model_metadata["outputs"]) != 1: + raise Exception( + "expecting 1 output, got {}".format(len(model_metadata["outputs"])) + ) + + if len(model_config["input"]) != 1: raise Exception( "expecting 1 input in model configuration, got {}".format( - len(model_config['input']))) + len(model_config["input"]) + ) + ) - input_metadata = model_metadata['inputs'][0] - output_metadata = model_metadata['outputs'][0] + input_metadata = model_metadata["inputs"][0] + output_metadata = model_metadata["outputs"][0] max_batch_size = 0 - if 'max_batch_size' in model_config: - max_batch_size = model_config['max_batch_size'] + if "max_batch_size" in model_config: + max_batch_size = model_config["max_batch_size"] - batch_dim = (max_batch_size > 0) + batch_dim = max_batch_size > 0 expected_dims = 1 + (1 if batch_dim else 0) - if len(input_metadata['shape']) != expected_dims: + if len(input_metadata["shape"]) != expected_dims: raise Exception( - "expecting input to have {} dimensions, model '{}' input has {}". - format(expected_dims, model_metadata.name, - len(input_metadata['shape']))) + "expecting input to have {} dimensions, model '{}' input has {}".format( + expected_dims, model_metadata.name, len(input_metadata["shape"]) + ) + ) - if len(output_metadata['shape']) != expected_dims: + if len(output_metadata["shape"]) != expected_dims: raise Exception( - "expecting output to have {} dimensions, model '{}' output has {}". - format(expected_dims, model_metadata.name, - len(output_metadata['shape']))) + "expecting output to have {} dimensions, model '{}' output has {}".format( + expected_dims, model_metadata.name, len(output_metadata["shape"]) + ) + ) - if input_metadata['shape'][-1] != -1: + if input_metadata["shape"][-1] != -1: raise Exception( - "expecting input to have variable shape [-1], model '{}' input has {}" - .format(model_metadata.name, input_metadata['shape'])) + "expecting input to have variable shape [-1], model '{}' input has {}".format( + model_metadata.name, input_metadata["shape"] + ) + ) - if output_metadata['shape'][-1] != -1: + if output_metadata["shape"][-1] != -1: raise Exception( - "expecting output to have variable shape [-1], model '{}' output has {}" - .format(model_metadata.name, output_metadata['shape'])) + "expecting output to have variable shape [-1], model '{}' output has {}".format( + model_metadata.name, output_metadata["shape"] + ) + ) - return (max_batch_size, input_metadata['name'], output_metadata['name'], - input_metadata['datatype']) + return ( + max_batch_size, + input_metadata["name"], + output_metadata["name"], + input_metadata["datatype"], + ) def requestGenerator(input_name, input_data, output_name, dtype, protocol): - # Set the input data inputs = [] if protocol.lower() == "grpc": - inputs.append(grpcclient.InferInput(input_name, input_data.shape, - dtype)) + inputs.append(grpcclient.InferInput(input_name, input_data.shape, dtype)) inputs[0].set_data_from_numpy(input_data) else: - inputs.append(httpclient.InferInput(input_name, input_data.shape, - dtype)) + inputs.append(httpclient.InferInput(input_name, input_data.shape, dtype)) inputs[0].set_data_from_numpy(input_data, binary_data=True) outputs = [] if protocol.lower() == "grpc": outputs.append(grpcclient.InferRequestedOutput(output_name)) else: - outputs.append( - httpclient.InferRequestedOutput(output_name, binary_data=True)) + outputs.append(httpclient.InferRequestedOutput(output_name, binary_data=True)) return inputs, outputs -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-m', - '--model-name', - type=str, - required=True, - help='Name of model') parser.add_argument( - '-x', - '--model-version', + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-m", "--model-name", type=str, required=True, help="Name of model" + ) + parser.add_argument( + "-x", + "--model-version", type=str, required=False, default="", - help='Version of model. Default is to use latest version.') - parser.add_argument('-b', - '--batch-size', - type=int, - required=False, - default=1, - help='Batch size. Default is 1.') - parser.add_argument('-s', - '--shape', - type=int, - required=False, - default=1, - help='The shape of the tensor. Default is 1.') - parser.add_argument('-u', - '--url', - type=str, - required=False, - default='localhost:8000', - help='Inference server URL. Default is localhost:8000.') - parser.add_argument('-i', - '--protocol', - type=str, - required=False, - default='HTTP', - help='Protocol (HTTP/gRPC) used to communicate with ' + - 'the inference service. Default is HTTP.') - parser.add_argument('-c', - '--iteration_count', - type=int, - required=False, - default=1000, - help='The number of iterations. Default is 1000.') + help="Version of model. Default is to use latest version.", + ) parser.add_argument( - '-w', - '--warmup_count', + "-b", + "--batch-size", + type=int, + required=False, + default=1, + help="Batch size. Default is 1.", + ) + parser.add_argument( + "-s", + "--shape", + type=int, + required=False, + default=1, + help="The shape of the tensor. Default is 1.", + ) + parser.add_argument( + "-u", + "--url", + type=str, + required=False, + default="localhost:8000", + help="Inference server URL. Default is localhost:8000.", + ) + parser.add_argument( + "-i", + "--protocol", + type=str, + required=False, + default="HTTP", + help="Protocol (HTTP/gRPC) used to communicate with " + + "the inference service. Default is HTTP.", + ) + parser.add_argument( + "-c", + "--iteration_count", + type=int, + required=False, + default=1000, + help="The number of iterations. Default is 1000.", + ) + parser.add_argument( + "-w", + "--warmup_count", type=int, required=False, default=500, - help='The number of warm-up iterations. Default is 500.') + help="The number of warm-up iterations. Default is 500.", + ) parser.add_argument( - '--csv', + "--csv", type=str, required=False, default=None, - help='The name of the file to store the results in CSV format') + help="The name of the file to store the results in CSV format", + ) FLAGS = parser.parse_args() try: if FLAGS.protocol.lower() == "grpc": # Create gRPC client for communicating with the server triton_client = grpcclient.InferenceServerClient( - url=FLAGS.url, verbose=FLAGS.verbose) + url=FLAGS.url, verbose=FLAGS.verbose + ) else: triton_client = httpclient.InferenceServerClient( - url=FLAGS.url, verbose=FLAGS.verbose, concurrency=1) + url=FLAGS.url, verbose=FLAGS.verbose, concurrency=1 + ) except Exception as e: print("client creation failed: " + str(e)) sys.exit(1) @@ -245,7 +281,8 @@ def requestGenerator(input_name, input_data, output_name, dtype, protocol): # properties of the model that we need for preprocessing try: model_metadata = triton_client.get_model_metadata( - model_name=FLAGS.model_name, model_version=FLAGS.model_version) + model_name=FLAGS.model_name, model_version=FLAGS.model_version + ) except InferenceServerException as e: print("failed to retrieve the metadata: " + str(e)) sys.exit(1) @@ -254,36 +291,41 @@ def requestGenerator(input_name, input_data, output_name, dtype, protocol): # properties of the model that we need for preprocessing try: model_metadata = triton_client.get_model_metadata( - model_name=FLAGS.model_name, model_version=FLAGS.model_version) + model_name=FLAGS.model_name, model_version=FLAGS.model_version + ) except InferenceServerException as e: print("failed to retrieve the metadata: " + str(e)) sys.exit(1) try: model_config = triton_client.get_model_config( - model_name=FLAGS.model_name, model_version=FLAGS.model_version) + model_name=FLAGS.model_name, model_version=FLAGS.model_version + ) except InferenceServerException as e: print("failed to retrieve the config: " + str(e)) sys.exit(1) if FLAGS.protocol.lower() == "grpc": max_batch_size, input_name, output_name, dtype = parse_model_grpc( - model_metadata, model_config.config) + model_metadata, model_config.config + ) else: max_batch_size, input_name, output_name, dtype = parse_model_http( - model_metadata, model_config) + model_metadata, model_config + ) - input_data = np.zeros([FLAGS.batch_size, FLAGS.shape], - dtype=triton_to_np_dtype(dtype)) + input_data = np.zeros( + [FLAGS.batch_size, FLAGS.shape], dtype=triton_to_np_dtype(dtype) + ) # --------------------------- Warm-Up -------------------------------------------------------- for i in range(FLAGS.warmup_count): - inputs, outputs = requestGenerator(input_name, input_data, output_name, - dtype, FLAGS.protocol.lower()) - triton_client.infer(FLAGS.model_name, - inputs, - model_version=FLAGS.model_version, - outputs=outputs) + inputs, outputs = requestGenerator( + input_name, input_data, output_name, dtype, FLAGS.protocol.lower() + ) + triton_client.infer( + FLAGS.model_name, inputs, model_version=FLAGS.model_version, outputs=outputs + ) latencies = [] @@ -293,12 +335,12 @@ def requestGenerator(input_name, input_data, output_name, dtype, protocol): for i in range(FLAGS.iteration_count): t0 = time.time() - inputs, outputs = requestGenerator(input_name, input_data, output_name, - dtype, FLAGS.protocol.lower()) - triton_client.infer(FLAGS.model_name, - inputs, - model_version=FLAGS.model_version, - outputs=outputs) + inputs, outputs = requestGenerator( + input_name, input_data, output_name, dtype, FLAGS.protocol.lower() + ) + triton_client.infer( + FLAGS.model_name, inputs, model_version=FLAGS.model_version, outputs=outputs + ) latencies.append(time.time() - t0) end_time = time.time() @@ -321,12 +363,17 @@ def requestGenerator(input_name, input_data, output_name, dtype, protocol): # --------------------------- Write CSV -------------------------------------------------------- if FLAGS.csv != None: - file = open(FLAGS.csv, 'w') + file = open(FLAGS.csv, "w") file.write( "Concurrency,Inferences/Second,p50 latency,p90 latency,p95 latency,p99 latency\n" ) - file.write("1,{},{},{},{},{}".format(throughput, p50_latency * 1000, - p90_latency * 1000, - p95_latency * 1000, - p99_latency * 1000)) + file.write( + "1,{},{},{},{},{}".format( + throughput, + p50_latency * 1000, + p90_latency * 1000, + p95_latency * 1000, + p99_latency * 1000, + ) + ) file.close() diff --git a/qa/L0_perf_resnet/run_test.sh b/qa/L0_perf_resnet/run_test.sh index bbd9b33c42..579d00c0e5 100755 --- a/qa/L0_perf_resnet/run_test.sh +++ b/qa/L0_perf_resnet/run_test.sh @@ -63,7 +63,7 @@ if [ "$ARCH" == "aarch64" ]; then fi fi -# Overload use of PERF_CLIENT_PROTOCOL for convenience with existing test and +# Overload use of PERF_CLIENT_PROTOCOL for convenience with existing test and # reporting structure, though "triton_c_api" is not strictly a "protocol". if [[ "${PERF_CLIENT_PROTOCOL}" == "triton_c_api" ]]; then # Server will be run in-process with C API diff --git a/qa/L0_query/query_e2e.py b/qa/L0_query/query_e2e.py old mode 100644 new mode 100755 index 9e301002a1..048a4a8d41 --- a/qa/L0_query/query_e2e.py +++ b/qa/L0_query/query_e2e.py @@ -27,23 +27,23 @@ import sys -sys.path.append('../common') +sys.path.append("../common") + +import unittest import numpy as np -import tritonclient.http as tritonhttpclient +import test_util as tu import tritonclient.grpc as tritongrpcclient +import tritonclient.http as tritonhttpclient from tritonclient.utils import InferenceServerException from tritonclient.utils import cuda_shared_memory as cudashm -import unittest -import test_util as tu class QueryTest(tu.TestResultCollector): - def test_http(self): triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") inputs = [] - inputs.append(tritonhttpclient.InferInput('INPUT', [1], "UINT8")) + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "UINT8")) inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) try: @@ -56,33 +56,33 @@ def test_http(self): def test_http_shared_memory(self): triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") inputs = [] - inputs.append(tritonhttpclient.InferInput('INPUT', [1], "UINT8")) + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "UINT8")) inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) # Set up CUDA shared memory for outputs triton_client.unregister_system_shared_memory() triton_client.unregister_cuda_shared_memory() - shm_op0_handle = cudashm.create_shared_memory_region( - "output0_data", 4, 0) - shm_op1_handle = cudashm.create_shared_memory_region( - "output1_data", 4, 0) + shm_op0_handle = cudashm.create_shared_memory_region("output0_data", 4, 0) + shm_op1_handle = cudashm.create_shared_memory_region("output1_data", 4, 0) triton_client.register_cuda_shared_memory( - "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 4) + "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 4 + ) triton_client.register_cuda_shared_memory( - "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 4) + "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 4 + ) outputs = [] outputs.append( - tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) outputs[-1].set_shared_memory("output0_data", 4) outputs.append( - tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) outputs[-1].set_shared_memory("output1_data", 4) try: - triton_client.infer(model_name="query", - inputs=inputs, - outputs=outputs) + triton_client.infer(model_name="query", inputs=inputs, outputs=outputs) self.assertTrue(False, "expect error with query information") except InferenceServerException as ex: self.assertTrue("OUTPUT0 GPU 0" in ex.message()) @@ -96,34 +96,34 @@ def test_http_shared_memory(self): def test_http_out_of_shared_memory(self): triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") inputs = [] - inputs.append(tritonhttpclient.InferInput('INPUT', [1], "UINT8")) + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "UINT8")) inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) # Set up too small CUDA shared memory for outputs, expect query # returns default value triton_client.unregister_system_shared_memory() triton_client.unregister_cuda_shared_memory() - shm_op0_handle = cudashm.create_shared_memory_region( - "output0_data", 1, 0) - shm_op1_handle = cudashm.create_shared_memory_region( - "output1_data", 1, 0) + shm_op0_handle = cudashm.create_shared_memory_region("output0_data", 1, 0) + shm_op1_handle = cudashm.create_shared_memory_region("output1_data", 1, 0) triton_client.register_cuda_shared_memory( - "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1) + "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1 + ) triton_client.register_cuda_shared_memory( - "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1) + "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1 + ) outputs = [] outputs.append( - tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) outputs[-1].set_shared_memory("output0_data", 1) outputs.append( - tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) outputs[-1].set_shared_memory("output1_data", 1) try: - triton_client.infer(model_name="query", - inputs=inputs, - outputs=outputs) + triton_client.infer(model_name="query", inputs=inputs, outputs=outputs) self.assertTrue(False, "expect error with query information") except InferenceServerException as ex: self.assertTrue("OUTPUT0 CPU 0" in ex.message()) @@ -137,7 +137,7 @@ def test_http_out_of_shared_memory(self): def test_grpc(self): triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") inputs = [] - inputs.append(tritongrpcclient.InferInput('INPUT', [1], "UINT8")) + inputs.append(tritongrpcclient.InferInput("INPUT", [1], "UINT8")) inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) try: @@ -150,31 +150,29 @@ def test_grpc(self): def test_grpc_shared_memory(self): triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") inputs = [] - inputs.append(tritongrpcclient.InferInput('INPUT', [1], "UINT8")) + inputs.append(tritongrpcclient.InferInput("INPUT", [1], "UINT8")) inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) # Set up CUDA shared memory for outputs triton_client.unregister_system_shared_memory() triton_client.unregister_cuda_shared_memory() - shm_op0_handle = cudashm.create_shared_memory_region( - "output0_data", 4, 0) - shm_op1_handle = cudashm.create_shared_memory_region( - "output1_data", 4, 0) + shm_op0_handle = cudashm.create_shared_memory_region("output0_data", 4, 0) + shm_op1_handle = cudashm.create_shared_memory_region("output1_data", 4, 0) triton_client.register_cuda_shared_memory( - "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 4) + "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 4 + ) triton_client.register_cuda_shared_memory( - "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 4) + "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 4 + ) outputs = [] - outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0')) + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0")) outputs[-1].set_shared_memory("output0_data", 4) - outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1')) + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1")) outputs[-1].set_shared_memory("output1_data", 4) try: - triton_client.infer(model_name="query", - inputs=inputs, - outputs=outputs) + triton_client.infer(model_name="query", inputs=inputs, outputs=outputs) self.assertTrue(False, "expect error with query information") except InferenceServerException as ex: self.assertTrue("OUTPUT0 GPU 0" in ex.message()) @@ -188,32 +186,30 @@ def test_grpc_shared_memory(self): def test_grpc_out_of_shared_memory(self): triton_client = tritongrpcclient.InferenceServerClient("localhost:8001") inputs = [] - inputs.append(tritongrpcclient.InferInput('INPUT', [1], "UINT8")) + inputs.append(tritongrpcclient.InferInput("INPUT", [1], "UINT8")) inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8)) # Set up too small CUDA shared memory for outputs, expect query # returns default value triton_client.unregister_system_shared_memory() triton_client.unregister_cuda_shared_memory() - shm_op0_handle = cudashm.create_shared_memory_region( - "output0_data", 1, 0) - shm_op1_handle = cudashm.create_shared_memory_region( - "output1_data", 1, 0) + shm_op0_handle = cudashm.create_shared_memory_region("output0_data", 1, 0) + shm_op1_handle = cudashm.create_shared_memory_region("output1_data", 1, 0) triton_client.register_cuda_shared_memory( - "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1) + "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1 + ) triton_client.register_cuda_shared_memory( - "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1) + "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1 + ) outputs = [] - outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0')) + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0")) outputs[-1].set_shared_memory("output0_data", 1) - outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1')) + outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1")) outputs[-1].set_shared_memory("output1_data", 1) try: - triton_client.infer(model_name="query", - inputs=inputs, - outputs=outputs) + triton_client.infer(model_name="query", inputs=inputs, outputs=outputs) self.assertTrue(False, "expect error with query information") except InferenceServerException as ex: self.assertTrue("OUTPUT0 CPU 0" in ex.message()) @@ -225,5 +221,5 @@ def test_grpc_out_of_shared_memory(self): triton_client.unregister_cuda_shared_memory() -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_query/test.sh b/qa/L0_query/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_rate_limiter/rate_limiter_test.py b/qa/L0_rate_limiter/rate_limiter_test.py old mode 100644 new mode 100755 index c02c50b61e..4bc7b82e70 --- a/qa/L0_rate_limiter/rate_limiter_test.py +++ b/qa/L0_rate_limiter/rate_limiter_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -29,11 +31,12 @@ sys.path.append("../common") import functools -import numpy as np import os -import unittest import threading import time +import unittest + +import numpy as np import sequence_util as su import tritongrpcclient as grpcclient from tritonclientutils import * @@ -46,7 +49,6 @@ class AsyncGrpcRunner: - def __init__(self, tester, server_url, model_name, delay_ms): self._tester = tester self._server_url = server_url @@ -79,18 +81,17 @@ def req_loop(self): client = grpcclient.InferenceServerClient(self._server_url) inputs = [ - grpcclient.InferInput("INPUT0", self._shape, - np_to_triton_dtype(self._dtype)) + grpcclient.InferInput( + "INPUT0", self._shape, np_to_triton_dtype(self._dtype) + ) ] self._inflight_requests = 0 - start_stat = client.get_inference_statistics( - model_name=self._model_name) + start_stat = client.get_inference_statistics(model_name=self._model_name) global _exit_signal while not _exit_signal: - input_numpy = np.random.random_sample(self._shape).astype( - self._dtype) + input_numpy = np.random.random_sample(self._shape).astype(self._dtype) inputs[0].set_data_from_numpy(input_numpy) self._input_data.append(input_numpy) @@ -99,12 +100,15 @@ def req_loop(self): def _check_can_send(): return self._inflight_requests < _inference_concurrency - can_send = self._sync.wait_for(_check_can_send, - timeout=_response_wait_time_s) + can_send = self._sync.wait_for( + _check_can_send, timeout=_response_wait_time_s + ) self._tester.assertTrue( can_send, "client didn't receive a response within {}s".format( - _response_wait_time_s)) + _response_wait_time_s + ), + ) callback = functools.partial(AsyncGrpcRunner._on_result, self) client.async_infer( @@ -115,7 +119,7 @@ def _check_can_send(): ) self._inflight_requests += 1 self._num_sent_request += 1 - if (self._num_sent_request == _inference_count): + if self._num_sent_request == _inference_count: _exit_signal = True time.sleep(self._delay_ms / 1000.0) @@ -125,17 +129,21 @@ def _check_can_send(): def _all_processed(): return self._inflight_requests == 0 - self._processed_all = self._sync.wait_for(_all_processed, - _finish_wait_time_s) + self._processed_all = self._sync.wait_for( + _all_processed, _finish_wait_time_s + ) self._tester.assertTrue( self._processed_all, - "the processing didn't complete even after waiting for {}s". - format(_finish_wait_time_s)) + "the processing didn't complete even after waiting for {}s".format( + _finish_wait_time_s + ), + ) end_stat = client.get_inference_statistics(model_name=self._model_name) - self._processed_request_count = end_stat.model_stats[ - 0].inference_stats.success.count - start_stat.model_stats[ - 0].inference_stats.success.count + self._processed_request_count = ( + end_stat.model_stats[0].inference_stats.success.count + - start_stat.model_stats[0].inference_stats.success.count + ) def start(self): self._req_thread.start() @@ -144,13 +152,15 @@ def _validate_run(self): if len(self._errors) != 0: raise self._errors[0] self._tester.assertEqual( - len(self._input_data), len(self._results.keys()), - "the number of inputs and output should match") + len(self._input_data), + len(self._results.keys()), + "the number of inputs and output should match", + ) for i in range(len(self._input_data)): self._tester.assertFalse( - (self._input_data[i] != - self._results[i].as_numpy('OUTPUT0')).any(), - "the output data should match with the input data") + (self._input_data[i] != self._results[i].as_numpy("OUTPUT0")).any(), + "the output data should match with the input data", + ) def join(self): self._req_thread.join() @@ -158,17 +168,16 @@ def join(self): class RateLimiterTest(su.SequenceBatcherTestUtil): - def stress_models(self, model_names, delay_ms=0): infer_counts = {} try: runners = [] for model_name in model_names: runners.append( - AsyncGrpcRunner(self, - "localhost:8001", - model_name, - delay_ms=delay_ms)) + AsyncGrpcRunner( + self, "localhost:8001", model_name, delay_ms=delay_ms + ) + ) for r in runners: r.start() for r in runners: @@ -191,7 +200,7 @@ def test_single_model(self): def test_cross_model_prioritization_limited_resource(self): # Sends requests to two models, one operating at # priority of 1 and other at 2 respectively. - # The availabe resource counts doesn't allow models + # The available resource counts doesn't allow models # to execute simultaneously. model_names = ["custom_zero_1_float32", "custom_zero_1_float32_v2"] @@ -199,32 +208,36 @@ def test_cross_model_prioritization_limited_resource(self): # TODO: Validate the priority and resource counts are set correctly infer_counts = self.stress_models(model_names) - infer_ratio = infer_counts[model_names[0]] / float( - infer_counts[model_names[1]]) + infer_ratio = infer_counts[model_names[0]] / float(infer_counts[model_names[1]]) self.assertGreater( - infer_ratio, 1.80, + infer_ratio, + 1.80, "Got infer ratio across models {}, expected closer to 2".format( - infer_ratio)) + infer_ratio + ), + ) def test_cross_model_prioritization_plenty_resource(self): # Sends requests to two models, one operating at # priority of 1 and other at 2 respectively. - # The availabe resource counts wll allow both models - # to run simulataneously. + # The available resource counts wll allow both models + # to run simultaneously. model_names = ["custom_zero_1_float32", "custom_zero_1_float32_v2"] # TODO: Validate the priority and resource counts are set correctly infer_counts = self.stress_models(model_names) - infer_diff = abs(infer_counts[model_names[0]] - - infer_counts[model_names[1]]) + infer_diff = abs(infer_counts[model_names[0]] - infer_counts[model_names[1]]) self.assertGreater( - 10, infer_diff, - "Got infer difference between models {}, expected closer to 0". - format(infer_diff)) + 10, + infer_diff, + "Got infer difference between models {}, expected closer to 0".format( + infer_diff + ), + ) def test_single_model_dynamic_batching(self): # Send all the inference requests with a delay to a model @@ -242,18 +255,25 @@ def test_single_model_dynamic_batching(self): batch_stats = stats.model_stats[0].batch_stats self.assertEqual( - len(batch_stats), 1, - "expected single batch-size, got {}".format(len(batch_stats))) + len(batch_stats), + 1, + "expected single batch-size, got {}".format(len(batch_stats)), + ) for batch_stat in batch_stats: self.assertEqual( - batch_stat.batch_size, 4, - "unexpected batch-size {}".format(batch_stat.batch_size)) + batch_stat.batch_size, + 4, + "unexpected batch-size {}".format(batch_stat.batch_size), + ) # Get count from one of the stats self.assertEqual( - batch_stat.compute_infer.count, _inference_count / 4, - "expected model-execution-count {} for batch size {}, got {}". - format(_inference_count / 4, 4, batch_stat.compute_infer.count)) + batch_stat.compute_infer.count, + _inference_count / 4, + "expected model-execution-count {} for batch size {}, got {}".format( + _inference_count / 4, 4, batch_stat.compute_infer.count + ), + ) def test_single_model_sequence_batching(self): # Send one sequence and check for correct accumulator @@ -265,19 +285,26 @@ def test_single_model_sequence_batching(self): model_name = "custom_sequence_int32" self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) self.check_sequence( - 'custom', + "custom", model_name, np.int32, 5, (4000, None), # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay)) - (("start", 1, None, None), (None, 2, None, None), - (None, 3, None, None), (None, 4, None, None), - (None, 5, None, None), (None, 6, None, None), - (None, 7, None, None), (None, 8, None, None), - ("end", 9, None, None)), + ( + ("start", 1, None, None), + (None, 2, None, None), + (None, 3, None, None), + (None, 4, None, None), + (None, 5, None, None), + (None, 6, None, None), + (None, 7, None, None), + (None, 8, None, None), + ("end", 9, None, None), + ), 45, - 'grpc') + "grpc", + ) self.check_deferred_exception() self.check_status(model_name, {1: 9}, 9, 9) @@ -285,5 +312,5 @@ def test_single_model_sequence_batching(self): self.assertTrue(False, "unexpected error {}".format(ex)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_rate_limiter/test.sh b/qa/L0_rate_limiter/test.sh old mode 100644 new mode 100755 index 0de1553908..334af99e4c --- a/qa/L0_rate_limiter/test.sh +++ b/qa/L0_rate_limiter/test.sh @@ -278,7 +278,7 @@ kill $SERVER_PID wait $SERVER_PID ## -## Tests with mulitple instances of the same model +## Tests with multiple instances of the same model ## # Replace the second model with a second instance with same resource requirements and priority. # TODO: Currently there is no way to check which instance got to run inferences hence we only diff --git a/qa/L0_register/test.sh b/qa/L0_register/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_repoagent_checksum/identity_test.py b/qa/L0_repoagent_checksum/identity_test.py old mode 100644 new mode 100755 index ad9f268967..4db55e0d45 --- a/qa/L0_repoagent_checksum/identity_test.py +++ b/qa/L0_repoagent_checksum/identity_test.py @@ -27,40 +27,43 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse -import numpy as np import sys + +import numpy as np import tritongrpcclient as grpcclient import tritonhttpclient as httpclient from tritonclientutils import np_to_triton_dtype FLAGS = None -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-u', - '--url', - type=str, - required=False, - help='Inference server URL.') parser.add_argument( - '-i', - '--protocol', + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-u", "--url", type=str, required=False, help="Inference server URL." + ) + parser.add_argument( + "-i", + "--protocol", type=str, required=False, - default='http', - help='Protocol ("http"/"grpc") used to ' + - 'communicate with inference service. Default is "http".') + default="http", + help='Protocol ("http"/"grpc") used to ' + + 'communicate with inference service. Default is "http".', + ) FLAGS = parser.parse_args() if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"): - print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format( - FLAGS.protocol)) + print( + 'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol) + ) exit(1) client_util = httpclient if FLAGS.protocol == "http" else grpcclient @@ -69,23 +72,23 @@ FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001" # Reuse a single client for all sync tests - with client_util.InferenceServerClient(FLAGS.url, - verbose=FLAGS.verbose) as client: + with client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) as client: for model_name, np_dtype, shape in ( - # yapf: disable + # yapf: disable ("identity_int32", np.int32, [0]), - ("identity_int32", np.int32, [7])): + ("identity_int32", np.int32, [7]) + ): # yapf: enable if np_dtype != object: input_data = (16384 * np.random.randn(*shape)).astype(np_dtype) else: - in0 = (16384 * np.ones(shape, dtype='int')) - in0n = np.array([str(x) for x in in0.reshape(in0.size)], - dtype=object) + in0 = 16384 * np.ones(shape, dtype="int") + in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) input_data = in0n.reshape(in0.shape) inputs = [ - client_util.InferInput("INPUT0", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + client_util.InferInput( + "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) @@ -102,6 +105,9 @@ output_data = np.char.decode(output_data) if not np.array_equal(output_data, input_data): - print("error: expected output {} to match input {}".format( - output_data, input_data)) + print( + "error: expected output {} to match input {}".format( + output_data, input_data + ) + ) sys.exit(1) diff --git a/qa/L0_response_cache/test.sh b/qa/L0_response_cache/test.sh index 8ec610b065..434195b693 100755 --- a/qa/L0_response_cache/test.sh +++ b/qa/L0_response_cache/test.sh @@ -71,8 +71,8 @@ function stop_redis() { } function set_redis_auth() { - # NOTE: Per-user auth [Access Control List (ACL)] is only supported in - # Redis >= 6.0 and is more comprehensive in what can be configured. + # NOTE: Per-user auth [Access Control List (ACL)] is only supported in + # Redis >= 6.0 and is more comprehensive in what can be configured. # For simplicity and wider range of Redis version support, use # server-wide password via "requirepass" for now. redis-cli -h "${TRITON_REDIS_HOST}" -p "${TRITON_REDIS_PORT}" config set requirepass "${REDIS_PW}" @@ -88,7 +88,7 @@ function unset_redis_auth() { # UNIT TESTS set +e -## Unit tests currently run for both Local and Redis cache implementaitons +## Unit tests currently run for both Local and Redis cache implementations ## by default. However, we could break out the unit tests for each ## into separate runs gtest filters if needed in the future: ## - `${UNIT_TEST} --gtest_filter=*Local*` @@ -130,7 +130,7 @@ function check_server_expected_failure { else # Check that server fails with the correct error message set +e - grep -i "${EXPECTED_MESSAGE}" ${SERVER_LOG} + grep -i "${EXPECTED_MESSAGE}" ${SERVER_LOG} if [ $? -ne 0 ]; then echo -e "\n***\n*** Failed: Expected [${EXPECTED_MESSAGE}] error message in output\n***" cat $SERVER_LOG diff --git a/qa/L0_sagemaker/sagemaker_multi_model_test.py b/qa/L0_sagemaker/sagemaker_multi_model_test.py old mode 100644 new mode 100755 index 06cd48edd7..b2052f6751 --- a/qa/L0_sagemaker/sagemaker_multi_model_test.py +++ b/qa/L0_sagemaker/sagemaker_multi_model_test.py @@ -29,58 +29,80 @@ sys.path.append("../common") +import json import os +import sys import time import unittest + import numpy as np +import requests import test_util as tu import tritonclient.http as httpclient -import json -import os -import requests -import sys - class SageMakerMultiModelTest(tu.TestResultCollector): - def setUp(self): - SAGEMAKER_BIND_TO_PORT = os.getenv("SAGEMAKER_BIND_TO_PORT", "8080") - self.url_mme_ = "http://localhost:{}/models".format( - SAGEMAKER_BIND_TO_PORT) + self.url_mme_ = "http://localhost:{}/models".format(SAGEMAKER_BIND_TO_PORT) # model_1 setup self.model1_name = "sm_mme_model_1" self.model1_url = "/opt/ml/models/123456789abcdefghi/model" - self.model1_input_data_ = [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - ] + self.model1_input_data_ = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] self.model1_expected_output0_data_ = [ - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + 0, + 2, + 4, + 6, + 8, + 10, + 12, + 14, + 16, + 18, + 20, + 22, + 24, + 26, + 28, + 30, ] self.model1_expected_output1_data_ = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, ] self.model1_expected_result_ = { - "model_name": - "sm_mme_model_1", - "model_version": - "1", + "model_name": "sm_mme_model_1", + "model_version": "1", "outputs": [ { "name": "OUTPUT0", "datatype": "INT32", "shape": [1, 16], - "data": self.model1_expected_output0_data_ + "data": self.model1_expected_output0_data_, }, { "name": "OUTPUT1", "datatype": "INT32", "shape": [1, 16], - "data": self.model1_expected_output1_data_ + "data": self.model1_expected_output1_data_, }, ], } @@ -97,42 +119,45 @@ def setUp(self): self.model3_url = "/opt/ml/models/123456789ensemble/model" def test_sm_0_environment_variables_set(self): - self.assertEqual(os.getenv("SAGEMAKER_MULTI_MODEL"), "true", - "Variable SAGEMAKER_MULTI_MODEL must be set to true") + self.assertEqual( + os.getenv("SAGEMAKER_MULTI_MODEL"), + "true", + "Variable SAGEMAKER_MULTI_MODEL must be set to true", + ) def test_sm_1_model_load(self): # Load model_1 request_body = {"model_name": self.model1_name, "url": self.model1_url} headers = {"Content-Type": "application/json"} - r = requests.post(self.url_mme_, - data=json.dumps(request_body), - headers=headers) + r = requests.post(self.url_mme_, data=json.dumps(request_body), headers=headers) time.sleep(5) # wait for model to load self.assertEqual( - r.status_code, 200, - "Expected status code 200, received {}".format(r.status_code)) + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) # Load the same model again, expect a 409 request_body = {"model_name": self.model1_name, "url": self.model1_url} headers = {"Content-Type": "application/json"} - r = requests.post(self.url_mme_, - data=json.dumps(request_body), - headers=headers) + r = requests.post(self.url_mme_, data=json.dumps(request_body), headers=headers) time.sleep(5) # wait for model to load self.assertEqual( - r.status_code, 409, - "Expected status code 409, received {}".format(r.status_code)) + r.status_code, + 409, + "Expected status code 409, received {}".format(r.status_code), + ) # Load model_2 request_body = {"model_name": self.model2_name, "url": self.model2_url} headers = {"Content-Type": "application/json"} - r = requests.post(self.url_mme_, - data=json.dumps(request_body), - headers=headers) + r = requests.post(self.url_mme_, data=json.dumps(request_body), headers=headers) time.sleep(5) # wait for model to load self.assertEqual( - r.status_code, 200, - "Expected status code 200, received {}".format(r.status_code)) + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) def test_sm_2_model_list(self): r = requests.get(self.url_mme_) @@ -141,11 +166,11 @@ def test_sm_2_model_list(self): "models": [ { "modelName": self.model1_name, - "modelUrl": self.model1_url.rstrip("/model") + "modelUrl": self.model1_url.rstrip("/model"), }, { "modelName": self.model2_name, - "modelUrl": self.model2_url.rstrip("/model") + "modelUrl": self.model2_url.rstrip("/model"), }, ] } @@ -153,11 +178,11 @@ def test_sm_2_model_list(self): "models": [ { "modelName": self.model2_name, - "modelUrl": self.model2_url.rstrip("/model") + "modelUrl": self.model2_url.rstrip("/model"), }, { "modelName": self.model1_name, - "modelUrl": self.model1_url.rstrip("/model") + "modelUrl": self.model1_url.rstrip("/model"), }, ] } @@ -167,7 +192,8 @@ def test_sm_2_model_list(self): r.json(), [expected_response_1, expected_response_2], "Expected one of {}, received: {}".format( - [expected_response_1, expected_response_2], r.json()), + [expected_response_1, expected_response_2], r.json() + ), ) def test_sm_3_model_get(self): @@ -176,12 +202,13 @@ def test_sm_3_model_get(self): time.sleep(3) expected_response = { "modelName": self.model1_name, - "modelUrl": self.model1_url.rstrip("/model") + "modelUrl": self.model1_url.rstrip("/model"), } self.assertEqual( - r.json(), expected_response, - "Expected response: {}, received: {}".format( - expected_response, r.json())) + r.json(), + expected_response, + "Expected response: {}, received: {}".format(expected_response, r.json()), + ) def test_sm_4_model_invoke(self): # Invoke model_1 @@ -196,12 +223,11 @@ def test_sm_4_model_invoke(self): inputs[0].set_data_from_numpy(input_data, binary_data=False) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) request_body, _ = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + inputs, outputs=outputs + ) headers = {"Content-Type": "application/json"} invoke_url = "{}/{}/invoke".format(self.url_mme_, self.model1_name) @@ -212,47 +238,56 @@ def test_sm_4_model_invoke(self): self.model1_expected_result_, r.json(), "Expected response : {}, received: {}".format( - self.model1_expected_result_, r.json()), + self.model1_expected_result_, r.json() + ), ) # Invoke model_2 inputs = [] outputs = [] - inputs.append(httpclient.InferInput( - "INPUT0", - [1, 8], - "FP32", - )) + inputs.append( + httpclient.InferInput( + "INPUT0", + [1, 8], + "FP32", + ) + ) input_data = np.array(self.model2_input_data_, dtype=np.float32) input_data = np.expand_dims(input_data, axis=0) inputs[0].set_data_from_numpy(input_data, binary_data=True) - outputs.append( - httpclient.InferRequestedOutput("OUTPUT0", binary_data=True)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True)) - request_body, header_length = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) invoke_url = "{}/{}/invoke".format(self.url_mme_, self.model2_name) headers = { - "Content-Type": - "application/vnd.sagemaker-triton.binary+json;json-header-size={}" - .format(header_length) + "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size={}".format( + header_length + ) } r = requests.post(invoke_url, data=request_body, headers=headers) - header_length_prefix = "application/vnd.sagemaker-triton.binary+json;json-header-size=" - header_length_str = r.headers["Content-Type"][len(header_length_prefix - ):] + header_length_prefix = ( + "application/vnd.sagemaker-triton.binary+json;json-header-size=" + ) + header_length_str = r.headers["Content-Type"][len(header_length_prefix) :] result = httpclient.InferenceServerClient.parse_response_body( - r._content, header_length=int(header_length_str)) + r._content, header_length=int(header_length_str) + ) # Get the inference header size so we can locate the output binary data output_data = result.as_numpy("OUTPUT0") for i in range(8): - self.assertEqual(output_data[0][i], input_data[0][i], - "Tensor Value Mismatch") + self.assertEqual( + output_data[0][i], input_data[0][i], "Tensor Value Mismatch" + ) def test_sm_5_model_unload(self): # Unload model_1 @@ -260,39 +295,45 @@ def test_sm_5_model_unload(self): r = requests.delete(unload_url) time.sleep(3) self.assertEqual( - r.status_code, 200, - "Expected status code 200, received {}".format(r.status_code)) + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) # Unload model_2 unload_url = "{}/{}".format(self.url_mme_, self.model2_name) r = requests.delete(unload_url) time.sleep(3) self.assertEqual( - r.status_code, 200, - "Expected status code 200, received {}".format(r.status_code)) + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) # Unload a non-loaded model, expect a 404 unload_url = "{}/sm_non_loaded_model".format(self.url_mme_) r = requests.delete(unload_url) time.sleep(3) self.assertEqual( - r.status_code, 404, - "Expected status code 404, received {}".format(r.status_code)) + r.status_code, + 404, + "Expected status code 404, received {}".format(r.status_code), + ) def test_sm_6_ensemble_model(self): # Load ensemble model request_body = {"model_name": self.model3_name, "url": self.model3_url} headers = { "Content-Type": "application/json", - "X-Amzn-SageMaker-Target-Model": f"{self.model3_name}" + "X-Amzn-SageMaker-Target-Model": f"{self.model3_name}", } - r = requests.post(self.url_mme_, - data=json.dumps(request_body), - headers=headers) + r = requests.post(self.url_mme_, data=json.dumps(request_body), headers=headers) time.sleep(5) # wait for model to load self.assertEqual( - r.status_code, 200, - "Expected status code 200, received {}".format(r.status_code)) + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) # Invoke ensemble model inputs = [] @@ -306,12 +347,11 @@ def test_sm_6_ensemble_model(self): inputs[0].set_data_from_numpy(input_data, binary_data=False) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) request_body, _ = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + inputs, outputs=outputs + ) headers = {"Content-Type": "application/json"} invoke_url = "{}/{}/invoke".format(self.url_mme_, self.model3_name) @@ -319,16 +359,20 @@ def test_sm_6_ensemble_model(self): print(f"response: {r.text}") r.raise_for_status() self.assertEqual( - r.status_code, 200, - "Expected status code 200, received {}".format(r.status_code)) + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) # Unload ensemble model unload_url = "{}/{}".format(self.url_mme_, self.model3_name) r = requests.delete(unload_url, headers=headers) time.sleep(5) self.assertEqual( - r.status_code, 200, - "Expected status code 200, received {}".format(r.status_code)) + r.status_code, + 200, + "Expected status code 200, received {}".format(r.status_code), + ) if __name__ == "__main__": diff --git a/qa/L0_sagemaker/sagemaker_test.py b/qa/L0_sagemaker/sagemaker_test.py old mode 100644 new mode 100755 index 3989e4aa93..6e76a9f0fd --- a/qa/L0_sagemaker/sagemaker_test.py +++ b/qa/L0_sagemaker/sagemaker_test.py @@ -29,80 +29,95 @@ sys.path.append("../common") +import json import os +import sys import unittest + import numpy as np +import requests import test_util as tu import tritonclient.http as httpclient -import json -import os -import requests -import sys - class SageMakerTest(tu.TestResultCollector): - def setUp(self): - SAGEMAKER_BIND_TO_PORT = os.getenv('SAGEMAKER_BIND_TO_PORT', '8080') - self.url_ = "http://localhost:{}/invocations".format( - SAGEMAKER_BIND_TO_PORT) - self.input_data_ = [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - ] + SAGEMAKER_BIND_TO_PORT = os.getenv("SAGEMAKER_BIND_TO_PORT", "8080") + self.url_ = "http://localhost:{}/invocations".format(SAGEMAKER_BIND_TO_PORT) + self.input_data_ = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] self.expected_output0_data_ = [ - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 - ] - self.expected_output1_data_ = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + 0, + 2, + 4, + 6, + 8, + 10, + 12, + 14, + 16, + 18, + 20, + 22, + 24, + 26, + 28, + 30, ] + self.expected_output1_data_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.expected_result_ = { - "model_name": - "sm_model", - "model_version": - "1", - "outputs": [{ - "name": "OUTPUT0", - "datatype": "INT32", - "shape": [1, 16], - "data": self.expected_output0_data_ - }, { - "name": "OUTPUT1", - "datatype": "INT32", - "shape": [1, 16], - "data": self.expected_output1_data_ - }] + "model_name": "sm_model", + "model_version": "1", + "outputs": [ + { + "name": "OUTPUT0", + "datatype": "INT32", + "shape": [1, 16], + "data": self.expected_output0_data_, + }, + { + "name": "OUTPUT1", + "datatype": "INT32", + "shape": [1, 16], + "data": self.expected_output1_data_, + }, + ], } def test_direct_inference(self): request = { - "inputs": [{ - "name": "INPUT0", - "datatype": "INT32", - "shape": [1, 16], - "data": self.input_data_ - }, { - "name": "INPUT1", - "datatype": "INT32", - "shape": [1, 16], - "data": self.input_data_ - }] + "inputs": [ + { + "name": "INPUT0", + "datatype": "INT32", + "shape": [1, 16], + "data": self.input_data_, + }, + { + "name": "INPUT1", + "datatype": "INT32", + "shape": [1, 16], + "data": self.input_data_, + }, + ] } - headers = {'Content-Type': 'application/json'} + headers = {"Content-Type": "application/json"} r = requests.post(self.url_, data=json.dumps(request), headers=headers) r.raise_for_status() self.assertEqual( - self.expected_result_, r.json(), + self.expected_result_, + r.json(), "Expected response body: {}; got: {}".format( - self.expected_result_, r.json())) + self.expected_result_, r.json() + ), + ) def test_inference_client_generated_request(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -110,27 +125,29 @@ def test_inference_client_generated_request(self): inputs[0].set_data_from_numpy(input_data, binary_data=False) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) request_body, _ = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + inputs, outputs=outputs + ) - headers = {'Content-Type': 'application/json'} + headers = {"Content-Type": "application/json"} r = requests.post(self.url_, data=request_body, headers=headers) r.raise_for_status() self.assertEqual( - self.expected_result_, r.json(), + self.expected_result_, + r.json(), "Expected response body: {}; got: {}".format( - self.expected_result_, r.json())) + self.expected_result_, r.json() + ), + ) def test_inference_client_generated_request_binary(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -138,31 +155,36 @@ def test_inference_client_generated_request_binary(self): inputs[0].set_data_from_numpy(input_data, binary_data=True) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) - request_body, header_length = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) headers = { - 'Content-Type': - 'application/vnd.sagemaker-triton.binary+json;json-header-size={}' - .format(header_length) + "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size={}".format( + header_length + ) } r = requests.post(self.url_, data=request_body, headers=headers) r.raise_for_status() self.assertEqual( - self.expected_result_, r.json(), + self.expected_result_, + r.json(), "Expected response body: {}; got: {}".format( - self.expected_result_, r.json())) + self.expected_result_, r.json() + ), + ) def test_inference_client_generated_response(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -170,22 +192,20 @@ def test_inference_client_generated_response(self): inputs[0].set_data_from_numpy(input_data, binary_data=False) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) request_body, _ = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + inputs, outputs=outputs + ) - headers = {'Content-Type': 'application/json'} + headers = {"Content-Type": "application/json"} r = requests.post(self.url_, data=request_body, headers=headers) r.raise_for_status() - result = httpclient.InferenceServerClient.parse_response_body( - r._content) + result = httpclient.InferenceServerClient.parse_response_body(r._content) - output0_data = result.as_numpy('OUTPUT0') - output1_data = result.as_numpy('OUTPUT1') + output0_data = result.as_numpy("OUTPUT0") + output1_data = result.as_numpy("OUTPUT1") for i in range(16): self.assertEqual(output0_data[0][i], self.expected_output0_data_[i]) self.assertEqual(output1_data[0][i], self.expected_output1_data_[i]) @@ -193,8 +213,8 @@ def test_inference_client_generated_response(self): def test_inference_client_generated_response_binary(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -202,25 +222,26 @@ def test_inference_client_generated_response_binary(self): inputs[0].set_data_from_numpy(input_data, binary_data=False) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) request_body, _ = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + inputs, outputs=outputs + ) - headers = {'Content-Type': 'application/json'} + headers = {"Content-Type": "application/json"} r = requests.post(self.url_, data=request_body, headers=headers) r.raise_for_status() - header_length_prefix = "application/vnd.sagemaker-triton.binary+json;json-header-size=" - header_length_str = r.headers['Content-Type'][len(header_length_prefix - ):] + header_length_prefix = ( + "application/vnd.sagemaker-triton.binary+json;json-header-size=" + ) + header_length_str = r.headers["Content-Type"][len(header_length_prefix) :] result = httpclient.InferenceServerClient.parse_response_body( - r._content, header_length=int(header_length_str)) + r._content, header_length=int(header_length_str) + ) - output0_data = result.as_numpy('OUTPUT0') - output1_data = result.as_numpy('OUTPUT1') + output0_data = result.as_numpy("OUTPUT0") + output1_data = result.as_numpy("OUTPUT1") for i in range(16): self.assertEqual(output0_data[0][i], self.expected_output0_data_[i]) self.assertEqual(output1_data[0][i], self.expected_output1_data_[i]) @@ -228,8 +249,8 @@ def test_inference_client_generated_response_binary(self): def test_malformed_binary_header(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -237,29 +258,34 @@ def test_malformed_binary_header(self): inputs[0].set_data_from_numpy(input_data, binary_data=True) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) - request_body, header_length = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) headers = { - 'Content-Type': - 'additional-string/application/vnd.sagemaker-triton.binary+json;json-header-size={}' - .format(header_length) + "Content-Type": "additional-string/application/vnd.sagemaker-triton.binary+json;json-header-size={}".format( + header_length + ) } r = requests.post(self.url_, data=request_body, headers=headers) self.assertEqual( - 400, r.status_code, + 400, + r.status_code, "Expected error code {} returned for the request; got: {}".format( - 400, r.status_code)) + 400, r.status_code + ), + ) def test_malformed_binary_header_not_number(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -267,29 +293,34 @@ def test_malformed_binary_header_not_number(self): inputs[0].set_data_from_numpy(input_data, binary_data=True) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) - request_body, header_length = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) headers = { - 'Content-Type': - 'application/vnd.sagemaker-triton.binary+json;json-header-size=additional-string{}' - .format(header_length) + "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size=additional-string{}".format( + header_length + ) } r = requests.post(self.url_, data=request_body, headers=headers) self.assertEqual( - 400, r.status_code, + 400, + r.status_code, "Expected error code {} returned for the request; got: {}".format( - 400, r.status_code)) + 400, r.status_code + ), + ) def test_malformed_binary_header_negative_number(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -297,28 +328,32 @@ def test_malformed_binary_header_negative_number(self): inputs[0].set_data_from_numpy(input_data, binary_data=True) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) - request_body, header_length = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) headers = { - 'Content-Type': - 'application/vnd.sagemaker-triton.binary+json;json-header-size=-123' + "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size=-123" } r = requests.post(self.url_, data=request_body, headers=headers) self.assertEqual( - 400, r.status_code, + 400, + r.status_code, "Expected error code {} returned for the request; got: {}".format( - 400, r.status_code)) + 400, r.status_code + ), + ) def test_malformed_binary_header_large_number(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -326,23 +361,27 @@ def test_malformed_binary_header_large_number(self): inputs[0].set_data_from_numpy(input_data, binary_data=True) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) - request_body, header_length = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) headers = { - 'Content-Type': - 'application/vnd.sagemaker-triton.binary+json;json-header-size=12345' + "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size=12345" } r = requests.post(self.url_, data=request_body, headers=headers) self.assertEqual( - 400, r.status_code, + 400, + r.status_code, "Expected error code {} returned for the request; got: {}".format( - 400, r.status_code)) + 400, r.status_code + ), + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_savedmodel_shape/saved_model_shape_test.py b/qa/L0_savedmodel_shape/saved_model_shape_test.py old mode 100644 new mode 100755 index 5c754ad600..b5ae13a680 --- a/qa/L0_savedmodel_shape/saved_model_shape_test.py +++ b/qa/L0_savedmodel_shape/saved_model_shape_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -29,192 +31,198 @@ sys.path.append("../common") import unittest -import numpy as np + import infer_util as iu +import numpy as np import test_util as tu np_dtype_string = np.dtype(object) class SavedModelShapeTest(tu.TestResultCollector): - - def _full_exact(self, input_dtype, output0_dtype, output1_dtype, - output0_raw, output1_raw, swap): - - def _infer_exact_helper(tester, - pf, - tensor_shape, - batch_size, - input_dtype, - output0_dtype, - output1_dtype, - output0_raw=True, - output1_raw=True, - model_version=None, - swap=False, - outputs=("OUTPUT0", "OUTPUT1"), - use_http=True, - use_grpc=True, - skip_request_id_check=False, - use_streaming=True, - correlation_id=0): + def _full_exact( + self, input_dtype, output0_dtype, output1_dtype, output0_raw, output1_raw, swap + ): + def _infer_exact_helper( + tester, + pf, + tensor_shape, + batch_size, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=True, + output1_raw=True, + model_version=None, + swap=False, + outputs=("OUTPUT0", "OUTPUT1"), + use_http=True, + use_grpc=True, + skip_request_id_check=False, + use_streaming=True, + correlation_id=0, + ): for bs in (1, batch_size): # model that does not support batching if bs == 1: - iu.infer_exact(tester, - "savedmodel_nobatch", - tensor_shape, - bs, - input_dtype, - output0_dtype, - output1_dtype, - output0_raw=output0_raw, - output1_raw=output1_raw, - model_version=model_version, - swap=swap, - outputs=outputs, - use_http=use_http, - use_grpc=use_grpc, - skip_request_id_check=skip_request_id_check, - use_streaming=use_streaming, - correlation_id=correlation_id) + iu.infer_exact( + tester, + "savedmodel_nobatch", + tensor_shape, + bs, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + model_version=model_version, + swap=swap, + outputs=outputs, + use_http=use_http, + use_grpc=use_grpc, + skip_request_id_check=skip_request_id_check, + use_streaming=use_streaming, + correlation_id=correlation_id, + ) # model that supports batching - iu.infer_exact(tester, - "savedmodel", (bs,) + tensor_shape, - bs, - input_dtype, - output0_dtype, - output1_dtype, - output0_raw=output0_raw, - output1_raw=output1_raw, - model_version=model_version, - swap=swap, - outputs=outputs, - use_http=use_http, - use_grpc=use_grpc, - skip_request_id_check=skip_request_id_check, - use_streaming=use_streaming, - correlation_id=correlation_id) + iu.infer_exact( + tester, + "savedmodel", + (bs,) + tensor_shape, + bs, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + model_version=model_version, + swap=swap, + outputs=outputs, + use_http=use_http, + use_grpc=use_grpc, + skip_request_id_check=skip_request_id_check, + use_streaming=use_streaming, + correlation_id=correlation_id, + ) input_size = 16 - if tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype, - (input_size,), (input_size,), - (input_size,)): - _infer_exact_helper(self, - "savedmodel", (input_size,), - 8, - input_dtype, - output0_dtype, - output1_dtype, - output0_raw=output0_raw, - output1_raw=output1_raw, - swap=swap) + if tu.validate_for_tf_model( + input_dtype, + output0_dtype, + output1_dtype, + (input_size,), + (input_size,), + (input_size,), + ): + _infer_exact_helper( + self, + "savedmodel", + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) def test_raw_bbb(self): - self._full_exact(np.int8, - np.int8, - np.int8, - output0_raw=True, - output1_raw=True, - swap=True) + self._full_exact( + np.int8, np.int8, np.int8, output0_raw=True, output1_raw=True, swap=True + ) def test_raw_sss(self): - self._full_exact(np.int16, - np.int16, - np.int16, - output0_raw=True, - output1_raw=True, - swap=True) + self._full_exact( + np.int16, np.int16, np.int16, output0_raw=True, output1_raw=True, swap=True + ) def test_raw_iii(self): - self._full_exact(np.int32, - np.int32, - np.int32, - output0_raw=True, - output1_raw=True, - swap=True) + self._full_exact( + np.int32, np.int32, np.int32, output0_raw=True, output1_raw=True, swap=True + ) def test_raw_lll(self): - self._full_exact(np.int64, - np.int64, - np.int64, - output0_raw=True, - output1_raw=True, - swap=False) + self._full_exact( + np.int64, np.int64, np.int64, output0_raw=True, output1_raw=True, swap=False + ) def test_raw_hhh(self): - self._full_exact(np.float16, - np.float16, - np.float16, - output0_raw=True, - output1_raw=True, - swap=False) + self._full_exact( + np.float16, + np.float16, + np.float16, + output0_raw=True, + output1_raw=True, + swap=False, + ) def test_raw_fff(self): - self._full_exact(np.float32, - np.float32, - np.float32, - output0_raw=True, - output1_raw=True, - swap=True) + self._full_exact( + np.float32, + np.float32, + np.float32, + output0_raw=True, + output1_raw=True, + swap=True, + ) def test_raw_hff(self): - self._full_exact(np.float16, - np.float32, - np.float32, - output0_raw=True, - output1_raw=True, - swap=False) + self._full_exact( + np.float16, + np.float32, + np.float32, + output0_raw=True, + output1_raw=True, + swap=False, + ) def test_raw_bii(self): - self._full_exact(np.int8, - np.int32, - np.int32, - output0_raw=True, - output1_raw=True, - swap=False) + self._full_exact( + np.int8, np.int32, np.int32, output0_raw=True, output1_raw=True, swap=False + ) def test_raw_ibb(self): - self._full_exact(np.int32, - np.int8, - np.int8, - output0_raw=True, - output1_raw=True, - swap=False) + self._full_exact( + np.int32, np.int8, np.int8, output0_raw=True, output1_raw=True, swap=False + ) def test_raw_ibs(self): - self._full_exact(np.int32, - np.int8, - np.int16, - output0_raw=True, - output1_raw=True, - swap=False) + self._full_exact( + np.int32, np.int8, np.int16, output0_raw=True, output1_raw=True, swap=False + ) def test_raw_iff(self): - self._full_exact(np.int32, - np.float32, - np.float32, - output0_raw=True, - output1_raw=True, - swap=False) + self._full_exact( + np.int32, + np.float32, + np.float32, + output0_raw=True, + output1_raw=True, + swap=False, + ) def test_raw_fii(self): - self._full_exact(np.float32, - np.int32, - np.int32, - output0_raw=True, - output1_raw=True, - swap=False) + self._full_exact( + np.float32, + np.int32, + np.int32, + output0_raw=True, + output1_raw=True, + swap=False, + ) def test_raw_ihs(self): - self._full_exact(np.int32, - np.float16, - np.int16, - output0_raw=True, - output1_raw=True, - swap=False) + self._full_exact( + np.int32, + np.float16, + np.int16, + output0_raw=True, + output1_raw=True, + swap=False, + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_savedmodel_shape/test.sh b/qa/L0_savedmodel_shape/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_secure_grpc/test.sh b/qa/L0_secure_grpc/test.sh old mode 100644 new mode 100755 index e1f9c8dd0b..63c9b104a6 --- a/qa/L0_secure_grpc/test.sh +++ b/qa/L0_secure_grpc/test.sh @@ -56,23 +56,23 @@ rm -fr *.log *.log.* # Generate valid CA openssl genrsa -passout pass:1234 -des3 -out ca.key 4096 -openssl req -passin pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA" +openssl req -passing pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA" # Generate valid Server Key/Cert openssl genrsa -passout pass:1234 -des3 -out server.key 4096 -openssl req -passin pass:1234 -new -key server.key -out server.csr -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost" -openssl x509 -req -passin pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt +openssl req -passing pass:1234 -new -key server.key -out server.csr -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost" +openssl x509 -req -passing pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt # Remove passphrase from the Server Key -openssl rsa -passin pass:1234 -in server.key -out server.key +openssl rsa -passing pass:1234 -in server.key -out server.key # Generate valid Client Key/Cert openssl genrsa -passout pass:1234 -des3 -out client.key 4096 -openssl req -passin pass:1234 -new -key client.key -out client.csr -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost" -openssl x509 -passin pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt +openssl req -passing pass:1234 -new -key client.key -out client.csr -subj "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost" +openssl x509 -passing pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt # Remove passphrase from Client Key -openssl rsa -passin pass:1234 -in client.key -out client.key +openssl rsa -passing pass:1234 -in client.key -out client.key # Create mutated client key (Make first char of each like capital) cp client.key client2.key && sed -i "s/\b\(.\)/\u\1/g" client2.key diff --git a/qa/L0_sequence_batcher/sequence_batcher_test.py b/qa/L0_sequence_batcher/sequence_batcher_test.py old mode 100644 new mode 100755 index c2ccd0111e..11b659b05a --- a/qa/L0_sequence_batcher/sequence_batcher_test.py +++ b/qa/L0_sequence_batcher/sequence_batcher_test.py @@ -2997,4 +2997,4 @@ def test_send_request_after_timeout(self): if __name__ == '__main__': - unittest.main() + unittest.main() \ No newline at end of file diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh index 208221f8ef..3dabfaba7a 100755 --- a/qa/L0_sequence_batcher/test.sh +++ b/qa/L0_sequence_batcher/test.sh @@ -241,7 +241,7 @@ for BACKEND in $BACKENDS; do MODELS="$MODELS ../custom_models/custom_sequence_int32" else DTYPES=$(get_datatype $BACKEND) - + for DTYPE in $DTYPES; do MODELS="$MODELS $DATADIR/$FIXED_MODEL_REPOSITORY/${BACKEND}_sequence_${DTYPE}" done @@ -256,7 +256,7 @@ for BACKEND in $BACKENDS; do MODELS="$MODELS ${TMP//onnx/python}" else MODELS="$MODELS $DATADIR/qa_ensemble_model_repository/$FIXED_MODEL_REPOSITORY/*_${BACKEND}_sequence_${DTYPE}" - fi + fi fi done fi @@ -743,7 +743,7 @@ done # Test request timeout with sequence batcher # only run the test outside shared memory setting as -# shared memory feature is irrelevant +# shared memory feature is irrelevant if [ "$TEST_SYSTEM_SHARED_MEMORY" -ne 1 ] && [ "$TEST_CUDA_SHARED_MEMORY" -ne 1 ]; then export NO_BATCHING=0 export MODEL_INSTANCES=1 diff --git a/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py b/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py old mode 100644 new mode 100755 index dee5502c78..15f16da352 --- a/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py +++ b/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -29,24 +31,22 @@ sys.path.append("../common") import os -import time import threading +import time import unittest + import numpy as np -import test_util as tu import sequence_util as su +import test_util as tu -_test_system_shared_memory = bool( - int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0))) -_test_cuda_shared_memory = bool( - int(os.environ.get('TEST_CUDA_SHARED_MEMORY', 0))) +_test_system_shared_memory = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0))) +_test_cuda_shared_memory = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0))) -_no_batching = (int(os.environ['NO_BATCHING']) == 1) -_model_instances = int(os.environ['MODEL_INSTANCES']) +_no_batching = int(os.environ["NO_BATCHING"]) == 1 +_model_instances = int(os.environ["MODEL_INSTANCES"]) if _no_batching: - _trials = ("savedmodel_nobatch", "graphdef_nobatch", "plan_nobatch", - "onnx_nobatch") + _trials = ("savedmodel_nobatch", "graphdef_nobatch", "plan_nobatch", "onnx_nobatch") else: _trials = ("savedmodel", "graphdef", "plan", "onnx") @@ -55,23 +55,20 @@ class SequenceCorrIDBatcherTest(su.SequenceBatcherTestUtil): - def get_datatype(self, trial): return np.int32 - def get_expected_result(self, - expected_result, - corrid, - value, - trial, - flag_str=None): + def get_expected_result(self, expected_result, corrid, value, trial, flag_str=None): # Adjust the expected_result for models that - # couldn't implement the full accumulator. See + # could not implement the full accumulator. See # qa/common/gen_qa_dyna_sequence_models.py for more # information. - if ((("nobatch" not in trial) and ("custom" not in trial)) or \ - ("graphdef" in trial) or ("plan" in trial) or \ - ("onnx" in trial)) or ("libtorch" in trial): + if ( + (("nobatch" not in trial) and ("custom" not in trial)) + or ("graphdef" in trial) + or ("plan" in trial) + or ("onnx" in trial) + ) or ("libtorch" in trial): expected_result = value if flag_str is not None: if "start" in flag_str: @@ -88,14 +85,16 @@ def test_skip_batch(self): for trial in _trials: self.clear_deferred_exceptions() dtype = self.get_datatype(trial) - precreated_shm0_handles = self.precreate_register_regions((1, 3), - dtype, 0) + precreated_shm0_handles = self.precreate_register_regions((1, 3), dtype, 0) precreated_shm1_handles = self.precreate_register_regions( - (11, 12, 13, 14), dtype, 1) + (11, 12, 13, 14), dtype, 1 + ) precreated_shm2_handles = self.precreate_register_regions( - (111, 113), dtype, 2) + (111, 113), dtype, 2 + ) precreated_shm3_handles = self.precreate_register_regions( - (1111, 1112, 1113, 1114), dtype, 3) + (1111, 1112, 1113, 1114), dtype, 3 + ) try: model_name = tu.get_dyna_sequence_model_name(trial, dtype) @@ -104,12 +103,11 @@ def test_skip_batch(self): # Need scheduler to wait for queue to contain all # inferences for both sequences. self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12) + self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ) self.assertEqual( - int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12) - self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", - os.environ) - self.assertEqual( - int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0) + int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0 + ) corrids = [1001, 1002, 1003, 1004] threads = [] @@ -124,12 +122,14 @@ def test_skip_batch(self): (None, None), # (flag_str, value, pre_delay_ms) (("start", 1, None), ("end", 3, None)), - self.get_expected_result(4 + corrids[0], corrids[0], - 3, trial, "end"), - precreated_shm0_handles), - kwargs={ - 'sequence_name': "{}".format(self._testMethodName) - })) + self.get_expected_result( + 4 + corrids[0], corrids[0], 3, trial, "end" + ), + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) threads.append( threading.Thread( target=self.check_sequence_async, @@ -140,15 +140,20 @@ def test_skip_batch(self): corrids[1], (None, None), # (flag_str, value, pre_delay_ms) - (("start", 11, None), (None, 12, None), - (None, 13, None), ("end", 14, None)), - self.get_expected_result(50 + corrids[1], - corrids[1], 14, trial, - "end"), - precreated_shm1_handles), - kwargs={ - 'sequence_name': "{}".format(self._testMethodName) - })) + ( + ("start", 11, None), + (None, 12, None), + (None, 13, None), + ("end", 14, None), + ), + self.get_expected_result( + 50 + corrids[1], corrids[1], 14, trial, "end" + ), + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) threads.append( threading.Thread( target=self.check_sequence_async, @@ -160,13 +165,14 @@ def test_skip_batch(self): (None, None), # (flag_str, value, pre_delay_ms) (("start", 111, None), ("end", 113, None)), - self.get_expected_result(224 + corrids[2], - corrids[2], 113, trial, - "end"), - precreated_shm2_handles), - kwargs={ - 'sequence_name': "{}".format(self._testMethodName) - })) + self.get_expected_result( + 224 + corrids[2], corrids[2], 113, trial, "end" + ), + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) threads.append( threading.Thread( target=self.check_sequence_async, @@ -177,15 +183,20 @@ def test_skip_batch(self): corrids[3], (None, None), # (flag_str, value, pre_delay_ms) - (("start", 1111, None), (None, 1112, None), - (None, 1113, None), ("end", 1114, None)), - self.get_expected_result(4450 + corrids[3], - corrids[3], 1114, trial, - "end"), - precreated_shm3_handles), - kwargs={ - 'sequence_name': "{}".format(self._testMethodName) - })) + ( + ("start", 1111, None), + (None, 1112, None), + (None, 1113, None), + ("end", 1114, None), + ), + self.get_expected_result( + 4450 + corrids[3], corrids[3], 1114, trial, "end" + ), + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) threads[1].start() threads[3].start() @@ -211,5 +222,5 @@ def test_skip_batch(self): self.cleanup_shm_regions(precreated_shm3_handles) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_sequence_stress/sequence_stress.py b/qa/L0_sequence_stress/sequence_stress.py old mode 100644 new mode 100755 index 26d7f4bbfa..039cf793a2 --- a/qa/L0_sequence_stress/sequence_stress.py +++ b/qa/L0_sequence_stress/sequence_stress.py @@ -1,4 +1,6 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -29,14 +31,14 @@ sys.path.append("../common") import argparse -from builtins import range -from builtins import str -import time import threading +import time import traceback +from builtins import range, str +from functools import partial + import numpy as np import test_util as tu -from functools import partial import tritongrpcclient as grpcclient from tritonclientutils import np_to_triton_dtype @@ -56,7 +58,6 @@ class UserData: - def __init__(self): self._completed_requests = queue.Queue() @@ -71,21 +72,27 @@ class TimeoutException(Exception): pass -def check_sequence_async(client_metadata, - trial, - model_name, - input_dtype, - steps, - timeout_ms=DEFAULT_TIMEOUT_MS, - sequence_name=""): +def check_sequence_async( + client_metadata, + trial, + model_name, + input_dtype, + steps, + timeout_ms=DEFAULT_TIMEOUT_MS, + sequence_name="", +): """Perform sequence of inferences using async run. The 'steps' holds a list of tuples, one for each inference with format: (flag_str, value, expected_result, delay_ms) """ - if (("savedmodel" in trial) or ("graphdef" in trial) or - ("custom" in trial) or ("plan" in trial)): + if ( + ("savedmodel" in trial) + or ("graphdef" in trial) + or ("custom" in trial) + or ("plan" in trial) + ): tensor_shape = ( 1, 1, @@ -108,27 +115,29 @@ def check_sequence_async(client_metadata, seq_start = False seq_end = False if flag_str is not None: - seq_start = ("start" in flag_str) - seq_end = ("end" in flag_str) + seq_start = "start" in flag_str + seq_end = "end" in flag_str if input_dtype == np.object_: in0 = np.full(tensor_shape, value, dtype=np.int32) - in0n = np.array([str(x) for x in in0.reshape(in0.size)], - dtype=object) + in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object) in0 = in0n.reshape(tensor_shape) else: in0 = np.full(tensor_shape, value, dtype=input_dtype) inputs = [ - grpcclient.InferInput("INPUT", tensor_shape, - np_to_triton_dtype(input_dtype)), + grpcclient.InferInput( + "INPUT", tensor_shape, np_to_triton_dtype(input_dtype) + ), ] inputs[0].set_data_from_numpy(in0) - triton_client.async_stream_infer(model_name, - inputs, - sequence_id=sequence_id, - sequence_start=seq_start, - sequence_end=seq_end) + triton_client.async_stream_infer( + model_name, + inputs, + sequence_id=sequence_id, + sequence_start=seq_start, + sequence_end=seq_end, + ) sent_count += 1 if delay_ms is not None: @@ -147,23 +156,21 @@ def check_sequence_async(client_metadata, if timeout_ms != None: now_ms = int(round(time.time() * 1000)) if (now_ms - seq_start_ms) > timeout_ms: - raise TimeoutException( - "Timeout expired for {}".format(sequence_name)) + raise TimeoutException("Timeout expired for {}".format(sequence_name)) result = results.as_numpy("OUTPUT")[0][0] if FLAGS.verbose: - print("{} {}: + {} = {}".format(sequence_name, sequence_id, value, - result)) + print("{} {}: + {} = {}".format(sequence_name, sequence_id, value, result)) if expected is not None: if input_dtype == np.object_: - assert int( - result - ) == expected, "{}: expected result {}, got {}".format( - sequence_name, expected, int(result)) + assert int(result) == expected, "{}: expected result {}, got {}".format( + sequence_name, expected, int(result) + ) else: assert result == expected, "{}: expected result {}, got {}".format( - sequence_name, expected, result) + sequence_name, expected, result + ) triton_client.stop_stream() @@ -176,12 +183,12 @@ def get_datatype(trial): return np.int32 -def sequence_valid(client_metadata, rng, trial, model_name, dtype, len_mean, - len_stddev, sequence_name): +def sequence_valid( + client_metadata, rng, trial, model_name, dtype, len_mean, len_stddev, sequence_name +): # Create a variable length sequence with "start" and "end" flags. seqlen = max(1, int(rng.normal(len_mean, len_stddev))) - print("{} {}: valid seqlen = {}".format(sequence_name, client_metadata[1], - seqlen)) + print("{} {}: valid seqlen = {}".format(sequence_name, client_metadata[1], seqlen)) values = rng.randint(0, 1024 * 1024, size=seqlen, dtype=dtype) @@ -200,31 +207,34 @@ def sequence_valid(client_metadata, rng, trial, model_name, dtype, len_mean, expected_result += val # (flag_str, value, expected_result, delay_ms) - steps.append((flags, val, expected_result, delay_ms),) + steps.append( + (flags, val, expected_result, delay_ms), + ) - check_sequence_async(client_metadata, - trial, - model_name, - dtype, - steps, - sequence_name=sequence_name) + check_sequence_async( + client_metadata, trial, model_name, dtype, steps, sequence_name=sequence_name + ) -def sequence_valid_valid(client_metadata, rng, trial, model_name, dtype, - len_mean, len_stddev, sequence_name): +def sequence_valid_valid( + client_metadata, rng, trial, model_name, dtype, len_mean, len_stddev, sequence_name +): # Create two variable length sequences with "start" and "end" # flags, where both sequences use the same correlation ID and are # sent back-to-back. seqlen = [ max(1, int(rng.normal(len_mean, len_stddev))), - max(1, int(rng.normal(len_mean, len_stddev))) + max(1, int(rng.normal(len_mean, len_stddev))), ] - print("{} {}: valid-valid seqlen[0] = {}, seqlen[1] = {}".format( - sequence_name, client_metadata[1], seqlen[0], seqlen[1])) + print( + "{} {}: valid-valid seqlen[0] = {}, seqlen[1] = {}".format( + sequence_name, client_metadata[1], seqlen[0], seqlen[1] + ) + ) values = [ rng.randint(0, 1024 * 1024, size=seqlen[0], dtype=dtype), - rng.randint(0, 1024 * 1024, size=seqlen[1], dtype=dtype) + rng.randint(0, 1024 * 1024, size=seqlen[1], dtype=dtype), ] for p in [0, 1]: @@ -243,31 +253,34 @@ def sequence_valid_valid(client_metadata, rng, trial, model_name, dtype, expected_result += val # (flag_str, value, expected_result, delay_ms) - steps.append((flags, val, expected_result, delay_ms),) + steps.append( + (flags, val, expected_result, delay_ms), + ) - check_sequence_async(client_metadata, - trial, - model_name, - dtype, - steps, - sequence_name=sequence_name) + check_sequence_async( + client_metadata, trial, model_name, dtype, steps, sequence_name=sequence_name + ) -def sequence_valid_no_end(client_metadata, rng, trial, model_name, dtype, - len_mean, len_stddev, sequence_name): +def sequence_valid_no_end( + client_metadata, rng, trial, model_name, dtype, len_mean, len_stddev, sequence_name +): # Create two variable length sequences, the first with "start" and # "end" flags and the second with no "end" flag, where both # sequences use the same correlation ID and are sent back-to-back. seqlen = [ max(1, int(rng.normal(len_mean, len_stddev))), - max(1, int(rng.normal(len_mean, len_stddev))) + max(1, int(rng.normal(len_mean, len_stddev))), ] - print("{} {}: valid-no-end seqlen[0] = {}, seqlen[1] = {}".format( - sequence_name, client_metadata[1], seqlen[0], seqlen[1])) + print( + "{} {}: valid-no-end seqlen[0] = {}, seqlen[1] = {}".format( + sequence_name, client_metadata[1], seqlen[0], seqlen[1] + ) + ) values = [ rng.randint(0, 1024 * 1024, size=seqlen[0], dtype=dtype), - rng.randint(0, 1024 * 1024, size=seqlen[1], dtype=dtype) + rng.randint(0, 1024 * 1024, size=seqlen[1], dtype=dtype), ] for p in [0, 1]: @@ -286,23 +299,22 @@ def sequence_valid_no_end(client_metadata, rng, trial, model_name, dtype, expected_result += val # (flag_str, value, expected_result, delay_ms) - steps.append((flags, val, expected_result, delay_ms),) + steps.append( + (flags, val, expected_result, delay_ms), + ) - check_sequence_async(client_metadata, - trial, - model_name, - dtype, - steps, - sequence_name=sequence_name) + check_sequence_async( + client_metadata, trial, model_name, dtype, steps, sequence_name=sequence_name + ) -def sequence_no_start(client_metadata, rng, trial, model_name, dtype, - sequence_name): +def sequence_no_start(client_metadata, rng, trial, model_name, dtype, sequence_name): # Create a sequence without a "start" flag. Sequence should get an # error from the server. seqlen = 1 - print("{} {}: no-start seqlen = {}".format(sequence_name, - client_metadata[1], seqlen)) + print( + "{} {}: no-start seqlen = {}".format(sequence_name, client_metadata[1], seqlen) + ) values = rng.randint(0, 1024 * 1024, size=seqlen, dtype=dtype) @@ -314,29 +326,33 @@ def sequence_no_start(client_metadata, rng, trial, model_name, dtype, delay_ms = None # (flag_str, value, expected_result, delay_ms) - steps.append((flags, val, None, delay_ms),) + steps.append( + (flags, val, None, delay_ms), + ) try: - check_sequence_async(client_metadata, - trial, - model_name, - dtype, - steps, - sequence_name=sequence_name) + check_sequence_async( + client_metadata, + trial, + model_name, + dtype, + steps, + sequence_name=sequence_name, + ) assert False, "expected inference failure from missing START flag" except Exception as ex: if "must specify the START flag" not in ex.message(): raise -def sequence_no_end(client_metadata, rng, trial, model_name, dtype, len_mean, - len_stddev, sequence_name): +def sequence_no_end( + client_metadata, rng, trial, model_name, dtype, len_mean, len_stddev, sequence_name +): # Create a variable length sequence with "start" flag but that # never ends. The sequence should be aborted by the server and its # slot reused for another sequence. seqlen = max(1, int(rng.normal(len_mean, len_stddev))) - print("{} {}: no-end seqlen = {}".format(sequence_name, client_metadata[1], - seqlen)) + print("{} {}: no-end seqlen = {}".format(sequence_name, client_metadata[1], seqlen)) values = rng.randint(0, 1024 * 1024, size=seqlen, dtype=dtype) @@ -353,18 +369,16 @@ def sequence_no_end(client_metadata, rng, trial, model_name, dtype, len_mean, expected_result += val # (flag_str, value, expected_result, delay_ms) - steps.append((flags, val, expected_result, delay_ms),) + steps.append( + (flags, val, expected_result, delay_ms), + ) - check_sequence_async(client_metadata, - trial, - model_name, - dtype, - steps, - sequence_name=sequence_name) + check_sequence_async( + client_metadata, trial, model_name, dtype, steps, sequence_name=sequence_name + ) -def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name, - dtype): +def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name, dtype): # Thread responsible for generating sequences of inference # requests. global _thread_exceptions @@ -390,9 +404,13 @@ def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name, for c in range(common_cnt + rare_cnt): client_metadata_list.append( - (grpcclient.InferenceServerClient("localhost:8001", - verbose=FLAGS.verbose), - correlation_id_base + c)) + ( + grpcclient.InferenceServerClient( + "localhost:8001", verbose=FLAGS.verbose + ), + correlation_id_base + c, + ) + ) last_choices.append(None) rare_idx = 0 @@ -408,34 +426,40 @@ def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name, # exercise the idle sequence path of the sequence # scheduler if choice < 0.33: - sequence_no_end(client_metadata_list[client_idx], - rng, - trial, - model_name, - dtype, - SEQUENCE_LENGTH_MEAN, - SEQUENCE_LENGTH_STDEV, - sequence_name=name) + sequence_no_end( + client_metadata_list[client_idx], + rng, + trial, + model_name, + dtype, + SEQUENCE_LENGTH_MEAN, + SEQUENCE_LENGTH_STDEV, + sequence_name=name, + ) last_choices[client_idx] = "no-end" elif choice < 0.66: - sequence_valid_no_end(client_metadata_list[client_idx], - rng, - trial, - model_name, - dtype, - SEQUENCE_LENGTH_MEAN, - SEQUENCE_LENGTH_STDEV, - sequence_name=name) + sequence_valid_no_end( + client_metadata_list[client_idx], + rng, + trial, + model_name, + dtype, + SEQUENCE_LENGTH_MEAN, + SEQUENCE_LENGTH_STDEV, + sequence_name=name, + ) last_choices[client_idx] = "valid-no-end" else: - sequence_valid_valid(client_metadata_list[client_idx], - rng, - trial, - model_name, - dtype, - SEQUENCE_LENGTH_MEAN, - SEQUENCE_LENGTH_STDEV, - sequence_name=name) + sequence_valid_valid( + client_metadata_list[client_idx], + rng, + trial, + model_name, + dtype, + SEQUENCE_LENGTH_MEAN, + SEQUENCE_LENGTH_STDEV, + sequence_name=name, + ) last_choices[client_idx] = "valid-valid" rare_idx = (rare_idx + 1) % rare_cnt @@ -451,54 +475,67 @@ def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name, # just assume that the no-start is a continuation of # the no-end sequence instead of being a sequence # missing start flag. - if ((last_choice != "no-end") and - (last_choice != "valid-no-end") and (choice < 0.01)): - sequence_no_start(client_metadata, - rng, - trial, - model_name, - dtype, - sequence_name=name) + if ( + (last_choice != "no-end") + and (last_choice != "valid-no-end") + and (choice < 0.01) + ): + sequence_no_start( + client_metadata, + rng, + trial, + model_name, + dtype, + sequence_name=name, + ) last_choices[client_idx] = "no-start" elif choice < 0.05: - sequence_no_end(client_metadata, - rng, - trial, - model_name, - dtype, - SEQUENCE_LENGTH_MEAN, - SEQUENCE_LENGTH_STDEV, - sequence_name=name) + sequence_no_end( + client_metadata, + rng, + trial, + model_name, + dtype, + SEQUENCE_LENGTH_MEAN, + SEQUENCE_LENGTH_STDEV, + sequence_name=name, + ) last_choices[client_idx] = "no-end" elif choice < 0.10: - sequence_valid_no_end(client_metadata, - rng, - trial, - model_name, - dtype, - SEQUENCE_LENGTH_MEAN, - SEQUENCE_LENGTH_STDEV, - sequence_name=name) + sequence_valid_no_end( + client_metadata, + rng, + trial, + model_name, + dtype, + SEQUENCE_LENGTH_MEAN, + SEQUENCE_LENGTH_STDEV, + sequence_name=name, + ) last_choices[client_idx] = "valid-no-end" elif choice < 0.15: - sequence_valid_valid(client_metadata, - rng, - trial, - model_name, - dtype, - SEQUENCE_LENGTH_MEAN, - SEQUENCE_LENGTH_STDEV, - sequence_name=name) + sequence_valid_valid( + client_metadata, + rng, + trial, + model_name, + dtype, + SEQUENCE_LENGTH_MEAN, + SEQUENCE_LENGTH_STDEV, + sequence_name=name, + ) last_choices[client_idx] = "valid-valid" else: - sequence_valid(client_metadata, - rng, - trial, - model_name, - dtype, - SEQUENCE_LENGTH_MEAN, - SEQUENCE_LENGTH_STDEV, - sequence_name=name) + sequence_valid( + client_metadata, + rng, + trial, + model_name, + dtype, + SEQUENCE_LENGTH_MEAN, + SEQUENCE_LENGTH_STDEV, + sequence_name=name, + ) last_choices[client_idx] = "valid" except Exception as ex: @@ -519,38 +556,40 @@ def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name, def check_status(model_name): - client = grpcclient.InferenceServerClient("localhost:8001", - verbose=FLAGS.verbose) + client = grpcclient.InferenceServerClient("localhost:8001", verbose=FLAGS.verbose) stats = client.get_inference_statistics(model_name) print(stats) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('-v', - '--verbose', - action="store_true", - required=False, - default=False, - help='Enable verbose output') - parser.add_argument('-r', - '--random-seed', - type=int, - required=False, - help='Random seed.') - parser.add_argument('-t', - '--concurrency', - type=int, - required=False, - default=8, - help='Request concurrency. Default is 8.') parser.add_argument( - '-i', - '--iterations', + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-r", "--random-seed", type=int, required=False, help="Random seed." + ) + parser.add_argument( + "-t", + "--concurrency", + type=int, + required=False, + default=8, + help="Request concurrency. Default is 8.", + ) + parser.add_argument( + "-i", + "--iterations", type=int, required=False, default=200, - help='Number of iterations of stress test to run. Default is 200.') + help="Number of iterations of stress test to run. Default is 200.", + ) FLAGS = parser.parse_args() # Initialize the random seed. For reproducibility each thread @@ -584,10 +623,19 @@ def check_status(model_name): correlation_id_base = 1 + (idx * CORRELATION_ID_BLOCK_SIZE) threads.append( - threading.Thread(target=stress_thread, - args=(thread_name, seed, FLAGS.iterations, - correlation_id_base, trial, model_name, - dtype))) + threading.Thread( + target=stress_thread, + args=( + thread_name, + seed, + FLAGS.iterations, + correlation_id_base, + trial, + model_name, + dtype, + ), + ) + ) for t in threads: t.start() diff --git a/qa/L0_server_status/server_status_test.py b/qa/L0_server_status/server_status_test.py old mode 100644 new mode 100755 index 93c94588df..7ab04708f0 --- a/qa/L0_server_status/server_status_test.py +++ b/qa/L0_server_status/server_status_test.py @@ -1,4 +1,6 @@ -# Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,10 +30,11 @@ sys.path.append("../common") -import numpy as np import os import unittest + import infer_util as iu +import numpy as np import test_util as tu import tritongrpcclient as grpcclient import tritonhttpclient as httpclient @@ -39,24 +42,29 @@ class ServerMetadataTest(tu.TestResultCollector): - def test_basic(self): try: - for pair in [("localhost:8000", "http"), - ("localhost:8001", "grpc")]: + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: model_name = "graphdef_int32_int8_int8" extensions = [ - 'classification', 'sequence', 'model_repository', - 'schedule_policy', 'model_configuration', - 'system_shared_memory', 'cuda_shared_memory', - 'binary_tensor_data', 'statistics' + "classification", + "sequence", + "model_repository", + "schedule_policy", + "model_configuration", + "system_shared_memory", + "cuda_shared_memory", + "binary_tensor_data", + "statistics", ] if pair[1] == "http": triton_client = httpclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) else: triton_client = grpcclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) @@ -64,16 +72,18 @@ def test_basic(self): model_metadata = triton_client.get_model_metadata(model_name) if pair[1] == "http": - self.assertEqual(os.environ["TRITON_SERVER_VERSION"], - server_metadata['version']) - self.assertEqual("triton", server_metadata['name']) + self.assertEqual( + os.environ["TRITON_SERVER_VERSION"], server_metadata["version"] + ) + self.assertEqual("triton", server_metadata["name"]) for ext in extensions: - self.assertIn(ext, server_metadata['extensions']) + self.assertIn(ext, server_metadata["extensions"]) - self.assertEqual(model_name, model_metadata['name']) + self.assertEqual(model_name, model_metadata["name"]) else: - self.assertEqual(os.environ["TRITON_SERVER_VERSION"], - server_metadata.version) + self.assertEqual( + os.environ["TRITON_SERVER_VERSION"], server_metadata.version + ) self.assertEqual("triton", server_metadata.name) for ext in extensions: self.assertIn(ext, server_metadata.extensions) @@ -84,91 +94,96 @@ def test_basic(self): def test_unknown_model(self): try: - for pair in [("localhost:8000", "http"), - ("localhost:8001", "grpc")]: + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: model_name = "foo" if pair[1] == "http": triton_client = httpclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) else: triton_client = grpcclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) server_metadata = triton_client.get_server_metadata() if pair[1] == "http": - self.assertEqual(os.environ["TRITON_SERVER_VERSION"], - server_metadata['version']) - self.assertEqual("triton", server_metadata['name']) + self.assertEqual( + os.environ["TRITON_SERVER_VERSION"], server_metadata["version"] + ) + self.assertEqual("triton", server_metadata["name"]) else: - self.assertEqual(os.environ["TRITON_SERVER_VERSION"], - server_metadata.version) + self.assertEqual( + os.environ["TRITON_SERVER_VERSION"], server_metadata.version + ) self.assertEqual("triton", server_metadata.name) model_metadata = triton_client.get_model_metadata(model_name) self.assertTrue(False, "expected unknown model failure") except InferenceServerException as ex: - self.assertTrue(ex.message().startswith( - "Request for unknown model: 'foo' is not found")) + self.assertTrue( + ex.message().startswith("Request for unknown model: 'foo' is not found") + ) def test_unknown_model_version(self): try: - for pair in [("localhost:8000", "http"), - ("localhost:8001", "grpc")]: + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: model_name = "graphdef_int32_int8_int8" if pair[1] == "http": triton_client = httpclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) else: triton_client = grpcclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) model_metadata = triton_client.get_model_metadata( - model_name, model_version="99") + model_name, model_version="99" + ) self.assertTrue(False, "expected unknown model version failure") except InferenceServerException as ex: - self.assertTrue(ex.message().startswith( - "Request for unknown model: 'graphdef_int32_int8_int8' version 99 is not found" - )) + self.assertTrue( + ex.message().startswith( + "Request for unknown model: 'graphdef_int32_int8_int8' version 99 is not found" + ) + ) def test_model_latest_infer(self): input_size = 16 tensor_shape = (1, input_size) - platform_name = { - 'graphdef': 'tensorflow_graphdef', - 'onnx': 'onnxruntime_onnx' - } + platform_name = {"graphdef": "tensorflow_graphdef", "onnx": "onnxruntime_onnx"} # There are 3 versions of *_int32_int32_int32 and all # should be available. - for platform in ('graphdef', 'onnx'): + for platform in ("graphdef", "onnx"): model_name = platform + "_int32_int32_int32" # Initially there should be no version stats.. try: - for pair in [("localhost:8000", "http"), - ("localhost:8001", "grpc")]: + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) else: triton_client = grpcclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) - model_metadata = triton_client.get_model_metadata( - model_name) + model_metadata = triton_client.get_model_metadata(model_name) # verify all versions are reported when no model version is specified if pair[1] == "http": - self.assertEqual(model_name, model_metadata['name']) - self.assertEqual(len(model_metadata['versions']), 3) + self.assertEqual(model_name, model_metadata["name"]) + self.assertEqual(len(model_metadata["versions"]), 3) for v in (1, 2, 3): - self.assertIn(str(v), model_metadata['versions']) + self.assertIn(str(v), model_metadata["versions"]) else: self.assertEqual(model_name, model_metadata.name) self.assertEqual(len(model_metadata.versions), 3) @@ -177,9 +192,9 @@ def test_model_latest_infer(self): # verify contents of model metadata if pair[1] == "http": - model_platform = model_metadata['platform'] - model_inputs = model_metadata['inputs'] - model_outputs = model_metadata['outputs'] + model_platform = model_metadata["platform"] + model_inputs = model_metadata["inputs"] + model_outputs = model_metadata["outputs"] else: model_platform = model_metadata.platform model_inputs = model_metadata.inputs @@ -191,9 +206,9 @@ def test_model_latest_infer(self): for model_input in model_inputs: if pair[1] == "http": - input_dtype = model_input['datatype'] - input_shape = model_input['shape'] - input_name = model_input['name'] + input_dtype = model_input["datatype"] + input_shape = model_input["shape"] + input_name = model_input["name"] else: input_dtype = model_input.datatype input_shape = model_input.shape @@ -204,9 +219,9 @@ def test_model_latest_infer(self): for model_output in model_outputs: if pair[1] == "http": - output_dtype = model_output['datatype'] - output_shape = model_output['shape'] - output_name = model_output['name'] + output_dtype = model_output["datatype"] + output_shape = model_output["shape"] + output_name = model_output["name"] else: output_dtype = model_output.datatype output_shape = model_output.shape @@ -219,67 +234,79 @@ def test_model_latest_infer(self): self.assertTrue(False, "unexpected error {}".format(ex)) # Infer using latest version (which is 3)... - iu.infer_exact(self, - platform, - tensor_shape, - 1, - np.int32, - np.int32, - np.int32, - model_version=None, - swap=True) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.int32, + np.int32, + np.int32, + model_version=None, + swap=True, + ) try: - for pair in [("localhost:8000", "http"), - ("localhost:8001", "grpc")]: + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) else: triton_client = grpcclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) for v in (1, 2, 3): self.assertTrue( - triton_client.is_model_ready(model_name, - model_version=str(v))) + triton_client.is_model_ready( + model_name, model_version=str(v) + ) + ) # Only version 3 should have infer stats - infer_stats = triton_client.get_inference_statistics( - model_name) + infer_stats = triton_client.get_inference_statistics(model_name) if pair[1] == "http": - stats = infer_stats['model_stats'] + stats = infer_stats["model_stats"] else: stats = infer_stats.model_stats self.assertEqual( - len(stats), 3, - "expected 3 infer stats for model " + model_name) + len(stats), 3, "expected 3 infer stats for model " + model_name + ) for s in stats: if pair[1] == "http": - v = s['version'] - stat = s['inference_stats'] + v = s["version"] + stat = s["inference_stats"] else: v = s.version stat = s.inference_stats if v == "3": if pair[1] == "http": - self.assertTrue(stat['success']['count'], 3) + self.assertTrue(stat["success"]["count"], 3) else: self.assertTrue(stat.success.count, 3) else: if pair[1] == "http": self.assertEqual( - stat['success']['count'], 0, + stat["success"]["count"], + 0, "unexpected infer success counts for version " - + str(v) + " of model " + model_name) + + str(v) + + " of model " + + model_name, + ) else: self.assertEqual( - stat.success.count, 0, + stat.success.count, + 0, "unexpected infer success counts for version " - + str(v) + " of model " + model_name) + + str(v) + + " of model " + + model_name, + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) @@ -289,136 +316,150 @@ def test_model_specific_infer(self): # There are 3 versions of *_float32_float32_float32 but only # versions 1 and 3 should be available. - for platform in ('graphdef', 'onnx', 'plan'): + for platform in ("graphdef", "onnx", "plan"): tensor_shape = (1, input_size) model_name = platform + "_float32_float32_float32" # Initially there should be no version status... try: - for pair in [("localhost:8000", "http"), - ("localhost:8001", "grpc")]: + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) else: triton_client = grpcclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) self.assertTrue( - triton_client.is_model_ready(model_name, - model_version="1")) + triton_client.is_model_ready(model_name, model_version="1") + ) self.assertFalse( - triton_client.is_model_ready(model_name, - model_version="2")) + triton_client.is_model_ready(model_name, model_version="2") + ) self.assertTrue( - triton_client.is_model_ready(model_name, - model_version="3")) + triton_client.is_model_ready(model_name, model_version="3") + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) # Infer using version 1... - iu.infer_exact(self, - platform, - tensor_shape, - 1, - np.float32, - np.float32, - np.float32, - model_version=1, - swap=False) + iu.infer_exact( + self, + platform, + tensor_shape, + 1, + np.float32, + np.float32, + np.float32, + model_version=1, + swap=False, + ) try: - for pair in [("localhost:8000", "http"), - ("localhost:8001", "grpc")]: + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) else: triton_client = grpcclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) self.assertTrue( - triton_client.is_model_ready(model_name, - model_version="1")) + triton_client.is_model_ready(model_name, model_version="1") + ) self.assertFalse( - triton_client.is_model_ready(model_name, - model_version="2")) + triton_client.is_model_ready(model_name, model_version="2") + ) self.assertTrue( - triton_client.is_model_ready(model_name, - model_version="3")) + triton_client.is_model_ready(model_name, model_version="3") + ) # Only version 1 should have infer stats infer_stats = triton_client.get_inference_statistics( - model_name, model_version='1') + model_name, model_version="1" + ) if pair[1] == "http": self.assertEqual( - len(infer_stats['model_stats']), 1, + len(infer_stats["model_stats"]), + 1, "expected 1 infer stats for version 1" - " of model " + model_name) - stats = infer_stats['model_stats'][0]['inference_stats'] - self.assertTrue(stats['success']['count'], 3) + " of model " + model_name, + ) + stats = infer_stats["model_stats"][0]["inference_stats"] + self.assertTrue(stats["success"]["count"], 3) else: self.assertEqual( - len(infer_stats.model_stats), 1, + len(infer_stats.model_stats), + 1, "expected 1 infer stats for version 1" - " of model " + model_name) + " of model " + model_name, + ) stats = infer_stats.model_stats[0].inference_stats self.assertTrue(stats.success.count, 3) infer_stats = triton_client.get_inference_statistics( - model_name, model_version='3') + model_name, model_version="3" + ) if pair[1] == "http": - stats = infer_stats['model_stats'][0]['inference_stats'] + stats = infer_stats["model_stats"][0]["inference_stats"] self.assertEqual( - stats['success']['count'], 0, + stats["success"]["count"], + 0, "unexpected infer stats for version 3" - " of model " + model_name) + " of model " + model_name, + ) else: stats = infer_stats.model_stats[0].inference_stats self.assertEqual( - stats.success.count, 0, + stats.success.count, + 0, "unexpected infer stats for version 3" - " of model " + model_name) + " of model " + model_name, + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) class ModelMetadataTest(tu.TestResultCollector): - ''' + """ These tests must be run after the ServerMetadataTest. See test.sh file for correct test running. - ''' + """ def test_model_versions_deleted(self): # Originally There were 3 versions of *_int32_int32_int32 and # version 3 was executed once. Version 2 and 3 models were # deleted from the model repository so now only expect version 1 to # be ready and show stats. - for platform in ('graphdef', 'onnx'): + for platform in ("graphdef", "onnx"): model_name = platform + "_int32_int32_int32" try: - for pair in [("localhost:8000", "http"), - ("localhost:8001", "grpc")]: + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) else: triton_client = grpcclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) - model_metadata = triton_client.get_model_metadata( - model_name) + model_metadata = triton_client.get_model_metadata(model_name) if pair[1] == "http": - self.assertEqual(model_name, model_metadata['name']) - self.assertEqual(len(model_metadata['versions']), 1) - self.assertEqual("1", model_metadata['versions'][0]) + self.assertEqual(model_name, model_metadata["name"]) + self.assertEqual(len(model_metadata["versions"]), 1) + self.assertEqual("1", model_metadata["versions"][0]) else: self.assertEqual(model_name, model_metadata.name) self.assertEqual(len(model_metadata.versions), 1) @@ -429,30 +470,41 @@ def test_model_versions_deleted(self): if v == 1: self.assertTrue( triton_client.is_model_ready( - model_name, model_version=str(v))) + model_name, model_version=str(v) + ) + ) infer_stats = triton_client.get_inference_statistics( - model_name, model_version=str(v)) + model_name, model_version=str(v) + ) if pair[1] == "http": self.assertEqual( - len(infer_stats['model_stats']), 1, - "expected 1 infer stats for version " + - str(v) + " of model " + model_name) - stats = infer_stats['model_stats'][0][ - 'inference_stats'] - self.assertEqual(stats['success']['count'], 0) + len(infer_stats["model_stats"]), + 1, + "expected 1 infer stats for version " + + str(v) + + " of model " + + model_name, + ) + stats = infer_stats["model_stats"][0]["inference_stats"] + self.assertEqual(stats["success"]["count"], 0) else: self.assertEqual( - len(infer_stats.model_stats), 1, - "expected 1 infer stats for version " + - str(v) + " of model " + model_name) - stats = infer_stats.model_stats[ - 0].inference_stats + len(infer_stats.model_stats), + 1, + "expected 1 infer stats for version " + + str(v) + + " of model " + + model_name, + ) + stats = infer_stats.model_stats[0].inference_stats self.assertEqual(stats.success.count, 0) else: self.assertFalse( triton_client.is_model_ready( - model_name, model_version=str(v))) + model_name, model_version=str(v) + ) + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) @@ -461,40 +513,46 @@ def test_model_versions_added(self): # Originally There was version 1 of *_float16_float32_float32. # Version 7 was added so now expect just version 7 to be ready # and provide infer stats. - for platform in ('graphdef',): + for platform in ("graphdef",): model_name = platform + "_float16_float32_float32" try: - for pair in [("localhost:8000", "http"), - ("localhost:8001", "grpc")]: + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) else: triton_client = grpcclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) - model_metadata = triton_client.get_model_metadata( - model_name) + model_metadata = triton_client.get_model_metadata(model_name) if pair[1] == "http": self.assertEqual( - model_name, model_metadata['name'], - "expected status for model " + model_name) + model_name, + model_metadata["name"], + "expected status for model " + model_name, + ) self.assertEqual( - len(model_metadata['versions']), 1, - "expected status for 1 versions for model " + - model_name) - self.assertEqual("7", model_metadata['versions'][0]) + len(model_metadata["versions"]), + 1, + "expected status for 1 versions for model " + model_name, + ) + self.assertEqual("7", model_metadata["versions"][0]) else: self.assertEqual( - model_name, model_metadata.name, - "expected status for model " + model_name) + model_name, + model_metadata.name, + "expected status for model " + model_name, + ) self.assertEqual( - len(model_metadata.versions), 1, - "expected status for 1 versions for model " + - model_name) + len(model_metadata.versions), + 1, + "expected status for 1 versions for model " + model_name, + ) self.assertEqual("7", model_metadata.versions[0]) # Only version 7 should be ready and show infer stat. @@ -502,39 +560,52 @@ def test_model_versions_added(self): if v == 7: self.assertTrue( triton_client.is_model_ready( - model_name, model_version=str(v))) + model_name, model_version=str(v) + ) + ) infer_stats = triton_client.get_inference_statistics( - model_name, model_version=str(v)) + model_name, model_version=str(v) + ) if pair[1] == "http": - stats = infer_stats['model_stats'][0][ - 'inference_stats'] + stats = infer_stats["model_stats"][0]["inference_stats"] self.assertEqual( - stats['success']['count'], 0, - "unexpected infer stats for version " + - str(v) + " of model " + model_name) + stats["success"]["count"], + 0, + "unexpected infer stats for version " + + str(v) + + " of model " + + model_name, + ) else: - stats = infer_stats.model_stats[ - 0].inference_stats + stats = infer_stats.model_stats[0].inference_stats self.assertEqual( - stats.success.count, 0, - "unexpected infer stats for version " + - str(v) + " of model " + model_name) + stats.success.count, + 0, + "unexpected infer stats for version " + + str(v) + + " of model " + + model_name, + ) else: self.assertFalse( triton_client.is_model_ready( - model_name, model_version=str(v))) + model_name, model_version=str(v) + ) + ) try: infer_stats = triton_client.get_inference_statistics( - model_name, model_version=str(v)) + model_name, model_version=str(v) + ) self.assertTrue( False, - "unexpected infer stats for the model that is not ready" + "unexpected infer stats for the model that is not ready", ) except InferenceServerException as ex: self.assertIn( "requested model version is not available for model", - str(ex)) + str(ex), + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) @@ -544,27 +615,27 @@ def test_infer_stats_no_model_version(self): # version 3 was executed once. Version 2 and 3 models were # deleted from the model repository so now only expect version 1 to # be ready and show infer stats. - for platform in ('graphdef', 'onnx'): + for platform in ("graphdef", "onnx"): model_name = platform + "_int32_int32_int32" try: - for pair in [("localhost:8000", "http"), - ("localhost:8001", "grpc")]: + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) else: triton_client = grpcclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) - model_metadata = triton_client.get_model_metadata( - model_name) + model_metadata = triton_client.get_model_metadata(model_name) if pair[1] == "http": - self.assertEqual(model_name, model_metadata['name']) - self.assertEqual(len(model_metadata['versions']), 1) - self.assertEqual("1", model_metadata['versions'][0]) + self.assertEqual(model_name, model_metadata["name"]) + self.assertEqual(len(model_metadata["versions"]), 1) + self.assertEqual("1", model_metadata["versions"][0]) else: self.assertEqual(model_name, model_metadata.name) self.assertEqual(len(model_metadata.versions), 1) @@ -575,44 +646,55 @@ def test_infer_stats_no_model_version(self): if v == 1: self.assertTrue( triton_client.is_model_ready( - model_name, model_version=str(v))) + model_name, model_version=str(v) + ) + ) else: self.assertFalse( triton_client.is_model_ready( - model_name, model_version=str(v))) + model_name, model_version=str(v) + ) + ) - infer_stats = triton_client.get_inference_statistics( - model_name) + infer_stats = triton_client.get_inference_statistics(model_name) if pair[1] == "http": - stats = infer_stats['model_stats'] + stats = infer_stats["model_stats"] else: stats = infer_stats.model_stats self.assertEqual( - len(stats), 1, - "expected 1 infer stats for model " + model_name) + len(stats), 1, "expected 1 infer stats for model " + model_name + ) if pair[1] == "http": - version = stats[0]['version'] - stat = stats[0]['inference_stats'] + version = stats[0]["version"] + stat = stats[0]["inference_stats"] else: version = stats[0].version stat = stats[0].inference_stats if version != "1": self.assertTrue( - False, - "expected version 1 for infer stat, got " + version) + False, "expected version 1 for infer stat, got " + version + ) else: if pair[1] == "http": self.assertEqual( - stat['success']['count'], 0, - "unexpected infer stats for version " + - str(version) + " of model " + model_name) + stat["success"]["count"], + 0, + "unexpected infer stats for version " + + str(version) + + " of model " + + model_name, + ) else: self.assertEqual( - stat.success.count, 0, - "unexpected infer stats for version " + - str(version) + " of model " + model_name) + stat.success.count, + 0, + "unexpected infer stats for version " + + str(version) + + " of model " + + model_name, + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) @@ -620,14 +702,15 @@ def test_infer_stats_no_model_version(self): def test_infer_stats_no_model(self): # Test get_inference_statistics when no model/model_version is passed. try: - for pair in [("localhost:8000", "http"), - ("localhost:8001", "grpc")]: + for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]: if pair[1] == "http": triton_client = httpclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) else: triton_client = grpcclient.InferenceServerClient( - url=pair[0], verbose=True) + url=pair[0], verbose=True + ) self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) @@ -635,17 +718,18 @@ def test_infer_stats_no_model(self): # Returns infer stats for ALL models + ready versions infer_stats = triton_client.get_inference_statistics() if pair[1] == "http": - stats = infer_stats['model_stats'] + stats = infer_stats["model_stats"] else: stats = infer_stats.model_stats self.assertEqual( - len(stats), 219, - "expected 219 infer stats for all ready versions of all model" + len(stats), + 219, + "expected 219 infer stats for all ready versions of all model", ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_shared_memory/shared_memory_test.py b/qa/L0_shared_memory/shared_memory_test.py old mode 100644 new mode 100755 index d4207064bf..6350dc2abe --- a/qa/L0_shared_memory/shared_memory_test.py +++ b/qa/L0_shared_memory/shared_memory_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,9 +30,10 @@ sys.path.append("../common") -import numpy as np -import unittest import os +import unittest + +import numpy as np import test_util as tu import tritonclient.grpc as grpcclient import tritonclient.http as httpclient @@ -39,12 +42,12 @@ class SharedMemoryTest(tu.TestResultCollector): - def test_invalid_create_shm(self): # Raises error since tried to create invalid system shared memory region try: shm_op0_handle = shm.create_shared_memory_region( - "dummy_data", "/dummy_data", -1) + "dummy_data", "/dummy_data", -1 + ) shm.destroy_shared_memory_region(shm_op0_handle) except Exception as ex: self.assertTrue(str(ex) == "unable to initialize the size") @@ -55,12 +58,11 @@ def test_valid_create_set_register(self): triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) - shm_op0_handle = shm.create_shared_memory_region( - "dummy_data", "/dummy_data", 8) - shm.set_shared_memory_region(shm_op0_handle, - [np.array([1, 2], dtype=np.float32)]) - triton_client.register_system_shared_memory("dummy_data", "/dummy_data", - 8) + shm_op0_handle = shm.create_shared_memory_region("dummy_data", "/dummy_data", 8) + shm.set_shared_memory_region( + shm_op0_handle, [np.array([1, 2], dtype=np.float32)] + ) + triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8) shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 1) @@ -74,8 +76,7 @@ def test_unregister_before_register(self): triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) - shm_op0_handle = shm.create_shared_memory_region( - "dummy_data", "/dummy_data", 8) + shm_op0_handle = shm.create_shared_memory_region("dummy_data", "/dummy_data", 8) triton_client.unregister_system_shared_memory("dummy_data") shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": @@ -90,10 +91,8 @@ def test_unregister_after_register(self): triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) - shm_op0_handle = shm.create_shared_memory_region( - "dummy_data", "/dummy_data", 8) - triton_client.register_system_shared_memory("dummy_data", "/dummy_data", - 8) + shm_op0_handle = shm.create_shared_memory_region("dummy_data", "/dummy_data", 8) + triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8) triton_client.unregister_system_shared_memory("dummy_data") shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": @@ -108,17 +107,14 @@ def test_reregister_after_register(self): triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) - shm_op0_handle = shm.create_shared_memory_region( - "dummy_data", "/dummy_data", 8) - triton_client.register_system_shared_memory("dummy_data", "/dummy_data", - 8) + shm_op0_handle = shm.create_shared_memory_region("dummy_data", "/dummy_data", 8) + triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8) try: - triton_client.register_system_shared_memory("dummy_data", - "/dummy_data", 8) + triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8) except Exception as ex: self.assertTrue( - "shared memory region 'dummy_data' already in manager" in str( - ex)) + "shared memory region 'dummy_data' already in manager" in str(ex) + ) shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 1) @@ -128,13 +124,17 @@ def test_reregister_after_register(self): def _configure_sever(self): shm_ip0_handle = shm.create_shared_memory_region( - "input0_data", "/input0_data", 64) + "input0_data", "/input0_data", 64 + ) shm_ip1_handle = shm.create_shared_memory_region( - "input1_data", "/input1_data", 64) + "input1_data", "/input1_data", 64 + ) shm_op0_handle = shm.create_shared_memory_region( - "output0_data", "/output0_data", 64) + "output0_data", "/output0_data", 64 + ) shm_op1_handle = shm.create_shared_memory_region( - "output1_data", "/output1_data", 64) + "output1_data", "/output1_data", 64 + ) input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) shm.set_shared_memory_region(shm_ip0_handle, [input0_data]) @@ -143,28 +143,26 @@ def _configure_sever(self): triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) - triton_client.register_system_shared_memory("input0_data", - "/input0_data", 64) - triton_client.register_system_shared_memory("input1_data", - "/input1_data", 64) - triton_client.register_system_shared_memory("output0_data", - "/output0_data", 64) - triton_client.register_system_shared_memory("output1_data", - "/output1_data", 64) + triton_client.register_system_shared_memory("input0_data", "/input0_data", 64) + triton_client.register_system_shared_memory("input1_data", "/input1_data", 64) + triton_client.register_system_shared_memory("output0_data", "/output0_data", 64) + triton_client.register_system_shared_memory("output1_data", "/output1_data", 64) return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle] def _cleanup_server(self, shm_handles): for shm_handle in shm_handles: shm.destroy_shared_memory_region(shm_handle) - def _basic_inference(self, - shm_ip0_handle, - shm_ip1_handle, - shm_op0_handle, - shm_op1_handle, - error_msg, - big_shm_name="", - big_shm_size=64): + def _basic_inference( + self, + shm_ip0_handle, + shm_ip1_handle, + shm_op0_handle, + shm_op1_handle, + error_msg, + big_shm_name="", + big_shm_size=64, + ): input0_data = np.arange(start=0, stop=16, dtype=np.int32) input1_data = np.ones(shape=16, dtype=np.int32) inputs = [] @@ -173,16 +171,16 @@ def _basic_inference(self, triton_client = httpclient.InferenceServerClient(_url, verbose=True) inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True)) outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) + httpclient.InferRequestedOutput("OUTPUT1", binary_data=False) + ) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32")) inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32")) - outputs.append(grpcclient.InferRequestedOutput('OUTPUT0')) - outputs.append(grpcclient.InferRequestedOutput('OUTPUT1')) + outputs.append(grpcclient.InferRequestedOutput("OUTPUT0")) + outputs.append(grpcclient.InferRequestedOutput("OUTPUT1")) inputs[0].set_shared_memory("input0_data", 64) @@ -197,23 +195,24 @@ def _basic_inference(self, outputs[1].set_shared_memory("output1_data", 64) try: - results = triton_client.infer("simple", - inputs, - model_version="", - outputs=outputs) - output = results.get_output('OUTPUT0') + results = triton_client.infer( + "simple", inputs, model_version="", outputs=outputs + ) + output = results.get_output("OUTPUT0") if _protocol == "http": - output_datatype = output['datatype'] - output_shape = output['shape'] + output_datatype = output["datatype"] + output_shape = output["shape"] else: output_datatype = output.datatype output_shape = output.shape output_dtype = utils.triton_to_np_dtype(output_datatype) - output_data = shm.get_contents_as_numpy(shm_op0_handle, - output_dtype, output_shape) + output_data = shm.get_contents_as_numpy( + shm_op0_handle, output_dtype, output_shape + ) self.assertTrue( (output_data[0] == (input0_data + input1_data)).all(), - "Model output does not match expected output") + "Model output does not match expected output", + ) except Exception as ex: error_msg.append(str(ex)) @@ -221,8 +220,9 @@ def test_unregister_after_inference(self): # Unregister after inference error_msg = [] shm_handles = self._configure_sever() - self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2], - shm_handles[3], error_msg) + self._basic_inference( + shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg + ) if len(error_msg) > 0: raise Exception(str(error_msg)) if _protocol == "http": @@ -245,14 +245,15 @@ def test_register_after_inference(self): triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) - self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2], - shm_handles[3], error_msg) + self._basic_inference( + shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg + ) if len(error_msg) > 0: raise Exception(str(error_msg)) shm_ip2_handle = shm.create_shared_memory_region( - "input2_data", "/input2_data", 64) - triton_client.register_system_shared_memory("input2_data", - "/input2_data", 64) + "input2_data", "/input2_data", 64 + ) + triton_client.register_system_shared_memory("input2_data", "/input2_data", 64) shm_status = triton_client.get_system_shared_memory_status() if _protocol == "http": self.assertTrue(len(shm_status) == 5) @@ -266,19 +267,27 @@ def test_too_big_shm(self): error_msg = [] shm_handles = self._configure_sever() shm_ip2_handle = shm.create_shared_memory_region( - "input2_data", "/input2_data", 128) + "input2_data", "/input2_data", 128 + ) if _protocol == "http": triton_client = httpclient.InferenceServerClient(_url, verbose=True) else: triton_client = grpcclient.InferenceServerClient(_url, verbose=True) - triton_client.register_system_shared_memory("input2_data", - "/input2_data", 128) - self._basic_inference(shm_handles[0], shm_ip2_handle, shm_handles[2], - shm_handles[3], error_msg, "input2_data", 128) + triton_client.register_system_shared_memory("input2_data", "/input2_data", 128) + self._basic_inference( + shm_handles[0], + shm_ip2_handle, + shm_handles[2], + shm_handles[3], + error_msg, + "input2_data", + 128, + ) if len(error_msg) > 0: self.assertTrue( "unexpected total byte size 128 for input 'INPUT1', expecting 64" - in error_msg[-1]) + in error_msg[-1] + ) shm_handles.append(shm_ip2_handle) self._cleanup_server(shm_handles) @@ -287,8 +296,9 @@ def test_mixed_raw_shm(self): error_msg = [] shm_handles = self._configure_sever() input1_data = np.ones(shape=16, dtype=np.int32) - self._basic_inference(shm_handles[0], [input1_data], shm_handles[2], - shm_handles[3], error_msg) + self._basic_inference( + shm_handles[0], [input1_data], shm_handles[2], shm_handles[3], error_msg + ) if len(error_msg) > 0: raise Exception(error_msg[-1]) self._cleanup_server(shm_handles) @@ -314,8 +324,8 @@ def test_unregisterall(self): self._cleanup_server(shm_handles) -if __name__ == '__main__': - _protocol = os.environ.get('CLIENT_TYPE', "http") +if __name__ == "__main__": + _protocol = os.environ.get("CLIENT_TYPE", "http") if _protocol == "http": _url = "localhost:8000" else: diff --git a/qa/L0_shared_memory/test.sh b/qa/L0_shared_memory/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_simple_ensemble/ensemble_test.py b/qa/L0_simple_ensemble/ensemble_test.py old mode 100644 new mode 100755 index 514cef59b6..0b064c13e8 --- a/qa/L0_simple_ensemble/ensemble_test.py +++ b/qa/L0_simple_ensemble/ensemble_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -30,72 +32,77 @@ sys.path.append("../clients") import logging - import unittest -import numpy as np + import infer_util as iu +import numpy as np import test_util as tu import tritonhttpclient class EnsembleTest(tu.TestResultCollector): - def _get_infer_count_per_version(self, model_name): - triton_client = tritonhttpclient.InferenceServerClient("localhost:8000", - verbose=True) + triton_client = tritonhttpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) stats = triton_client.get_inference_statistics(model_name) self.assertEqual(len(stats["model_stats"]), 2) infer_count = [0, 0] for model_stat in stats["model_stats"]: - self.assertEqual(model_stat["name"], model_name, - "expected stats for model " + model_name) - model_version = model_stat['version'] + self.assertEqual( + model_stat["name"], model_name, "expected stats for model " + model_name + ) + model_version = model_stat["version"] if model_version == "1": - infer_count[0] = model_stat["inference_stats"]["success"][ - "count"] + infer_count[0] = model_stat["inference_stats"]["success"]["count"] elif model_version == "2": - infer_count[1] = model_stat["inference_stats"]["success"][ - "count"] + infer_count[1] = model_stat["inference_stats"]["success"]["count"] else: self.assertTrue( - False, "unexpected version {} for model {}".format( - model_version, model_name)) + False, + "unexpected version {} for model {}".format( + model_version, model_name + ), + ) return infer_count def test_ensemble_add_sub(self): for bs in (1, 8): - iu.infer_exact(self, "ensemble_add_sub", (bs, 16), bs, np.int32, - np.int32, np.int32) + iu.infer_exact( + self, "ensemble_add_sub", (bs, 16), bs, np.int32, np.int32, np.int32 + ) infer_count = self._get_infer_count_per_version("simple") # The two 'simple' versions should have the same infer count - if (infer_count[0] != infer_count[1]): + if infer_count[0] != infer_count[1]: self.assertTrue( - False, - "unexpeced different infer count for different 'simple' versions" + False, "unexpeced different infer count for different 'simple' versions" ) def test_ensemble_add_sub_one_output(self): for bs in (1, 8): - iu.infer_exact(self, - "ensemble_add_sub", (bs, 16), - bs, - np.int32, - np.int32, - np.int32, - outputs=("OUTPUT0",)) + iu.infer_exact( + self, + "ensemble_add_sub", + (bs, 16), + bs, + np.int32, + np.int32, + np.int32, + outputs=("OUTPUT0",), + ) infer_count = self._get_infer_count_per_version("simple") # Only 'simple' version 2 should have non-zero infer count # as it is in charge of producing OUTPUT0 - if (infer_count[0] != 0): + if infer_count[0] != 0: self.assertTrue( - False, "unexpeced non-zero infer count for 'simple' version 1") - elif (infer_count[1] == 0): - self.assertTrue( - False, "unexpeced zero infer count for 'simple' version 2") + False, "unexpeced non-zero infer count for 'simple' version 1" + ) + elif infer_count[1] == 0: + self.assertTrue(False, "unexpeced zero infer count for 'simple' version 2") -if __name__ == '__main__': +if __name__ == "__main__": logging.basicConfig(stream=sys.stderr) unittest.main() diff --git a/qa/L0_simple_nodejs_client/test.sh b/qa/L0_simple_nodejs_client/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_socket/test.sh b/qa/L0_socket/test.sh old mode 100644 new mode 100755 index 08852624d7..228eec3f2a --- a/qa/L0_socket/test.sh +++ b/qa/L0_socket/test.sh @@ -138,7 +138,7 @@ for address in default explicit; do kill $SERVER_PID wait $SERVER_PID - # error if http/grpc port overlaps with grpc/http explicit port + # error if http/grpc port overlaps with grpc/http explicit port if [ "$p" == "http" ]; then SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --http-port 8003 --grpc-port 8003" run_server_nowait diff --git a/qa/L0_storage_S3_local/mock_s3_service.py b/qa/L0_storage_S3_local/mock_s3_service.py old mode 100644 new mode 100755 index b146cd8f3f..956aac0e66 --- a/qa/L0_storage_S3_local/mock_s3_service.py +++ b/qa/L0_storage_S3_local/mock_s3_service.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -24,13 +26,12 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import time import threading -from http.server import HTTPServer, BaseHTTPRequestHandler - +import time +from http.server import BaseHTTPRequestHandler, HTTPServer -class MockS3Service(): +class MockS3Service: __address = "localhost" __port = 8080 @@ -49,8 +50,10 @@ def __CheckHttp2Ads(self): v = self.headers["connection"].lower() if "upgrade" in v or "http2" in v: test_results["http2_ads"] = True - if "upgrade" in self.headers and "h2c" in self.headers[ - "upgrade"].lower(): + if ( + "upgrade" in self.headers + and "h2c" in self.headers["upgrade"].lower() + ): test_results["http2_ads"] = True if "http2-settings" in self.headers: test_results["http2_ads"] = True @@ -64,14 +67,15 @@ def do_HEAD(self): def do_GET(self): self.__CheckHttp2Ads() test_results["get_count"] += 1 - self.send_error(404, "Thank you for using the mock s3 service!", - "Your bucket is not found here!") + self.send_error( + 404, + "Thank you for using the mock s3 service!", + "Your bucket is not found here!", + ) self.__test_results = test_results - self.__server = HTTPServer((self.__address, self.__port), - RequestValidator) - self.__service_thread = threading.Thread( - target=self.__server.serve_forever) + self.__server = HTTPServer((self.__address, self.__port), RequestValidator) + self.__service_thread = threading.Thread(target=self.__server.serve_forever) def __enter__(self): self.__service_thread.start() @@ -82,12 +86,14 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.__service_thread.join() def TestPassed(self): - return self.__test_results["head_count"] > 0 and self.__test_results[ - "get_count"] > 0 and not self.__test_results["http2_ads"] + return ( + self.__test_results["head_count"] > 0 + and self.__test_results["get_count"] > 0 + and not self.__test_results["http2_ads"] + ) if __name__ == "__main__": - # Initialize mock service mock_s3_service = MockS3Service() diff --git a/qa/L0_storage_azure/test.sh b/qa/L0_storage_azure/test.sh index 9f67b1f272..9345671c84 100755 --- a/qa/L0_storage_azure/test.sh +++ b/qa/L0_storage_azure/test.sh @@ -218,7 +218,7 @@ for FW in ${AUTOCOMPLETE_BACKENDS}; do for model in ${FW}_float32_float32_float32 ${FW}_object_object_object; do cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/${model} models/ # Config files specify things expected by unit test like label_filename - # and max_batch_size for comparing results, so remove some key fields + # and max_batch_size for comparing results, so remove some key fields # for autocomplete to fill that won't break the unit test. sed -i '/platform:/d' models/${model}/config.pbtxt sed -i '/data_type:/d' models/${model}/config.pbtxt diff --git a/qa/L0_storage_swiftstack/infer_test.py b/qa/L0_storage_swiftstack/infer_test.py old mode 100644 new mode 100755 index 5e1b3704ff..f8a65a01a4 --- a/qa/L0_storage_swiftstack/infer_test.py +++ b/qa/L0_storage_swiftstack/infer_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -29,133 +31,177 @@ sys.path.append("../common") import unittest -import numpy as np + import infer_util as iu +import numpy as np import test_util as tu class InferTest(tu.TestResultCollector): - - def _full_exact(self, input_dtype, output0_dtype, output1_dtype, - output0_raw, output1_raw, swap): - - def _infer_exact_helper(tester, - pf, - tensor_shape, - batch_size, - input_dtype, - output0_dtype, - output1_dtype, - output0_raw=True, - output1_raw=True, - model_version=None, - swap=False, - outputs=("OUTPUT0", "OUTPUT1"), - use_http=True, - use_grpc=True, - skip_request_id_check=False, - use_streaming=True, - correlation_id=0): + def _full_exact( + self, input_dtype, output0_dtype, output1_dtype, output0_raw, output1_raw, swap + ): + def _infer_exact_helper( + tester, + pf, + tensor_shape, + batch_size, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=True, + output1_raw=True, + model_version=None, + swap=False, + outputs=("OUTPUT0", "OUTPUT1"), + use_http=True, + use_grpc=True, + skip_request_id_check=False, + use_streaming=True, + correlation_id=0, + ): for bs in (1, batch_size): - iu.infer_exact(tester, - pf, (bs,) + tensor_shape, - bs, - input_dtype, - output0_dtype, - output1_dtype, - output0_raw=output0_raw, - output1_raw=output1_raw, - model_version=model_version, - swap=swap, - outputs=outputs, - use_http=use_http, - use_grpc=use_grpc, - skip_request_id_check=skip_request_id_check, - use_streaming=use_streaming, - correlation_id=correlation_id) + iu.infer_exact( + tester, + pf, + (bs,) + tensor_shape, + bs, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + model_version=model_version, + swap=swap, + outputs=outputs, + use_http=use_http, + use_grpc=use_grpc, + skip_request_id_check=skip_request_id_check, + use_streaming=use_streaming, + correlation_id=correlation_id, + ) input_size = 16 - if tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype, - (input_size,), (input_size,), - (input_size,)): + if tu.validate_for_tf_model( + input_dtype, + output0_dtype, + output1_dtype, + (input_size,), + (input_size,), + (input_size,), + ): for pf in ["graphdef", "savedmodel"]: - _infer_exact_helper(self, - pf, (input_size,), - 8, - input_dtype, - output0_dtype, - output1_dtype, - output0_raw=output0_raw, - output1_raw=output1_raw, - swap=swap) - - if tu.validate_for_trt_model(input_dtype, output0_dtype, output1_dtype, - (input_size, 1, 1), (input_size, 1, 1), - (input_size, 1, 1)): + _infer_exact_helper( + self, + pf, + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + if tu.validate_for_trt_model( + input_dtype, + output0_dtype, + output1_dtype, + (input_size, 1, 1), + (input_size, 1, 1), + (input_size, 1, 1), + ): if input_dtype == np.int8: - _infer_exact_helper(self, - 'plan', (input_size, 1, 1), - 8, - input_dtype, - output0_dtype, - output1_dtype, - output0_raw=output0_raw, - output1_raw=output1_raw, - swap=swap) + _infer_exact_helper( + self, + "plan", + (input_size, 1, 1), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) else: - _infer_exact_helper(self, - 'plan', (input_size,), - 8, - input_dtype, - output0_dtype, - output1_dtype, - output0_raw=output0_raw, - output1_raw=output1_raw, - swap=swap) - - if tu.validate_for_onnx_model(input_dtype, output0_dtype, output1_dtype, - (input_size,), (input_size,), - (input_size,)): - _infer_exact_helper(self, - 'onnx', (input_size,), - 8, - input_dtype, - output0_dtype, - output1_dtype, - output0_raw=output0_raw, - output1_raw=output1_raw, - swap=swap) - - if tu.validate_for_libtorch_model(input_dtype, output0_dtype, - output1_dtype, (input_size,), - (input_size,), (input_size,)): - _infer_exact_helper(self, - 'libtorch', (input_size,), - 8, - input_dtype, - output0_dtype, - output1_dtype, - output0_raw=output0_raw, - output1_raw=output1_raw, - swap=swap) + _infer_exact_helper( + self, + "plan", + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + if tu.validate_for_onnx_model( + input_dtype, + output0_dtype, + output1_dtype, + (input_size,), + (input_size,), + (input_size,), + ): + _infer_exact_helper( + self, + "onnx", + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) + + if tu.validate_for_libtorch_model( + input_dtype, + output0_dtype, + output1_dtype, + (input_size,), + (input_size,), + (input_size,), + ): + _infer_exact_helper( + self, + "libtorch", + (input_size,), + 8, + input_dtype, + output0_dtype, + output1_dtype, + output0_raw=output0_raw, + output1_raw=output1_raw, + swap=swap, + ) def test_raw_fff(self): - self._full_exact(np.float32, - np.float32, - np.float32, - output0_raw=True, - output1_raw=True, - swap=True) + self._full_exact( + np.float32, + np.float32, + np.float32, + output0_raw=True, + output1_raw=True, + swap=True, + ) def test_class_fff(self): - self._full_exact(np.float32, - np.float32, - np.float32, - output0_raw=False, - output1_raw=False, - swap=True) + self._full_exact( + np.float32, + np.float32, + np.float32, + output0_raw=False, + output1_raw=False, + swap=True, + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_string_io/string_client_test.py b/qa/L0_string_io/string_client_test.py old mode 100644 new mode 100755 index aabcd7f111..16112ac70c --- a/qa/L0_string_io/string_client_test.py +++ b/qa/L0_string_io/string_client_test.py @@ -27,25 +27,25 @@ import sys -sys.path.append('../common') +sys.path.append("../common") -import numpy as np +import unittest from builtins import range -import tritonclient.http as tritonhttpclient + +import numpy as np +import test_util as tu import tritonclient.grpc as tritongrpcclient +import tritonclient.http as tritonhttpclient import tritonclient.utils as tritonutils -import unittest -import test_util as tu class ClientStringTest(tu.TestResultCollector): - def _test_infer_unicode(self, model_name, client, input_): # Send inference request to the inference server. Get results for # both output tensors. inputs = [] outputs = [] - inputs.append(client[1].InferInput('INPUT0', input_.shape, "BYTES")) + inputs.append(client[1].InferInput("INPUT0", input_.shape, "BYTES")) if client[1] == tritonhttpclient: inputs[0].set_data_from_numpy(input_, client[3]) @@ -53,31 +53,26 @@ def _test_infer_unicode(self, model_name, client, input_): inputs[0].set_data_from_numpy(input_) if client[1] == tritonhttpclient: - outputs.append(client[1].InferRequestedOutput( - 'OUTPUT0', binary_data=client[2])) + outputs.append( + client[1].InferRequestedOutput("OUTPUT0", binary_data=client[2]) + ) else: - outputs.append(client[1].InferRequestedOutput('OUTPUT0')) + outputs.append(client[1].InferRequestedOutput("OUTPUT0")) - results = client[0].infer(model_name=model_name, - inputs=inputs, - outputs=outputs) + results = client[0].infer(model_name=model_name, inputs=inputs, outputs=outputs) - out0 = results.as_numpy('OUTPUT0') + out0 = results.as_numpy("OUTPUT0") # We expect there to be 1 results (with batch-size 1). Verify # that all 8 result elements are the same as the input. self.assertTrue(np.array_equal(input_, out0)) return out0 - def _test_infer_non_unicode(self, - model_name, - client, - input_, - binary_data=True): + def _test_infer_non_unicode(self, model_name, client, input_, binary_data=True): # Send inference request to the inference server. Get results for # both output tensors. inputs = [] outputs = [] - inputs.append(client[1].InferInput('INPUT0', input_.shape, "BYTES")) + inputs.append(client[1].InferInput("INPUT0", input_.shape, "BYTES")) if client[1] == tritonhttpclient: inputs[0].set_data_from_numpy(input_, client[3]) @@ -85,57 +80,58 @@ def _test_infer_non_unicode(self, inputs[0].set_data_from_numpy(input_) if client[1] == tritonhttpclient: - outputs.append(client[1].InferRequestedOutput( - 'OUTPUT0', binary_data=client[2])) + outputs.append( + client[1].InferRequestedOutput("OUTPUT0", binary_data=client[2]) + ) else: - outputs.append(client[1].InferRequestedOutput('OUTPUT0')) + outputs.append(client[1].InferRequestedOutput("OUTPUT0")) - results = client[0].infer(model_name=model_name, - inputs=inputs, - outputs=outputs) + results = client[0].infer(model_name=model_name, inputs=inputs, outputs=outputs) - out0 = results.as_numpy('OUTPUT0') + out0 = results.as_numpy("OUTPUT0") # We expect there to be 1 results (with batch-size 1). Verify # that all 8 result elements are the same as the input. if client[2]: self.assertTrue(np.array_equal(input_.astype(np.bytes_), out0)) else: self.assertTrue( - np.array_equal(input_.astype(np.bytes_), - out0.astype(np.bytes_))) + np.array_equal(input_.astype(np.bytes_), out0.astype(np.bytes_)) + ) return out0 - def _test_unicode_bytes_dtype(self, client, model_name, dtype='|S78'): + def _test_unicode_bytes_dtype(self, client, model_name, dtype="|S78"): # Create the data for the input tensor. Initialize the tensor to 8 # byte strings. (dtype of np.bytes_) # Sample string that should no longer cause failure - in0 = np.array([ - [ - b'\nF\n\'\n\x01a\x12"\x1a \n\x1e\xfa\x03\x94\x01\x0f\xd7\x02\xf1\x05\xdf\x01\x82\x03\xb5\x05\xc1\x07\xba\x06\xff\x06\xc7\x07L\xf5\x03\xe2\x07\xa9\x03\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04\xdf\\\xcb\xbf' - ], + in0 = np.array( [ - b'\n:\n\x1a\n\x01a\x12\x15\x1a\x13\n\x11*\xe3\x05\xc5\x06\xda\x07\xcb\x06~\xb1\x05\xb3\x01\xa9\x02\x15\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\xbb[\n\xbf' + [ + b"\nF\n'\n\x01a\x12\"\x1a \n\x1e\xfa\x03\x94\x01\x0f\xd7\x02\xf1\x05\xdf\x01\x82\x03\xb5\x05\xc1\x07\xba\x06\xff\x06\xc7\x07L\xf5\x03\xe2\x07\xa9\x03\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04\xdf\\\xcb\xbf" + ], + [ + b"\n:\n\x1a\n\x01a\x12\x15\x1a\x13\n\x11*\xe3\x05\xc5\x06\xda\x07\xcb\x06~\xb1\x05\xb3\x01\xa9\x02\x15\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\xbb[\n\xbf" + ], + [ + b"\nL\n-\n\x01a\x12(\x1a&\n$\x87\x07\xce\x01\xe7\x06\xee\x04\xe1\x03\xf1\x03\xd7\x07\xbe\x02\xb8\x05\xe0\x05\xe4\x01\x88\x06\xb6\x03\xb9\x05\x83\x06\xf8\x04\xe2\x04\xf4\x06\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04\xbc\x99+@" + ], + [ + b"\n2\n\x12\n\x01a\x12\r\x1a\x0b\n\t\x99\x02\xde\x04\x9f\x04\xc5\x053\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\x12\x07\x83\xbe" + ], + [ + b"\nJ\n\r\n\x01b\x12\x08\x1a\x06\n\x04\x9b\x94\xad\x04\n\r\n\x01c\x12\x08\x12\x06\n\x04\xc3\x8a\x08\xbf\n*\n\x01a\x12%\x1a#\n!\x9c\x02\xb2\x02\xcd\x02\x9d\x07\x8d\x01\xb6\x05a\xf1\x01\xf0\x05\xdb\x02\xac\x04\xbd\x05\xe0\x04\xd2\x06\xaf\x02\xa8\x01\x8b\x04" + ], + [ + b"\n3\n\x13\n\x01a\x12\x0e\x1a\x0c\n\n<\xe2\x05\x8a\x01\xb3\x07?\xfd\x01\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\x1b\x931\xbf\x00\x00" + ], + [ + b"\n&\n\x07\n\x01a\x12\x02\x1a\x00\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04{\xbc\x0e>\x00\x00\x00" + ], + [ + b"\nF\n'\n\x01a\x12\"\x1a \n\x1e\x97\x01\x93\x02\x9e\x01\xac\x06\xff\x01\xd8\x05\xe1\x07\xd8\x04g]\x9a\x05\xff\x06\xde\x07\x8f\x04\x97\x04\xda\x03\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x9a\xb7I\n\r\n\x01c\x12\x08\x12\x06\n\x04\xfb\x87\x83\xbf" + ], ], - [ - b'\nL\n-\n\x01a\x12(\x1a&\n$\x87\x07\xce\x01\xe7\x06\xee\x04\xe1\x03\xf1\x03\xd7\x07\xbe\x02\xb8\x05\xe0\x05\xe4\x01\x88\x06\xb6\x03\xb9\x05\x83\x06\xf8\x04\xe2\x04\xf4\x06\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04\xbc\x99+@' - ], - [ - b'\n2\n\x12\n\x01a\x12\r\x1a\x0b\n\t\x99\x02\xde\x04\x9f\x04\xc5\x053\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\x12\x07\x83\xbe' - ], - [ - b'\nJ\n\r\n\x01b\x12\x08\x1a\x06\n\x04\x9b\x94\xad\x04\n\r\n\x01c\x12\x08\x12\x06\n\x04\xc3\x8a\x08\xbf\n*\n\x01a\x12%\x1a#\n!\x9c\x02\xb2\x02\xcd\x02\x9d\x07\x8d\x01\xb6\x05a\xf1\x01\xf0\x05\xdb\x02\xac\x04\xbd\x05\xe0\x04\xd2\x06\xaf\x02\xa8\x01\x8b\x04' - ], - [ - b'\n3\n\x13\n\x01a\x12\x0e\x1a\x0c\n\n<\xe2\x05\x8a\x01\xb3\x07?\xfd\x01\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\x1b\x931\xbf\x00\x00' - ], - [ - b'\n&\n\x07\n\x01a\x12\x02\x1a\x00\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04{\xbc\x0e>\x00\x00\x00' - ], - [ - b'\nF\n\'\n\x01a\x12"\x1a \n\x1e\x97\x01\x93\x02\x9e\x01\xac\x06\xff\x01\xd8\x05\xe1\x07\xd8\x04g]\x9a\x05\xff\x06\xde\x07\x8f\x04\x97\x04\xda\x03\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x9a\xb7I\n\r\n\x01c\x12\x08\x12\x06\n\x04\xfb\x87\x83\xbf' - ] - ], - dtype=dtype).flatten() + dtype=dtype, + ).flatten() self._test_infer_unicode(model_name, client, in0) def _test_str_dtype(self, client, model_name, dtype=np.object_): @@ -151,25 +147,39 @@ def _test_bytes(self, model_name): # This clients will fail for binary_data=False when the binary input # is not UTF-8 encodable. They should work for other cases however. binary_false_clients = [ - (tritonhttpclient.InferenceServerClient("localhost:8000", - verbose=True), - tritonhttpclient, True, False), - (tritonhttpclient.InferenceServerClient("localhost:8000", - verbose=True), - tritonhttpclient, False, False), - (tritonhttpclient.InferenceServerClient("localhost:8000", - verbose=True), - tritonhttpclient, False, True), + ( + tritonhttpclient.InferenceServerClient("localhost:8000", verbose=True), + tritonhttpclient, + True, + False, + ), + ( + tritonhttpclient.InferenceServerClient("localhost:8000", verbose=True), + tritonhttpclient, + False, + False, + ), + ( + tritonhttpclient.InferenceServerClient("localhost:8000", verbose=True), + tritonhttpclient, + False, + True, + ), ] # These clients work for every data type other_clients = [ - (tritongrpcclient.InferenceServerClient("localhost:8001", - verbose=True), - tritongrpcclient, False), - (tritonhttpclient.InferenceServerClient("localhost:8000", - verbose=True), - tritonhttpclient, True, True), + ( + tritongrpcclient.InferenceServerClient("localhost:8001", verbose=True), + tritongrpcclient, + False, + ), + ( + tritonhttpclient.InferenceServerClient("localhost:8000", verbose=True), + tritonhttpclient, + True, + True, + ), ] for client in other_clients + binary_false_clients: @@ -194,5 +204,5 @@ def test_tf_unicode_bytes(self): self._test_bytes("string_identity") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_tf_gpu_io/tf_gpu_io_test.py b/qa/L0_tf_gpu_io/tf_gpu_io_test.py old mode 100644 new mode 100755 index 23cdb5252f..fd3550e434 --- a/qa/L0_tf_gpu_io/tf_gpu_io_test.py +++ b/qa/L0_tf_gpu_io/tf_gpu_io_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -29,30 +31,35 @@ sys.path.append("../common") import unittest -import numpy as np + import infer_util as iu +import numpy as np import test_util as tu TENSOR_SIZE = 16384 class TfGpuIoTest(tu.TestResultCollector): - - def _test_helper(self, - model_name, - shape, - override_input_names=[], - override_output_names=[], - batching_enabled=False): + def _test_helper( + self, + model_name, + shape, + override_input_names=[], + override_output_names=[], + batching_enabled=False, + ): try: bs = 1 if batching_enabled: - shape = [[ - bs, - ] + shape] + shape = [ + [ + bs, + ] + + shape + ] iu.infer_zero( self, - 'graphdef', + "graphdef", bs, np.float32, shape, @@ -66,26 +73,33 @@ def _test_helper(self, self.assertTrue(False, "unexpected error {}".format(ex)) def test_sig_tag0(self): - self._test_helper("sig_tag0", [16], - override_input_names=["INPUT"], - override_output_names=["OUTPUT"]) + self._test_helper( + "sig_tag0", + [16], + override_input_names=["INPUT"], + override_output_names=["OUTPUT"], + ) def test_graphdef_zero_1_float32_def(self): - self._test_helper("graphdef_zero_1_float32_def", [TENSOR_SIZE], - batching_enabled=True) + self._test_helper( + "graphdef_zero_1_float32_def", [TENSOR_SIZE], batching_enabled=True + ) def test_graphdef_zero_1_float32_gpu(self): - self._test_helper("graphdef_zero_1_float32_gpu", [TENSOR_SIZE], - batching_enabled=True) + self._test_helper( + "graphdef_zero_1_float32_gpu", [TENSOR_SIZE], batching_enabled=True + ) def test_savedmodel_zero_1_float32_def(self): - self._test_helper("savedmodel_zero_1_float32_def", [TENSOR_SIZE], - batching_enabled=True) + self._test_helper( + "savedmodel_zero_1_float32_def", [TENSOR_SIZE], batching_enabled=True + ) def test_savedmodel_zero_1_float32_gpu(self): - self._test_helper("savedmodel_zero_1_float32_gpu", [TENSOR_SIZE], - batching_enabled=True) + self._test_helper( + "savedmodel_zero_1_float32_gpu", [TENSOR_SIZE], batching_enabled=True + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_tf_parameters/test.sh b/qa/L0_tf_parameters/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_tf_parameters/tf_parameter_test.py b/qa/L0_tf_parameters/tf_parameter_test.py old mode 100644 new mode 100755 index 4cdd8aa045..f1a4621d93 --- a/qa/L0_tf_parameters/tf_parameter_test.py +++ b/qa/L0_tf_parameters/tf_parameter_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -25,53 +27,55 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import sys -sys.path.append('../common') +sys.path.append("../common") + +import unittest + +import numpy as np import test_util as tu import tritonclient.http as tritonhttpclient import tritonclient.utils -import numpy as np -import unittest class TFParameterTest(tu.TestResultCollector): - def setUp(self): - self._client = tritonhttpclient.InferenceServerClient("localhost:8000", - verbose=True) + self._client = tritonhttpclient.InferenceServerClient( + "localhost:8000", verbose=True + ) def _infer_helper(self): # The model has a single variable which is added to the input. Since the # variable is initialized to zero the input and output must match. - model_name = 'graphdef_variable' + model_name = "graphdef_variable" input = np.array([10], dtype=np.int32) inputs = [] - inputs.append(tritonhttpclient.InferInput('INPUT', input.shape, - 'INT32')) + inputs.append(tritonhttpclient.InferInput("INPUT", input.shape, "INT32")) inputs[-1].set_data_from_numpy(input) outputs = [] - outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT')) + outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT")) - results = self._client.infer(model_name=model_name, - inputs=inputs, - outputs=outputs) - output = results.as_numpy('OUTPUT') + results = self._client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) + output = results.as_numpy("OUTPUT") np.testing.assert_array_equal(output, input) def test_tf_variable(self): self._infer_helper() def test_tf_variable_error(self): - with self.assertRaises( - tritonclient.utils.InferenceServerException) as e: + with self.assertRaises(tritonclient.utils.InferenceServerException) as e: self._infer_helper() self.assertIn( - "FAILED_PRECONDITION: Could not find variable VARIABLE. This " + - "could mean that the variable has been deleted. In TF1, it can " + - "also mean the variable is uninitialized.", e.exception.message()) + "FAILED_PRECONDITION: Could not find variable VARIABLE. This " + + "could mean that the variable has been deleted. In TF1, it can " + + "also mean the variable is uninitialized.", + e.exception.message(), + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py b/qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py old mode 100644 new mode 100755 index f4dcc5bdba..b4a11ac04e --- a/qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py +++ b/qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,6 +31,7 @@ sys.path.append("../common") import unittest + import numpy as np import test_util as tu import tritonhttpclient as httpclient @@ -49,16 +52,14 @@ def _test_helper(self, modelVersion, tag, sig_def): # for details multiplier = modelVersion + 1 output_name = "OUTPUT" - triton_client = httpclient.InferenceServerClient("localhost:8000", - verbose=True) + triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT', shape, "FP32")) + inputs.append(httpclient.InferInput("INPUT", shape, "FP32")) input_data = np.ones(shape=shape).astype(np.float32) inputs[0].set_data_from_numpy(input_data, binary_data=True) - outputs.append( - httpclient.InferRequestedOutput(output_name, binary_data=True)) + outputs.append(httpclient.InferRequestedOutput(output_name, binary_data=True)) results = triton_client.infer(model_name, inputs, outputs=outputs) output_data = results.as_numpy(output_name) test_output = input_data * multiplier @@ -77,5 +78,5 @@ def test_tag_sig_def(self): self._test_helper(3, self.test_tag, self.test_sig_def) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_tf_unknown_rank/test.sh b/qa/L0_tf_unknown_rank/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_tf_unknown_rank/tf_unknown_rank_test.py b/qa/L0_tf_unknown_rank/tf_unknown_rank_test.py old mode 100644 new mode 100755 index 66297d671d..add6b32c13 --- a/qa/L0_tf_unknown_rank/tf_unknown_rank_test.py +++ b/qa/L0_tf_unknown_rank/tf_unknown_rank_test.py @@ -1,4 +1,6 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -29,6 +31,7 @@ sys.path.append("../common") import unittest + import numpy as np import test_util as tu import tritonhttpclient @@ -40,18 +43,19 @@ class UnknownRankTest(tu.TestResultCollector): def infer_unknown(self, model_name, tensor_shape): print("About to run the test") input_data = np.random.random_sample(tensor_shape).astype(np.float32) - client = tritonhttpclient.InferenceServerClient('localhost:8000') + client = tritonhttpclient.InferenceServerClient("localhost:8000") inputs = [ - tritonhttpclient.InferInput("INPUT", input_data.shape, - np_to_triton_dtype(input_data.dtype)) + tritonhttpclient.InferInput( + "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) ] inputs[0].set_data_from_numpy(input_data) results = client.infer(model_name, inputs) - self.assertTrue(np.array_equal(results.as_numpy('OUTPUT'), input_data)) + self.assertTrue(np.array_equal(results.as_numpy("OUTPUT"), input_data)) def test_success(self): model_name = "unknown_rank_success" - tensor_shape = (1) + tensor_shape = 1 try: self.infer_unknown(model_name, tensor_shape) except InferenceServerException as ex: @@ -63,15 +67,16 @@ def test_wrong_input(self): try: self.infer_unknown(model_name, tensor_shape) self.fail( - "Found success when expected failure with model given " \ + "Found success when expected failure with model given " "wrong input tensor [1,2] for input [-1,1]." ) except InferenceServerException as ex: self.assertIn( - "unexpected shape for input \'INPUT\' for model " \ - "\'unknown_rank_wrong_output\'. Expected [1], got [1,2]", - ex.message()) + "unexpected shape for input 'INPUT' for model " + "'unknown_rank_wrong_output'. Expected [1], got [1,2]", + ex.message(), + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_tftrt_optimization/tftrt_optimization_test.py b/qa/L0_tftrt_optimization/tftrt_optimization_test.py old mode 100644 new mode 100755 index 9129d8d87d..9e59677317 --- a/qa/L0_tftrt_optimization/tftrt_optimization_test.py +++ b/qa/L0_tftrt_optimization/tftrt_optimization_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,47 +31,45 @@ sys.path.append("../common") import unittest + import numpy as np import test_util as tu import tritonhttpclient as httpclient class TFTRTOptimizationTest(tu.TestResultCollector): - def setUp(self): - self.input0_ = np.arange(start=0, stop=16, - dtype=np.float32).reshape(1, 16) + self.input0_ = np.arange(start=0, stop=16, dtype=np.float32).reshape(1, 16) self.input1_ = np.ones(shape=16, dtype=np.float32).reshape(1, 16) self.expected_output0_ = self.input0_ + self.input1_ self.expected_output1_ = self.input0_ - self.input1_ def _addsub_infer(self, model_name): - triton_client = httpclient.InferenceServerClient("localhost:8000", - verbose=True) + triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True) inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "FP32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "FP32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "FP32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "FP32")) # Initialize the data inputs[0].set_data_from_numpy(self.input0_, binary_data=True) inputs[1].set_data_from_numpy(self.input1_, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=True)) results = triton_client.infer(model_name, inputs, outputs=outputs) - output0_data = results.as_numpy('OUTPUT0') - output1_data = results.as_numpy('OUTPUT1') + output0_data = results.as_numpy("OUTPUT0") + output1_data = results.as_numpy("OUTPUT1") - self.assertTrue(np.array_equal(self.expected_output0_, output0_data), - "incorrect sum") - self.assertTrue(np.array_equal(self.expected_output1_, output1_data), - "incorrect difference") + self.assertTrue( + np.array_equal(self.expected_output0_, output0_data), "incorrect sum" + ) + self.assertTrue( + np.array_equal(self.expected_output1_, output1_data), "incorrect difference" + ) def test_graphdef(self): self._addsub_infer("graphdef_float32_float32_float32_trt") @@ -80,5 +80,5 @@ def test_savedmodel(self): self._addsub_infer("savedmodel_float32_float32_float32_param") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_trace/test.sh b/qa/L0_trace/test.sh index cf81a1a1ec..aeaa96e367 100755 --- a/qa/L0_trace/test.sh +++ b/qa/L0_trace/test.sh @@ -722,4 +722,4 @@ set -e kill $SERVER_PID wait $SERVER_PID -exit $RET +exit $RET \ No newline at end of file diff --git a/qa/L0_trace/trace_endpoint_test.py b/qa/L0_trace/trace_endpoint_test.py old mode 100644 new mode 100755 index 8f9c482656..a962da5d4c --- a/qa/L0_trace/trace_endpoint_test.py +++ b/qa/L0_trace/trace_endpoint_test.py @@ -30,18 +30,18 @@ sys.path.append("../common") +import json import sys import unittest -import tritonclient.http as httpclient + +import test_util as tu import tritonclient.grpc as grpcclient -import json +import tritonclient.http as httpclient from google.protobuf import json_format -import test_util as tu # Similar set up as dynamic batcher tests class TraceEndpointTest(tu.TestResultCollector): - def tearDown(self): # Clear all trace settings to initial state. # Note that the tearDown function uses HTTP client so the pass/fail @@ -53,13 +53,13 @@ def tearDown(self): "trace_level": None, "trace_rate": None, "trace_count": None, - "log_frequency": None + "log_frequency": None, } triton_client = httpclient.InferenceServerClient("localhost:8000") - triton_client.update_trace_settings(model_name="simple", - settings=clear_settings) - triton_client.update_trace_settings(model_name=None, - settings=clear_settings) + triton_client.update_trace_settings( + model_name="simple", settings=clear_settings + ) + triton_client.update_trace_settings(model_name=None, settings=clear_settings) def check_server_initial_state(self): # Helper function to make sure the trace setting is properly @@ -72,11 +72,12 @@ def check_server_initial_state(self): "trace_level": ["TIMESTAMPS"], "trace_rate": "1", "trace_count": "-1", - "log_frequency": "0" + "log_frequency": "0", } triton_client = httpclient.InferenceServerClient("localhost:8000") - self.assertEqual(initial_settings, - triton_client.get_trace_settings(model_name="simple")) + self.assertEqual( + initial_settings, triton_client.get_trace_settings(model_name="simple") + ) self.assertEqual(initial_settings, triton_client.get_trace_settings()) def test_http_get_settings(self): @@ -87,46 +88,50 @@ def test_http_get_settings(self): "trace_level": ["TIMESTAMPS"], "trace_rate": "1", "trace_count": "-1", - "log_frequency": "0" + "log_frequency": "0", } triton_client = httpclient.InferenceServerClient("localhost:8000") - self.assertEqual(initial_settings, - triton_client.get_trace_settings(model_name="simple"), - "Unexpected initial model trace settings") - self.assertEqual(initial_settings, triton_client.get_trace_settings(), - "Unexpected initial global settings") + self.assertEqual( + initial_settings, + triton_client.get_trace_settings(model_name="simple"), + "Unexpected initial model trace settings", + ) + self.assertEqual( + initial_settings, + triton_client.get_trace_settings(), + "Unexpected initial global settings", + ) def test_grpc_get_settings(self): # Model trace settings will be the same as global trace settings since # no update has been made. initial_settings = grpcclient.service_pb2.TraceSettingResponse() json_format.Parse( - json.dumps({ - "settings": { - "trace_file": { - "value": ["global_unittest.log"] - }, - "trace_level": { - "value": ["TIMESTAMPS"] - }, - "trace_rate": { - "value": ["1"] - }, - "trace_count": { - "value": ["-1"] - }, - "log_frequency": { - "value": ["0"] - }, + json.dumps( + { + "settings": { + "trace_file": {"value": ["global_unittest.log"]}, + "trace_level": {"value": ["TIMESTAMPS"]}, + "trace_rate": {"value": ["1"]}, + "trace_count": {"value": ["-1"]}, + "log_frequency": {"value": ["0"]}, + } } - }), initial_settings) + ), + initial_settings, + ) triton_client = grpcclient.InferenceServerClient("localhost:8001") - self.assertEqual(initial_settings, - triton_client.get_trace_settings(model_name="simple"), - "Unexpected initial model trace settings") - self.assertEqual(initial_settings, triton_client.get_trace_settings(), - "Unexpected initial global settings") + self.assertEqual( + initial_settings, + triton_client.get_trace_settings(model_name="simple"), + "Unexpected initial model trace settings", + ) + self.assertEqual( + initial_settings, + triton_client.get_trace_settings(), + "Unexpected initial global settings", + ) def test_http_update_settings(self): # Update model and global trace settings in order, @@ -139,47 +144,51 @@ def test_http_update_settings(self): "trace_level": ["TIMESTAMPS"], "trace_rate": "1", "trace_count": "-1", - "log_frequency": "0" + "log_frequency": "0", } expected_second_model_settings = { "trace_file": "model.log", "trace_level": ["TIMESTAMPS", "TENSORS"], "trace_rate": "1", "trace_count": "-1", - "log_frequency": "0" + "log_frequency": "0", } expected_global_settings = { "trace_file": "another.log", "trace_level": ["TIMESTAMPS", "TENSORS"], "trace_rate": "1", "trace_count": "-1", - "log_frequency": "0" + "log_frequency": "0", } model_update_settings = {"trace_file": "model.log"} global_update_settings = { "trace_file": "another.log", - "trace_level": ["TIMESTAMPS", "TENSORS"] + "trace_level": ["TIMESTAMPS", "TENSORS"], } triton_client = httpclient.InferenceServerClient("localhost:8000") self.assertEqual( expected_first_model_settings, - triton_client.update_trace_settings(model_name="simple", - settings=model_update_settings), - "Unexpected updated model trace settings") + triton_client.update_trace_settings( + model_name="simple", settings=model_update_settings + ), + "Unexpected updated model trace settings", + ) # Note that 'trace_level' may be mismatch due to the order of # the levels listed, currently we assume the order is the same # for simplicity. But the order shouldn't be enforced and this checking # needs to be improved when this kind of failure is reported self.assertEqual( expected_global_settings, - triton_client.update_trace_settings( - settings=global_update_settings), - "Unexpected updated global settings") - self.assertEqual(expected_second_model_settings, - triton_client.get_trace_settings(model_name="simple"), - "Unexpected model trace settings after global update") + triton_client.update_trace_settings(settings=global_update_settings), + "Unexpected updated global settings", + ) + self.assertEqual( + expected_second_model_settings, + triton_client.get_trace_settings(model_name="simple"), + "Unexpected model trace settings after global update", + ) def test_grpc_update_settings(self): # Update model and global trace settings in order, @@ -187,98 +196,82 @@ def test_grpc_update_settings(self): # the model setting fields that haven't been specified. self.check_server_initial_state() - expected_first_model_settings = grpcclient.service_pb2.TraceSettingResponse( - ) + expected_first_model_settings = grpcclient.service_pb2.TraceSettingResponse() json_format.Parse( - json.dumps({ - "settings": { - "trace_file": { - "value": ["model.log"] - }, - "trace_level": { - "value": ["TIMESTAMPS"] - }, - "trace_rate": { - "value": ["1"] - }, - "trace_count": { - "value": ["-1"] - }, - "log_frequency": { - "value": ["0"] - }, + json.dumps( + { + "settings": { + "trace_file": {"value": ["model.log"]}, + "trace_level": {"value": ["TIMESTAMPS"]}, + "trace_rate": {"value": ["1"]}, + "trace_count": {"value": ["-1"]}, + "log_frequency": {"value": ["0"]}, + } } - }), expected_first_model_settings) - - expected_second_model_settings = grpcclient.service_pb2.TraceSettingResponse( + ), + expected_first_model_settings, ) + + expected_second_model_settings = grpcclient.service_pb2.TraceSettingResponse() json_format.Parse( - json.dumps({ - "settings": { - "trace_file": { - "value": ["model.log"] - }, - "trace_level": { - "value": ["TIMESTAMPS", "TENSORS"] - }, - "trace_rate": { - "value": ["1"] - }, - "trace_count": { - "value": ["-1"] - }, - "log_frequency": { - "value": ["0"] - }, + json.dumps( + { + "settings": { + "trace_file": {"value": ["model.log"]}, + "trace_level": {"value": ["TIMESTAMPS", "TENSORS"]}, + "trace_rate": {"value": ["1"]}, + "trace_count": {"value": ["-1"]}, + "log_frequency": {"value": ["0"]}, + } } - }), expected_second_model_settings) + ), + expected_second_model_settings, + ) expected_global_settings = grpcclient.service_pb2.TraceSettingResponse() json_format.Parse( - json.dumps({ - "settings": { - "trace_file": { - "value": ["another.log"] - }, - "trace_level": { - "value": ["TIMESTAMPS", "TENSORS"] - }, - "trace_rate": { - "value": ["1"] - }, - "trace_count": { - "value": ["-1"] - }, - "log_frequency": { - "value": ["0"] - }, + json.dumps( + { + "settings": { + "trace_file": {"value": ["another.log"]}, + "trace_level": {"value": ["TIMESTAMPS", "TENSORS"]}, + "trace_rate": {"value": ["1"]}, + "trace_count": {"value": ["-1"]}, + "log_frequency": {"value": ["0"]}, + } } - }), expected_global_settings) + ), + expected_global_settings, + ) model_update_settings = {"trace_file": "model.log"} global_update_settings = { "trace_file": "another.log", - "trace_level": ["TIMESTAMPS", "TENSORS"] + "trace_level": ["TIMESTAMPS", "TENSORS"], } triton_client = grpcclient.InferenceServerClient("localhost:8001") self.assertEqual( expected_first_model_settings, - triton_client.update_trace_settings(model_name="simple", - settings=model_update_settings), - "Unexpected updated model trace settings") + triton_client.update_trace_settings( + model_name="simple", settings=model_update_settings + ), + "Unexpected updated model trace settings", + ) # Note that 'trace_level' may be mismatch due to the order of # the levels listed, currently we assume the order is the same # for simplicity. But the order shouldn't be enforced and this checking # needs to be improved when this kind of failure is reported self.assertEqual( expected_global_settings, - triton_client.update_trace_settings( - settings=global_update_settings), - "Unexpected updated global settings") - self.assertEqual(expected_second_model_settings, - triton_client.get_trace_settings(model_name="simple"), - "Unexpected model trace settings after global update") + triton_client.update_trace_settings(settings=global_update_settings), + "Unexpected updated global settings", + ) + self.assertEqual( + expected_second_model_settings, + triton_client.get_trace_settings(model_name="simple"), + "Unexpected model trace settings after global update", + ) def test_http_clear_settings(self): # Clear global and model trace settings in order, @@ -290,37 +283,33 @@ def test_http_clear_settings(self): # model 'simple' has 'trace_rate' and 'log_frequency' specified # global has 'trace_level', 'trace_count' and 'trace_rate' specified triton_client = httpclient.InferenceServerClient("localhost:8000") - triton_client.update_trace_settings(model_name="simple", - settings={ - "trace_rate": "12", - "log_frequency": "34" - }) - triton_client.update_trace_settings(settings={ - "trace_rate": "56", - "trace_count": "78", - "trace_level": ["OFF"] - }) + triton_client.update_trace_settings( + model_name="simple", settings={"trace_rate": "12", "log_frequency": "34"} + ) + triton_client.update_trace_settings( + settings={"trace_rate": "56", "trace_count": "78", "trace_level": ["OFF"]} + ) expected_global_settings = { "trace_file": "global_unittest.log", "trace_level": ["OFF"], "trace_rate": "1", "trace_count": "-1", - "log_frequency": "0" + "log_frequency": "0", } expected_first_model_settings = { "trace_file": "global_unittest.log", "trace_level": ["OFF"], "trace_rate": "12", "trace_count": "-1", - "log_frequency": "34" + "log_frequency": "34", } expected_second_model_settings = { "trace_file": "global_unittest.log", "trace_level": ["OFF"], "trace_rate": "1", "trace_count": "-1", - "log_frequency": "34" + "log_frequency": "34", } global_clear_settings = {"trace_rate": None, "trace_count": None} model_clear_settings = {"trace_rate": None, "trace_level": None} @@ -329,18 +318,25 @@ def test_http_clear_settings(self): self.assertEqual( expected_global_settings, triton_client.update_trace_settings(settings=global_clear_settings), - "Unexpected cleared global trace settings") - self.assertEqual(expected_first_model_settings, - triton_client.get_trace_settings(model_name="simple"), - "Unexpected model trace settings after global clear") + "Unexpected cleared global trace settings", + ) + self.assertEqual( + expected_first_model_settings, + triton_client.get_trace_settings(model_name="simple"), + "Unexpected model trace settings after global clear", + ) self.assertEqual( expected_second_model_settings, - triton_client.update_trace_settings(model_name="simple", - settings=model_clear_settings), - "Unexpected model trace settings after model clear") - self.assertEqual(expected_global_settings, - triton_client.get_trace_settings(), - "Unexpected global trace settings after model clear") + triton_client.update_trace_settings( + model_name="simple", settings=model_clear_settings + ), + "Unexpected model trace settings after model clear", + ) + self.assertEqual( + expected_global_settings, + triton_client.get_trace_settings(), + "Unexpected global trace settings after model clear", + ) def test_grpc_clear_settings(self): # Clear global and model trace settings in order, @@ -352,82 +348,58 @@ def test_grpc_clear_settings(self): # model 'simple' has 'trace_rate' and 'log_frequency' specified # global has 'trace_level', 'trace_count' and 'trace_rate' specified triton_client = grpcclient.InferenceServerClient("localhost:8001") - triton_client.update_trace_settings(model_name="simple", - settings={ - "trace_rate": "12", - "log_frequency": "34" - }) - triton_client.update_trace_settings(settings={ - "trace_rate": "56", - "trace_count": "78", - "trace_level": ["OFF"] - }) + triton_client.update_trace_settings( + model_name="simple", settings={"trace_rate": "12", "log_frequency": "34"} + ) + triton_client.update_trace_settings( + settings={"trace_rate": "56", "trace_count": "78", "trace_level": ["OFF"]} + ) expected_global_settings = grpcclient.service_pb2.TraceSettingResponse() json_format.Parse( - json.dumps({ - "settings": { - "trace_file": { - "value": ["global_unittest.log"] - }, - "trace_level": { - "value": ["OFF"] - }, - "trace_rate": { - "value": ["1"] - }, - "trace_count": { - "value": ["-1"] - }, - "log_frequency": { - "value": ["0"] - }, + json.dumps( + { + "settings": { + "trace_file": {"value": ["global_unittest.log"]}, + "trace_level": {"value": ["OFF"]}, + "trace_rate": {"value": ["1"]}, + "trace_count": {"value": ["-1"]}, + "log_frequency": {"value": ["0"]}, + } } - }), expected_global_settings) - expected_first_model_settings = grpcclient.service_pb2.TraceSettingResponse( + ), + expected_global_settings, ) + expected_first_model_settings = grpcclient.service_pb2.TraceSettingResponse() json_format.Parse( - json.dumps({ - "settings": { - "trace_file": { - "value": ["global_unittest.log"] - }, - "trace_level": { - "value": ["OFF"] - }, - "trace_rate": { - "value": ["12"] - }, - "trace_count": { - "value": ["-1"] - }, - "log_frequency": { - "value": ["34"] - }, + json.dumps( + { + "settings": { + "trace_file": {"value": ["global_unittest.log"]}, + "trace_level": {"value": ["OFF"]}, + "trace_rate": {"value": ["12"]}, + "trace_count": {"value": ["-1"]}, + "log_frequency": {"value": ["34"]}, + } } - }), expected_first_model_settings) - expected_second_model_settings = grpcclient.service_pb2.TraceSettingResponse( + ), + expected_first_model_settings, ) + expected_second_model_settings = grpcclient.service_pb2.TraceSettingResponse() json_format.Parse( - json.dumps({ - "settings": { - "trace_file": { - "value": ["global_unittest.log"] - }, - "trace_level": { - "value": ["OFF"] - }, - "trace_rate": { - "value": ["1"] - }, - "trace_count": { - "value": ["-1"] - }, - "log_frequency": { - "value": ["34"] - }, + json.dumps( + { + "settings": { + "trace_file": {"value": ["global_unittest.log"]}, + "trace_level": {"value": ["OFF"]}, + "trace_rate": {"value": ["1"]}, + "trace_count": {"value": ["-1"]}, + "log_frequency": {"value": ["34"]}, + } } - }), expected_second_model_settings) + ), + expected_second_model_settings, + ) global_clear_settings = {"trace_rate": None, "trace_count": None} model_clear_settings = {"trace_rate": None, "trace_level": None} @@ -436,19 +408,26 @@ def test_grpc_clear_settings(self): self.assertEqual( expected_global_settings, triton_client.update_trace_settings(settings=global_clear_settings), - "Unexpected cleared global trace settings") - self.assertEqual(expected_first_model_settings, - triton_client.get_trace_settings(model_name="simple"), - "Unexpected model trace settings after global clear") + "Unexpected cleared global trace settings", + ) + self.assertEqual( + expected_first_model_settings, + triton_client.get_trace_settings(model_name="simple"), + "Unexpected model trace settings after global clear", + ) self.assertEqual( expected_second_model_settings, - triton_client.update_trace_settings(model_name="simple", - settings=model_clear_settings), - "Unexpected model trace settings after model clear") - self.assertEqual(expected_global_settings, - triton_client.get_trace_settings(), - "Unexpected global trace settings after model clear") + triton_client.update_trace_settings( + model_name="simple", settings=model_clear_settings + ), + "Unexpected model trace settings after model clear", + ) + self.assertEqual( + expected_global_settings, + triton_client.get_trace_settings(), + "Unexpected global trace settings after model clear", + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_triton_repo_agent/test.sh b/qa/L0_triton_repo_agent/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_trt_data_dependent_shape/trt_data_dependent_shape_test.py b/qa/L0_trt_data_dependent_shape/trt_data_dependent_shape_test.py old mode 100644 new mode 100755 index 3f2eeeaa40..ee0b675d84 --- a/qa/L0_trt_data_dependent_shape/trt_data_dependent_shape_test.py +++ b/qa/L0_trt_data_dependent_shape/trt_data_dependent_shape_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,16 +31,17 @@ sys.path.append("../common") import unittest + import numpy as np import test_util as tu import tritonclient.http as client class TrtDataDependentShapeTest(tu.TestResultCollector): - def setUp(self): - self.triton_client = client.InferenceServerClient("localhost:8000", - verbose=True) + self.triton_client = client.InferenceServerClient( + "localhost:8000", verbose=True + ) def test_fixed(self): model_name = "plan_nobatch_nonzero_fixed" @@ -46,15 +49,16 @@ def test_fixed(self): expected_output_np = np.nonzero(input_np) inputs = [] - inputs.append(client.InferInput('INPUT', [4, 4], "INT32")) + inputs.append(client.InferInput("INPUT", [4, 4], "INT32")) inputs[-1].set_data_from_numpy(input_np) results = self.triton_client.infer(model_name=model_name, inputs=inputs) # Validate the results by comparing with precomputed values. - output_np = results.as_numpy('OUTPUT') + output_np = results.as_numpy("OUTPUT") self.assertTrue( np.array_equal(output_np, expected_output_np), - "OUTPUT expected: {}, got {}".format(expected_output_np, output_np)) + "OUTPUT expected: {}, got {}".format(expected_output_np, output_np), + ) def test_dynamic(self): model_name = "plan_nobatch_nonzero_dynamic" @@ -65,16 +69,17 @@ def test_dynamic(self): expected_output_np = np.nonzero(input_np) inputs = [] - inputs.append(client.InferInput('INPUT', [20, 16], "INT32")) + inputs.append(client.InferInput("INPUT", [20, 16], "INT32")) inputs[-1].set_data_from_numpy(input_np) results = self.triton_client.infer(model_name=model_name, inputs=inputs) # Validate the results by comparing with precomputed values. - output_np = results.as_numpy('OUTPUT') + output_np = results.as_numpy("OUTPUT") self.assertTrue( np.array_equal(output_np, expected_output_np), - "OUTPUT expected: {}, got {}".format(expected_output_np, output_np)) + "OUTPUT expected: {}, got {}".format(expected_output_np, output_np), + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_trt_dla/dla_test.py b/qa/L0_trt_dla/dla_test.py old mode 100644 new mode 100755 index c4fe48a22d..d71d277ac4 --- a/qa/L0_trt_dla/dla_test.py +++ b/qa/L0_trt_dla/dla_test.py @@ -30,22 +30,21 @@ sys.path.append("../common") import unittest + import numpy as np -from PIL import Image import test_util as tu - import tritonclient.http as httpclient +from PIL import Image class InferTest(tu.TestResultCollector): - def _preprocess(self, img, dtype): """ Pre-process an image to meet the size and type requirements specified by the parameters. """ - sample_img = img.convert('RGB') + sample_img = img.convert("RGB") resized_img = sample_img.resize((224, 224), Image.BILINEAR) resized = np.array(resized_img) @@ -57,8 +56,7 @@ def _preprocess(self, img, dtype): def test_resnet50(self): try: - triton_client = httpclient.InferenceServerClient( - url="localhost:8000") + triton_client = httpclient.InferenceServerClient(url="localhost:8000") except Exception as e: print("channel creation failed: " + str(e)) sys.exit(1) @@ -74,22 +72,21 @@ def test_resnet50(self): batched_image_data = image_data for i in range(1, batch_size): batched_image_data = np.concatenate( - (batched_image_data, image_data), axis=0) + (batched_image_data, image_data), axis=0 + ) inputs = [ - httpclient.InferInput('input_tensor_0', [batch_size, 3, 224, 224], - 'INT8') + httpclient.InferInput("input_tensor_0", [batch_size, 3, 224, 224], "INT8") ] inputs[0].set_data_from_numpy(batched_image_data, binary_data=True) outputs = [ - httpclient.InferRequestedOutput('topk_layer_output_index', - binary_data=True) + httpclient.InferRequestedOutput("topk_layer_output_index", binary_data=True) ] results = triton_client.infer(model_name, inputs, outputs=outputs) - output_data = results.as_numpy('topk_layer_output_index') + output_data = results.as_numpy("topk_layer_output_index") print(output_data) # Validate the results by comparing with precomputed values. @@ -99,5 +96,5 @@ def test_resnet50(self): self.assertEqual(output_data[i][0][0], EXPECTED_CLASS_INDEX) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_trt_dla/test.sh b/qa/L0_trt_dla/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_trt_dynamic_shape/test.sh b/qa/L0_trt_dynamic_shape/test.sh index 99ecc7f2b8..43a39dd199 100755 --- a/qa/L0_trt_dynamic_shape/test.sh +++ b/qa/L0_trt_dynamic_shape/test.sh @@ -305,7 +305,7 @@ kill $SERVER_PID wait $SERVER_PID -# Adding test cases for mulitple optimization profiles with static shapes. +# Adding test cases for multiple optimization profiles with static shapes. # Will load only the following profiles with the static shapes: # Profile 7: [1, 33] # Profile 8: [3, 33] diff --git a/qa/L0_trt_dynamic_shape/trt_dynamic_shape_test.py b/qa/L0_trt_dynamic_shape/trt_dynamic_shape_test.py old mode 100644 new mode 100755 index d01bc51ee1..d9f890d9b6 --- a/qa/L0_trt_dynamic_shape/trt_dynamic_shape_test.py +++ b/qa/L0_trt_dynamic_shape/trt_dynamic_shape_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,33 +31,48 @@ sys.path.append("../common") import unittest -import numpy as np + import infer_util as iu +import numpy as np import test_util as tu import tritonhttpclient from tritonclientutils import InferenceServerException class TrtDynamicShapeTest(tu.TestResultCollector): - def setUp(self): self.dtype_ = np.float32 - self.model_name_ = 'plan' + self.model_name_ = "plan" def test_load_specific_optimization_profile(self): # Only OP 5 should be available, which only allow batch size 8 tensor_shape = (1,) try: - iu.infer_exact(self, self.model_name_, (1,) + tensor_shape, 1, - self.dtype_, self.dtype_, self.dtype_) + iu.infer_exact( + self, + self.model_name_, + (1,) + tensor_shape, + 1, + self.dtype_, + self.dtype_, + self.dtype_, + ) except InferenceServerException as ex: self.assertTrue( "model expected the shape of dimension 0 to be between 6 and 8 but received 1" - in ex.message()) + in ex.message() + ) try: - iu.infer_exact(self, self.model_name_, (8,) + tensor_shape, 8, - self.dtype_, self.dtype_, self.dtype_) + iu.infer_exact( + self, + self.model_name_, + (8,) + tensor_shape, + 8, + self.dtype_, + self.dtype_, + self.dtype_, + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) @@ -64,37 +81,60 @@ def test_load_default_optimization_profile(self): tensor_shape = (33,) try: - iu.infer_exact(self, self.model_name_, (8,) + tensor_shape, 8, - self.dtype_, self.dtype_, self.dtype_) + iu.infer_exact( + self, + self.model_name_, + (8,) + tensor_shape, + 8, + self.dtype_, + self.dtype_, + self.dtype_, + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) over_tensor_shape = (34,) try: - iu.infer_exact(self, self.model_name_, (8,) + over_tensor_shape, 8, - self.dtype_, self.dtype_, self.dtype_) + iu.infer_exact( + self, + self.model_name_, + (8,) + over_tensor_shape, + 8, + self.dtype_, + self.dtype_, + self.dtype_, + ) except InferenceServerException as ex: self.assertTrue( "model expected the shape of dimension 1 to be between 1 and 33 but received 34" - in ex.message()) + in ex.message() + ) def test_select_optimization_profile(self): # Different profile has different optimized input shape batch_size = 4 tensor_shape = (16,) try: - iu.infer_exact(self, self.model_name_, (batch_size,) + tensor_shape, - batch_size, self.dtype_, self.dtype_, self.dtype_) + iu.infer_exact( + self, + self.model_name_, + (batch_size,) + tensor_shape, + batch_size, + self.dtype_, + self.dtype_, + self.dtype_, + ) except InferenceServerException as ex: self.assertTrue(False, "unexpected error {}".format(ex)) def test_load_wrong_optimization_profile(self): client = tritonhttpclient.InferenceServerClient("localhost:8000") - model_name = tu.get_model_name(self.model_name_, self.dtype_, - self.dtype_, self.dtype_) + model_name = tu.get_model_name( + self.model_name_, self.dtype_, self.dtype_, self.dtype_ + ) model_status = client.is_model_ready(model_name, "1") self.assertFalse(model_status, "expected model to be not ready") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_trt_error_propagation/trt_error_propagation_test.py b/qa/L0_trt_error_propagation/trt_error_propagation_test.py old mode 100644 new mode 100755 index 69c7ecaa28..83527a7533 --- a/qa/L0_trt_error_propagation/trt_error_propagation_test.py +++ b/qa/L0_trt_error_propagation/trt_error_propagation_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -25,16 +27,15 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import unittest + import tritonclient.grpc as grpcclient from tritonclient.utils import InferenceServerException class TestTrtErrorPropagation(unittest.TestCase): - def setUp(self): # Initialize client - self.__triton = grpcclient.InferenceServerClient("localhost:8001", - verbose=True) + self.__triton = grpcclient.InferenceServerClient("localhost:8001", verbose=True) def test_invalid_trt_model(self): with self.assertRaises(InferenceServerException) as cm: @@ -42,13 +43,18 @@ def test_invalid_trt_model(self): err_msg = str(cm.exception) # All 'expected_msg_parts' should be present in the 'err_msg' in order expected_msg_parts = [ - "load failed for model", "version 1 is at UNAVAILABLE state: ", - "Internal: unable to create TensorRT engine: ", "Error Code ", - "Internal Error " + "load failed for model", + "version 1 is at UNAVAILABLE state: ", + "Internal: unable to create TensorRT engine: ", + "Error Code ", + "Internal Error ", ] for expected_msg_part in expected_msg_parts: - self.assertIn(expected_msg_part, err_msg, - "Cannot find an expected part of error message") + self.assertIn( + expected_msg_part, + err_msg, + "Cannot find an expected part of error message", + ) _, err_msg = err_msg.split(expected_msg_part) def test_invalid_trt_model_autocomplete(self): @@ -57,8 +63,10 @@ def test_invalid_trt_model_autocomplete(self): err_msg = str(cm.exception) self.assertIn( "Internal: unable to load plan file to auto complete config", - err_msg, "Caught an unexpected exception") + err_msg, + "Caught an unexpected exception", + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_trt_plugin/test.sh b/qa/L0_trt_plugin/test.sh old mode 100644 new mode 100755 diff --git a/qa/L0_trt_plugin/trt_plugin_test.py b/qa/L0_trt_plugin/trt_plugin_test.py old mode 100644 new mode 100755 index 8862348f7d..36f87335b6 --- a/qa/L0_trt_plugin/trt_plugin_test.py +++ b/qa/L0_trt_plugin/trt_plugin_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,55 +30,52 @@ sys.path.append("../common") +import os import unittest + import numpy as np -import os import test_util as tu - import tritonclient.http as httpclient # By default, find tritonserver on "localhost", but can be overridden # with TRITONSERVER_IPADDR envvar -_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost') +_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost") class PluginModelTest(tu.TestResultCollector): - def _full_exact(self, model_name, plugin_name, shape): print(f"{_tritonserver_ipaddr}:8000") - triton_client = httpclient.InferenceServerClient( - f"{_tritonserver_ipaddr}:8000") + triton_client = httpclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8000") inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', list(shape), "FP32")) + inputs.append(httpclient.InferInput("INPUT0", list(shape), "FP32")) input0_data = np.ones(shape=shape).astype(np.float32) inputs[0].set_data_from_numpy(input0_data, binary_data=True) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True)) - results = triton_client.infer(model_name + '_' + plugin_name, - inputs, - outputs=outputs) + results = triton_client.infer( + model_name + "_" + plugin_name, inputs, outputs=outputs + ) - output0_data = results.as_numpy('OUTPUT0') + output0_data = results.as_numpy("OUTPUT0") # Verify values of Clip, GELU, and Normalize - if plugin_name == 'CustomClipPlugin': + if plugin_name == "CustomClipPlugin": # Clip data to minimum of .1, maximum of .5 test_output = np.clip(input0_data, 0.1, 0.5) self.assertTrue(np.isclose(output0_data, test_output).all()) - elif plugin_name == 'CustomGeluPluginDynamic': + elif plugin_name == "CustomGeluPluginDynamic": # Add bias input0_data += 1 # Calculate Gelu activation - test_output = (input0_data * - 0.5) * (1 + np.tanh((0.797885 * input0_data) + - (0.035677 * (input0_data**3)))) + test_output = (input0_data * 0.5) * ( + 1 + np.tanh((0.797885 * input0_data) + (0.035677 * (input0_data**3))) + ) self.assertTrue(np.isclose(output0_data, test_output).all()) - elif plugin_name == 'Normalize_TRT': + elif plugin_name == "Normalize_TRT": # L2 norm is sqrt(sum([1]*16))) test_output = input0_data / np.sqrt(sum([1] * 16)) self.assertTrue(np.isclose(output0_data, test_output).all()) @@ -85,19 +84,24 @@ def _full_exact(self, model_name, plugin_name, shape): def test_raw_fff_clip(self): for bs in (1, 8): - self._full_exact('plan_float32_float32_float32', 'CustomClipPlugin', - (bs, 16)) + self._full_exact( + "plan_float32_float32_float32", "CustomClipPlugin", (bs, 16) + ) def test_raw_fff_gelu(self): - self._full_exact('plan_nobatch_float32_float32_float32', - 'CustomGeluPluginDynamic', (16, 1, 1)) + self._full_exact( + "plan_nobatch_float32_float32_float32", + "CustomGeluPluginDynamic", + (16, 1, 1), + ) def test_raw_fff_norm(self): # model that supports batching for bs in (1, 8): - self._full_exact('plan_float32_float32_float32', 'Normalize_TRT', - (bs, 16, 16, 16)) + self._full_exact( + "plan_float32_float32_float32", "Normalize_TRT", (bs, 16, 16, 16) + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_trt_reformat_free/trt_reformat_free_test.py b/qa/L0_trt_reformat_free/trt_reformat_free_test.py old mode 100644 new mode 100755 index 4192b878d8..ea36f9c24a --- a/qa/L0_trt_reformat_free/trt_reformat_free_test.py +++ b/qa/L0_trt_reformat_free/trt_reformat_free_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,12 +30,13 @@ sys.path.append("../common") -from builtins import range import unittest +from builtins import range + import numpy as np import test_util as tu -import tritonhttpclient import tritonclient.utils.shared_memory as shm +import tritonhttpclient def div_up(a, b): @@ -47,36 +50,36 @@ def reformat(format, tensor_np): factor = 32 else: raise ValueError( - "Unexpected format {} for testing reformat-free input".format( - format)) + "Unexpected format {} for testing reformat-free input".format(format) + ) shape = list(tensor_np.shape) + [factor] shape[-4] = div_up(shape[-4], factor) reformatted_tensor_np = np.empty(shape, tensor_np.dtype) if len(tensor_np.shape) == 3: batch = [(tensor_np, reformatted_tensor_np)] elif len(tensor_np.shape) == 4: - batch = [(tensor_np[idx], reformatted_tensor_np[idx]) - for idx in range(tensor_np.shape[0])] + batch = [ + (tensor_np[idx], reformatted_tensor_np[idx]) + for idx in range(tensor_np.shape[0]) + ] else: raise ValueError( "Unexpected numpy shape {} for testing reformat-free input".format( - tensor_np.shape)) - for (tensor, reformatted_tensor) in batch: + tensor_np.shape + ) + ) + for tensor, reformatted_tensor in batch: for c in range(tensor.shape[0]): for h in range(tensor.shape[1]): for w in range(tensor.shape[2]): - reformatted_tensor[c // - factor][h][w][c % - factor] = tensor[c][h][w] + reformatted_tensor[c // factor][h][w][c % factor] = tensor[c][h][w] return reformatted_tensor_np class TrtReformatFreeTest(tu.TestResultCollector): - def add_reformat_free_data_as_shared_memory(self, name, tensor, tensor_np): byte_size = tensor_np.size * tensor_np.dtype.itemsize - self.shm_handles.append( - shm.create_shared_memory_region(name, name, byte_size)) + self.shm_handles.append(shm.create_shared_memory_region(name, name, byte_size)) # Put data values into shared memory shm.set_shared_memory_region(self.shm_handles[-1], [tensor_np]) # Register shared memory with Triton Server @@ -87,7 +90,8 @@ def add_reformat_free_data_as_shared_memory(self, name, tensor, tensor_np): def setUp(self): self.shm_handles = [] self.triton_client = tritonhttpclient.InferenceServerClient( - "localhost:8000", verbose=True) + "localhost:8000", verbose=True + ) def tearDown(self): self.triton_client.unregister_system_shared_memory() @@ -105,39 +109,42 @@ def test_nobatch_chw2_input(self): # for non-linear format tensor, the data buffer is padded and thus the # data byte size may not match what is calculated from tensor shape inputs = [] - inputs.append(tritonhttpclient.InferInput('INPUT0', [13, 2, 1], "FP16")) - self.add_reformat_free_data_as_shared_memory("input0", inputs[-1], - reformatted_input_np) - inputs.append(tritonhttpclient.InferInput('INPUT1', [13, 2, 1], "FP16")) - self.add_reformat_free_data_as_shared_memory("input1", inputs[-1], - reformatted_input_np) + inputs.append(tritonhttpclient.InferInput("INPUT0", [13, 2, 1], "FP16")) + self.add_reformat_free_data_as_shared_memory( + "input0", inputs[-1], reformatted_input_np + ) + inputs.append(tritonhttpclient.InferInput("INPUT1", [13, 2, 1], "FP16")) + self.add_reformat_free_data_as_shared_memory( + "input1", inputs[-1], reformatted_input_np + ) outputs = [] outputs.append( - tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) outputs.append( - tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) - results = self.triton_client.infer(model_name=model_name, - inputs=inputs, - outputs=outputs) + results = self.triton_client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) # Validate the results by comparing with precomputed values. - output0_np = results.as_numpy('OUTPUT0') - output1_np = results.as_numpy('OUTPUT1') + output0_np = results.as_numpy("OUTPUT0") + output1_np = results.as_numpy("OUTPUT1") self.assertTrue( np.array_equal(output0_np, expected_output0_np), - "OUTPUT0 expected: {}, got {}".format(expected_output0_np, - output0_np)) + "OUTPUT0 expected: {}, got {}".format(expected_output0_np, output0_np), + ) self.assertTrue( np.array_equal(output1_np, expected_output1_np), - "OUTPUT0 expected: {}, got {}".format(expected_output1_np, - output1_np)) + "OUTPUT0 expected: {}, got {}".format(expected_output1_np, output1_np), + ) def test_chw2_input(self): model_name = "plan_CHW2_LINEAR_float16_float16_float16" for bs in [1, 8]: - input_np = np.arange(26 * bs, dtype=np.float16).reshape( - (bs, 13, 2, 1)) + input_np = np.arange(26 * bs, dtype=np.float16).reshape((bs, 13, 2, 1)) expected_output0_np = input_np + input_np expected_output1_np = input_np - input_np reformatted_input_np = reformat("CHW2", input_np) @@ -147,37 +154,37 @@ def test_chw2_input(self): # and thus the data byte size may not match what is calculated from # tensor shape inputs = [] - inputs.append( - tritonhttpclient.InferInput('INPUT0', [bs, 13, 2, 1], "FP16")) + inputs.append(tritonhttpclient.InferInput("INPUT0", [bs, 13, 2, 1], "FP16")) self.add_reformat_free_data_as_shared_memory( - "input0" + str(bs), inputs[-1], reformatted_input_np) - inputs.append( - tritonhttpclient.InferInput('INPUT1', [bs, 13, 2, 1], "FP16")) + "input0" + str(bs), inputs[-1], reformatted_input_np + ) + inputs.append(tritonhttpclient.InferInput("INPUT1", [bs, 13, 2, 1], "FP16")) self.add_reformat_free_data_as_shared_memory( - "input1" + str(bs), inputs[-1], reformatted_input_np) + "input1" + str(bs), inputs[-1], reformatted_input_np + ) outputs = [] outputs.append( - tritonhttpclient.InferRequestedOutput('OUTPUT0', - binary_data=True)) + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) outputs.append( - tritonhttpclient.InferRequestedOutput('OUTPUT1', - binary_data=True)) + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) - results = self.triton_client.infer(model_name=model_name, - inputs=inputs, - outputs=outputs) + results = self.triton_client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) # Validate the results by comparing with precomputed values. - output0_np = results.as_numpy('OUTPUT0') - output1_np = results.as_numpy('OUTPUT1') + output0_np = results.as_numpy("OUTPUT0") + output1_np = results.as_numpy("OUTPUT1") self.assertTrue( np.array_equal(output0_np, expected_output0_np), - "OUTPUT0 expected: {}, got {}".format(expected_output0_np, - output0_np)) + "OUTPUT0 expected: {}, got {}".format(expected_output0_np, output0_np), + ) self.assertTrue( np.array_equal(output1_np, expected_output1_np), - "OUTPUT0 expected: {}, got {}".format(expected_output1_np, - output1_np)) + "OUTPUT0 expected: {}, got {}".format(expected_output1_np, output1_np), + ) def test_nobatch_chw32_input(self): model_name = "plan_nobatch_CHW32_LINEAR_float32_float32_float32" @@ -190,39 +197,42 @@ def test_nobatch_chw32_input(self): # for non-linear format tensor, the data buffer is padded and thus the # data byte size may not match what is calculated from tensor shape inputs = [] - inputs.append(tritonhttpclient.InferInput('INPUT0', [13, 2, 1], "FP32")) - self.add_reformat_free_data_as_shared_memory("input0", inputs[-1], - reformatted_input_np) - inputs.append(tritonhttpclient.InferInput('INPUT1', [13, 2, 1], "FP32")) - self.add_reformat_free_data_as_shared_memory("input1", inputs[-1], - reformatted_input_np) + inputs.append(tritonhttpclient.InferInput("INPUT0", [13, 2, 1], "FP32")) + self.add_reformat_free_data_as_shared_memory( + "input0", inputs[-1], reformatted_input_np + ) + inputs.append(tritonhttpclient.InferInput("INPUT1", [13, 2, 1], "FP32")) + self.add_reformat_free_data_as_shared_memory( + "input1", inputs[-1], reformatted_input_np + ) outputs = [] outputs.append( - tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) outputs.append( - tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True)) + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) - results = self.triton_client.infer(model_name=model_name, - inputs=inputs, - outputs=outputs) + results = self.triton_client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) # Validate the results by comparing with precomputed values. - output0_np = results.as_numpy('OUTPUT0') - output1_np = results.as_numpy('OUTPUT1') + output0_np = results.as_numpy("OUTPUT0") + output1_np = results.as_numpy("OUTPUT1") self.assertTrue( np.array_equal(output0_np, expected_output0_np), - "OUTPUT0 expected: {}, got {}".format(expected_output0_np, - output0_np)) + "OUTPUT0 expected: {}, got {}".format(expected_output0_np, output0_np), + ) self.assertTrue( np.array_equal(output1_np, expected_output1_np), - "OUTPUT0 expected: {}, got {}".format(expected_output1_np, - output1_np)) + "OUTPUT0 expected: {}, got {}".format(expected_output1_np, output1_np), + ) def test_chw32_input(self): model_name = "plan_CHW32_LINEAR_float32_float32_float32" for bs in [1, 8]: - input_np = np.arange(26 * bs, dtype=np.float32).reshape( - (bs, 13, 2, 1)) + input_np = np.arange(26 * bs, dtype=np.float32).reshape((bs, 13, 2, 1)) expected_output0_np = input_np + input_np expected_output1_np = input_np - input_np reformatted_input_np = reformat("CHW32", input_np) @@ -232,38 +242,38 @@ def test_chw32_input(self): # and thus the data byte size may not match what is calculated from # tensor shape inputs = [] - inputs.append( - tritonhttpclient.InferInput('INPUT0', [bs, 13, 2, 1], "FP32")) + inputs.append(tritonhttpclient.InferInput("INPUT0", [bs, 13, 2, 1], "FP32")) self.add_reformat_free_data_as_shared_memory( - "input0" + str(bs), inputs[-1], reformatted_input_np) - inputs.append( - tritonhttpclient.InferInput('INPUT1', [bs, 13, 2, 1], "FP32")) + "input0" + str(bs), inputs[-1], reformatted_input_np + ) + inputs.append(tritonhttpclient.InferInput("INPUT1", [bs, 13, 2, 1], "FP32")) self.add_reformat_free_data_as_shared_memory( - "input1" + str(bs), inputs[-1], reformatted_input_np) + "input1" + str(bs), inputs[-1], reformatted_input_np + ) outputs = [] outputs.append( - tritonhttpclient.InferRequestedOutput('OUTPUT0', - binary_data=True)) + tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True) + ) outputs.append( - tritonhttpclient.InferRequestedOutput('OUTPUT1', - binary_data=True)) + tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True) + ) - results = self.triton_client.infer(model_name=model_name, - inputs=inputs, - outputs=outputs) + results = self.triton_client.infer( + model_name=model_name, inputs=inputs, outputs=outputs + ) # Validate the results by comparing with precomputed values. - output0_np = results.as_numpy('OUTPUT0') - output1_np = results.as_numpy('OUTPUT1') + output0_np = results.as_numpy("OUTPUT0") + output1_np = results.as_numpy("OUTPUT1") self.assertTrue( np.array_equal(output0_np, expected_output0_np), - "OUTPUT0 expected: {}, got {}".format(expected_output0_np, - output0_np)) + "OUTPUT0 expected: {}, got {}".format(expected_output0_np, output0_np), + ) self.assertTrue( np.array_equal(output1_np, expected_output1_np), - "OUTPUT0 expected: {}, got {}".format(expected_output1_np, - output1_np)) + "OUTPUT0 expected: {}, got {}".format(expected_output1_np, output1_np), + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_trt_shape_tensors/test.sh b/qa/L0_trt_shape_tensors/test.sh old mode 100644 new mode 100755 index e0f0faa229..eed67d9dcb --- a/qa/L0_trt_shape_tensors/test.sh +++ b/qa/L0_trt_shape_tensors/test.sh @@ -49,7 +49,7 @@ SERVER_ARGS="--model-repository=`pwd`/models" SERVER_LOG="./inference_server.log" source ../common/util.sh -rm -fr *.log +rm -fr *.log rm -fr models && mkdir models cp -r /data/inferenceserver/${REPO_VERSION}/qa_shapetensor_model_repository/* models/. diff --git a/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py b/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py old mode 100644 new mode 100755 index 14609dbb94..a83795f981 --- a/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py +++ b/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,20 +30,19 @@ sys.path.append("../common") -from builtins import range import os -import unittest -import time import threading -import numpy as np +import time +import unittest +from builtins import range + import infer_util as iu -import test_util as tu +import numpy as np import sequence_util as su - +import test_util as tu import tritongrpcclient as grpcclient -TEST_SYSTEM_SHARED_MEMORY = bool( - int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0))) +TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0))) _model_instances = 1 _max_queue_delay_ms = 10000 @@ -52,7 +53,6 @@ class InferShapeTensorTest(tu.TestResultCollector): - def setUp(self): # The helper client for setup will be GRPC for simplicity. self.triton_client_ = grpcclient.InferenceServerClient("localhost:8001") @@ -75,14 +75,16 @@ def check_deferred_exception(self): if len(_deferred_exceptions) > 0: raise _deferred_exceptions[0] - def check_response(self, - bs, - thresholds, - shape_values, - dummy_input_shapes, - shm_region_names=None, - precreated_shm_regions=None, - shm_suffix=""): + def check_response( + self, + bs, + thresholds, + shape_values, + dummy_input_shapes, + shm_region_names=None, + precreated_shm_regions=None, + shm_suffix="", + ): try: # Add batch size to shape as full shape is expected for i in range(len(dummy_input_shapes)): @@ -93,7 +95,7 @@ def check_response(self, iu.infer_shape_tensor( self, - 'plan', + "plan", np.float32, shape_values, dummy_input_shapes, @@ -101,7 +103,8 @@ def check_response(self, use_streaming=False, shm_suffix=shm_suffix, use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, - batch_size=bs) + batch_size=bs, + ) end_ms = int(round(time.time() * 1000)) @@ -110,13 +113,21 @@ def check_response(self, if lt_ms is not None: self.assertTrue( (end_ms - start_ms) < lt_ms, - "expected less than " + str(lt_ms) + - "ms response time, got " + str(end_ms - start_ms) + " ms") + "expected less than " + + str(lt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) if gt_ms is not None: self.assertTrue( (end_ms - start_ms) > gt_ms, - "expected greater than " + str(gt_ms) + - "ms response time, got " + str(end_ms - start_ms) + " ms") + "expected greater than " + + str(gt_ms) + + "ms response time, got " + + str(end_ms - start_ms) + + " ms", + ) except Exception as ex: self.add_deferred_exception(ex) @@ -126,8 +137,9 @@ def check_setup(self, model_name): bconfig = config.dynamic_batching self.assertTrue(2 in bconfig.preferred_batch_size) self.assertTrue(6 in bconfig.preferred_batch_size) - self.assertEqual(bconfig.max_queue_delay_microseconds, - _max_queue_delay_ms * 1000) # 10 secs + self.assertEqual( + bconfig.max_queue_delay_microseconds, _max_queue_delay_ms * 1000 + ) # 10 secs def check_status(self, model_name, batch_exec, exec_cnt, infer_cnt): # There is a time window between when responses are returned and statistics are updated. @@ -135,113 +147,154 @@ def check_status(self, model_name, batch_exec, exec_cnt, infer_cnt): # inference statistics to be ready. num_tries = 10 for i in range(num_tries): - stats = self.triton_client_.get_inference_statistics( - model_name, "1") + stats = self.triton_client_.get_inference_statistics(model_name, "1") self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats") actual_exec_cnt = stats.model_stats[0].execution_count if actual_exec_cnt == exec_cnt: break - print("WARNING: expect {} executions, got {} (attempt {})".format( - exec_cnt, actual_exec_cnt, i)) + print( + "WARNING: expect {} executions, got {} (attempt {})".format( + exec_cnt, actual_exec_cnt, i + ) + ) time.sleep(1) - self.assertEqual(stats.model_stats[0].name, model_name, - "expect model stats for model {}".format(model_name)) self.assertEqual( - stats.model_stats[0].version, "1", - "expect model stats for model {} version 1".format(model_name)) + stats.model_stats[0].name, + model_name, + "expect model stats for model {}".format(model_name), + ) + self.assertEqual( + stats.model_stats[0].version, + "1", + "expect model stats for model {} version 1".format(model_name), + ) if batch_exec is not None: batch_stats = stats.model_stats[0].batch_stats print(batch_stats) self.assertEqual( - len(batch_stats), len(batch_exec), + len(batch_stats), + len(batch_exec), "expected {} different batch-sizes, got {}".format( - len(batch_exec), len(batch_stats))) + len(batch_exec), len(batch_stats) + ), + ) for batch_stat in batch_stats: bs = batch_stat.batch_size bc = batch_stat.compute_infer.count self.assertTrue( - bs in batch_exec, - "did not find expected batch-size {}".format(bs)) + bs in batch_exec, "did not find expected batch-size {}".format(bs) + ) # Get count from one of the stats self.assertEqual( - bc, batch_exec[bs], - "expected model-execution-count {} for batch size {}, got {}" - .format(batch_exec[bs], bs, bc)) + bc, + batch_exec[bs], + "expected model-execution-count {} for batch size {}, got {}".format( + batch_exec[bs], bs, bc + ), + ) actual_exec_cnt = stats.model_stats[0].execution_count self.assertEqual( - actual_exec_cnt, exec_cnt, - "expected model-exec-count {}, got {}".format( - exec_cnt, actual_exec_cnt)) + actual_exec_cnt, + exec_cnt, + "expected model-exec-count {}, got {}".format(exec_cnt, actual_exec_cnt), + ) actual_infer_cnt = stats.model_stats[0].inference_count self.assertEqual( - actual_infer_cnt, infer_cnt, + actual_infer_cnt, + infer_cnt, "expected model-inference-count {}, got {}".format( - infer_cnt, actual_infer_cnt)) + infer_cnt, actual_infer_cnt + ), + ) actual_infer_cnt = stats.model_stats[0].inference_count self.assertEqual( - actual_infer_cnt, infer_cnt, + actual_infer_cnt, + infer_cnt, "expected model-inference-count {}, got {}".format( - infer_cnt, actual_infer_cnt)) + infer_cnt, actual_infer_cnt + ), + ) def test_static_batch(self): iu.infer_shape_tensor( self, - 'plan', - np.float32, [[32, 32]], [[8, 4, 4]], + "plan", + np.float32, + [[32, 32]], + [[8, 4, 4]], use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, - batch_size=8) + batch_size=8, + ) iu.infer_shape_tensor( self, - 'plan', - np.float32, [[4, 4]], [[8, 32, 32]], + "plan", + np.float32, + [[4, 4]], + [[8, 32, 32]], use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, - batch_size=8) + batch_size=8, + ) iu.infer_shape_tensor( self, - 'plan', - np.float32, [[4, 4]], [[8, 4, 4]], + "plan", + np.float32, + [[4, 4]], + [[8, 4, 4]], use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, - batch_size=8) + batch_size=8, + ) def test_nobatch(self): iu.infer_shape_tensor( self, - 'plan_nobatch', - np.float32, [[32, 32]], [[4, 4]], - use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY) + "plan_nobatch", + np.float32, + [[32, 32]], + [[4, 4]], + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + ) iu.infer_shape_tensor( self, - 'plan_nobatch', - np.float32, [[4, 4]], [[32, 32]], - use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY) + "plan_nobatch", + np.float32, + [[4, 4]], + [[32, 32]], + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + ) iu.infer_shape_tensor( self, - 'plan_nobatch', - np.float32, [[4, 4]], [[4, 4]], - use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY) + "plan_nobatch", + np.float32, + [[4, 4]], + [[4, 4]], + use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, + ) def test_wrong_shape_values(self): over_shape_values = [[32, 33]] try: iu.infer_shape_tensor( self, - 'plan', + "plan", np.float32, - over_shape_values, [[8, 4, 4]], + over_shape_values, + [[8, 4, 4]], use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY, - batch_size=8) + batch_size=8, + ) # InferenceServerException will be raised from different namespace, # use dynamic type characteristic to catch both ex except Exception as ex: self.assertTrue( "The shape value at index 2 is expected to be in range from 1 to 32, Got: 33" - in ex.message()) + in ex.message() + ) # Dynamic Batcher tests def test_dynamic_different_shape_values(self): @@ -257,22 +310,27 @@ def test_dynamic_different_shape_values(self): threads = [] threads.append( - threading.Thread(target=self.check_response, - args=(3, (6000, None)), - kwargs={ - 'shape_values': [[2, 2]], - 'dummy_input_shapes': [[16, 16]], - 'shm_suffix': '{}'.format(len(threads)) - })) + threading.Thread( + target=self.check_response, + args=(3, (6000, None)), + kwargs={ + "shape_values": [[2, 2]], + "dummy_input_shapes": [[16, 16]], + "shm_suffix": "{}".format(len(threads)), + }, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(3, (_max_queue_delay_ms * 1.5, - _max_queue_delay_ms)), - kwargs={ - 'shape_values': [[4, 4]], - 'dummy_input_shapes': [[16, 16]], - 'shm_suffix': '{}'.format(len(threads)) - })) + threading.Thread( + target=self.check_response, + args=(3, (_max_queue_delay_ms * 1.5, _max_queue_delay_ms)), + kwargs={ + "shape_values": [[4, 4]], + "dummy_input_shapes": [[16, 16]], + "shm_suffix": "{}".format(len(threads)), + }, + ) + ) threads[0].start() time.sleep(1) threads[1].start() @@ -295,21 +353,27 @@ def test_dynamic_identical_shape_values(self): threads = [] threads.append( - threading.Thread(target=self.check_response, - args=(4, (6000, None)), - kwargs={ - 'shape_values': [[4, 4]], - 'dummy_input_shapes': [[16, 16]], - 'shm_suffix': '{}'.format(len(threads)) - })) + threading.Thread( + target=self.check_response, + args=(4, (6000, None)), + kwargs={ + "shape_values": [[4, 4]], + "dummy_input_shapes": [[16, 16]], + "shm_suffix": "{}".format(len(threads)), + }, + ) + ) threads.append( - threading.Thread(target=self.check_response, - args=(2, (6000, None)), - kwargs={ - 'shape_values': [[4, 4]], - 'dummy_input_shapes': [[16, 16]], - 'shm_suffix': '{}'.format(len(threads)) - })) + threading.Thread( + target=self.check_response, + args=(2, (6000, None)), + kwargs={ + "shape_values": [[4, 4]], + "dummy_input_shapes": [[16, 16]], + "shm_suffix": "{}".format(len(threads)), + }, + ) + ) threads[0].start() time.sleep(1) threads[1].start() @@ -322,7 +386,6 @@ def test_dynamic_identical_shape_values(self): class SequenceBatcherShapeTensorTest(su.SequenceBatcherTestUtil): - def get_expected_result(self, expected_result, value, flag_str=None): # Adjust the expected_result for models expected_result = value @@ -345,20 +408,21 @@ def test_sequence_identical_shape_values(self): # Need scheduler to wait for queue to contain all # inferences for both sequences. self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), - 12) - self.assertTrue( - "TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ) - self.assertEqual( - int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12) + self.assertTrue("TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0) precreated_shm0_handles = self.precreate_register_shape_tensor_regions( - ((2, 1), (4, 2), (8, 3)), dtype, 0) + ((2, 1), (4, 2), (8, 3)), dtype, 0 + ) precreated_shm1_handles = self.precreate_register_shape_tensor_regions( - ((2, 11), (4, 12), (8, 13)), dtype, 1) + ((2, 11), (4, 12), (8, 13)), dtype, 1 + ) precreated_shm2_handles = self.precreate_register_shape_tensor_regions( - ((2, 111), (4, 112), (8, 113)), dtype, 2) + ((2, 111), (4, 112), (8, 113)), dtype, 2 + ) precreated_shm3_handles = self.precreate_register_shape_tensor_regions( - ((2, 1111), (4, 1112), (8, 1113)), dtype, 3) + ((2, 1111), (4, 1112), (8, 1113)), dtype, 3 + ) threads = [] threads.append( threading.Thread( @@ -369,12 +433,17 @@ def test_sequence_identical_shape_values(self): 1001, (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 2, 1, None), (None, 4, 2, None), ("end", 8, - 3, None)), + ( + ("start", 2, 1, None), + (None, 4, 2, None), + ("end", 8, 3, None), + ), self.get_expected_result(6, 3, "end"), - precreated_shm0_handles), - kwargs={'sequence_name': "{}".format(self._testMethodName) - })) + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, @@ -384,12 +453,17 @@ def test_sequence_identical_shape_values(self): 1002, (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 2, 11, None), (None, 4, 12, None), - ("end", 8, 13, None)), + ( + ("start", 2, 11, None), + (None, 4, 12, None), + ("end", 8, 13, None), + ), self.get_expected_result(36, 13, "end"), - precreated_shm1_handles), - kwargs={'sequence_name': "{}".format(self._testMethodName) - })) + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, @@ -399,12 +473,17 @@ def test_sequence_identical_shape_values(self): 1003, (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 2, 111, None), (None, 4, 112, None), - ("end", 8, 113, None)), + ( + ("start", 2, 111, None), + (None, 4, 112, None), + ("end", 8, 113, None), + ), self.get_expected_result(336, 113, "end"), - precreated_shm2_handles), - kwargs={'sequence_name': "{}".format(self._testMethodName) - })) + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, @@ -414,12 +493,17 @@ def test_sequence_identical_shape_values(self): 1004, (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 2, 1111, None), (None, 4, 1112, None), - ("end", 8, 1113, None)), + ( + ("start", 2, 1111, None), + (None, 4, 1112, None), + ("end", 8, 1113, None), + ), self.get_expected_result(3336, 1113, "end"), - precreated_shm3_handles), - kwargs={'sequence_name': "{}".format(self._testMethodName) - })) + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) for t in threads: t.start() @@ -447,13 +531,17 @@ def test_sequence_different_shape_values(self): dtype = np.float32 precreated_shm0_handles = self.precreate_register_shape_tensor_regions( - ((1, 1), (1, 2), (1, 3)), dtype, 0) + ((1, 1), (1, 2), (1, 3)), dtype, 0 + ) precreated_shm1_handles = self.precreate_register_shape_tensor_regions( - ((32, 11), (32, 12), (32, 13)), dtype, 1) + ((32, 11), (32, 12), (32, 13)), dtype, 1 + ) precreated_shm2_handles = self.precreate_register_shape_tensor_regions( - ((16, 111), (16, 112), (16, 113)), dtype, 2) + ((16, 111), (16, 112), (16, 113)), dtype, 2 + ) precreated_shm3_handles = self.precreate_register_shape_tensor_regions( - ((1, 1111), (1, 1112), (1, 1113)), dtype, 3) + ((1, 1111), (1, 1112), (1, 1113)), dtype, 3 + ) try: model_name = tu.get_sequence_model_name("plan", dtype) self.check_setup(model_name) @@ -461,12 +549,9 @@ def test_sequence_different_shape_values(self): # Need scheduler to wait for queue to contain all # inferences for both sequences. self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), - 12) - self.assertTrue( - "TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ) - self.assertEqual( - int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0) + self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12) + self.assertTrue("TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ) + self.assertEqual(int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0) threads = [] threads.append( @@ -478,12 +563,17 @@ def test_sequence_different_shape_values(self): 1001, (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 1, 1, None), (None, 1, 2, None), ("end", 1, - 3, None)), + ( + ("start", 1, 1, None), + (None, 1, 2, None), + ("end", 1, 3, None), + ), self.get_expected_result(6, 3, "end"), - precreated_shm0_handles), - kwargs={'sequence_name': "{}".format(self._testMethodName) - })) + precreated_shm0_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, @@ -493,12 +583,17 @@ def test_sequence_different_shape_values(self): 1002, (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 32, 11, None), (None, 32, 12, None), - ("end", 32, 13, None)), + ( + ("start", 32, 11, None), + (None, 32, 12, None), + ("end", 32, 13, None), + ), self.get_expected_result(36, 13, "end"), - precreated_shm1_handles), - kwargs={'sequence_name': "{}".format(self._testMethodName) - })) + precreated_shm1_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, @@ -508,12 +603,17 @@ def test_sequence_different_shape_values(self): 1003, (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 16, 111, None), (None, 16, 112, None), - ("end", 16, 113, None)), + ( + ("start", 16, 111, None), + (None, 16, 112, None), + ("end", 16, 113, None), + ), self.get_expected_result(336, 113, "end"), - precreated_shm2_handles), - kwargs={'sequence_name': "{}".format(self._testMethodName) - })) + precreated_shm2_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, @@ -523,12 +623,17 @@ def test_sequence_different_shape_values(self): 1004, (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 1, 1111, None), (None, 1, 1112, None), - ("end", 1, 1113, None)), + ( + ("start", 1, 1111, None), + (None, 1, 1112, None), + ("end", 1, 1113, None), + ), self.get_expected_result(3336, 1113, "end"), - precreated_shm3_handles), - kwargs={'sequence_name': "{}".format(self._testMethodName) - })) + precreated_shm3_handles, + ), + kwargs={"sequence_name": "{}".format(self._testMethodName)}, + ) + ) for t in threads: t.start() @@ -549,12 +654,7 @@ def test_sequence_different_shape_values(self): class DynaSequenceBatcherTest(su.SequenceBatcherTestUtil): - - def get_expected_result(self, - expected_result, - corrid, - value, - flag_str=None): + def get_expected_result(self, expected_result, corrid, value, flag_str=None): expected_result = value if flag_str is not None: if "start" in flag_str: @@ -568,20 +668,23 @@ def _multi_sequence_different_shape_impl(self, sleep_secs): dtype = np.float32 precreated_shm0_handles = self.precreate_register_dynaseq_shape_tensor_regions( - ((1, 1), (12, 2), (2, 3)), dtype, 0) + ((1, 1), (12, 2), (2, 3)), dtype, 0 + ) precreated_shm1_handles = self.precreate_register_dynaseq_shape_tensor_regions( - ((3, 11), (4, 12), (5, 13)), dtype, 1) + ((3, 11), (4, 12), (5, 13)), dtype, 1 + ) precreated_shm2_handles = self.precreate_register_dynaseq_shape_tensor_regions( - ((6, 111), (7, 112), (8, 113)), dtype, 2) + ((6, 111), (7, 112), (8, 113)), dtype, 2 + ) precreated_shm3_handles = self.precreate_register_dynaseq_shape_tensor_regions( - ((9, 1111), (10, 1112), (11, 1113)), dtype, 3) + ((9, 1111), (10, 1112), (11, 1113)), dtype, 3 + ) try: model_name = tu.get_dyna_sequence_model_name("plan", dtype) self.check_setup(model_name) self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.assertFalse( - "TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ) + self.assertFalse("TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ) corrids = [1001, 1002, 1003, 1004] threads = [] @@ -594,17 +697,22 @@ def _multi_sequence_different_shape_impl(self, sleep_secs): corrids[0], (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 1, 1, None), (None, 12, 2, None), ("end", 2, - 3, None)), - self.get_expected_result(4 + corrids[0], corrids[0], 3, - "end"), - precreated_shm0_handles), + ( + ("start", 1, 1, None), + (None, 12, 2, None), + ("end", 2, 3, None), + ), + self.get_expected_result(4 + corrids[0], corrids[0], 3, "end"), + precreated_shm0_handles, + ), kwargs={ - 'sequence_name': - "{}_{}".format(self._testMethodName, corrids[0]), - 'using_dynamic_batcher': - True - })) + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[0] + ), + "using_dynamic_batcher": True, + }, + ) + ) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, @@ -614,17 +722,24 @@ def _multi_sequence_different_shape_impl(self, sleep_secs): corrids[1], (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 3, 11, None), (None, 4, 12, None), - ("end", 5, 13, None)), - self.get_expected_result(36 + corrids[1], corrids[1], - 13, "end"), - precreated_shm1_handles), + ( + ("start", 3, 11, None), + (None, 4, 12, None), + ("end", 5, 13, None), + ), + self.get_expected_result( + 36 + corrids[1], corrids[1], 13, "end" + ), + precreated_shm1_handles, + ), kwargs={ - 'sequence_name': - "{}_{}".format(self._testMethodName, corrids[1]), - 'using_dynamic_batcher': - True - })) + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[1] + ), + "using_dynamic_batcher": True, + }, + ) + ) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, @@ -634,17 +749,24 @@ def _multi_sequence_different_shape_impl(self, sleep_secs): corrids[2], (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 6, 111, None), (None, 7, 112, None), - ("end", 8, 113, None)), - self.get_expected_result(336 + corrids[2], corrids[2], - 113, "end"), - precreated_shm2_handles), + ( + ("start", 6, 111, None), + (None, 7, 112, None), + ("end", 8, 113, None), + ), + self.get_expected_result( + 336 + corrids[2], corrids[2], 113, "end" + ), + precreated_shm2_handles, + ), kwargs={ - 'sequence_name': - "{}_{}".format(self._testMethodName, corrids[2]), - 'using_dynamic_batcher': - True - })) + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[2] + ), + "using_dynamic_batcher": True, + }, + ) + ) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, @@ -654,17 +776,24 @@ def _multi_sequence_different_shape_impl(self, sleep_secs): corrids[3], (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 9, 1111, None), (None, 10, 1112, None), - ("end", 11, 1113, None)), - self.get_expected_result(3336 + corrids[3], corrids[3], - 1113, "end"), - precreated_shm3_handles), + ( + ("start", 9, 1111, None), + (None, 10, 1112, None), + ("end", 11, 1113, None), + ), + self.get_expected_result( + 3336 + corrids[3], corrids[3], 1113, "end" + ), + precreated_shm3_handles, + ), kwargs={ - 'sequence_name': - "{}_{}".format(self._testMethodName, corrids[3]), - 'using_dynamic_batcher': - True - })) + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[3] + ), + "using_dynamic_batcher": True, + }, + ) + ) for t in threads: t.start() @@ -688,21 +817,24 @@ def _multi_sequence_identical_shape_impl(self, sleep_secs): dtype = np.float32 precreated_shm0_handles = self.precreate_register_dynaseq_shape_tensor_regions( - ((2, 1), (4, 2), (8, 3)), dtype, 0) + ((2, 1), (4, 2), (8, 3)), dtype, 0 + ) precreated_shm1_handles = self.precreate_register_dynaseq_shape_tensor_regions( - ((2, 11), (4, 12), (8, 13)), dtype, 1) + ((2, 11), (4, 12), (8, 13)), dtype, 1 + ) precreated_shm2_handles = self.precreate_register_dynaseq_shape_tensor_regions( - ((2, 111), (4, 112), (8, 113)), dtype, 2) + ((2, 111), (4, 112), (8, 113)), dtype, 2 + ) precreated_shm3_handles = self.precreate_register_dynaseq_shape_tensor_regions( - ((2, 1111), (4, 1112), (8, 1113)), dtype, 3) + ((2, 1111), (4, 1112), (8, 1113)), dtype, 3 + ) try: model_name = tu.get_dyna_sequence_model_name("plan", dtype) self.check_setup(model_name) self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ) - self.assertFalse( - "TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ) + self.assertFalse("TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ) corrids = [1001, 1002, 1003, 1004] threads = [] @@ -715,17 +847,22 @@ def _multi_sequence_identical_shape_impl(self, sleep_secs): corrids[0], (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 2, 1, None), (None, 4, 2, None), ("end", 8, - 3, None)), - self.get_expected_result(4 + corrids[0], corrids[0], 3, - "end"), - precreated_shm0_handles), + ( + ("start", 2, 1, None), + (None, 4, 2, None), + ("end", 8, 3, None), + ), + self.get_expected_result(4 + corrids[0], corrids[0], 3, "end"), + precreated_shm0_handles, + ), kwargs={ - 'sequence_name': - "{}_{}".format(self._testMethodName, corrids[0]), - 'using_dynamic_batcher': - True - })) + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[0] + ), + "using_dynamic_batcher": True, + }, + ) + ) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, @@ -735,17 +872,24 @@ def _multi_sequence_identical_shape_impl(self, sleep_secs): corrids[1], (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 2, 11, None), (None, 4, 12, None), - ("end", 8, 13, None)), - self.get_expected_result(36 + corrids[1], corrids[1], - 13, "end"), - precreated_shm1_handles), + ( + ("start", 2, 11, None), + (None, 4, 12, None), + ("end", 8, 13, None), + ), + self.get_expected_result( + 36 + corrids[1], corrids[1], 13, "end" + ), + precreated_shm1_handles, + ), kwargs={ - 'sequence_name': - "{}_{}".format(self._testMethodName, corrids[1]), - 'using_dynamic_batcher': - True - })) + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[1] + ), + "using_dynamic_batcher": True, + }, + ) + ) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, @@ -755,17 +899,24 @@ def _multi_sequence_identical_shape_impl(self, sleep_secs): corrids[2], (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 2, 111, None), (None, 4, 112, None), - ("end", 8, 113, None)), - self.get_expected_result(336 + corrids[2], corrids[2], - 113, "end"), - precreated_shm2_handles), + ( + ("start", 2, 111, None), + (None, 4, 112, None), + ("end", 8, 113, None), + ), + self.get_expected_result( + 336 + corrids[2], corrids[2], 113, "end" + ), + precreated_shm2_handles, + ), kwargs={ - 'sequence_name': - "{}_{}".format(self._testMethodName, corrids[2]), - 'using_dynamic_batcher': - True - })) + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[2] + ), + "using_dynamic_batcher": True, + }, + ) + ) threads.append( threading.Thread( target=self.check_sequence_shape_tensor_io, @@ -775,17 +926,24 @@ def _multi_sequence_identical_shape_impl(self, sleep_secs): corrids[3], (None, None), # (flag_str, shape_value, value, pre_delay_ms) - (("start", 2, 1111, None), (None, 4, 1112, None), - ("end", 8, 1113, None)), - self.get_expected_result(3336 + corrids[3], corrids[3], - 1113, "end"), - precreated_shm3_handles), + ( + ("start", 2, 1111, None), + (None, 4, 1112, None), + ("end", 8, 1113, None), + ), + self.get_expected_result( + 3336 + corrids[3], corrids[3], 1113, "end" + ), + precreated_shm3_handles, + ), kwargs={ - 'sequence_name': - "{}_{}".format(self._testMethodName, corrids[3]), - 'using_dynamic_batcher': - True - })) + "sequence_name": "{}_{}".format( + self._testMethodName, corrids[3] + ), + "using_dynamic_batcher": True, + }, + ) + ) for t in threads: t.start() @@ -827,5 +985,5 @@ def test_dynaseq_different_shape_values_parallel(self): self._multi_sequence_different_shape_impl(0) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_vertex_ai/test.sh b/qa/L0_vertex_ai/test.sh old mode 100644 new mode 100755 index d334d6c886..3113a66d1f --- a/qa/L0_vertex_ai/test.sh +++ b/qa/L0_vertex_ai/test.sh @@ -106,7 +106,7 @@ function vertex_ai_wait_for_server_ready() { WAIT_RET=1 } -# Helper function to unset all AIP vairables before test +# Helper function to unset all AIP variables before test function unset_vertex_variables() { unset AIP_MODE unset AIP_HTTP_PORT @@ -418,7 +418,7 @@ else fi fi -# Test AIP_STORAGE_URI won't be used if model repository is specified +# Test AIP_STORAGE_URI won't be used if model repository is specified SERVER_ARGS="--model-repository=single_model" run_server_nowait vertex_ai_wait_for_server_ready $SERVER_PID 10 diff --git a/qa/L0_vertex_ai/vertex_ai_test.py b/qa/L0_vertex_ai/vertex_ai_test.py old mode 100644 new mode 100755 index 77f78aad36..b6f9fc42b4 --- a/qa/L0_vertex_ai/vertex_ai_test.py +++ b/qa/L0_vertex_ai/vertex_ai_test.py @@ -30,34 +30,30 @@ sys.path.append("../common") import os +import sys import unittest + import numpy as np +import requests import test_util as tu import tritonclient.http as httpclient -import os -import requests -import sys - class VertexAiTest(tu.TestResultCollector): - def setUp(self): - port = os.getenv('AIP_HTTP_PORT', '8080') - predict_endpoint = os.getenv('AIP_PREDICT_ROUTE', '/predict') - self.model_ = os.getenv('TEST_EXPLICIT_MODEL_NAME', 'addsub') + port = os.getenv("AIP_HTTP_PORT", "8080") + predict_endpoint = os.getenv("AIP_PREDICT_ROUTE", "/predict") + self.model_ = os.getenv("TEST_EXPLICIT_MODEL_NAME", "addsub") self.url_ = "http://localhost:{}{}".format(port, predict_endpoint) - self.input_data_ = [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - ] + self.input_data_ = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] self.expected_output0_data_ = [x * 2 for x in self.input_data_] self.expected_output1_data_ = [0 for x in self.input_data_] def test_predict(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -65,22 +61,20 @@ def test_predict(self): inputs[0].set_data_from_numpy(input_data, binary_data=False) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) request_body, _ = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + inputs, outputs=outputs + ) - headers = {'Content-Type': 'application/json'} + headers = {"Content-Type": "application/json"} r = requests.post(self.url_, data=request_body, headers=headers) r.raise_for_status() - result = httpclient.InferenceServerClient.parse_response_body( - r._content) + result = httpclient.InferenceServerClient.parse_response_body(r._content) - output0_data = result.as_numpy('OUTPUT0') - output1_data = result.as_numpy('OUTPUT1') + output0_data = result.as_numpy("OUTPUT0") + output1_data = result.as_numpy("OUTPUT1") for i in range(16): self.assertEqual(output0_data[0][i], self.expected_output0_data_[i]) self.assertEqual(output1_data[0][i], self.expected_output1_data_[i]) @@ -88,8 +82,8 @@ def test_predict(self): def test_predict_specified_model(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -97,27 +91,23 @@ def test_predict_specified_model(self): inputs[0].set_data_from_numpy(input_data, binary_data=False) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) request_body, _ = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + inputs, outputs=outputs + ) headers = { - 'Content-Type': - 'application/json', - "X-Vertex-Ai-Triton-Redirect": - "v2/models/{}/infer".format(self.model_) + "Content-Type": "application/json", + "X-Vertex-Ai-Triton-Redirect": "v2/models/{}/infer".format(self.model_), } r = requests.post(self.url_, data=request_body, headers=headers) r.raise_for_status() - result = httpclient.InferenceServerClient.parse_response_body( - r._content) + result = httpclient.InferenceServerClient.parse_response_body(r._content) - output0_data = result.as_numpy('OUTPUT0') - output1_data = result.as_numpy('OUTPUT1') + output0_data = result.as_numpy("OUTPUT0") + output1_data = result.as_numpy("OUTPUT1") if self.model_ == "addsub": expected_output0_data = [x * 2 for x in self.input_data_] expected_output1_data = [0 for x in self.input_data_] @@ -131,8 +121,8 @@ def test_predict_specified_model(self): def test_predict_request_binary(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -140,25 +130,26 @@ def test_predict_request_binary(self): inputs[0].set_data_from_numpy(input_data, binary_data=True) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) - request_body, header_length = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) headers = { - 'Content-Type': - 'application/vnd.vertex-ai-triton.binary+json;json-header-size={}' - .format(header_length) + "Content-Type": "application/vnd.vertex-ai-triton.binary+json;json-header-size={}".format( + header_length + ) } r = requests.post(self.url_, data=request_body, headers=headers) r.raise_for_status() - result = httpclient.InferenceServerClient.parse_response_body( - r._content) - output0_data = result.as_numpy('OUTPUT0') - output1_data = result.as_numpy('OUTPUT1') + result = httpclient.InferenceServerClient.parse_response_body(r._content) + output0_data = result.as_numpy("OUTPUT0") + output1_data = result.as_numpy("OUTPUT1") for i in range(16): self.assertEqual(output0_data[0][i], self.expected_output0_data_[i]) self.assertEqual(output1_data[0][i], self.expected_output1_data_[i]) @@ -166,8 +157,8 @@ def test_predict_request_binary(self): def test_predict_response_binary(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -175,23 +166,23 @@ def test_predict_response_binary(self): inputs[0].set_data_from_numpy(input_data, binary_data=False) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=True)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) request_body, _ = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + inputs, outputs=outputs + ) - headers = {'Content-Type': 'application/json'} + headers = {"Content-Type": "application/json"} r = requests.post(self.url_, data=request_body, headers=headers) r.raise_for_status() - header_length_str = r.headers['Inference-Header-Content-Length'] + header_length_str = r.headers["Inference-Header-Content-Length"] result = httpclient.InferenceServerClient.parse_response_body( - r._content, header_length=int(header_length_str)) + r._content, header_length=int(header_length_str) + ) - output0_data = result.as_numpy('OUTPUT0') - output1_data = result.as_numpy('OUTPUT1') + output0_data = result.as_numpy("OUTPUT0") + output1_data = result.as_numpy("OUTPUT1") for i in range(16): self.assertEqual(output0_data[0][i], self.expected_output0_data_[i]) self.assertEqual(output1_data[0][i], self.expected_output1_data_[i]) @@ -199,8 +190,8 @@ def test_predict_response_binary(self): def test_malformed_binary_header(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -208,29 +199,34 @@ def test_malformed_binary_header(self): inputs[0].set_data_from_numpy(input_data, binary_data=True) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) - request_body, header_length = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) headers = { - 'Content-Type': - 'additional-string/application/vnd.vertex-ai-triton.binary+json;json-header-size={}' - .format(header_length) + "Content-Type": "additional-string/application/vnd.vertex-ai-triton.binary+json;json-header-size={}".format( + header_length + ) } r = requests.post(self.url_, data=request_body, headers=headers) self.assertEqual( - 400, r.status_code, + 400, + r.status_code, "Expected error code {} returned for the request; got: {}".format( - 400, r.status_code)) + 400, r.status_code + ), + ) def test_malformed_binary_header_not_number(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -238,29 +234,34 @@ def test_malformed_binary_header_not_number(self): inputs[0].set_data_from_numpy(input_data, binary_data=True) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) - request_body, header_length = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) headers = { - 'Content-Type': - 'application/vnd.vertex-ai-triton.binary+json;json-header-size=additional-string{}' - .format(header_length) + "Content-Type": "application/vnd.vertex-ai-triton.binary+json;json-header-size=additional-string{}".format( + header_length + ) } r = requests.post(self.url_, data=request_body, headers=headers) self.assertEqual( - 400, r.status_code, + 400, + r.status_code, "Expected error code {} returned for the request; got: {}".format( - 400, r.status_code)) + 400, r.status_code + ), + ) def test_malformed_binary_header_negative_number(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -268,28 +269,32 @@ def test_malformed_binary_header_negative_number(self): inputs[0].set_data_from_numpy(input_data, binary_data=True) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) - request_body, header_length = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) headers = { - 'Content-Type': - 'application/vnd.vertex-ai-triton.binary+json;json-header-size=-123' + "Content-Type": "application/vnd.vertex-ai-triton.binary+json;json-header-size=-123" } r = requests.post(self.url_, data=request_body, headers=headers) self.assertEqual( - 400, r.status_code, + 400, + r.status_code, "Expected error code {} returned for the request; got: {}".format( - 400, r.status_code)) + 400, r.status_code + ), + ) def test_malformed_binary_header_large_number(self): inputs = [] outputs = [] - inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32")) - inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32")) + inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32")) # Initialize the data input_data = np.array(self.input_data_, dtype=np.int32) @@ -297,23 +302,27 @@ def test_malformed_binary_header_large_number(self): inputs[0].set_data_from_numpy(input_data, binary_data=True) inputs[1].set_data_from_numpy(input_data, binary_data=False) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT0', binary_data=False)) - outputs.append( - httpclient.InferRequestedOutput('OUTPUT1', binary_data=False)) - request_body, header_length = httpclient.InferenceServerClient.generate_request_body( - inputs, outputs=outputs) + outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False)) + outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)) + ( + request_body, + header_length, + ) = httpclient.InferenceServerClient.generate_request_body( + inputs, outputs=outputs + ) headers = { - 'Content-Type': - 'application/vnd.vertex-ai-triton.binary+json;json-header-size=12345' + "Content-Type": "application/vnd.vertex-ai-triton.binary+json;json-header-size=12345" } r = requests.post(self.url_, data=request_body, headers=headers) self.assertEqual( - 400, r.status_code, + 400, + r.status_code, "Expected error code {} returned for the request; got: {}".format( - 400, r.status_code)) + 400, r.status_code + ), + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/qa/L0_warmup/decoupled/1/model.py b/qa/L0_warmup/decoupled/1/model.py old mode 100644 new mode 100755 index db7c6903f5..52481ae83f --- a/qa/L0_warmup/decoupled/1/model.py +++ b/qa/L0_warmup/decoupled/1/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,11 +30,12 @@ class TritonPythonModel: - """Test model that always returns 0 response for all requests. """ + """Test model that always returns 0 response for all requests.""" def execute(self, requests): for request in requests: request.get_response_sender().send( - flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) return None diff --git a/qa/L0_warmup/failing_infer/1/model.py b/qa/L0_warmup/failing_infer/1/model.py old mode 100644 new mode 100755 index 1935fe6cd9..65814c77d4 --- a/qa/L0_warmup/failing_infer/1/model.py +++ b/qa/L0_warmup/failing_infer/1/model.py @@ -1,4 +1,6 @@ -# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +#!/usr/bin/env python3 + +# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -28,7 +30,7 @@ class TritonPythonModel: - """Test model that always returns error for all requests. """ + """Test model that always returns error for all requests.""" def execute(self, requests): responses = [] @@ -36,8 +38,9 @@ def execute(self, requests): for _ in requests: responses.append( pb_utils.InferenceResponse( - output_tensors=[], - error=pb_utils.TritonError("An Error Occurred"))) + output_tensors=[], error=pb_utils.TritonError("An Error Occurred") + ) + ) # You must return a list of pb_utils.InferenceResponse. Length # of this list must match the length of `requests` list. diff --git a/qa/L0_warmup/test.sh b/qa/L0_warmup/test.sh old mode 100644 new mode 100755 diff --git a/qa/common/check_copyright.py b/qa/common/check_copyright.py index f5d84995e0..ff18ca8e39 100755 --- a/qa/common/check_copyright.py +++ b/qa/common/check_copyright.py @@ -28,44 +28,68 @@ import argparse import os -import re import pathlib +import re FLAGS = None -SKIP_EXTS = ('jpeg', 'jpg', 'pgm', 'png', 'log', 'preprocessed', 'jmx', 'gz', - 'json', 'pdf', 'so', 'onnx', 'svg') -REPO_PATH_FROM_THIS_FILE = '../..' +SKIP_EXTS = ( + "jpeg", + "jpg", + "pgm", + "png", + "log", + "preprocessed", + "jmx", + "gz", + "json", + "pdf", + "so", + "onnx", + "svg", +) +REPO_PATH_FROM_THIS_FILE = "../.." SKIP_PATHS = ( - 'build', 'deploy/gke-marketplace-app/.gitignore', - 'deploy/gke-marketplace-app/server-deployer/chart/.helmignore', - 'deploy/gcp/.helmignore', 'deploy/aws/.helmignore', - 'deploy/fleetcommand/.helmignore', 'docs/.gitignore', - 'docs/_static/.gitattributes', 'docs/examples/model_repository', - 'docs/examples/jetson', 'docker', 'qa/common/cuda_op_kernel.cu.cc.patch', - 'qa/ensemble_models/mix_platform_float32_float32_float32/output0_labels.txt', - 'qa/ensemble_models/mix_type_int32_float32_float32/output0_labels.txt', - 'qa/ensemble_models/mix_ensemble_int32_float32_float32/output0_labels.txt', - 'qa/ensemble_models/wrong_label_int32_float32_float32/output0_labels.txt', - 'qa/ensemble_models/label_override_int32_float32_float32/output0_labels.txt', - 'qa/L0_model_config/noautofill_platform', - 'qa/L0_model_config/autofill_noplatform', - 'qa/L0_model_config/autofill_noplatform_success', - 'qa/L0_model_config/special_cases', - 'qa/L0_model_config/cli_messages/cli_override/expected', - 'qa/L0_model_config/cli_messages/cli_deprecation/expected', - 'qa/L0_model_namespacing/test_duplication', - 'qa/L0_model_namespacing/test_dynamic_resolution', - 'qa/L0_model_namespacing/test_ensemble_duplication', - 'qa/L0_model_namespacing/test_no_duplication', - 'qa/L0_perf_nomodel/baseline', 'qa/L0_perf_nomodel/legacy_baseline', - 'qa/L0_warmup/raw_mug_data', 'qa/L0_java_resnet/expected_output_data', - 'qa/L0_trt_dla_jetson/trt_dla_model_store', - 'qa/openvino_models/dynamic_batch', 'qa/openvino_models/fixed_batch', - 'CITATION.cff', 'TRITON_VERSION') + "build", + "deploy/gke-marketplace-app/.gitignore", + "deploy/gke-marketplace-app/server-deployer/chart/.helmignore", + "deploy/gcp/.helmignore", + "deploy/aws/.helmignore", + "deploy/fleetcommand/.helmignore", + "docs/.gitignore", + "docs/_static/.gitattributes", + "docs/examples/model_repository", + "docs/examples/jetson", + "docker", + "qa/common/cuda_op_kernel.cu.cc.patch", + "qa/ensemble_models/mix_platform_float32_float32_float32/output0_labels.txt", + "qa/ensemble_models/mix_type_int32_float32_float32/output0_labels.txt", + "qa/ensemble_models/mix_ensemble_int32_float32_float32/output0_labels.txt", + "qa/ensemble_models/wrong_label_int32_float32_float32/output0_labels.txt", + "qa/ensemble_models/label_override_int32_float32_float32/output0_labels.txt", + "qa/L0_model_config/noautofill_platform", + "qa/L0_model_config/autofill_noplatform", + "qa/L0_model_config/autofill_noplatform_success", + "qa/L0_model_config/special_cases", + "qa/L0_model_config/cli_messages/cli_override/expected", + "qa/L0_model_config/cli_messages/cli_deprecation/expected", + "qa/L0_model_namespacing/test_duplication", + "qa/L0_model_namespacing/test_dynamic_resolution", + "qa/L0_model_namespacing/test_ensemble_duplication", + "qa/L0_model_namespacing/test_no_duplication", + "qa/L0_perf_nomodel/baseline", + "qa/L0_perf_nomodel/legacy_baseline", + "qa/L0_warmup/raw_mug_data", + "qa/L0_java_resnet/expected_output_data", + "qa/L0_trt_dla_jetson/trt_dla_model_store", + "qa/openvino_models/dynamic_batch", + "qa/openvino_models/fixed_batch", + "CITATION.cff", + "TRITON_VERSION", +) -COPYRIGHT_YEAR_RE = 'Copyright( \\(c\\))? 20[1-9][0-9](-(20)?[1-9][0-9])?(,((20[2-9][0-9](-(20)?[2-9][0-9])?)|([2-9][0-9](-[2-9][0-9])?)))*,? NVIDIA CORPORATION( & AFFILIATES)?. All rights reserved.' +COPYRIGHT_YEAR_RE = "Copyright( \\(c\\))? 20[1-9][0-9](-(20)?[1-9][0-9])?(,((20[2-9][0-9](-(20)?[2-9][0-9])?)|([2-9][0-9](-[2-9][0-9])?)))*,? NVIDIA CORPORATION( & AFFILIATES)?. All rights reserved." -COPYRIGHT = ''' +COPYRIGHT = """ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -90,10 +114,11 @@ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -''' +""" -repo_abs_path = pathlib.Path(__file__).parent.joinpath( - REPO_PATH_FROM_THIS_FILE).resolve() +repo_abs_path = ( + pathlib.Path(__file__).parent.joinpath(REPO_PATH_FROM_THIS_FILE).resolve() +) copyright_year_re = re.compile(COPYRIGHT_YEAR_RE) @@ -103,19 +128,20 @@ def visit(path): print("visiting " + path) for skip in SKIP_EXTS: - if path.endswith('.' + skip): + if path.endswith("." + skip): if FLAGS.verbose: print("skipping due to extension: " + path) return True for skip in SKIP_PATHS: if str(pathlib.Path(path).resolve()).startswith( - str(repo_abs_path.joinpath(skip).resolve())): + str(repo_abs_path.joinpath(skip).resolve()) + ): if FLAGS.verbose: print("skipping due to path prefix: " + path) return True - with open(path, 'r') as f: + with open(path, "r") as f: first_line = True line = None try: @@ -126,9 +152,13 @@ def visit(path): # start of the file if first_line: first_line = False - if (fline.startswith("#!") or fline.startswith("..") or - fline.startswith("