From 2057e6d77a5ce49a18f314f5918e1648884cb688 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Mon, 3 Jul 2023 22:24:14 -0700
Subject: [PATCH 01/39] Add pre-commit

---
 .github/workflows/pre-commit.yaml | 41 +++++++++++++++++
 .pre-commit-config.yaml           | 74 +++++++++++++++++++++++++++++++
 pyproject.toml                    | 49 ++++++++++++++++++++
 3 files changed, 164 insertions(+)
 create mode 100644 .github/workflows/pre-commit.yaml
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 pyproject.toml

diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
new file mode 100644
index 0000000000..190610a7aa
--- /dev/null
+++ b/.github/workflows/pre-commit.yaml
@@ -0,0 +1,41 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-22.04
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v3
+    - uses: pre-commit/action@v3.0.0
+
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000..6c03a4ad6c
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,74 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+repos:
+- repo: https://github.com/timothycrosley/isort
+  rev: 5.12.0
+  hooks:
+  - id: isort
+    additional_dependencies: [toml]
+- repo: https://github.com/psf/black
+  rev: 23.1.0
+  hooks:
+      - id: black
+        types_or: [python, cython]
+- repo: https://github.com/PyCQA/flake8
+  rev: 5.0.4
+  hooks:
+      - id: flake8
+        args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
+        types_or: [python, cython]
+- repo: https://github.com/pre-commit/mirrors-clang-format
+  rev: v16.0.5
+  hooks:
+      - id: clang-format
+        types_or: [c, c++, cuda, proto, textproto, java]
+        args: ["-fallback-style=none", "-style=file", "-i"]
+- repo: https://github.com/codespell-project/codespell
+  rev: v2.2.4
+  hooks:
+      - id: codespell
+        additional_dependencies: [tomli]
+        args: ["--toml", "pyproject.toml"]
+        exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$)
+# More details about these pre-commit hooks here:
+# https://pre-commit.com/hooks.html
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.4.0
+  hooks:
+    - id: check-case-conflict
+    - id: check-executables-have-shebangs
+    - id: check-merge-conflict
+    - id: check-json
+    - id: check-toml
+    - id: check-yaml
+    - id: check-shebang-scripts-are-executable
+    - id: end-of-file-fixer
+      types_or: [c, c++, cuda, proto, textproto, java, python]
+    - id: mixed-line-ending
+    - id: requirements-txt-fixer
+    - id: trailing-whitespace
+
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000..1a8da1f4d3
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,49 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+[tool.codespell]
+# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
+# this is only to allow you to run codespell interactively
+skip = "./.git,./.github"
+# ignore short words, and typename parameters like OffsetT
+ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
+# use the 'clear' dictionary for unambiguous spelling mistakes
+builtin = "clear"
+# disable warnings about binary files and wrong encoding
+quiet-level = 3
+
+[tool.isort]
+profile = "black"
+use_parentheses = true
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+ensure_newline_before_comments = true
+line_length = 88
+balanced_wrapping = true
+indent = "    "
+skip = ["build"]
+

From 36998f0d1113a3cd03d5a1dc2c0fd9dd2a354965 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 4 Jul 2023 12:03:03 -0700
Subject: [PATCH 02/39] Fix typos, exec/shebang, formatting

---
 .github/workflows/codeql.yml                  |    6 +-
 .pre-commit-config.yaml                       |    2 +
 CMakeLists.txt                                |    2 +-
 Dockerfile.QA                                 |    2 +-
 Dockerfile.sdk                                |    6 +-
 Dockerfile.win10.min                          |    4 +-
 README.md                                     |   92 +-
 build.py                                      |    2 +-
 compose.py                                    |  417 +--
 deploy/alibaba-cloud/README.md                |    8 +-
 deploy/aws/README.md                          |    6 +-
 deploy/aws/templates/deployment.yaml          |    4 +-
 deploy/fleetcommand/README.md                 |    4 +-
 deploy/gcp/README.md                          |    2 +-
 deploy/gke-marketplace-app/README.md          |   28 +-
 .../gke-marketplace-app/benchmark/README.md   |   15 +-
 .../model-store/bert_base_tf_gpu/config.pbtxt |    2 +-
 .../bert_base_trt_gpu/config.pbtxt            |    2 +-
 .../bert_distill_tf_cpu/config.pbtxt          |    2 +-
 .../bert_distill_tf_gpu/config.pbtxt          |    2 +-
 .../perf-analyzer-script/perf_query.sh        |    0
 .../client-sample/bert_request.json           |    6 +-
 .../client-sample/locustfile_bert.py          |   17 +-
 .../client-sample/perf_analyzer_grpc.sh       |    0
 .../server-deployer/build_and_push.sh         |    3 +-
 .../chart/triton/templates/application.yaml   |    8 +-
 .../chart/triton/templates/deployment.yaml    |    2 +-
 .../chart/triton/templates/service.yaml       |    4 +-
 .../server-deployer/chart/triton/values.yaml  |    2 +-
 .../server-deployer/data-test/schema.yaml     |   20 +-
 .../server-deployer/schema.yaml               |   20 +-
 .../gke-marketplace-app/trt-engine/README.md  |   12 +-
 .../onnx_float32_int32_int32/config.pbtxt     |    0
 .../mlflow_triton/__init__.py                 |    6 +-
 .../mlflow_triton/config.py                   |   55 +-
 .../mlflow_triton/deployments.py              |  299 +-
 .../scripts/publish_model_to_mlflow.py        |   22 +-
 .../scripts/triton_flavor.py                  |   16 +-
 deploy/mlflow-triton-plugin/setup.py          |    6 +-
 docker/cpu_only/entrypoint.d/12-banner.sh     |    0
 .../entrypoint.d/50-gpu-driver-check2.sh      |    0
 docker/entrypoint.d/50-gpu-driver-check2.sh   |    0
 .../56-network-driver-version-check.sh        |    2 +-
 docker/entrypoint.d/70-shm-check.sh           |    2 +-
 docker/entrypoint.d/99-check-run-aip-mode.sh  |    0
 docker/sagemaker/serve                        |    8 +-
 docs/Makefile                                 |    2 +-
 docs/README.md                                |   30 +-
 docs/_static/custom.css                       |    4 +-
 docs/conf.py                                  |   66 +-
 docs/customization_guide/build.md             |   12 +-
 .../inference_protocols.md                    |   26 +-
 docs/examples/README.md                       |    2 +-
 .../concurrency_and_dynamic_batching/Makefile |    4 +-
 .../README.md                                 |   22 +-
 .../tao/convert_peoplenet.sh                  |    0
 .../simple_identity/config.pbtxt              |    0
 docs/getting_started/quickstart.md            |    8 +-
 docs/index.md                                 |    8 +-
 docs/protocol/extension_logging.md            |   22 +-
 .../protocol/extension_model_configuration.md |    2 +-
 docs/protocol/extension_parameters.md         |    4 +-
 docs/protocol/extension_schedule_policy.md    |    2 +-
 docs/protocol/extension_sequence.md           |    4 +-
 docs/protocol/extension_statistics.md         |   10 +-
 docs/protocol/extension_trace.md              |    2 +-
 docs/user_guide/architecture.md               |   12 +-
 docs/user_guide/custom_operations.md          |    4 +-
 docs/user_guide/decoupled_models.md           |    8 +-
 docs/user_guide/faq.md                        |    4 +-
 docs/user_guide/jetson.md                     |    4 +-
 docs/user_guide/metrics.md                    |   30 +-
 docs/user_guide/model_analyzer.md             |    2 +-
 docs/user_guide/model_configuration.md        |   68 +-
 docs/user_guide/model_management.md           |   10 +-
 docs/user_guide/model_repository.md           |   18 +-
 docs/user_guide/optimization.md               |   10 +-
 docs/user_guide/performance_tuning.md         |    8 +-
 docs/user_guide/rate_limiter.md               |    4 +-
 docs/user_guide/response_cache.md             |   68 +-
 qa/L0_async_work_queue/test.sh                |    0
 qa/L0_backend_config/test.sh                  |   50 +-
 qa/L0_backend_fastertransformer/test.sh       |    2 +-
 qa/L0_backend_identity/identity_test.py       |  192 +-
 .../models/argument_validation/1/model.py     |   74 +-
 .../argument_validation/test.sh               |    1 +
 qa/L0_backend_python/bls/test.sh              |    8 +-
 qa/L0_backend_python/common.sh                |    3 +-
 qa/L0_backend_python/custom_metrics/test.sh   |    2 +-
 .../decoupled/decoupled_test.py               |  108 +-
 .../decoupled/models/decoupled_bls/1/model.py |  127 +-
 .../models/decoupled_bls_stream/1/model.py    |   72 +-
 .../models/decoupled_execute_error/1/model.py |   54 +-
 .../1/model.py                                |   48 +-
 .../1/model.py                                |   47 +-
 qa/L0_backend_python/decoupled/test.sh        |    1 +
 .../ensemble/ensemble_test.py                 |   46 +-
 qa/L0_backend_python/ensemble/test.sh         |    0
 qa/L0_backend_python/env/test.sh              |    2 +-
 qa/L0_backend_python/examples/test.sh         |    2 +-
 qa/L0_backend_python/io/io_test.py            |   74 +-
 qa/L0_backend_python/io/test.sh               |    0
 .../lifecycle/lifecycle_test.py               |   59 +-
 qa/L0_backend_python/lifecycle/test.sh        |    2 +-
 qa/L0_backend_python/logging/logging_test.py  |   20 +-
 qa/L0_backend_python/logging/test.sh          |    4 +-
 .../model_control/model_control_test.py       |   23 +-
 qa/L0_backend_python/model_control/test.sh    |    0
 qa/L0_backend_python/python_test.py           |  297 +-
 qa/L0_backend_python/python_unittest.py       |   26 +-
 .../restart/models/restart/1/model.py         |   23 +-
 qa/L0_backend_python/restart/restart_test.py  |   23 +-
 qa/L0_backend_python/restart/test.sh          |    0
 qa/L0_backend_python/variants/test.sh         |    2 +-
 qa/L0_batch_custom/batch_custom_test.py       |  200 +-
 qa/L0_batch_custom/test.sh                    |    4 +-
 qa/L0_batch_input/batch_input_test.py         |  170 +-
 qa/L0_batch_input/test.sh                     |    0
 qa/L0_batcher/batcher_test.py                 | 1348 +++++----
 qa/L0_batcher/test.sh                         |    2 +-
 qa/L0_batcher/verify_timestamps.py            |   45 +-
 .../buffer_attributes_test.py                 |   65 +-
 qa/L0_buffer_attributes/models/bls/1/model.py |   23 +-
 .../models/identity/1/model.py                |   10 +-
 qa/L0_buffer_attributes/test.sh               |    3 +-
 qa/L0_client_build_variants/test.sh           |    2 +-
 qa/L0_client_java/test.sh                     |    0
 .../client_memory_mail.py                     |   12 +-
 .../models/custom_identity_int32/config.pbtxt |    2 +-
 qa/L0_client_memory_growth/test.sh            |    2 +-
 qa/L0_client_nobatch/client_test.py           |  200 +-
 qa/L0_client_timeout/client_timeout_test.py   |  157 +-
 .../models/custom_identity_int32/config.pbtxt |    2 +-
 qa/L0_client_timeout/test.sh                  |    0
 .../models/custom_identity_int32/config.pbtxt |    2 +-
 qa/L0_cmdline_trace/test.sh                   |    2 +-
 qa/L0_cmdline_trace/trace_client.py           |   37 +-
 qa/L0_cuda_graph/test.sh                      |    0
 qa/L0_cuda_graph/trt_cuda_graph_test.py       |   72 +-
 .../cuda_shared_memory_test.py                |  137 +-
 qa/L0_cuda_shared_memory/test.sh              |    0
 qa/L0_custom_ops/cuda_op_test.py              |   66 +-
 qa/L0_custom_ops/mod_op_test.py               |   77 +-
 qa/L0_custom_ops/onnx_op_test.py              |   74 +-
 qa/L0_custom_ops/vision_op_test.py            |   74 +-
 qa/L0_custom_ops/zero_out_test.py             |   64 +-
 qa/L0_data_compression/test.sh                |    0
 qa/L0_data_compression/validation.py          |   12 +-
 qa/L0_decoupled/decoupled_test.py             |  400 +--
 qa/L0_decoupled/test.sh                       |   16 +-
 qa/L0_device_memory_tracker/test.py           |   32 +-
 qa/L0_device_memory_tracker/test.sh           |    0
 qa/L0_dlpack_multi_gpu/test.sh                |    2 +-
 qa/L0_doc_links/test.sh                       |    3 +-
 qa/L0_dyna_implicit_state/test.sh             |    0
 .../dyna_sequence_batcher_test.py             | 1016 ++++---
 qa/L0_dyna_sequence_batcher/test.sh           |    2 +-
 .../client_plugin_test/1/model.py             |   25 +-
 qa/L0_grpc/grpc_basic_auth_test.py            |   19 +-
 qa/L0_grpc/grpc_client_plugin_test.py         |   36 +-
 qa/L0_grpc/python_grpc_aio_test.py            |   23 +-
 qa/L0_grpc/python_unit_test.py                |   93 +-
 qa/L0_grpc/test.sh                            |    2 +-
 qa/L0_http/http_basic_auth_test.py            |   19 +-
 qa/L0_http/http_client_plugin_test.py         |   64 +-
 qa/L0_http/http_test.py                       |  124 +-
 qa/L0_http/python_http_aio_test.py            |   14 +-
 qa/L0_http/test.sh                            |    4 +-
 qa/L0_http_fuzz/fuzztest.py                   |   55 +-
 qa/L0_http_fuzz/test.sh                       |    6 +-
 qa/L0_https/test.sh                           |   16 +-
 qa/L0_implicit_state/implicit_state.py        |    2 +-
 qa/L0_implicit_state/test.sh                  |    0
 qa/L0_infer/infer_test.py                     | 1184 ++++----
 qa/L0_infer/install_and_test.sh               |    2 +-
 qa/L0_infer_reshape/infer_reshape_test.py     |  252 +-
 qa/L0_infer_variable/infer_variable_test.py   |  452 +--
 qa/L0_infer_zero/infer_zero_test.py           |  332 ++-
 qa/L0_inferentia_perf_analyzer/test.sh        |   34 +-
 qa/L0_io/test.sh                              |    2 +-
 .../MemoryGrowthTest.java                     | 1481 +++++----
 qa/L0_java_memory_growth/test.sh              |    2 +-
 qa/L0_java_resnet/ResnetTest.java             |  986 +++---
 qa/L0_java_sequence_batcher/SequenceTest.java | 1019 +++----
 qa/L0_json/test.sh                            |    0
 qa/L0_large_payload/large_payload_test.py     |  102 +-
 qa/L0_large_payload/test.sh                   |    0
 qa/L0_libtorch_inference_mode/test.sh         |    0
 .../client.py                                 |   26 +-
 .../gen_models.py                             |   18 +-
 .../models/libtorch_multi_device/config.pbtxt |    0
 .../test.sh                                   |    8 +-
 qa/L0_libtorch_io_names/io_names_client.py    |   46 +-
 qa/L0_libtorch_io_names/test.sh               |    0
 qa/L0_libtorch_nvfuser/test.sh                |    0
 qa/L0_libtorch_optimized_execution/test.sh    |    0
 .../libtorch_shared_weights_test.py           |   21 +-
 qa/L0_libtorch_shared_weights/test.sh         |    3 +-
 qa/L0_lifecycle/lifecycle_test.py             | 2459 ++++++++-------
 qa/L0_lifecycle/test.sh                       |    8 +-
 qa/L0_logging/logging_endpoint_test.py        |  330 +-
 qa/L0_logging/test.sh                         |   14 +-
 qa/L0_long_running_stress/crashing_client.py  |   60 +-
 qa/L0_long_running_stress/scenarios.py        |  653 ++--
 qa/L0_long_running_stress/stress.py           |  508 ++--
 qa/L0_long_running_stress/stress_mail.py      |   28 +-
 qa/L0_memory/test.sh                          |    0
 qa/L0_memory_growth/busy_op_test.py           |   84 +-
 qa/L0_memory_growth/server_memory_mail.py     |   22 +-
 qa/L0_metrics/metrics_test.py                 |   34 +-
 qa/L0_metrics/test.sh                         |    2 +-
 qa/L0_mlflow/plugin_test.py                   |   53 +-
 qa/L0_mlflow/test.sh                          |   10 +-
 .../conflicting_max_batch_size/model.py       |   13 +-
 .../conflicting_scheduler_sequence/model.py   |   13 +-
 .../python/input_missing_datatype/model.py    |   13 +-
 .../python/input_missing_dims/model.py        |   13 +-
 .../python/input_missing_name/model.py        |   13 +-
 .../python/input_wrong_property/model.py      |   19 +-
 .../python/no_return/model.py                 |   13 +-
 .../python/output_missing_datatype/model.py   |   13 +-
 .../python/output_missing_dims/model.py       |   13 +-
 .../python/output_missing_name/model.py       |   13 +-
 .../python/output_wrong_property/model.py     |   19 +-
 .../onnx/cpu_instance/config.pbtxt            |    0
 .../openvino/partial_config/config.pbtxt      |    0
 .../conflicting_scheduler_ensemble/model.py   |    9 +-
 .../ensemble_first_step/model.py              |    9 +-
 .../ensemble_second_step/model.py             |    9 +-
 .../python/dynamic_batching/model.py          |   13 +-
 .../python/dynamic_batching_no_op/model.py    |   13 +-
 .../python/incomplete_input/model.py          |   11 +-
 .../reshape_config_provided/config.pbtxt      |    0
 qa/L0_model_config/compare_status.py          |   45 +-
 qa/L0_model_config/noautofill_test.py         |    8 +-
 qa/L0_model_config/test.sh                    |    6 +-
 .../python_addsub/__init__.py                 |  109 +-
 .../python_subadd/__init__.py                 |  109 +-
 qa/L0_model_namespacing/test.py               |  101 +-
 qa/L0_model_namespacing/test.sh               |    0
 .../addsub_repo/composing_model/1/model.py    |    4 +-
 .../addsub_repo/simple_addsub/config.pbtxt    |   12 +-
 .../subadd_repo/composing_model/1/model.py    |    4 +-
 .../subadd_repo/simple_subadd/config.pbtxt    |   12 +-
 .../addsub_repo/composing_model/1/model.py    |    4 +-
 .../addsub_repo/simple_addsub/config.pbtxt    |   12 +-
 .../subadd_repo/composing_model/1/model.py    |    4 +-
 .../subadd_repo/simple_subadd/config.pbtxt    |   12 +-
 .../addsub_repo/composing_addsub/1/model.py   |    4 +-
 .../addsub_repo/simple_ensemble/config.pbtxt  |   12 +-
 .../subadd_repo/composing_subadd/1/model.py   |    4 +-
 .../subadd_repo/simple_ensemble/config.pbtxt  |   12 +-
 .../addsub_repo/composing_addsub/1/model.py   |    4 +-
 .../addsub_repo/simple_addsub/config.pbtxt    |   12 +-
 .../subadd_repo/composing_subadd/1/model.py   |    4 +-
 .../subadd_repo/simple_subadd/config.pbtxt    |   12 +-
 qa/L0_model_queue/model_queue_test.py         |  392 ++-
 qa/L0_model_update/instance_update_test.py    |  166 +-
 qa/L0_multi_server/test.sh                    |    0
 .../models/nan_inf_output/1/model.py          |   14 +-
 qa/L0_nan_inf/nan_inf_test.py                 |   49 +-
 .../nullchar_string_client.py                 |   63 +-
 qa/L0_nullchar_string/test.sh                 |    0
 .../ensemble_identity_2_float32/config.pbtxt  |    0
 .../models/identity_2_float32/config.pbtxt    |    0
 .../pipeline_identity_2_float32/config.pbtxt  |    0
 qa/L0_optional_input/optional_input_test.py   |  227 +-
 qa/L0_output_name/output_name_test.py         |   20 +-
 qa/L0_output_name/test.sh                     |    0
 qa/L0_output_validation/lt_op_val_client.py   |   17 +-
 qa/L0_output_validation/test.sh               |    0
 qa/L0_parallel_copy/parallel_copy_test.py     |   80 +-
 .../model_repository/parameter/1/model.py     |   43 +-
 qa/L0_parameters/parameters_test.py           |  162 +-
 qa/L0_parameters/test.sh                      |    8 +-
 .../config.pbtxt                              |    0
 .../passive_instance_test.py                  |   15 +-
 qa/L0_passive_instance/test.sh                |    0
 qa/L0_perf_analyzer/test.sh                   |   24 +-
 qa/L0_perf_analyzer_doc_links/test.sh         |    9 +-
 qa/L0_perf_analyzer_ground_truth/test.sh      |    4 +-
 qa/L0_perf_analyzer_report/test.sh            |    2 +-
 qa/L0_perf_kaldi/create_data.sh               |    2 +-
 qa/L0_perf_kaldi/test.sh                      |    0
 qa/L0_perf_nomodel/run_test.sh                |    2 +-
 qa/L0_perf_pyclients/simple_perf_client.py    |  317 +-
 qa/L0_perf_resnet/run_test.sh                 |    2 +-
 qa/L0_query/query_e2e.py                      |  108 +-
 qa/L0_query/test.sh                           |    0
 qa/L0_rate_limiter/rate_limiter_test.py       |  143 +-
 qa/L0_rate_limiter/test.sh                    |    2 +-
 qa/L0_register/test.sh                        |    0
 qa/L0_repoagent_checksum/identity_test.py     |   68 +-
 qa/L0_response_cache/test.sh                  |    8 +-
 qa/L0_sagemaker/sagemaker_multi_model_test.py |  226 +-
 qa/L0_sagemaker/sagemaker_test.py             |  329 +-
 .../saved_model_shape_test.py                 |  302 +-
 qa/L0_savedmodel_shape/test.sh                |    0
 qa/L0_secure_grpc/test.sh                     |   14 +-
 .../sequence_batcher_test.py                  |    2 +-
 qa/L0_sequence_batcher/test.sh                |    6 +-
 .../sequence_corrid_batcher_test.py           |  139 +-
 qa/L0_sequence_stress/sequence_stress.py      |  428 +--
 qa/L0_server_status/server_status_test.py     |  534 ++--
 qa/L0_shared_memory/shared_memory_test.py     |  164 +-
 qa/L0_shared_memory/test.sh                   |    0
 qa/L0_simple_ensemble/ensemble_test.py        |   73 +-
 qa/L0_simple_nodejs_client/test.sh            |    0
 qa/L0_socket/test.sh                          |    2 +-
 qa/L0_storage_S3_local/mock_s3_service.py     |   36 +-
 qa/L0_storage_azure/test.sh                   |    2 +-
 qa/L0_storage_swiftstack/infer_test.py        |  270 +-
 qa/L0_string_io/string_client_test.py         |  152 +-
 qa/L0_tf_gpu_io/tf_gpu_io_test.py             |   64 +-
 qa/L0_tf_parameters/test.sh                   |    0
 qa/L0_tf_parameters/tf_parameter_test.py      |   44 +-
 qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py     |   13 +-
 qa/L0_tf_unknown_rank/test.sh                 |    0
 qa/L0_tf_unknown_rank/tf_unknown_rank_test.py |   27 +-
 .../tftrt_optimization_test.py                |   36 +-
 qa/L0_trace/test.sh                           |    2 +-
 qa/L0_trace/trace_endpoint_test.py            |  423 ++-
 qa/L0_triton_repo_agent/test.sh               |    0
 .../trt_data_dependent_shape_test.py          |   25 +-
 qa/L0_trt_dla/dla_test.py                     |   23 +-
 qa/L0_trt_dla/test.sh                         |    0
 qa/L0_trt_dynamic_shape/test.sh               |    2 +-
 .../trt_dynamic_shape_test.py                 |   76 +-
 .../trt_error_propagation_test.py             |   28 +-
 qa/L0_trt_plugin/test.sh                      |    0
 qa/L0_trt_plugin/trt_plugin_test.py           |   56 +-
 .../trt_reformat_free_test.py                 |  194 +-
 qa/L0_trt_shape_tensors/test.sh               |    2 +-
 .../trt_shape_tensor_test.py                  |  674 +++--
 qa/L0_vertex_ai/test.sh                       |    4 +-
 qa/L0_vertex_ai/vertex_ai_test.py             |  241 +-
 qa/L0_warmup/decoupled/1/model.py             |    9 +-
 qa/L0_warmup/failing_infer/1/model.py         |   11 +-
 qa/L0_warmup/test.sh                          |    0
 qa/common/check_copyright.py                  |  192 +-
 qa/common/check_massif_log.py                 |   45 +-
 qa/common/check_valgrind_log.py               |   42 +-
 qa/common/cuda_op_kernel.cu.cc.patch          |    8 +-
 qa/common/gen_ensemble_model_utils.py         |  626 ++--
 qa/common/gen_qa_custom_ops                   |    6 +-
 qa/common/gen_qa_custom_ops_models.py         |  239 +-
 .../gen_qa_dyna_sequence_implicit_models.py   |  470 +--
 qa/common/gen_qa_dyna_sequence_models.py      |  825 ++---
 qa/common/gen_qa_identity_models.py           |  853 +++---
 qa/common/gen_qa_implicit_models.py           |    4 +-
 qa/common/gen_qa_model_repository             |    6 +-
 qa/common/gen_qa_models.py                    | 2646 +++++++++++------
 qa/common/gen_qa_noshape_models.py            |  438 +--
 qa/common/gen_qa_ragged_models.py             |  442 +--
 qa/common/gen_qa_reshape_models.py            | 1364 ++++++---
 qa/common/gen_qa_sequence_models.py           |  812 +++--
 qa/common/gen_qa_tf_parameters.py             |   47 +-
 qa/common/gen_qa_torchtrt_models.py           |   32 +-
 qa/common/gen_qa_trt_data_dependent_shape.py  |   65 +-
 qa/common/gen_qa_trt_format_models.py         |  351 ++-
 qa/common/gen_qa_trt_plugin_models.py         |  312 +-
 qa/common/gen_tag_sigdef.py                   |  233 +-
 qa/common/infer_test.py                       |  263 +-
 qa/common/infer_util.py                       |  860 +++---
 .../non_aligned_validation_batched.json       |   56 +-
 .../non_aligned_validation_no_batch.json      |   56 +-
 .../simple_model.py                           |  101 +-
 .../validation_batched.json                   |   64 +-
 .../validation_no_batch.json                  |   64 +-
 .../wrong_validation_batched.json             |   64 +-
 .../wrong_validation_no_batch.json            |   64 +-
 qa/common/libtorch_infer_client.py            |   40 +-
 qa/common/nightly_email_helper.py             |   41 +-
 .../int_data.json                             |    4 +-
 .../int_data_diff_shape.json                  |    4 +-
 .../perf_analyzer_input_data_json/output.json |    2 +-
 .../string_data_with_shape.json               |    8 +-
 .../wrong_output.json                         |    2 +-
 .../wrong_output_2.json                       |    2 +-
 qa/common/reporter.py                         |  120 +-
 qa/common/sequence_util.py                    |  824 ++---
 qa/common/shm_util.py                         |  314 +-
 qa/common/test_util.py                        |  173 +-
 qa/common/trace_summary.py                    |  345 ++-
 qa/common/util.sh                             |    3 +-
 .../custom_zero_1_float32/config.pbtxt        |    0
 qa/python_models/add_sub/model.py             |   52 +-
 qa/python_models/add_sub_gpu/config.pbtxt     |    8 +-
 qa/python_models/auto_complete/model.py       |   60 +-
 qa/python_models/auto_complete_error/model.py |   13 +-
 qa/python_models/bls/model.py                 |  364 +--
 qa/python_models/bls_async/model.py           |  104 +-
 qa/python_models/bls_finalize_error/model.py  |   14 +-
 qa/python_models/bls_init_error/model.py      |   14 +-
 qa/python_models/bls_memory/model.py          |   52 +-
 qa/python_models/bls_memory_async/model.py    |   32 +-
 .../bls_model_loading/config.pbtxt            |    4 +-
 qa/python_models/bls_model_loading/model.py   |   37 +-
 qa/python_models/bls_onnx_warmup/config.pbtxt |    2 +-
 qa/python_models/bls_undefined/model.py       |    5 +-
 .../cuda_memory_consumer/1/model.py           |   20 +-
 qa/python_models/custom_metrics/config.pbtxt  |    4 +-
 qa/python_models/custom_metrics/model.py      |   88 +-
 qa/python_models/delayed_model/model.py       |    8 +-
 qa/python_models/dlpack_add_sub/model.py      |  103 +-
 qa/python_models/dlpack_empty_output/model.py |   10 +-
 qa/python_models/dlpack_identity/model.py     |   10 +-
 qa/python_models/dlpack_io_identity/model.py  |   55 +-
 .../dlpack_io_identity_decoupled/model.py     |   44 +-
 qa/python_models/dlpack_square/model.py       |   58 +-
 qa/python_models/dlpack_sub_add/model.py      |  103 +-
 qa/python_models/dlpack_test/model.py         |  175 +-
 qa/python_models/execute_error/model.py       |   15 +-
 .../execute_return_error/model.py             |    3 +-
 qa/python_models/fini_error/model.py          |    5 +-
 qa/python_models/ground_truth/model.py        |   10 +-
 qa/python_models/identity_fp32/model.py       |    5 +-
 .../identity_fp32_logging/model.py            |    5 +-
 .../identity_fp32_timeout/model.py            |    6 +-
 qa/python_models/init_args/model.py           |    2 +-
 qa/python_models/init_error/model.py          |    7 +-
 qa/python_models/init_exit/model.py           |    5 +-
 qa/python_models/model_env/model.py           |   11 +-
 qa/python_models/model_init_del/model.py      |   11 +-
 qa/python_models/model_init_del/util.py       |    4 +-
 qa/python_models/multi_file/file1.py          |    6 +-
 qa/python_models/multi_file/file2.py          |    6 +-
 qa/python_models/multi_file/model.py          |   13 +-
 qa/python_models/non_contiguous/model.py      |   13 +-
 qa/python_models/optional/model.py            |   16 +-
 qa/python_models/python_version/model.py      |   31 +-
 qa/python_models/pytorch_fp32_fp32/model.py   |    8 +-
 .../response_sender_error/model.py            |   38 +-
 qa/python_models/sequence_int32/config.pbtxt  |    6 +-
 qa/python_models/sequence_int32/model.py      |   63 +-
 qa/python_models/string/model.py              |    8 +-
 qa/python_models/string_fixed/model.py        |   16 +-
 qa/python_models/string_identity/model.py     |   16 +-
 qa/python_models/sub_add/model.py             |   56 +-
 .../torchvision/resnet50/config.pbtxt         |    0
 .../torchvision/resnet50/model.py             |   24 +-
 qa/python_models/variable_gpu_output/model.py |   17 +-
 qa/python_models/wrong_model/model.py         |    5 +-
 src/CMakeLists.txt                            |    8 +-
 src/command_line_parser.cc                    |    6 +-
 src/common.h                                  |   23 +-
 src/data_compressor.h                         |    8 +-
 src/grpc/grpc_server.cc                       |    6 +-
 src/grpc/stream_infer_handler.cc              |    2 +-
 src/http_server.cc                            |    6 +-
 src/sagemaker_server.h                        |    2 +-
 src/shared_memory_manager.cc                  |    4 +-
 src/simple.cc                                 |    2 +-
 .../src/distributed_addsub.cc                 |    2 +-
 .../relocation_repoagent/src/relocation.cc    |    2 +-
 src/vertex_ai_server.h                        |    2 +-
 456 files changed, 23165 insertions(+), 17677 deletions(-)
 mode change 100644 => 100755 compose.py
 mode change 100644 => 100755 deploy/gke-marketplace-app/benchmark/perf-analyzer-script/perf_query.sh
 mode change 100644 => 100755 deploy/gke-marketplace-app/client-sample/locustfile_bert.py
 mode change 100644 => 100755 deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh
 mode change 100644 => 100755 deploy/gke-marketplace-app/server-deployer/build_and_push.sh
 mode change 100755 => 100644 deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt
 mode change 100644 => 100755 deploy/mlflow-triton-plugin/mlflow_triton/__init__.py
 mode change 100644 => 100755 deploy/mlflow-triton-plugin/mlflow_triton/config.py
 mode change 100644 => 100755 deploy/mlflow-triton-plugin/mlflow_triton/deployments.py
 mode change 100644 => 100755 deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py
 mode change 100644 => 100755 deploy/mlflow-triton-plugin/scripts/triton_flavor.py
 mode change 100644 => 100755 deploy/mlflow-triton-plugin/setup.py
 mode change 100644 => 100755 docker/cpu_only/entrypoint.d/12-banner.sh
 mode change 100644 => 100755 docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh
 mode change 100644 => 100755 docker/entrypoint.d/50-gpu-driver-check2.sh
 mode change 100644 => 100755 docker/entrypoint.d/56-network-driver-version-check.sh
 mode change 100644 => 100755 docker/entrypoint.d/70-shm-check.sh
 mode change 100644 => 100755 docker/entrypoint.d/99-check-run-aip-mode.sh
 mode change 100644 => 100755 docs/conf.py
 mode change 100644 => 100755 docs/examples/jetson/concurrency_and_dynamic_batching/tao/convert_peoplenet.sh
 mode change 100755 => 100644 docs/examples/model_repository/simple_identity/config.pbtxt
 mode change 100644 => 100755 qa/L0_async_work_queue/test.sh
 mode change 100644 => 100755 qa/L0_backend_config/test.sh
 mode change 100644 => 100755 qa/L0_backend_fastertransformer/test.sh
 mode change 100644 => 100755 qa/L0_backend_identity/identity_test.py
 mode change 100644 => 100755 qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py
 mode change 100644 => 100755 qa/L0_backend_python/argument_validation/test.sh
 mode change 100644 => 100755 qa/L0_backend_python/bls/test.sh
 mode change 100644 => 100755 qa/L0_backend_python/common.sh
 mode change 100644 => 100755 qa/L0_backend_python/custom_metrics/test.sh
 mode change 100644 => 100755 qa/L0_backend_python/decoupled/decoupled_test.py
 mode change 100644 => 100755 qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py
 mode change 100644 => 100755 qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py
 mode change 100644 => 100755 qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py
 mode change 100644 => 100755 qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py
 mode change 100644 => 100755 qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py
 mode change 100644 => 100755 qa/L0_backend_python/decoupled/test.sh
 mode change 100644 => 100755 qa/L0_backend_python/ensemble/ensemble_test.py
 mode change 100644 => 100755 qa/L0_backend_python/ensemble/test.sh
 mode change 100644 => 100755 qa/L0_backend_python/env/test.sh
 mode change 100644 => 100755 qa/L0_backend_python/examples/test.sh
 mode change 100644 => 100755 qa/L0_backend_python/io/io_test.py
 mode change 100644 => 100755 qa/L0_backend_python/io/test.sh
 mode change 100644 => 100755 qa/L0_backend_python/lifecycle/lifecycle_test.py
 mode change 100644 => 100755 qa/L0_backend_python/lifecycle/test.sh
 mode change 100644 => 100755 qa/L0_backend_python/logging/logging_test.py
 mode change 100644 => 100755 qa/L0_backend_python/model_control/model_control_test.py
 mode change 100644 => 100755 qa/L0_backend_python/model_control/test.sh
 mode change 100644 => 100755 qa/L0_backend_python/python_test.py
 mode change 100644 => 100755 qa/L0_backend_python/python_unittest.py
 mode change 100644 => 100755 qa/L0_backend_python/restart/models/restart/1/model.py
 mode change 100644 => 100755 qa/L0_backend_python/restart/restart_test.py
 mode change 100644 => 100755 qa/L0_backend_python/restart/test.sh
 mode change 100644 => 100755 qa/L0_backend_python/variants/test.sh
 mode change 100644 => 100755 qa/L0_batch_custom/batch_custom_test.py
 mode change 100644 => 100755 qa/L0_batch_input/batch_input_test.py
 mode change 100644 => 100755 qa/L0_batch_input/test.sh
 mode change 100644 => 100755 qa/L0_batcher/batcher_test.py
 mode change 100644 => 100755 qa/L0_batcher/test.sh
 mode change 100644 => 100755 qa/L0_batcher/verify_timestamps.py
 mode change 100644 => 100755 qa/L0_buffer_attributes/buffer_attributes_test.py
 mode change 100644 => 100755 qa/L0_buffer_attributes/models/bls/1/model.py
 mode change 100644 => 100755 qa/L0_buffer_attributes/models/identity/1/model.py
 mode change 100644 => 100755 qa/L0_buffer_attributes/test.sh
 mode change 100644 => 100755 qa/L0_client_java/test.sh
 mode change 100644 => 100755 qa/L0_client_memory_growth/client_memory_mail.py
 mode change 100644 => 100755 qa/L0_client_nobatch/client_test.py
 mode change 100644 => 100755 qa/L0_client_timeout/client_timeout_test.py
 mode change 100644 => 100755 qa/L0_client_timeout/test.sh
 mode change 100644 => 100755 qa/L0_cmdline_trace/trace_client.py
 mode change 100644 => 100755 qa/L0_cuda_graph/test.sh
 mode change 100644 => 100755 qa/L0_cuda_graph/trt_cuda_graph_test.py
 mode change 100644 => 100755 qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
 mode change 100644 => 100755 qa/L0_cuda_shared_memory/test.sh
 mode change 100644 => 100755 qa/L0_custom_ops/cuda_op_test.py
 mode change 100644 => 100755 qa/L0_custom_ops/mod_op_test.py
 mode change 100644 => 100755 qa/L0_custom_ops/onnx_op_test.py
 mode change 100644 => 100755 qa/L0_custom_ops/vision_op_test.py
 mode change 100644 => 100755 qa/L0_custom_ops/zero_out_test.py
 mode change 100644 => 100755 qa/L0_data_compression/test.sh
 mode change 100644 => 100755 qa/L0_data_compression/validation.py
 mode change 100644 => 100755 qa/L0_decoupled/decoupled_test.py
 mode change 100644 => 100755 qa/L0_decoupled/test.sh
 mode change 100644 => 100755 qa/L0_device_memory_tracker/test.py
 mode change 100644 => 100755 qa/L0_device_memory_tracker/test.sh
 mode change 100644 => 100755 qa/L0_dlpack_multi_gpu/test.sh
 mode change 100644 => 100755 qa/L0_doc_links/test.sh
 mode change 100644 => 100755 qa/L0_dyna_implicit_state/test.sh
 mode change 100644 => 100755 qa/L0_dyna_sequence_batcher/dyna_sequence_batcher_test.py
 mode change 100644 => 100755 qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py
 mode change 100644 => 100755 qa/L0_grpc/grpc_basic_auth_test.py
 mode change 100644 => 100755 qa/L0_grpc/grpc_client_plugin_test.py
 mode change 100644 => 100755 qa/L0_grpc/python_grpc_aio_test.py
 mode change 100644 => 100755 qa/L0_grpc/python_unit_test.py
 mode change 100644 => 100755 qa/L0_grpc/test.sh
 mode change 100644 => 100755 qa/L0_http/http_basic_auth_test.py
 mode change 100644 => 100755 qa/L0_http/http_client_plugin_test.py
 mode change 100644 => 100755 qa/L0_http/http_test.py
 mode change 100644 => 100755 qa/L0_http/python_http_aio_test.py
 mode change 100644 => 100755 qa/L0_http/test.sh
 mode change 100644 => 100755 qa/L0_http_fuzz/fuzztest.py
 mode change 100644 => 100755 qa/L0_http_fuzz/test.sh
 mode change 100644 => 100755 qa/L0_https/test.sh
 mode change 100644 => 100755 qa/L0_implicit_state/implicit_state.py
 mode change 100644 => 100755 qa/L0_implicit_state/test.sh
 mode change 100644 => 100755 qa/L0_infer/infer_test.py
 mode change 100644 => 100755 qa/L0_infer_reshape/infer_reshape_test.py
 mode change 100644 => 100755 qa/L0_infer_variable/infer_variable_test.py
 mode change 100644 => 100755 qa/L0_infer_zero/infer_zero_test.py
 mode change 100644 => 100755 qa/L0_inferentia_perf_analyzer/test.sh
 mode change 100644 => 100755 qa/L0_json/test.sh
 mode change 100644 => 100755 qa/L0_large_payload/large_payload_test.py
 mode change 100644 => 100755 qa/L0_large_payload/test.sh
 mode change 100644 => 100755 qa/L0_libtorch_inference_mode/test.sh
 mode change 100644 => 100755 qa/L0_libtorch_instance_group_kind_model/client.py
 mode change 100755 => 100644 qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt
 mode change 100644 => 100755 qa/L0_libtorch_io_names/io_names_client.py
 mode change 100644 => 100755 qa/L0_libtorch_io_names/test.sh
 mode change 100644 => 100755 qa/L0_libtorch_nvfuser/test.sh
 mode change 100644 => 100755 qa/L0_libtorch_optimized_execution/test.sh
 mode change 100644 => 100755 qa/L0_libtorch_shared_weights/libtorch_shared_weights_test.py
 mode change 100644 => 100755 qa/L0_libtorch_shared_weights/test.sh
 mode change 100644 => 100755 qa/L0_lifecycle/lifecycle_test.py
 mode change 100644 => 100755 qa/L0_logging/logging_endpoint_test.py
 mode change 100644 => 100755 qa/L0_long_running_stress/crashing_client.py
 mode change 100644 => 100755 qa/L0_long_running_stress/scenarios.py
 mode change 100644 => 100755 qa/L0_long_running_stress/stress.py
 mode change 100644 => 100755 qa/L0_long_running_stress/stress_mail.py
 mode change 100644 => 100755 qa/L0_memory/test.sh
 mode change 100644 => 100755 qa/L0_memory_growth/busy_op_test.py
 mode change 100644 => 100755 qa/L0_memory_growth/server_memory_mail.py
 mode change 100644 => 100755 qa/L0_mlflow/plugin_test.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/no_return/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform_success/onnx/cpu_instance/config.pbtxt
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/config.pbtxt
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py
 mode change 100644 => 100755 qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/config.pbtxt
 mode change 100644 => 100755 qa/L0_model_config/compare_status.py
 mode change 100644 => 100755 qa/L0_model_config/noautofill_test.py
 mode change 100644 => 100755 qa/L0_model_namespacing/python_addsub/__init__.py
 mode change 100644 => 100755 qa/L0_model_namespacing/python_subadd/__init__.py
 mode change 100644 => 100755 qa/L0_model_namespacing/test.py
 mode change 100644 => 100755 qa/L0_model_namespacing/test.sh
 mode change 100644 => 100755 qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_duplication/addsub_repo/simple_addsub/config.pbtxt
 mode change 100644 => 100755 qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_duplication/subadd_repo/simple_subadd/config.pbtxt
 mode change 100644 => 100755 qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/simple_addsub/config.pbtxt
 mode change 100644 => 100755 qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/simple_subadd/config.pbtxt
 mode change 100644 => 100755 qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/simple_ensemble/config.pbtxt
 mode change 100644 => 100755 qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/simple_ensemble/config.pbtxt
 mode change 100644 => 100755 qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_no_duplication/addsub_repo/simple_addsub/config.pbtxt
 mode change 100644 => 100755 qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_no_duplication/subadd_repo/simple_subadd/config.pbtxt
 mode change 100644 => 100755 qa/L0_model_queue/model_queue_test.py
 mode change 100644 => 100755 qa/L0_model_update/instance_update_test.py
 mode change 100644 => 100755 qa/L0_multi_server/test.sh
 mode change 100644 => 100755 qa/L0_nan_inf/models/nan_inf_output/1/model.py
 mode change 100644 => 100755 qa/L0_nan_inf/nan_inf_test.py
 mode change 100644 => 100755 qa/L0_nullchar_string/nullchar_string_client.py
 mode change 100644 => 100755 qa/L0_nullchar_string/test.sh
 mode change 100755 => 100644 qa/L0_optional_input/models/ensemble_identity_2_float32/config.pbtxt
 mode change 100755 => 100644 qa/L0_optional_input/models/identity_2_float32/config.pbtxt
 mode change 100755 => 100644 qa/L0_optional_input/models/pipeline_identity_2_float32/config.pbtxt
 mode change 100644 => 100755 qa/L0_optional_input/optional_input_test.py
 mode change 100644 => 100755 qa/L0_output_name/output_name_test.py
 mode change 100644 => 100755 qa/L0_output_name/test.sh
 mode change 100644 => 100755 qa/L0_output_validation/lt_op_val_client.py
 mode change 100644 => 100755 qa/L0_output_validation/test.sh
 mode change 100644 => 100755 qa/L0_parallel_copy/parallel_copy_test.py
 mode change 100644 => 100755 qa/L0_parameters/model_repository/parameter/1/model.py
 mode change 100644 => 100755 qa/L0_parameters/parameters_test.py
 mode change 100644 => 100755 qa/L0_parameters/test.sh
 mode change 100755 => 100644 qa/L0_passive_instance/models/distributed_int32_int32_int32/config.pbtxt
 mode change 100644 => 100755 qa/L0_passive_instance/passive_instance_test.py
 mode change 100644 => 100755 qa/L0_passive_instance/test.sh
 mode change 100644 => 100755 qa/L0_perf_analyzer_doc_links/test.sh
 mode change 100644 => 100755 qa/L0_perf_kaldi/create_data.sh
 mode change 100644 => 100755 qa/L0_perf_kaldi/test.sh
 mode change 100644 => 100755 qa/L0_perf_pyclients/simple_perf_client.py
 mode change 100644 => 100755 qa/L0_query/query_e2e.py
 mode change 100644 => 100755 qa/L0_query/test.sh
 mode change 100644 => 100755 qa/L0_rate_limiter/rate_limiter_test.py
 mode change 100644 => 100755 qa/L0_rate_limiter/test.sh
 mode change 100644 => 100755 qa/L0_register/test.sh
 mode change 100644 => 100755 qa/L0_repoagent_checksum/identity_test.py
 mode change 100644 => 100755 qa/L0_sagemaker/sagemaker_multi_model_test.py
 mode change 100644 => 100755 qa/L0_sagemaker/sagemaker_test.py
 mode change 100644 => 100755 qa/L0_savedmodel_shape/saved_model_shape_test.py
 mode change 100644 => 100755 qa/L0_savedmodel_shape/test.sh
 mode change 100644 => 100755 qa/L0_secure_grpc/test.sh
 mode change 100644 => 100755 qa/L0_sequence_batcher/sequence_batcher_test.py
 mode change 100644 => 100755 qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py
 mode change 100644 => 100755 qa/L0_sequence_stress/sequence_stress.py
 mode change 100644 => 100755 qa/L0_server_status/server_status_test.py
 mode change 100644 => 100755 qa/L0_shared_memory/shared_memory_test.py
 mode change 100644 => 100755 qa/L0_shared_memory/test.sh
 mode change 100644 => 100755 qa/L0_simple_ensemble/ensemble_test.py
 mode change 100644 => 100755 qa/L0_simple_nodejs_client/test.sh
 mode change 100644 => 100755 qa/L0_socket/test.sh
 mode change 100644 => 100755 qa/L0_storage_S3_local/mock_s3_service.py
 mode change 100644 => 100755 qa/L0_storage_swiftstack/infer_test.py
 mode change 100644 => 100755 qa/L0_string_io/string_client_test.py
 mode change 100644 => 100755 qa/L0_tf_gpu_io/tf_gpu_io_test.py
 mode change 100644 => 100755 qa/L0_tf_parameters/test.sh
 mode change 100644 => 100755 qa/L0_tf_parameters/tf_parameter_test.py
 mode change 100644 => 100755 qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py
 mode change 100644 => 100755 qa/L0_tf_unknown_rank/test.sh
 mode change 100644 => 100755 qa/L0_tf_unknown_rank/tf_unknown_rank_test.py
 mode change 100644 => 100755 qa/L0_tftrt_optimization/tftrt_optimization_test.py
 mode change 100644 => 100755 qa/L0_trace/trace_endpoint_test.py
 mode change 100644 => 100755 qa/L0_triton_repo_agent/test.sh
 mode change 100644 => 100755 qa/L0_trt_data_dependent_shape/trt_data_dependent_shape_test.py
 mode change 100644 => 100755 qa/L0_trt_dla/dla_test.py
 mode change 100644 => 100755 qa/L0_trt_dla/test.sh
 mode change 100644 => 100755 qa/L0_trt_dynamic_shape/trt_dynamic_shape_test.py
 mode change 100644 => 100755 qa/L0_trt_error_propagation/trt_error_propagation_test.py
 mode change 100644 => 100755 qa/L0_trt_plugin/test.sh
 mode change 100644 => 100755 qa/L0_trt_plugin/trt_plugin_test.py
 mode change 100644 => 100755 qa/L0_trt_reformat_free/trt_reformat_free_test.py
 mode change 100644 => 100755 qa/L0_trt_shape_tensors/test.sh
 mode change 100644 => 100755 qa/L0_trt_shape_tensors/trt_shape_tensor_test.py
 mode change 100644 => 100755 qa/L0_vertex_ai/test.sh
 mode change 100644 => 100755 qa/L0_vertex_ai/vertex_ai_test.py
 mode change 100644 => 100755 qa/L0_warmup/decoupled/1/model.py
 mode change 100644 => 100755 qa/L0_warmup/failing_infer/1/model.py
 mode change 100644 => 100755 qa/L0_warmup/test.sh
 mode change 100644 => 100755 qa/common/gen_ensemble_model_utils.py
 mode change 100644 => 100755 qa/common/gen_qa_custom_ops_models.py
 mode change 100644 => 100755 qa/common/gen_qa_dyna_sequence_implicit_models.py
 mode change 100644 => 100755 qa/common/gen_qa_dyna_sequence_models.py
 mode change 100644 => 100755 qa/common/gen_qa_identity_models.py
 mode change 100644 => 100755 qa/common/gen_qa_implicit_models.py
 mode change 100644 => 100755 qa/common/gen_qa_models.py
 mode change 100644 => 100755 qa/common/gen_qa_noshape_models.py
 mode change 100644 => 100755 qa/common/gen_qa_ragged_models.py
 mode change 100644 => 100755 qa/common/gen_qa_reshape_models.py
 mode change 100644 => 100755 qa/common/gen_qa_sequence_models.py
 mode change 100644 => 100755 qa/common/gen_qa_tf_parameters.py
 mode change 100644 => 100755 qa/common/gen_qa_torchtrt_models.py
 mode change 100644 => 100755 qa/common/gen_qa_trt_data_dependent_shape.py
 mode change 100644 => 100755 qa/common/gen_qa_trt_format_models.py
 mode change 100644 => 100755 qa/common/gen_qa_trt_plugin_models.py
 mode change 100644 => 100755 qa/common/gen_tag_sigdef.py
 mode change 100644 => 100755 qa/common/infer_test.py
 mode change 100644 => 100755 qa/common/infer_util.py
 mode change 100644 => 100755 qa/common/inferentia_perf_analyzer_input_data_json/simple_model.py
 mode change 100644 => 100755 qa/common/libtorch_infer_client.py
 mode change 100644 => 100755 qa/common/nightly_email_helper.py
 mode change 100644 => 100755 qa/common/sequence_util.py
 mode change 100644 => 100755 qa/common/shm_util.py
 mode change 100644 => 100755 qa/common/test_util.py
 mode change 100755 => 100644 qa/custom_models/custom_zero_1_float32/config.pbtxt
 mode change 100644 => 100755 qa/python_models/add_sub/model.py
 mode change 100644 => 100755 qa/python_models/auto_complete/model.py
 mode change 100644 => 100755 qa/python_models/auto_complete_error/model.py
 mode change 100644 => 100755 qa/python_models/bls/model.py
 mode change 100644 => 100755 qa/python_models/bls_async/model.py
 mode change 100644 => 100755 qa/python_models/bls_finalize_error/model.py
 mode change 100644 => 100755 qa/python_models/bls_init_error/model.py
 mode change 100644 => 100755 qa/python_models/bls_memory/model.py
 mode change 100644 => 100755 qa/python_models/bls_memory_async/model.py
 mode change 100644 => 100755 qa/python_models/bls_model_loading/model.py
 mode change 100644 => 100755 qa/python_models/bls_onnx_warmup/config.pbtxt
 mode change 100644 => 100755 qa/python_models/bls_undefined/model.py
 mode change 100644 => 100755 qa/python_models/cuda_memory_consumer/1/model.py
 mode change 100644 => 100755 qa/python_models/custom_metrics/model.py
 mode change 100644 => 100755 qa/python_models/delayed_model/model.py
 mode change 100644 => 100755 qa/python_models/dlpack_add_sub/model.py
 mode change 100644 => 100755 qa/python_models/dlpack_empty_output/model.py
 mode change 100644 => 100755 qa/python_models/dlpack_identity/model.py
 mode change 100644 => 100755 qa/python_models/dlpack_io_identity/model.py
 mode change 100644 => 100755 qa/python_models/dlpack_io_identity_decoupled/model.py
 mode change 100644 => 100755 qa/python_models/dlpack_square/model.py
 mode change 100644 => 100755 qa/python_models/dlpack_sub_add/model.py
 mode change 100644 => 100755 qa/python_models/dlpack_test/model.py
 mode change 100644 => 100755 qa/python_models/execute_error/model.py
 mode change 100644 => 100755 qa/python_models/execute_return_error/model.py
 mode change 100644 => 100755 qa/python_models/fini_error/model.py
 mode change 100644 => 100755 qa/python_models/ground_truth/model.py
 mode change 100644 => 100755 qa/python_models/identity_fp32/model.py
 mode change 100644 => 100755 qa/python_models/identity_fp32_logging/model.py
 mode change 100644 => 100755 qa/python_models/identity_fp32_timeout/model.py
 mode change 100644 => 100755 qa/python_models/init_args/model.py
 mode change 100644 => 100755 qa/python_models/init_error/model.py
 mode change 100644 => 100755 qa/python_models/init_exit/model.py
 mode change 100644 => 100755 qa/python_models/model_env/model.py
 mode change 100644 => 100755 qa/python_models/model_init_del/model.py
 mode change 100644 => 100755 qa/python_models/model_init_del/util.py
 mode change 100644 => 100755 qa/python_models/multi_file/file1.py
 mode change 100644 => 100755 qa/python_models/multi_file/file2.py
 mode change 100644 => 100755 qa/python_models/multi_file/model.py
 mode change 100644 => 100755 qa/python_models/non_contiguous/model.py
 mode change 100644 => 100755 qa/python_models/optional/model.py
 mode change 100644 => 100755 qa/python_models/python_version/model.py
 mode change 100644 => 100755 qa/python_models/pytorch_fp32_fp32/model.py
 mode change 100644 => 100755 qa/python_models/response_sender_error/model.py
 mode change 100644 => 100755 qa/python_models/sequence_int32/model.py
 mode change 100644 => 100755 qa/python_models/string/model.py
 mode change 100644 => 100755 qa/python_models/string_fixed/model.py
 mode change 100644 => 100755 qa/python_models/string_identity/model.py
 mode change 100644 => 100755 qa/python_models/sub_add/model.py
 mode change 100755 => 100644 qa/python_models/torchvision/resnet50/config.pbtxt
 mode change 100644 => 100755 qa/python_models/torchvision/resnet50/model.py
 mode change 100644 => 100755 qa/python_models/variable_gpu_output/model.py
 mode change 100644 => 100755 qa/python_models/wrong_model/model.py

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index a724718d46..4f3f98cc6f 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -63,12 +63,12 @@ jobs:
         # If you wish to specify custom queries, you can do so here or in a config file.
         # By default, queries listed here will override any specified in a config file.
         # Prefix the list here with "+" to use these queries and those in the config file.
-        
+
         # Details on CodeQL's query packs refer to:
         # https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
         queries: +security-and-quality
 
-        
+
     # Autobuild attempts to build any compiled languages  (C/C++, C#, Go, or Java).
     # If this step fails, then you should remove it and run the build manually (see below)
     - name: Autobuild
@@ -77,7 +77,7 @@ jobs:
     # Command-line programs to run using the OS shell.
     # See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
 
-    #   If the Autobuild fails above, remove it and uncomment the following three lines. 
+    #   If the Autobuild fails above, remove it and uncomment the following three lines.
     #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
 
     # - run: |
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6c03a4ad6c..1985278fd3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -64,7 +64,9 @@ repos:
     - id: check-merge-conflict
     - id: check-json
     - id: check-toml
+    # Do not check template yaml files in deploy directory
     - id: check-yaml
+      exclude: ^deploy(\/[^\/]+)*\/templates\/.*$
     - id: check-shebang-scripts-are-executable
     - id: end-of-file-fixer
       types_or: [c, c++, cuda, proto, textproto, java, python]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a2031f1bdb..7ea6dbddf7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,7 +130,7 @@ if(EXISTS "/etc/os-release")
     set (LIB_DIR "lib64")
   endif()
 endif()
-  
+
 set(TRITON_CORE_HEADERS_ONLY OFF)
 
 FetchContent_MakeAvailable(repo-third-party repo-core)
diff --git a/Dockerfile.QA b/Dockerfile.QA
index 0d3fb2a239..563194a7c8 100644
--- a/Dockerfile.QA
+++ b/Dockerfile.QA
@@ -70,7 +70,7 @@ RUN apt update && apt install -y gpg wget && \
     echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | \
     tee /etc/apt/sources.list.d/kitware.list >/dev/null && \
     apt-get update && \
-    apt-get install -y --no-install-recommends cmake cmake-data 
+    apt-get install -y --no-install-recommends cmake cmake-data
 
 # Add inception_graphdef model to example repo
 WORKDIR /workspace/docs/examples/model_repository
diff --git a/Dockerfile.sdk b/Dockerfile.sdk
index cb64a5599a..5d7f409e8f 100644
--- a/Dockerfile.sdk
+++ b/Dockerfile.sdk
@@ -97,7 +97,7 @@ RUN apt update && apt install -y gpg wget && \
     tee /etc/apt/sources.list.d/kitware.list >/dev/null && \
     apt-get update && \
     apt-get install -y --no-install-recommends cmake cmake-data && \
-    cmake --version 
+    cmake --version
 
 # Build expects "python" executable (not python3).
 RUN rm -f /usr/bin/python && \
@@ -197,8 +197,8 @@ RUN mkdir qa
 COPY qa/L0_sdk qa/L0_sdk
 COPY qa/L0_client_build_variants qa/L0_client_build_variants
 
-# Create a directory for all the python client tests to enable unit testing 
-RUN mkdir -p qa/python_client_unit_tests/ 
+# Create a directory for all the python client tests to enable unit testing
+RUN mkdir -p qa/python_client_unit_tests/
 COPY --from=sdk_build /workspace/client/src/python/library/tests/* qa/python_client_unit_tests/
 
 # Install an image needed by the quickstart and other documentation.
diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min
index a0660c2f80..ee9393de80 100644
--- a/Dockerfile.win10.min
+++ b/Dockerfile.win10.min
@@ -130,7 +130,7 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi
 
 RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%"
 
-LABEL CUDA_VERSION="${CUDA_VERSION}" 
+LABEL CUDA_VERSION="${CUDA_VERSION}"
 
 #
 # Installing Tensorrt
@@ -159,7 +159,7 @@ ARG CUDNN_SOURCE=${CUDNN_ZIP}
 
 ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP}
 
-RUN unzip /tmp/%CUDNN_ZIP% 
+RUN unzip /tmp/%CUDNN_ZIP%
 RUN move cudnn-* cudnn
 RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\."
 RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\."
diff --git a/README.md b/README.md
index 2ea07a98a7..8d5f96c0a2 100644
--- a/README.md
+++ b/README.md
@@ -31,19 +31,19 @@
 [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
 **LATEST RELEASE: You are currently on the main branch which tracks
-under-development progress towards the next release. The current release is 
+under-development progress towards the next release. The current release is
 version [2.35.0](https://github.com/triton-inference-server/server/tree/r23.06)
-and corresponds to the 23.06 container release on 
+and corresponds to the 23.06 container release on
 [NVIDIA GPU Cloud (NGC)](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver).**
 
 ----
-Triton Inference Server is an open source inference serving software that 
-streamlines AI inferencing. Triton enables teams to deploy any AI model from 
-multiple deep learning and machine learning frameworks, including TensorRT, 
-TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton 
-supports inference across cloud, data center,edge and embedded devices on NVIDIA 
-GPUs, x86 and ARM CPU, or AWS Inferentia. Triton delivers optimized performance 
-for many query types, including real time, batched, ensembles and audio/video 
+Triton Inference Server is an open source inference serving software that
+streamlines AI inferencing. Triton enables teams to deploy any AI model from
+multiple deep learning and machine learning frameworks, including TensorRT,
+TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton
+supports inference across cloud, data center,edge and embedded devices on NVIDIA
+GPUs, x86 and ARM CPU, or AWS Inferentia. Triton delivers optimized performance
+for many query types, including real time, batched, ensembles and audio/video
 streaming.
 
 Major features include:
@@ -55,7 +55,7 @@ Major features include:
 - [Concurrent model
   execution](docs/user_guide/architecture.md#concurrent-model-execution)
 - [Dynamic batching](docs/user_guide/model_configuration.md#dynamic-batcher)
-- [Sequence batching](docs/user_guide/model_configuration.md#sequence-batcher) and 
+- [Sequence batching](docs/user_guide/model_configuration.md#sequence-batcher) and
   [implicit state management](docs/user_guide/architecture.md#implicit-state-management)
   for stateful models
 - Provides [Backend API](https://github.com/triton-inference-server/backend) that
@@ -74,20 +74,20 @@ Major features include:
 - [Metrics](docs/user_guide/metrics.md) indicating GPU utilization, server
   throughput, server latency, and more
 
-**New to Triton Inference Server?** Make use of 
+**New to Triton Inference Server?** Make use of
 [these tutorials](https://github.com/triton-inference-server/tutorials)
-to begin your Triton journey! 
+to begin your Triton journey!
 
-Join the [Triton and TensorRT community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and 
-stay current on the latest product updates, bug fixes, content, best practices, 
-and more.  Need enterprise support?  NVIDIA global support is available for Triton 
-Inference Server with the 
-[NVIDIA AI Enterprise software suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/). 
+Join the [Triton and TensorRT community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and
+stay current on the latest product updates, bug fixes, content, best practices,
+and more.  Need enterprise support?  NVIDIA global support is available for Triton
+Inference Server with the
+[NVIDIA AI Enterprise software suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/).
 
 ## Serve a Model in 3 Easy Steps
 
 ```bash
-# Step 1: Create the example model repository 
+# Step 1: Create the example model repository
 git clone -b r23.06 https://github.com/triton-inference-server/server.git
 cd server/docs/examples
 ./fetch_models.sh
@@ -95,7 +95,7 @@ cd server/docs/examples
 # Step 2: Launch triton from the NGC Triton container
 docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:23.06-py3 tritonserver --model-repository=/models
 
-# Step 3: Sending an Inference Request 
+# Step 3: Sending an Inference Request
 # In a separate console, launch the image_client example from the NGC Triton SDK container
 docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:23.06-py3-sdk
 /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
@@ -115,13 +115,13 @@ Check out [NVIDIA LaunchPad](https://www.nvidia.com/en-us/data-center/products/a
 for free access to a set of hands-on labs with Triton Inference Server hosted on
 NVIDIA infrastructure.
 
-Specific end-to-end examples for popular models, such as ResNet, BERT, and DLRM 
-are located in the 
+Specific end-to-end examples for popular models, such as ResNet, BERT, and DLRM
+are located in the
 [NVIDIA Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples)
-page on GitHub. The 
-[NVIDIA Developer Zone](https://developer.nvidia.com/nvidia-triton-inference-server) 
+page on GitHub. The
+[NVIDIA Developer Zone](https://developer.nvidia.com/nvidia-triton-inference-server)
 contains additional documentation, presentations, and examples.
- 
+
 ## Documentation
 
 ### Build and Deploy
@@ -134,7 +134,7 @@ images.
 - [Build a custom Triton Inference Server Docker container](docs/customization_guide/compose.md)
 - [Build Triton Inference Server from source](docs/customization_guide/build.md#building-on-unsupported-platforms)
 - [Build Triton Inference Server for Windows 10](docs/customization_guide/build.md#building-for-windows-10)
-- Examples for deploying Triton Inference Server with Kubernetes and Helm on [GCP](deploy/gcp/README.md), 
+- Examples for deploying Triton Inference Server with Kubernetes and Helm on [GCP](deploy/gcp/README.md),
   [AWS](deploy/aws/README.md), and [NVIDIA FleetCommand](deploy/fleetcommand/README.md)
 
 ### Using Triton
@@ -142,10 +142,10 @@ images.
 #### Preparing Models for Triton Inference Server
 
 The first step in using Triton to serve your models is to place one or
-more models into a [model repository](docs/user_guide/model_repository.md). Depending on 
+more models into a [model repository](docs/user_guide/model_repository.md). Depending on
 the type of the model and on what Triton capabilities you want to enable for
 the model, you may need to create a [model
-configuration](docs/user_guide/model_configuration.md) for the model.  
+configuration](docs/user_guide/model_configuration.md) for the model.
 
 - [Add custom operations to Triton if needed by your model](docs/user_guide/custom_operations.md)
 - Enable model pipelining with [Model Ensemble](docs/user_guide/architecture.md#ensemble-models)
@@ -154,37 +154,37 @@ configuration](docs/user_guide/model_configuration.md) for the model.
   parameters and [model instances](docs/user_guide/model_configuration.md#instance-groups).
 - Use the [Model Analyzer tool](https://github.com/triton-inference-server/model_analyzer)
   to help optimize your model configuration with profiling
-- Learn how to [explicitly manage what models are available by loading and 
+- Learn how to [explicitly manage what models are available by loading and
   unloading models](docs/user_guide/model_management.md)
 
 #### Configure and Use Triton Inference Server
 
-- Read the [Quick Start Guide](docs/getting_started/quickstart.md) to run Triton Inference 
+- Read the [Quick Start Guide](docs/getting_started/quickstart.md) to run Triton Inference
   Server on both GPU and CPU
-- Triton supports multiple execution engines, called 
-  [backends](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton), including 
-  [TensorRT](https://github.com/triton-inference-server/tensorrt_backend), 
-  [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend), 
-  [PyTorch](https://github.com/triton-inference-server/pytorch_backend), 
-  [ONNX](https://github.com/triton-inference-server/onnxruntime_backend), 
-  [OpenVINO](https://github.com/triton-inference-server/openvino_backend), 
+- Triton supports multiple execution engines, called
+  [backends](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton), including
+  [TensorRT](https://github.com/triton-inference-server/tensorrt_backend),
+  [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend),
+  [PyTorch](https://github.com/triton-inference-server/pytorch_backend),
+  [ONNX](https://github.com/triton-inference-server/onnxruntime_backend),
+  [OpenVINO](https://github.com/triton-inference-server/openvino_backend),
   [Python](https://github.com/triton-inference-server/python_backend), and more
 - Not all the above backends are supported on every platform supported by Triton.
   Look at the
   [Backend-Platform Support Matrix](https://github.com/triton-inference-server/backend/blob/main/docs/backend_platform_support_matrix.md)
   to learn which backends are supported on your target platform.
-- Learn how to [optimize performance](docs/user_guide/optimization.md) using the 
+- Learn how to [optimize performance](docs/user_guide/optimization.md) using the
   [Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
   and
   [Model Analyzer](https://github.com/triton-inference-server/model_analyzer)
-- Learn how to [manage loading and unloading models](docs/user_guide/model_management.md) in 
+- Learn how to [manage loading and unloading models](docs/user_guide/model_management.md) in
   Triton
 - Send requests directly to Triton with the [HTTP/REST JSON-based
   or gRPC protocols](docs/customization_guide/inference_protocols.md#httprest-and-grpc-protocols)
 
 #### Client Support and Examples
 
-A Triton *client* application sends inference and other requests to Triton. The 
+A Triton *client* application sends inference and other requests to Triton. The
 [Python and C++ client libraries](https://github.com/triton-inference-server/client)
 provide APIs to simplify this communication.
 
@@ -194,25 +194,25 @@ provide APIs to simplify this communication.
 - Configure [HTTP](https://github.com/triton-inference-server/client#http-options)
   and [gRPC](https://github.com/triton-inference-server/client#grpc-options)
   client options
-- Send input data (e.g. a jpeg image) directly to Triton in the [body of an HTTP 
+- Send input data (e.g. a jpeg image) directly to Triton in the [body of an HTTP
   request without any additional metadata](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_binary_data.md#raw-binary-request)
 
 ### Extend Triton
 
-[Triton Inference Server's architecture](docs/user_guide/architecture.md) is specifically 
+[Triton Inference Server's architecture](docs/user_guide/architecture.md) is specifically
 designed for modularity and flexibility
 
 - [Customize Triton Inference Server container](docs/customization_guide/compose.md) for your use case
 - [Create custom backends](https://github.com/triton-inference-server/backend)
   in either [C/C++](https://github.com/triton-inference-server/backend/blob/main/README.md#triton-backend-api)
   or [Python](https://github.com/triton-inference-server/python_backend)
-- Create [decouple backends and models](docs/user_guide/decoupled_models.md) that can send 
+- Create [decouple backends and models](docs/user_guide/decoupled_models.md) that can send
   multiple responses for a request or not send any responses for a request
 - Use a [Triton repository agent](docs/customization_guide/repository_agents.md) to add functionality
-  that operates when a model is loaded and unloaded, such as authentication, 
+  that operates when a model is loaded and unloaded, such as authentication,
   decryption, or conversion
 - Deploy Triton on [Jetson and JetPack](docs/user_guide/jetson.md)
-- [Use Triton on AWS 
+- [Use Triton on AWS
    Inferentia](https://github.com/triton-inference-server/python_backend/tree/main/inferentia)
 
 ### Additional Documentation
@@ -227,7 +227,7 @@ Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html)
 ## Contributing
 
 Contributions to Triton Inference Server are more than welcome. To
-contribute please review the [contribution 
+contribute please review the [contribution
 guidelines](CONTRIBUTING.md). If you have a backend, client,
 example or similar contribution that is not modifying the core of
 Triton, then you should file a PR in the [contrib
@@ -235,7 +235,7 @@ repo](https://github.com/triton-inference-server/contrib).
 
 ## Reporting problems, asking questions
 
-We appreciate any feedback, questions or bug reporting regarding this project. 
+We appreciate any feedback, questions or bug reporting regarding this project.
 When posting [issues in GitHub](https://github.com/triton-inference-server/server/issues),
 follow the process outlined in the [Stack Overflow document](https://stackoverflow.com/help/mcve).
 Ensure posted examples are:
diff --git a/build.py b/build.py
index d59bb56f9c..1339c5c6f9 100755
--- a/build.py
+++ b/build.py
@@ -2495,4 +2495,4 @@ def enable_all():
         else:
             p = subprocess.Popen([f'./{script_name}'], cwd=FLAGS.build_dir)
         p.wait()
-        fail_if(p.returncode != 0, 'build failed')
+        fail_if(p.returncode != 0, 'build failed')
\ No newline at end of file
diff --git a/compose.py b/compose.py
old mode 100644
new mode 100755
index 0a00883727..9f948c14fd
--- a/compose.py
+++ b/compose.py
@@ -39,7 +39,7 @@ def log(msg, force=False):
         try:
             print(msg, file=sys.stderr)
         except Exception:
-            print('<failed to log>', file=sys.stderr)
+            print("<failed to log>", file=sys.stderr)
 
 
 def log_verbose(msg):
@@ -48,7 +48,7 @@ def log_verbose(msg):
 
 
 def fail(msg):
-    print('error: {}'.format(msg), file=sys.stderr)
+    print("error: {}".format(msg), file=sys.stderr)
     sys.exit(1)
 
 
@@ -58,8 +58,8 @@ def fail_if(p, msg):
 
 
 def start_dockerfile(ddir, images, argmap, dockerfile_name, backends):
-    # Set enviroment variables, set default user and install dependencies
-    df = '''
+    # Set environment variables, set default user and install dependencies
+    df = """
 #
 # Multistage build.
 #
@@ -67,30 +67,38 @@ def start_dockerfile(ddir, images, argmap, dockerfile_name, backends):
 ARG TRITON_CONTAINER_VERSION={}
 
 FROM {} AS full
-'''.format(argmap['TRITON_VERSION'], argmap['TRITON_CONTAINER_VERSION'],
-           images["full"])
+""".format(
+        argmap["TRITON_VERSION"], argmap["TRITON_CONTAINER_VERSION"], images["full"]
+    )
 
     # PyTorch, TensorFlow 1 and TensorFlow 2 backends need extra CUDA and other
     # dependencies during runtime that are missing in the CPU-only base container.
     # These dependencies must be copied from the Triton Min image.
-    if not FLAGS.enable_gpu and (('pytorch' in backends) or
-                                 ('tensorflow1' in backends) or
-                                 ('tensorflow2' in backends)):
-        df += '''
+    if not FLAGS.enable_gpu and (
+        ("pytorch" in backends)
+        or ("tensorflow1" in backends)
+        or ("tensorflow2" in backends)
+    ):
+        df += """
 FROM {} AS min_container
 
-'''.format(images["gpu-min"])
+""".format(
+            images["gpu-min"]
+        )
 
-    df += '''
+    df += """
 FROM {}
-'''.format(images["min"])
+""".format(
+        images["min"]
+    )
 
     import build
-    df += build.dockerfile_prepare_container_linux(argmap, backends,
-                                                   FLAGS.enable_gpu,
-                                                   platform.machine().lower())
+
+    df += build.dockerfile_prepare_container_linux(
+        argmap, backends, FLAGS.enable_gpu, platform.machine().lower()
+    )
     # Copy over files
-    df += '''
+    df += """
 WORKDIR /opt/tritonserver
 COPY --chown=1000:1000 --from=full /opt/tritonserver/LICENSE .
 COPY --chown=1000:1000 --from=full /opt/tritonserver/TRITON_VERSION .
@@ -98,7 +106,7 @@ def start_dockerfile(ddir, images, argmap, dockerfile_name, backends):
 COPY --chown=1000:1000 --from=full /opt/tritonserver/bin bin/
 COPY --chown=1000:1000 --from=full /opt/tritonserver/lib lib/
 COPY --chown=1000:1000 --from=full /opt/tritonserver/include include/
-'''
+"""
     with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
         dfile.write(df)
 
@@ -106,13 +114,15 @@ def start_dockerfile(ddir, images, argmap, dockerfile_name, backends):
 def add_requested_backends(ddir, dockerfile_name, backends):
     df = "# Copying over backends \n"
     for backend in backends:
-        df += '''COPY --chown=1000:1000 --from=full /opt/tritonserver/backends/{} /opt/tritonserver/backends/{}
-'''.format(backend, backend)
+        df += """COPY --chown=1000:1000 --from=full /opt/tritonserver/backends/{} /opt/tritonserver/backends/{}
+""".format(
+            backend, backend
+        )
     if len(backends) > 0:
-        df += '''
+        df += """
 # Top-level /opt/tritonserver/backends not copied so need to explicitly set permissions here
 RUN chown triton-server:triton-server /opt/tritonserver/backends
-'''
+"""
     with open(os.path.join(ddir, dockerfile_name), "a") as dfile:
         dfile.write(df)
 
@@ -120,13 +130,15 @@ def add_requested_backends(ddir, dockerfile_name, backends):
 def add_requested_repoagents(ddir, dockerfile_name, repoagents):
     df = "#  Copying over repoagents \n"
     for ra in repoagents:
-        df += '''COPY --chown=1000:1000 --from=full /opt/tritonserver/repoagents/{} /opt/tritonserver/repoagents/{}
-'''.format(ra, ra)
+        df += """COPY --chown=1000:1000 --from=full /opt/tritonserver/repoagents/{} /opt/tritonserver/repoagents/{}
+""".format(
+            ra, ra
+        )
     if len(repoagents) > 0:
-        df += '''
+        df += """
 # Top-level /opt/tritonserver/repoagents not copied so need to explicitly set permissions here
 RUN chown triton-server:triton-server /opt/tritonserver/repoagents
-'''
+"""
     with open(os.path.join(ddir, dockerfile_name), "a") as dfile:
         dfile.write(df)
 
@@ -134,13 +146,15 @@ def add_requested_repoagents(ddir, dockerfile_name, repoagents):
 def add_requested_caches(ddir, dockerfile_name, caches):
     df = "#  Copying over caches \n"
     for cache in caches:
-        df += '''COPY --chown=1000:1000 --from=full /opt/tritonserver/caches/{} /opt/tritonserver/caches/{}
-'''.format(cache, cache)
+        df += """COPY --chown=1000:1000 --from=full /opt/tritonserver/caches/{} /opt/tritonserver/caches/{}
+""".format(
+            cache, cache
+        )
     if len(caches) > 0:
-        df += '''
+        df += """
 # Top-level /opt/tritonserver/caches not copied so need to explicitly set permissions here
 RUN chown triton-server:triton-server /opt/tritonserver/caches
-'''
+"""
     with open(os.path.join(ddir, dockerfile_name), "a") as dfile:
         dfile.write(df)
 
@@ -148,33 +162,44 @@ def add_requested_caches(ddir, dockerfile_name, caches):
 def end_dockerfile(ddir, dockerfile_name, argmap):
     # Install additional dependencies
     df = ""
-    if argmap['SAGEMAKER_ENDPOINT']:
-        df += '''
+    if argmap["SAGEMAKER_ENDPOINT"]:
+        df += """
 LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
 COPY --chown=1000:1000 --from=full /usr/bin/serve /usr/bin/.
-'''
+"""
     with open(os.path.join(ddir, dockerfile_name), "a") as dfile:
         dfile.write(df)
 
 
 def build_docker_image(ddir, dockerfile_name, container_name):
     # Create container with docker build
-    p = subprocess.Popen(['docker', 'build', '-t', container_name, '-f', \
-        os.path.join(ddir, dockerfile_name), '.'])
+    p = subprocess.Popen(
+        [
+            "docker",
+            "build",
+            "-t",
+            container_name,
+            "-f",
+            os.path.join(ddir, dockerfile_name),
+            ".",
+        ]
+    )
     p.wait()
-    fail_if(p.returncode != 0, 'docker build {} failed'.format(container_name))
+    fail_if(p.returncode != 0, "docker build {} failed".format(container_name))
 
 
 def get_container_version_if_not_specified():
     if FLAGS.container_version is None:
         # Read from TRITON_VERSION file in server repo to determine version
-        with open('TRITON_VERSION', "r") as vfile:
+        with open("TRITON_VERSION", "r") as vfile:
             version = vfile.readline().strip()
         import build
+
         _, FLAGS.container_version = build.container_versions(
-            version, None, FLAGS.container_version)
-        log('version {}'.format(version))
-    log('using container version {}'.format(FLAGS.container_version))
+            version, None, FLAGS.container_version
+        )
+        log("version {}".format(version))
+    log("using container version {}".format(FLAGS.container_version))
 
 
 def create_argmap(images, skip_pull):
@@ -183,210 +208,246 @@ def create_argmap(images, skip_pull):
     full_docker_image = images["full"]
     min_docker_image = images["min"]
     enable_gpu = FLAGS.enable_gpu
-    # Docker inspect enviroment variables
-    base_run_args = ['docker', 'inspect', '-f']
-    import re  # parse all PATH enviroment variables
+    # Docker inspect environment variables
+    base_run_args = ["docker", "inspect", "-f"]
+    import re  # parse all PATH environment variables
 
     # first pull docker images
     if not skip_pull:
         log("pulling container:{}".format(full_docker_image))
-        p = subprocess.run(['docker', 'pull', full_docker_image])
+        p = subprocess.run(["docker", "pull", full_docker_image])
         fail_if(
-            p.returncode != 0, 'docker pull container {} failed, {}'.format(
-                full_docker_image, p.stderr))
+            p.returncode != 0,
+            "docker pull container {} failed, {}".format(full_docker_image, p.stderr),
+        )
     if enable_gpu:
         if not skip_pull:
-            pm = subprocess.run(['docker', 'pull', min_docker_image])
+            pm = subprocess.run(["docker", "pull", min_docker_image])
             fail_if(
                 pm.returncode != 0 and not skip_pull,
-                'docker pull container {} failed, {}'.format(
-                    min_docker_image, pm.stderr))
-        pm_path = subprocess.run(base_run_args + [
-            '{{range $index, $value := .Config.Env}}{{$value}} {{end}}',
-            min_docker_image
-        ],
-                                 capture_output=True,
-                                 text=True)
+                "docker pull container {} failed, {}".format(
+                    min_docker_image, pm.stderr
+                ),
+            )
+        pm_path = subprocess.run(
+            base_run_args
+            + [
+                "{{range $index, $value := .Config.Env}}{{$value}} {{end}}",
+                min_docker_image,
+            ],
+            capture_output=True,
+            text=True,
+        )
         fail_if(
             pm_path.returncode != 0,
-            'docker inspect to find triton enviroment variables for min container failed, {}'
-            .format(pm_path.stderr))
+            "docker inspect to find triton environment variables for min container failed, {}".format(
+                pm_path.stderr
+            ),
+        )
         # min container needs to be GPU-support-enabled if the build is GPU build
         vars = pm_path.stdout
         e = re.search("CUDA_VERSION", vars)
         gpu_enabled = False if e is None else True
         fail_if(
             not gpu_enabled,
-            'Composing container with gpu support enabled but min container provided does not have CUDA installed'
+            "Composing container with gpu support enabled but min container provided does not have CUDA installed",
         )
 
-    # Check full container enviroment variables
-    p_path = subprocess.run(base_run_args + [
-        '{{range $index, $value := .Config.Env}}{{$value}} {{end}}',
-        full_docker_image
-    ],
-                            capture_output=True,
-                            text=True)
+    # Check full container environment variables
+    p_path = subprocess.run(
+        base_run_args
+        + [
+            "{{range $index, $value := .Config.Env}}{{$value}} {{end}}",
+            full_docker_image,
+        ],
+        capture_output=True,
+        text=True,
+    )
     fail_if(
         p_path.returncode != 0,
-        'docker inspect to find enviroment variables for full container failed, {}'
-        .format(p_path.stderr))
+        "docker inspect to find environment variables for full container failed, {}".format(
+            p_path.stderr
+        ),
+    )
     vars = p_path.stdout
     log_verbose("inspect args: {}".format(vars))
 
     e0 = re.search("TRITON_SERVER_GPU_ENABLED=([\S]{1,}) ", vars)
     e1 = re.search("CUDA_VERSION", vars)
     gpu_enabled = False
-    if (e0 != None):
+    if e0 != None:
         gpu_enabled = e0.group(1) == "1"
-    elif (e1 != None):
+    elif e1 != None:
         gpu_enabled = True
     fail_if(
         gpu_enabled != enable_gpu,
-        'Error: full container provided was build with '
-        '\'TRITON_SERVER_GPU_ENABLED\' as {} and you are composing container'
-        'with \'TRITON_SERVER_GPU_ENABLED\' as {}'.format(
-            gpu_enabled, enable_gpu))
+        "Error: full container provided was build with "
+        "'TRITON_SERVER_GPU_ENABLED' as {} and you are composing container"
+        "with 'TRITON_SERVER_GPU_ENABLED' as {}".format(gpu_enabled, enable_gpu),
+    )
     e = re.search("TRITON_SERVER_VERSION=([\S]{6,}) ", vars)
     version = "" if e is None else e.group(1)
     fail_if(
         len(version) == 0,
-        'docker inspect to find triton server version failed, {}'.format(
-            p_path.stderr))
+        "docker inspect to find triton server version failed, {}".format(p_path.stderr),
+    )
     e = re.search("NVIDIA_TRITON_SERVER_VERSION=([\S]{5,}) ", vars)
     container_version = "" if e is None else e.group(1)
     fail_if(
         len(container_version) == 0,
-        'docker inspect to find triton container version failed, {}'.format(
-            vars))
+        "docker inspect to find triton container version failed, {}".format(vars),
+    )
     dcgm_ver = re.search("DCGM_VERSION=([\S]{4,}) ", vars)
     dcgm_version = ""
     if dcgm_ver is None:
         dcgm_version = "2.2.3"
-        log("WARNING: DCGM version not found from image, installing the earlierst version {}"
-            .format(dcgm_version))
+        log(
+            "WARNING: DCGM version not found from image, installing the earlierst version {}".format(
+                dcgm_version
+            )
+        )
     else:
         dcgm_version = dcgm_ver.group(1)
     fail_if(
         len(dcgm_version) == 0,
-        'docker inspect to find DCGM version failed, {}'.format(vars))
+        "docker inspect to find DCGM version failed, {}".format(vars),
+    )
 
     p_sha = subprocess.run(
-        base_run_args +
-        ['{{ index .Config.Labels "com.nvidia.build.ref"}}', full_docker_image],
+        base_run_args
+        + ['{{ index .Config.Labels "com.nvidia.build.ref"}}', full_docker_image],
         capture_output=True,
-        text=True)
+        text=True,
+    )
     fail_if(
         p_sha.returncode != 0,
-        'docker inspect of upstream docker image build sha failed, {}'.format(
-            p_sha.stderr))
+        "docker inspect of upstream docker image build sha failed, {}".format(
+            p_sha.stderr
+        ),
+    )
     p_build = subprocess.run(
-        base_run_args +
-        ['{{ index .Config.Labels "com.nvidia.build.id"}}', full_docker_image],
+        base_run_args
+        + ['{{ index .Config.Labels "com.nvidia.build.id"}}', full_docker_image],
         capture_output=True,
-        text=True)
+        text=True,
+    )
     fail_if(
         p_build.returncode != 0,
-        'docker inspect of upstream docker image build sha failed, {}'.format(
-            p_build.stderr))
+        "docker inspect of upstream docker image build sha failed, {}".format(
+            p_build.stderr
+        ),
+    )
 
     p_find = subprocess.run(
-        ['docker', 'run', full_docker_image, 'bash', '-c', 'ls /usr/bin/'],
+        ["docker", "run", full_docker_image, "bash", "-c", "ls /usr/bin/"],
         capture_output=True,
-        text=True)
+        text=True,
+    )
     f = re.search("serve", p_find.stdout)
-    fail_if(p_find.returncode != 0,
-            "Cannot search for 'serve' in /usr/bin, {}".format(p_find.stderr))
+    fail_if(
+        p_find.returncode != 0,
+        "Cannot search for 'serve' in /usr/bin, {}".format(p_find.stderr),
+    )
     argmap = {
-        'NVIDIA_BUILD_REF': p_sha.stdout.rstrip(),
-        'NVIDIA_BUILD_ID': p_build.stdout.rstrip(),
-        'TRITON_VERSION': version,
-        'TRITON_CONTAINER_VERSION': container_version,
-        'DCGM_VERSION': dcgm_version,
-        'SAGEMAKER_ENDPOINT': f is not None,
+        "NVIDIA_BUILD_REF": p_sha.stdout.rstrip(),
+        "NVIDIA_BUILD_ID": p_build.stdout.rstrip(),
+        "TRITON_VERSION": version,
+        "TRITON_CONTAINER_VERSION": container_version,
+        "DCGM_VERSION": dcgm_version,
+        "SAGEMAKER_ENDPOINT": f is not None,
     }
     return argmap
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     group_qv = parser.add_mutually_exclusive_group()
-    group_qv.add_argument('-q',
-                          '--quiet',
-                          action="store_true",
-                          required=False,
-                          help='Disable console output.')
-    group_qv.add_argument('-v',
-                          '--verbose',
-                          action="store_true",
-                          required=False,
-                          help='Enable verbose output.')
+    group_qv.add_argument(
+        "-q",
+        "--quiet",
+        action="store_true",
+        required=False,
+        help="Disable console output.",
+    )
+    group_qv.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        help="Enable verbose output.",
+    )
     parser.add_argument(
-        '--output-name',
+        "--output-name",
         type=str,
         required=False,
-        help='Name for the generated Docker image. Default is "tritonserver".')
+        help='Name for the generated Docker image. Default is "tritonserver".',
+    )
     parser.add_argument(
-        '--work-dir',
+        "--work-dir",
         type=str,
         required=False,
-        help=
-        'Generated dockerfiles are placed here. Default to current directory.')
+        help="Generated dockerfiles are placed here. Default to current directory.",
+    )
     parser.add_argument(
-        '--container-version',
+        "--container-version",
         type=str,
         required=False,
-        help=
-        'The version to use for the generated Docker image. If not specified '
-        'the container version will be chosen automatically based on the '
-        'repository branch.')
+        help="The version to use for the generated Docker image. If not specified "
+        "the container version will be chosen automatically based on the "
+        "repository branch.",
+    )
     parser.add_argument(
-        '--image',
-        action='append',
+        "--image",
+        action="append",
         required=False,
-        help='Use specified Docker image to generate Docker image. Specified as '
+        help="Use specified Docker image to generate Docker image. Specified as "
         '<image-name>,<full-image-name>. <image-name> can be "min", "gpu-min" '
         'or "full". Both "min" and "full" need to be specified at the same time.'
         'This will override "--container-version". "gpu-min" is needed for '
-        'CPU-only container to copy TensorFlow and PyTorch deps.')
-    parser.add_argument('--enable-gpu',
-                        nargs='?',
-                        type=lambda x: (str(x).lower() == 'true'),
-                        const=True,
-                        default=True,
-                        required=False,
-                        help=argparse.SUPPRESS)
+        "CPU-only container to copy TensorFlow and PyTorch deps.",
+    )
+    parser.add_argument(
+        "--enable-gpu",
+        nargs="?",
+        type=lambda x: (str(x).lower() == "true"),
+        const=True,
+        default=True,
+        required=False,
+        help=argparse.SUPPRESS,
+    )
     parser.add_argument(
-        '--backend',
-        action='append',
+        "--backend",
+        action="append",
         required=False,
-        help=
-        'Include <backend-name> in the generated Docker image. The flag may be '
-        'specified multiple times.')
+        help="Include <backend-name> in the generated Docker image. The flag may be "
+        "specified multiple times.",
+    )
     parser.add_argument(
-        '--repoagent',
-        action='append',
+        "--repoagent",
+        action="append",
         required=False,
-        help=
-        'Include <repoagent-name> in the generated Docker image. The flag may '
-        'be specified multiple times.')
+        help="Include <repoagent-name> in the generated Docker image. The flag may "
+        "be specified multiple times.",
+    )
     parser.add_argument(
-        '--cache',
-        action='append',
+        "--cache",
+        action="append",
         required=False,
-        help='Include <cache-name> in the generated Docker image. The flag may '
-        'be specified multiple times.')
+        help="Include <cache-name> in the generated Docker image. The flag may "
+        "be specified multiple times.",
+    )
     parser.add_argument(
-        '--skip-pull',
-        action='store_true',
+        "--skip-pull",
+        action="store_true",
         required=False,
-        help='Do not pull the required docker images. The user is responsible '
-        'for pulling the upstream images needed to compose the image.')
+        help="Do not pull the required docker images. The user is responsible "
+        "for pulling the upstream images needed to compose the image.",
+    )
     parser.add_argument(
-        '--dry-run',
+        "--dry-run",
         action="store_true",
         required=False,
-        help='Only creates Dockerfile.compose, does not build the Docker image.'
+        help="Only creates Dockerfile.compose, does not build the Docker image.",
     )
 
     FLAGS = parser.parse_args()
@@ -396,7 +457,7 @@ def create_argmap(images, skip_pull):
     if FLAGS.output_name is None:
         FLAGS.output_name = "tritonserver"
 
-    dockerfile_name = 'Dockerfile.compose'
+    dockerfile_name = "Dockerfile.compose"
 
     if FLAGS.backend is None:
         FLAGS.backend = []
@@ -409,54 +470,56 @@ def create_argmap(images, skip_pull):
     images = {}
     if FLAGS.image:
         for img in FLAGS.image:
-            parts = img.split(',')
+            parts = img.split(",")
             fail_if(
                 len(parts) != 2,
-                '--image must specific <image-name>,<full-image-registry>')
+                "--image must specific <image-name>,<full-image-registry>",
+            )
             fail_if(
-                parts[0] not in ['min', 'full', 'gpu-min'],
-                'unsupported image-name \'{}\' for --image'.format(parts[0]))
+                parts[0] not in ["min", "full", "gpu-min"],
+                "unsupported image-name '{}' for --image".format(parts[0]),
+            )
             log('image "{}": "{}"'.format(parts[0], parts[1]))
             images[parts[0]] = parts[1]
     else:
         get_container_version_if_not_specified()
         if FLAGS.enable_gpu:
             images = {
-                "full":
-                    "nvcr.io/nvidia/tritonserver:{}-py3".format(
-                        FLAGS.container_version),
-                "min":
-                    "nvcr.io/nvidia/tritonserver:{}-py3-min".format(
-                        FLAGS.container_version)
+                "full": "nvcr.io/nvidia/tritonserver:{}-py3".format(
+                    FLAGS.container_version
+                ),
+                "min": "nvcr.io/nvidia/tritonserver:{}-py3-min".format(
+                    FLAGS.container_version
+                ),
             }
         else:
             images = {
-                "full":
-                    "nvcr.io/nvidia/tritonserver:{}-cpu-only-py3".format(
-                        FLAGS.container_version),
-                "min":
-                    "ubuntu:22.04"
+                "full": "nvcr.io/nvidia/tritonserver:{}-cpu-only-py3".format(
+                    FLAGS.container_version
+                ),
+                "min": "ubuntu:22.04",
             }
-    fail_if(
-        len(images) < 2,
-        "Need to specify both 'full' and 'min' images if at all")
+    fail_if(len(images) < 2, "Need to specify both 'full' and 'min' images if at all")
 
     # For CPU-only image we need to copy some cuda libraries and dependencies
     # since we are using PyTorch, TensorFlow 1, TensorFlow 2 containers that
     # are not CPU-only.
-    if (('pytorch' in FLAGS.backend) or ('tensorflow1' in FLAGS.backend) or
-        ('tensorflow2' in FLAGS.backend)) and ('gpu-min' not in images):
+    if (
+        ("pytorch" in FLAGS.backend)
+        or ("tensorflow1" in FLAGS.backend)
+        or ("tensorflow2" in FLAGS.backend)
+    ) and ("gpu-min" not in images):
         images["gpu-min"] = "nvcr.io/nvidia/tritonserver:{}-py3-min".format(
-            FLAGS.container_version)
+            FLAGS.container_version
+        )
 
     argmap = create_argmap(images, FLAGS.skip_pull)
 
-    start_dockerfile(FLAGS.work_dir, images, argmap, dockerfile_name,
-                     FLAGS.backend)
+    start_dockerfile(FLAGS.work_dir, images, argmap, dockerfile_name, FLAGS.backend)
     add_requested_backends(FLAGS.work_dir, dockerfile_name, FLAGS.backend)
     add_requested_repoagents(FLAGS.work_dir, dockerfile_name, FLAGS.repoagent)
     add_requested_caches(FLAGS.work_dir, dockerfile_name, FLAGS.cache)
     end_dockerfile(FLAGS.work_dir, dockerfile_name, argmap)
 
-    if (not FLAGS.dry_run):
+    if not FLAGS.dry_run:
         build_docker_image(FLAGS.work_dir, dockerfile_name, FLAGS.output_name)
diff --git a/deploy/alibaba-cloud/README.md b/deploy/alibaba-cloud/README.md
index 1dea4ede11..0521eb704f 100644
--- a/deploy/alibaba-cloud/README.md
+++ b/deploy/alibaba-cloud/README.md
@@ -26,7 +26,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 -->
 
-# Deploy Triton Inference Server on PAI-EAS 
+# Deploy Triton Inference Server on PAI-EAS
 * Table Of Contents
    - [Description](https://yuque.alibaba-inc.com/pai/blade/mtptqc#Description)
    - [Prerequisites](https://yuque.alibaba-inc.com/pai/blade/mtptqc#Prerequisites)
@@ -57,11 +57,11 @@ Download the tensorflow inception model via [fetch_model.sh](https://github.com/
 The following is the json we use when creating a Triton Server on EAS.
 ```
 {
-  "name": "<your triton service name>",                          
+  "name": "<your triton service name>",
   "processor": "triton",
   "processor_params": [
-    "--model-repository=oss://triton-model-repo/models", 
-    "--allow-grpc=true", 
+    "--model-repository=oss://triton-model-repo/models",
+    "--allow-grpc=true",
     "--allow-http=true"
   ],
   "metadata": {
diff --git a/deploy/aws/README.md b/deploy/aws/README.md
index 600f8c953f..cbde5610ce 100644
--- a/deploy/aws/README.md
+++ b/deploy/aws/README.md
@@ -39,10 +39,10 @@ This guide assumes you already have a functional Kubernetes cluster
 and helm installed (see below for instructions on installing
 helm). Note the following requirements:
 
-* The helm chart deploys Prometheus and Grafana to collect and display Triton metrics. To use this helm chart you must install Prpmetheus and Grafana in your cluster as described below and your cluster must contain sufficient CPU resourses to support these services. 
+* The helm chart deploys Prometheus and Grafana to collect and display Triton metrics. To use this helm chart you must install Prpmetheus and Grafana in your cluster as described below and your cluster must contain sufficient CPU resources to support these services.
 
 * If you want Triton Server to use GPUs for inferencing, your cluster
-must be configured to contain the desired number of GPU nodes (EC2 G4 instances recommended) 
+must be configured to contain the desired number of GPU nodes (EC2 G4 instances recommended)
 with support for the NVIDIA driver and CUDA version required by the version
 of the inference server you are using.
 
@@ -67,7 +67,7 @@ please see the [official migration guide](https://helm.sh/docs/topics/v2_v3_migr
 
 > **NOTE**: Moving forward this chart will only be tested and maintained for Helm v3.
 
-Below are example instructions for installing Helm v2. 
+Below are example instructions for installing Helm v2.
 
 ```
 $ curl https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash
diff --git a/deploy/aws/templates/deployment.yaml b/deploy/aws/templates/deployment.yaml
index 24f3f65380..48ef82160d 100644
--- a/deploy/aws/templates/deployment.yaml
+++ b/deploy/aws/templates/deployment.yaml
@@ -56,7 +56,7 @@ spec:
             limits:
               nvidia.com/gpu: {{ .Values.image.numGpus }}
 
-          args: ["tritonserver", "--model-store={{ .Values.image.modelRepositoryPath }}", 
+          args: ["tritonserver", "--model-store={{ .Values.image.modelRepositoryPath }}",
                  "--model-control-mode=poll",
                  "--repository-poll-secs=5"]
 
@@ -94,7 +94,7 @@ spec:
             httpGet:
               path: /v2/health/ready
               port: http
-              
+
       securityContext:
         runAsUser: 1000
         fsGroup: 1000
diff --git a/deploy/fleetcommand/README.md b/deploy/fleetcommand/README.md
index 88a05af34b..996b7598cc 100644
--- a/deploy/fleetcommand/README.md
+++ b/deploy/fleetcommand/README.md
@@ -87,7 +87,7 @@ echo -n 'AWS_SESSION_TOKEN' | base64
 
 Deploy the Triton Inference Server to your Location in Fleet Command by creating
 a Deployment.  You can specify configuration parameters to override the default
-[values.yaml](values.yaml) in the Application Configuration section.  
+[values.yaml](values.yaml) in the Application Configuration section.
 
 *Note:* You _must_ provide a `--model-repository` parameter with a path to your
 prepared model repository in your S3 bucket.  Otherwise, the Triton will not
@@ -114,7 +114,7 @@ for more info.
 If you have `prometheus-operator` deployed, you can enable the ServiceMonitor
 for the Triton Inference Server by setting `serviceMonitor.enabled: true` in
 Application Configuration.  This will also deploy a Grafana dashboard for Triton
-as a ConfigMap.  
+as a ConfigMap.
 
 Otherwise, metrics can be scraped by pointing an external Prometheus
 instance at the `metricsNodePort` in the values.
diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md
index 0530df412e..b1ed1d2d91 100644
--- a/deploy/gcp/README.md
+++ b/deploy/gcp/README.md
@@ -72,7 +72,7 @@ please see the [official migration guide](https://helm.sh/docs/topics/v2_v3_migr
 
 > **NOTE**: Moving forward this chart will only be tested and maintained for Helm v3.
 
-Below are example instructions for installing Helm v2. 
+Below are example instructions for installing Helm v2.
 
 ```
 $ curl https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash
diff --git a/deploy/gke-marketplace-app/README.md b/deploy/gke-marketplace-app/README.md
index 1d805c68d2..e99b9efbae 100644
--- a/deploy/gke-marketplace-app/README.md
+++ b/deploy/gke-marketplace-app/README.md
@@ -38,23 +38,23 @@
 
 ## Description
 
-This repository contains Google Kubernetes Engine(GKE) Marketplace Application for NVIDIA Triton Inference Server deployer. 
+This repository contains Google Kubernetes Engine(GKE) Marketplace Application for NVIDIA Triton Inference Server deployer.
 
  - Triton GKE deployer is a helm chart deployer recommended by GKE Marketplace
  - Triton GKE deployer deploys a GKE ingress which accepts public inference requests
  - Triton GKE deployer includes a horizontal pod autoscaler(HPA) which relies on [stack driver custom metrics adaptor](https://github.com/GoogleCloudPlatform/k8s-stackdriver/tree/master/custom-metrics-stackdriver-adapter) to monitor GPU duty cycle, and auto scale GPU nodes.
- - This repo also contains a sample to generate BERT model with TensorRT and use Locust to experiment with GPU node autoscaling and monitor client latency/throughput. 
+ - This repo also contains a sample to generate BERT model with TensorRT and use Locust to experiment with GPU node autoscaling and monitor client latency/throughput.
 
 ![Cloud Architecture Diagram](diagram.png)
 
 ## Prerequisites
 
- - [Install Google Cloud SDK on your laptop/client workstation](https://cloud.google.com/sdk/docs/install), so that `gcloud` SDK cli interface could be run on the client and sign in with your GCP credentials. 
+ - [Install Google Cloud SDK on your laptop/client workstation](https://cloud.google.com/sdk/docs/install), so that `gcloud` SDK cli interface could be run on the client and sign in with your GCP credentials.
  - In addition, user could leverage [Google Cloud shell](https://cloud.google.com/shell/docs/launching-cloud-shell).
 
 ## Demo Instruction
 
-First, install this Triton GKE app to an existing GKE cluster with GPU node pool, Google Cloud Marketplace currently doesn't support auto creation of GPU clusters. User has to run following command to create a compatible cluster (gke version >=1.18.7) with GPU node pools, we recommend user to select T4 or A100(MIG) instances type and choose CPU ratio based on profiling of actual inference workflow. 
+First, install this Triton GKE app to an existing GKE cluster with GPU node pool, Google Cloud Marketplace currently doesn't support auto creation of GPU clusters. User has to run following command to create a compatible cluster (gke version >=1.18.7) with GPU node pools, we recommend user to select T4 or A100(MIG) instances type and choose CPU ratio based on profiling of actual inference workflow.
 
 Users need to follow these [instructions](https://cloud.google.com/kubernetes-engine/docs/how-to/kubernetes-service-accounts#creating_a_kubernetes_service_account) to create a kubernetes service account. In this example, we use `gke-test@k80-exploration.iam.gserviceaccount.com`. Make sure it has access to artifact registry and monitoring viewer. For example, to grant access to custom metrics which is required for HPA to work:
 ```
@@ -65,7 +65,7 @@ gcloud iam service-accounts add-iam-policy-binding --role \
 
 kubectl annotate serviceaccount --namespace custom-metrics \
   custom-metrics-stackdriver-adapter \
-  iam.gke.io/gcp-service-account=<google-service-account>@<project-id>.iam.gserviceaccount.com  
+  iam.gke.io/gcp-service-account=<google-service-account>@<project-id>.iam.gserviceaccount.com
 ```
 
 Currently, GKE >= 1.18.7 only supported in GKE rapid channel, to find the latest version, please visit [GKE release notes](https://cloud.google.com/kubernetes-engine/docs/release-notes).
@@ -104,10 +104,10 @@ gcloud container node-pools create accel \
   --verbosity error
 
 # so that you can run kubectl locally to the cluster
-gcloud container clusters get-credentials ${DEPLOYMENT_NAME} --project ${PROJECT_ID} --zone ${ZONE}  
+gcloud container clusters get-credentials ${DEPLOYMENT_NAME} --project ${PROJECT_ID} --zone ${ZONE}
 
 # deploy NVIDIA device plugin for GKE to prepare GPU nodes for driver install
-kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml 
+kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml
 
 # make sure you can run kubectl locally to access the cluster
 kubectl create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin --user "$(gcloud config get-value account)"
@@ -119,7 +119,7 @@ kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/k8s-stack
 gcloud compute addresses create ingress-triton --global
 ```
 
-Creating a cluster and adding GPU nodes could take up-to 10 minutes. Please be patient after executing this command. GPU resources in GCP could be fully utilized, so please try a different zone in case compute resource cannot be allocated. After GKE cluster is running, run `kubectl get pods --all-namespaces` to make sure the client can access the cluster correctly: 
+Creating a cluster and adding GPU nodes could take up-to 10 minutes. Please be patient after executing this command. GPU resources in GCP could be fully utilized, so please try a different zone in case compute resource cannot be allocated. After GKE cluster is running, run `kubectl get pods --all-namespaces` to make sure the client can access the cluster correctly:
 
 If user would like to experiment with A100 MIG partitioned GPU in GKE, please create node pool with following command:
 ```
@@ -137,14 +137,14 @@ gcloud beta container node-pools create accel \
   --verbosity error
 ```
 
-Please note that A100 MIG in GKE does not support GPU metrics yet, also Triton GPU Metrics is not compatiable with A100 MIG. Hence, please disable GPU metrics by unselect allowGPUMetrics while deploy Triton GKE app. Also for the same reason, this deployer doesn't support inference workfload auto-scaling on A100 MIG as well.  
+Please note that A100 MIG in GKE does not support GPU metrics yet, also Triton GPU Metrics is not compatible with A100 MIG. Hence, please disable GPU metrics by unselect allowGPUMetrics while deploy Triton GKE app. Also for the same reason, this deployer doesn't support inference workfload auto-scaling on A100 MIG as well.
 
-Second, go to this [GKE Marketplace link](https://console.cloud.google.com/marketplace/details/nvidia-ngc-public/triton-inference-server) to deploy Triton application. 
+Second, go to this [GKE Marketplace link](https://console.cloud.google.com/marketplace/details/nvidia-ngc-public/triton-inference-server) to deploy Triton application.
 
 Users can leave everything as default if their models have already been tested/validated with Triton. They can provide a GCS path pointing to the model repository containing their models. By default, we provide a BERT large model optimized by TensorRT in a public demo GCS bucket that is compatible with the `xx.yy` release of Triton Server in `gs://triton_sample_models/xx_yy`. However, please take note of the following about this demo bucket:
-- The TensorRT engine provided in the demo bucket is only compatible with Tesla T4 GPUs. 
+- The TensorRT engine provided in the demo bucket is only compatible with Tesla T4 GPUs.
 - This bucket is located in `us-central1`, so loading from this bucket into Triton in other regions may be affected.
-- The first deployment of this Triton GKE application will be slower than consecutive runs because the image needs to be pulled into the GKE cluster. 
+- The first deployment of this Triton GKE application will be slower than consecutive runs because the image needs to be pulled into the GKE cluster.
 - You can find an example of how this model is generated and uploaded [here](trt-engine/README.md).
 
 Where <xx.yy> is the version of NGC Triton container needed.
@@ -167,7 +167,7 @@ If User selected deploy Triton to accept HTTP request, please launch [Locust](ht
 locust -f locustfile_bert.py -H http://${INGRESS_HOST}:${INGRESS_PORT}
 ```
 
-The client example push about ~650 QPS(Query per second) to Triton Server, and will trigger a auto scale of T4 GPU nodes (We recommend to use T4 and A100[MIG] for inference). From locust UI, we will observer a drop of latency mean and variance for the requests. At the end, after autoscaling, we see the latency stablized at ~200 ms, end to end from US client to europe server, which is excellent for a model that has 345 million parameters. Since for each node, we use 1T4 + n1-standard-4 instance, and it can handle ~450 QPS, with on-demand price, it is ($0.35+$0.19)=$0.54/hr, that translate to 3 million inference per dollar for BERT large model at batch size 1. Further more, with 3 year commitment price, hr rate is ($0.16+$0.08)=$0.24/hr, that translate to 6.75 million inference per dollar. 
+The client example push about ~650 QPS(Query per second) to Triton Server, and will trigger a auto scale of T4 GPU nodes (We recommend to use T4 and A100[MIG] for inference). From locust UI, we will observer a drop of latency mean and variance for the requests. At the end, after autoscaling, we see the latency stablized at ~200 ms, end to end from US client to europe server, which is excellent for a model that has 345 million parameters. Since for each node, we use 1T4 + n1-standard-4 instance, and it can handle ~450 QPS, with on-demand price, it is ($0.35+$0.19)=$0.54/hr, that translate to 3 million inference per dollar for BERT large model at batch size 1. Further more, with 3 year commitment price, hr rate is ($0.16+$0.08)=$0.24/hr, that translate to 6.75 million inference per dollar.
 
 ![Locust Client Chart](client.png)
 
@@ -197,5 +197,5 @@ See the following resources to learn more about NVIDIA Triton Inference Server a
 
 ## Known Issues
 
-- GKE one click cluster creation doesn't support GPU node pools at the moment, users have to mannually create a compatible (>=1.18.7) cluster and attach node pool (T4 and A100 MIG recommended)
+- GKE one click cluster creation doesn't support GPU node pools at the moment, users have to manually create a compatible (>=1.18.7) cluster and attach node pool (T4 and A100 MIG recommended)
 - When Horizontal Pod Autoscaler(HPA) expand and all GPU node pool already utilized, GKE will request new GPU node and it can take between 4-7 minutes, it could be a long wait plus GPU driver install and image pulling. We recommend user to leverage multi-tier model serving and Triton's priority feature to create cushion for latency critical models, and allocate active standby GPU node for spike of requests.
diff --git a/deploy/gke-marketplace-app/benchmark/README.md b/deploy/gke-marketplace-app/benchmark/README.md
index c350b931dc..c9c502e1b0 100644
--- a/deploy/gke-marketplace-app/benchmark/README.md
+++ b/deploy/gke-marketplace-app/benchmark/README.md
@@ -49,30 +49,30 @@ We the place the model into a GCS with following structure, `config.pbtxt` was p
     ├── bert_base_trt_gpu_seqlen128
     │   ├── 1
     │   │   └── model.plan
-    │   └── config.pbtxt    
+    │   └── config.pbtxt
     ├── bert_base_tf_gpu
     │   ├── 1
     │   │   └── model.savedmodel
-    │   └── config.pbtxt      
+    │   └── config.pbtxt
     ├── bert_base_tf_cpu
     │   ├── 1
     │   │   └── model.savedmodel
     │   └── config.pbtxt
-    ├── bert_distill_tf_gpu 
+    ├── bert_distill_tf_gpu
     │   ├── 1
     │   │   └── model.savedmodel
     │   └── config.pbtxt
     └── bert_distill_tf_cpu
         ├── 1
         │   └── model.savedmodel
-        └── config.pbtxt 
+        └── config.pbtxt
 ```
 
-When deploy Triton GKE application, point the model repository to directory contains the structure above with actual models. 
+When deploy Triton GKE application, point the model repository to directory contains the structure above with actual models.
 
 ## Performance
 
-We use perf analyzer of Triton to benchmark the performance of each model, the perf analyzer reside in another pod of the GKE cluster. 
+We use perf analyzer of Triton to benchmark the performance of each model, the perf analyzer reside in another pod of the GKE cluster.
 ```bash
 export INGRESS_HOST=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
 export INGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=="http2")].port}')
@@ -91,6 +91,5 @@ GPU TensorRT BERT BASE: latency: 50ms, throughput: 465 qps
 
 With n1-standard-96 priced at $4.56/hr and n1-standard-4 at $0.19/hr and T4 at $0.35/hr totaling $0.54/hr. While achieving a much lower latency, the TCO of BERT inference with TensorRT on T4 is over 163 times that of Distill BERT inference on n1-standard-96.
 
-  
 
- 
\ No newline at end of file
+
diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt
index b46aa21f5e..f369db917f 100644
--- a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt
+++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt
@@ -24,7 +24,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-max_batch_size: 4 
+max_batch_size: 4
 dynamic_batching {
    preferred_batch_size: 4
    max_queue_delay_microseconds: 200000
diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt
index 9cc4dd4551..f3b83d5725 100644
--- a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt
+++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt
@@ -25,7 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 platform: "tensorrt_plan"
-max_batch_size: 4 
+max_batch_size: 4
 dynamic_batching {
    preferred_batch_size: 4
    max_queue_delay_microseconds: 200000
diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt
index 9b236c9092..3bfccb5c45 100644
--- a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt
+++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt
@@ -24,7 +24,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-max_batch_size: 4 
+max_batch_size: 4
 dynamic_batching {
    preferred_batch_size: 1
    max_queue_delay_microseconds: 2000000
diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt
index b46aa21f5e..f369db917f 100644
--- a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt
+++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt
@@ -24,7 +24,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-max_batch_size: 4 
+max_batch_size: 4
 dynamic_batching {
    preferred_batch_size: 4
    max_queue_delay_microseconds: 200000
diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/perf_query.sh b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/perf_query.sh
old mode 100644
new mode 100755
diff --git a/deploy/gke-marketplace-app/client-sample/bert_request.json b/deploy/gke-marketplace-app/client-sample/bert_request.json
index b918815147..ce4b956db6 100644
--- a/deploy/gke-marketplace-app/client-sample/bert_request.json
+++ b/deploy/gke-marketplace-app/client-sample/bert_request.json
@@ -4,19 +4,19 @@
     "shape": [1, 128],
     "datatype": "INT32",
     "parameters": {},
-    "data": [101, 2054, 2003, 23435, 5339, 1029, 102, 23435, 5339, 2003, 1037, 2152, 2836, 2784, 4083, 28937, 4132, 2008, 18058, 2659, 2397, 9407, 1998, 2152, 2083, 18780, 2005, 18726, 2107, 2004, 16755, 2545, 1010, 4613, 1998, 3746, 1013, 2678, 2006, 1050, 17258, 2401, 14246, 2271, 1012, 2009, 2950, 11968, 8043, 2015, 2000, 12324, 4275, 1010, 1998, 13354, 7076, 2000, 2490, 3117, 23092, 1998, 9014, 2077, 11243, 20600, 2015, 2005, 28937, 1012, 2651, 1050, 17258, 2401, 2003, 2330, 1011, 14768, 6129, 11968, 8043, 2015, 1998, 13354, 7076, 1999, 23435, 5339, 2061, 2008, 1996, 2784, 4083, 2451, 2064, 7661, 4697, 1998, 7949, 2122, 6177, 2000, 2202, 5056, 1997, 3928, 23435, 5339, 20600, 2015, 2005, 2115, 18726, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
+    "data": [101, 2054, 2003, 23435, 5339, 1029, 102, 23435, 5339, 2003, 1037, 2152, 2836, 2784, 4083, 28937, 4132, 2008, 18058, 2659, 2397, 9407, 1998, 2152, 2083, 18780, 2005, 18726, 2107, 2004, 16755, 2545, 1010, 4613, 1998, 3746, 1013, 2678, 2006, 1050, 17258, 2401, 14246, 2271, 1012, 2009, 2950, 11968, 8043, 2015, 2000, 12324, 4275, 1010, 1998, 13354, 7076, 2000, 2490, 3117, 23092, 1998, 9014, 2077, 11243, 20600, 2015, 2005, 28937, 1012, 2651, 1050, 17258, 2401, 2003, 2330, 1011, 14768, 6129, 11968, 8043, 2015, 1998, 13354, 7076, 1999, 23435, 5339, 2061, 2008, 1996, 2784, 4083, 2451, 2064, 7661, 4697, 1998, 7949, 2122, 6177, 2000, 2202, 5056, 1997, 3928, 23435, 5339, 20600, 2015, 2005, 2115, 18726, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   }, {
     "name": "input_mask",
     "shape": [1, 128],
     "datatype": "INT32",
     "parameters": {},
-    "data": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
+    "data": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   }, {
     "name": "segment_ids",
     "shape": [1, 128],
     "datatype": "INT32",
     "parameters": {},
-    "data": [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
+    "data": [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   }],
   "outputs": [{
     "name": "cls_squad_logits",
diff --git a/deploy/gke-marketplace-app/client-sample/locustfile_bert.py b/deploy/gke-marketplace-app/client-sample/locustfile_bert.py
old mode 100644
new mode 100755
index 2e2ac2f721..aae8c69f43
--- a/deploy/gke-marketplace-app/client-sample/locustfile_bert.py
+++ b/deploy/gke-marketplace-app/client-sample/locustfile_bert.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,18 +26,18 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from locust import HttpUser, task, between
-from locust import LoadTestShape
 import json
 
+from locust import HttpUser, LoadTestShape, between, task
+
 
 class ProfileLoad(LoadTestShape):
-    '''
+    """
     This load profile starts at 0 and steps up by step_users
     increments every tick, up to target_users.  After reaching
     target_user level, load will stay at target_user level
     until time_limit is reached.
-    '''
+    """
 
     target_users = 1000
     step_users = 50  # ramp users each step
@@ -63,8 +65,7 @@ def bert(self):
         response = self.client.post(self.url1, data=json.dumps(self.data))
 
     def on_start(self):
-        with open('bert_request.json') as f:
+        with open("bert_request.json") as f:
             self.data = json.load(f)
 
-        self.url1 = '{}/v2/models/{}/infer'.format(self.environment.host,
-                                                   'bert')
+        self.url1 = "{}/v2/models/{}/infer".format(self.environment.host, "bert")
diff --git a/deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh b/deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh
old mode 100644
new mode 100755
diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
old mode 100644
new mode 100755
index 6cf5319b8a..64292409c8
--- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
+++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
@@ -1,4 +1,5 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/bin/bash
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml
index 2f08cf07d7..5658aea801 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml
@@ -41,14 +41,14 @@ spec:
     type: Triton
     version: "{{ .Values.publishedVersion }}"
     description: |-
-      Triton Inference Server provides a cloud and edge inferencing solution 
-      optimized for both CPUs and GPUs. Triton supports an HTTP/REST and GRPC 
-      protocol that allows remote clients to request inferencing for any model 
+      Triton Inference Server provides a cloud and edge inferencing solution
+      optimized for both CPUs and GPUs. Triton supports an HTTP/REST and GRPC
+      protocol that allows remote clients to request inferencing for any model
       being managed by the server.
 
     notes: |-
 
-      Send request to Triton server by using IP address "ingress-triton", 
+      Send request to Triton server by using IP address "ingress-triton",
       send to IP:80/v2/models/{}/infer
 
       Links:
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml
index 6a0b77b4ea..8bf21d9684 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml
@@ -33,7 +33,7 @@ metadata:
     app: {{ template "triton-inference-server.name" . }}
     chart: {{ template "triton-inference-server.chart" . }}
     release: {{ .Release.Name }}
-    heritage: {{ .Release.Service }}    
+    heritage: {{ .Release.Service }}
 spec:
   replicas: {{ .Values.initReplicaCount }}
   selector:
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml
index b919c55f1f..5562fa76b5 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml
@@ -35,11 +35,11 @@ metadata:
     app: {{ template "triton-inference-server.name" . }}
     chart: {{ template "triton-inference-server.chart" . }}
     release: {{ .Release.Name }}
-    heritage: {{ .Release.Service }}    
+    heritage: {{ .Release.Service }}
 spec:
   type: {{ .Values.service.type }}
   ports:
-    - port: 8000 
+    - port: 8000
       targetPort: http
       name: http-inference-server
     - port: 8001
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
index 2413b17a82..6a7dc39772 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
@@ -49,7 +49,7 @@ image:
   allowGPUMetrics: True
 
 service:
-  type: NodePort 
+  type: NodePort
 
 deployment:
   livenessProbe:
diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
index 08086cc2d7..1a51f17a8f 100644
--- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
@@ -66,26 +66,26 @@ properties:
     type: string
     x-google-marketplace:
       type: NAMESPACE
-  initReplicaCount: 
+  initReplicaCount:
     title: Initial number of Triton pod instances to deploy.
     type: integer
     default: 1
-  minReplicaCount: 
+  minReplicaCount:
     title: Minimum number of Triton pod instances in the deployment for autoscaling.
     type: integer
     default: 1
-  maxReplicaCount: 
+  maxReplicaCount:
     title: Maximum number of Triton pod instances in the deployment for autoscaling.
     type: integer
     default: 3
-  tritonProtocol: 
+  tritonProtocol:
     title: Request protocol to send data to Triton, choose from gRPC and HTTP.
     type: string
     default: HTTP
-  HPATargetAverageValue: 
-    title: HPA autoscaling target, GKE currently support Duty Cycle which is GPU utilization, when target is reached, Triton Server service will create another pod instance. We ask user to analyze model inference to associate appropriate GPU metric target based on latency requirement. We also recommend to leave some room to mitigate transient load effect. For user interested in customizing autoscaling metrics, we recommends GPU Power (Percentage of Power), Queue time or SLA measurements such as latency. 
+  HPATargetAverageValue:
+    title: HPA autoscaling target, GKE currently support Duty Cycle which is GPU utilization, when target is reached, Triton Server service will create another pod instance. We ask user to analyze model inference to associate appropriate GPU metric target based on latency requirement. We also recommend to leave some room to mitigate transient load effect. For user interested in customizing autoscaling metrics, we recommends GPU Power (Percentage of Power), Queue time or SLA measurements such as latency.
     type: integer
-    default: 85      
+    default: 85
   modelRepositoryPath:
     type: string
     title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc.
@@ -97,7 +97,7 @@ properties:
   image.logVerboseLevel:
     type: integer
     title: Set verbose logging level. Zero (0) disables verbose logging and values >= 1 enable verbose logging, this is helpful when user unsure if the model is compatible with Triton or for general debug.
-    default: 0 
+    default: 0
   image.strictModelConfig:
     type: boolean
     title: Leave this unchecked by default. When strictModelConfig is not checked(False), Triton will try to infer the config file from model file, when checked(True), user need to provide config.pbtxt in model repository.
@@ -105,14 +105,14 @@ properties:
   image.allowGPUMetrics:
     type: boolean
     title: Select by default. When use A100 MIG, unselect to disable GPU Memory metrics reported by Triton, as current GPU metrics not support on A100 MIG.
-    default: True  
+    default: True
   istioEnabled:
     type: boolean
     x-google-marketplace:
       type: ISTIO_ENABLED
     default: True
 
-  
+
 required:
 - name
 - namespace
diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml
index 4d9c95c2da..4da79a389a 100644
--- a/deploy/gke-marketplace-app/server-deployer/schema.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml
@@ -66,26 +66,26 @@ properties:
     type: string
     x-google-marketplace:
       type: NAMESPACE
-  initReplicaCount: 
+  initReplicaCount:
     title: Initial number of Triton pod instances to deploy.
     type: integer
     default: 1
-  minReplicaCount: 
+  minReplicaCount:
     title: Minimum number of Triton pod instances in the deployment for autoscaling.
     type: integer
     default: 1
-  maxReplicaCount: 
+  maxReplicaCount:
     title: Maximum number of Triton pod instances in the deployment for autoscaling.
     type: integer
     default: 3
-  tritonProtocol: 
+  tritonProtocol:
     title: Request protocol to send data to Triton, choose from gRPC and HTTP.
     type: string
     default: HTTP
-  HPATargetAverageValue: 
-    title: HPA autoscaling target, GKE currently support Duty Cycle which is GPU utilization, when target is reached, Triton Server service will create another pod instance. We ask user to analyze model inference to associate appropriate GPU metric target based on latency requirement. We also recommend to leave some room to mitigate transient load effect. For user interested in customizing autoscaling metrics, we recommends GPU Power (Percentage of Power), Queue time or SLA measurements such as latency. 
+  HPATargetAverageValue:
+    title: HPA autoscaling target, GKE currently support Duty Cycle which is GPU utilization, when target is reached, Triton Server service will create another pod instance. We ask user to analyze model inference to associate appropriate GPU metric target based on latency requirement. We also recommend to leave some room to mitigate transient load effect. For user interested in customizing autoscaling metrics, we recommends GPU Power (Percentage of Power), Queue time or SLA measurements such as latency.
     type: integer
-    default: 85      
+    default: 85
   modelRepositoryPath:
     type: string
     title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc.
@@ -97,7 +97,7 @@ properties:
   image.logVerboseLevel:
     type: integer
     title: Set verbose logging level. Zero (0) disables verbose logging and values >= 1 enable verbose logging, this is helpful when user unsure if the model is compatible with Triton or for general debug.
-    default: 0 
+    default: 0
   image.strictModelConfig:
     type: boolean
     title: Leave this unchecked by default. When strictModelConfig is not checked(False), Triton will try to infer the config file from model file, when checked(True), user need to provide config.pbtxt in model repository.
@@ -105,14 +105,14 @@ properties:
   image.allowGPUMetrics:
     type: boolean
     title: Select by default. When use A100 MIG, unselect to disable GPU Memory metrics reported by Triton, as current GPU metrics not support on A100 MIG.
-    default: True  
+    default: True
   istioEnabled:
     type: boolean
     x-google-marketplace:
       type: ISTIO_ENABLED
     default: True
 
-  
+
 required:
 - name
 - namespace
diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md
index b4bade1e6b..2a67879f51 100644
--- a/deploy/gke-marketplace-app/trt-engine/README.md
+++ b/deploy/gke-marketplace-app/trt-engine/README.md
@@ -33,13 +33,13 @@
 ```
 docker run --gpus all -it --network host \
     --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
-    -v ~:/scripts nvcr.io/nvidia/tensorrt:23.06-py3 
+    -v ~:/scripts nvcr.io/nvidia/tensorrt:23.06-py3
 
-pip install onnx six torch tf2onnx tensorflow 
+pip install onnx six torch tf2onnx tensorflow
 
 git clone -b main https://github.com/NVIDIA/TensorRT.git
 cd TensorRT
-git submodule update --init --recursive 
+git submodule update --init --recursive
 
 export TRT_OSSPATH=/workspace/TensorRT
 export TRT_LIBPATH=/lib/x86_64-linux-gnu
@@ -49,15 +49,15 @@ pushd /usr/local/bin && wget https://ngc.nvidia.com/downloads/ngccli_cat_linux.z
 popd
 
 cd /workspace/TensorRT/demo/BERT
-bash ./scripts/download_squad.sh 
+bash ./scripts/download_squad.sh
 bash ./scripts/download_model.sh large 128
 # bash ./scripts/download_model.sh large 384
 
 mkdir -p engines
 
-python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh 
+python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh
 
 gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/23_02/bert/1/model.plan
 ```
 
-For each Triton upgrade, container version used to genrate the model, and the model path in GCS `gs://triton_sample_models/23_02/` should be updated accordingly with the correct version.
+For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/23_02/` should be updated accordingly with the correct version.
diff --git a/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt b/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt
old mode 100755
new mode 100644
diff --git a/deploy/mlflow-triton-plugin/mlflow_triton/__init__.py b/deploy/mlflow-triton-plugin/mlflow_triton/__init__.py
old mode 100644
new mode 100755
index 6eff4167d0..0b73b537d4
--- a/deploy/mlflow-triton-plugin/mlflow_triton/__init__.py
+++ b/deploy/mlflow-triton-plugin/mlflow_triton/__init__.py
@@ -1,4 +1,6 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -22,4 +24,4 @@
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/deploy/mlflow-triton-plugin/mlflow_triton/config.py b/deploy/mlflow-triton-plugin/mlflow_triton/config.py
old mode 100644
new mode 100755
index 484b026227..0a381fd407
--- a/deploy/mlflow-triton-plugin/mlflow_triton/config.py
+++ b/deploy/mlflow-triton-plugin/mlflow_triton/config.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,38 +28,40 @@
 import os
 import re
 from collections import namedtuple
+
 from mlflow.exceptions import MlflowException
 
 
 class Config(dict):
-
     def __init__(self):
         super().__init__()
-        self['triton_url'] = os.environ.get('TRITON_URL')
-        self['triton_model_repo'] = os.environ.get('TRITON_MODEL_REPO')
+        self["triton_url"] = os.environ.get("TRITON_URL")
+        self["triton_model_repo"] = os.environ.get("TRITON_MODEL_REPO")
 
-        if self['triton_model_repo'].startswith('s3://'):
+        if self["triton_model_repo"].startswith("s3://"):
             self.s3_regex = re.compile(
-                's3://(http://|https://|)([0-9a-zA-Z\\-.]+):([0-9]+)/'
-                '([0-9a-z.\\-]+)(((/[0-9a-zA-Z.\\-_]+)*)?)')
+                "s3://(http://|https://|)([0-9a-zA-Z\\-.]+):([0-9]+)/"
+                "([0-9a-z.\\-]+)(((/[0-9a-zA-Z.\\-_]+)*)?)"
+            )
 
-            uri = self.parse_path(self['triton_model_repo'])
+            uri = self.parse_path(self["triton_model_repo"])
             if uri.protocol == "https://":
                 protocol = "https://"
             else:
                 protocol = "http://"
             endpoint_url = None
             if uri.host_name != "" and uri.host_port != "":
-                endpoint_url = '{}{}:{}'.format(protocol, uri.host_name,
-                                                uri.host_port)
+                endpoint_url = "{}{}:{}".format(protocol, uri.host_name, uri.host_port)
 
             import boto3
+
             # boto3 handles AWS credentials
-            self['s3'] = boto3.client('s3', endpoint_url=endpoint_url)
-            self['s3_bucket'] = uri.bucket
-            self['s3_prefix'] = uri.prefix
-            self['triton_model_repo'] = 's3://{}'.format(
-                os.path.join(uri.bucket, uri.prefix))
+            self["s3"] = boto3.client("s3", endpoint_url=endpoint_url)
+            self["s3_bucket"] = uri.bucket
+            self["s3_prefix"] = uri.prefix
+            self["triton_model_repo"] = "s3://{}".format(
+                os.path.join(uri.bucket, uri.prefix)
+            )
 
     def parse_path(self, path):
         # Cleanup extra slashes
@@ -66,10 +70,11 @@ def parse_path(self, path):
         # Get the bucket name and the object path. Return error if path is malformed
         match = self.s3_regex.fullmatch(clean_path)
         S3URI = namedtuple(
-            "S3URI", ["protocol", "host_name", "host_port", "bucket", "prefix"])
+            "S3URI", ["protocol", "host_name", "host_port", "bucket", "prefix"]
+        )
         if match:
             uri = S3URI(*match.group(1, 2, 3, 4, 5))
-            if uri.prefix and uri.prefix[0] == '/':
+            if uri.prefix and uri.prefix[0] == "/":
                 uri = uri._replace(prefix=uri.prefix[1:])
         else:
             bucket_start = clean_path.find("s3://") + len("s3://")
@@ -78,7 +83,7 @@ def parse_path(self, path):
             # If there isn't a slash, the address has only the bucket
             if bucket_end > bucket_start:
                 bucket = clean_path[bucket_start:bucket_end]
-                prefix = clean_path[bucket_end + 1:]
+                prefix = clean_path[bucket_end + 1 :]
             else:
                 bucket = clean_path[bucket_start:]
                 prefix = ""
@@ -94,8 +99,8 @@ def clean_path(self, s3_path):
         start = s3_path.find("s3://")
         path = ""
         if start != -1:
-            path = s3_path[start + len("s3://"):]
-            clean_path = ("s3://")
+            path = s3_path[start + len("s3://") :]
+            clean_path = "s3://"
         else:
             path = s3_path
             clean_path = ""
@@ -103,29 +108,29 @@ def clean_path(self, s3_path):
         # Must handle paths with https:// or http:// prefix
         https_start = path.find("https://")
         if https_start != -1:
-            path = path[https_start + len("https://"):]
+            path = path[https_start + len("https://") :]
             clean_path += "https://"
         else:
             http_start = path.find("http://")
             if http_start != -1:
-                path = path[http_start + len("http://"):]
+                path = path[http_start + len("http://") :]
                 clean_path += "http://"
 
         # Remove trailing slashes
-        rtrim_length = len(path.rstrip('/'))
+        rtrim_length = len(path.rstrip("/"))
         if rtrim_length == 0:
             raise MlflowException("Invalid bucket name: '" + path + "'")
 
         # Remove leading slashes
-        ltrim_length = len(path) - len(path.lstrip('/'))
+        ltrim_length = len(path) - len(path.lstrip("/"))
         if ltrim_length == len(path):
             raise MlflowException("Invalid bucket name: '" + path + "'")
 
         # Remove extra internal slashes
-        true_path = path[ltrim_length:rtrim_length + 1]
+        true_path = path[ltrim_length : rtrim_length + 1]
         previous_slash = False
         for i in range(len(true_path)):
-            if true_path[i] == '/':
+            if true_path[i] == "/":
                 if not previous_slash:
                     clean_path += true_path[i]
                 previous_slash = True
diff --git a/deploy/mlflow-triton-plugin/mlflow_triton/deployments.py b/deploy/mlflow-triton-plugin/mlflow_triton/deployments.py
old mode 100644
new mode 100755
index 0a22ba6c88..fb8e72c286
--- a/deploy/mlflow-triton-plugin/mlflow_triton/deployments.py
+++ b/deploy/mlflow-triton-plugin/mlflow_triton/deployments.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -23,25 +25,27 @@
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import os
 import ast
-import shutil
+import glob
+import json
 import logging
+import os
+import shutil
 from pathlib import Path
 
-from mlflow_triton.config import Config
+import numpy as np
+import pandas as pd
 import tritonclient.http as tritonhttpclient
-from tritonclient.utils import InferenceServerException, np_to_triton_dtype, triton_to_np_dtype
-
 from mlflow.deployments import BaseDeploymentClient
-from mlflow.tracking.artifact_utils import _download_artifact_from_uri
 from mlflow.exceptions import MlflowException
 from mlflow.models import Model
-
-import glob
-import json
-import pandas as pd
-import numpy as np
+from mlflow.tracking.artifact_utils import _download_artifact_from_uri
+from mlflow_triton.config import Config
+from tritonclient.utils import (
+    InferenceServerException,
+    np_to_triton_dtype,
+    triton_to_np_dtype,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -49,7 +53,6 @@
 
 
 class TritonPlugin(BaseDeploymentClient):
-
     def __init__(self, uri):
         """
         Initializes the deployment plugin, sets the triton model repo
@@ -58,16 +61,17 @@ def __init__(self, uri):
         self.server_config = Config()
         triton_url, self.triton_model_repo = self._get_triton_server_config()
         # need to add other flavors
-        self.supported_flavors = ['triton', 'onnx']
+        self.supported_flavors = ["triton", "onnx"]
         # URL cleaning for constructing Triton client
         ssl = False
         if triton_url.startswith("http://"):
-            triton_url = triton_url[len("http://"):]
+            triton_url = triton_url[len("http://") :]
         elif triton_url.startswith("https://"):
-            triton_url = triton_url[len("https://"):]
+            triton_url = triton_url[len("https://") :]
             ssl = True
         self.triton_client = tritonhttpclient.InferenceServerClient(
-            url=triton_url, ssl=ssl)
+            url=triton_url, ssl=ssl
+        )
 
     def _get_triton_server_config(self):
         triton_url = "localhost:8000"
@@ -76,8 +80,7 @@ def _get_triton_server_config(self):
         logger.info("Triton url = {}".format(triton_url))
 
         if not self.server_config["triton_model_repo"]:
-            raise Exception(
-                "Check that environment variable TRITON_MODEL_REPO is set")
+            raise Exception("Check that environment variable TRITON_MODEL_REPO is set")
         triton_model_repo = self.server_config["triton_model_repo"]
         logger.info("Triton model repo = {}".format(triton_model_repo))
 
@@ -100,7 +103,8 @@ def create_deployment(self, name, model_uri, flavor=None, config=None):
         if self._model_exists(name):
             raise Exception(
                 "Unable to create deployment for name %s because it already exists."
-                % (name))
+                % (name)
+            )
 
         # Get the path of the artifact
         path = Path(_download_artifact_from_uri(model_uri))
@@ -126,7 +130,8 @@ def delete_deployment(self, name):
         if not self._model_exists(name):
             raise Exception(
                 "Unable to delete deployment for name %s because it does not exist."
-                % (name))
+                % (name)
+            )
 
         try:
             self.triton_client.unload_model(name)
@@ -156,7 +161,8 @@ def update_deployment(self, name, model_uri=None, flavor=None, config=None):
         if not self._model_exists(name):
             raise Exception(
                 "Unable to update deployment for name %s because it does not exist."
-                % (name))
+                % (name)
+            )
 
         self.get_deployment(name)
 
@@ -183,25 +189,32 @@ def list_deployments(self):
         resp = self.triton_client.get_model_repository_index()
         actives = []
         for d in resp:
-            if 'state' in d and d['state'] == 'READY':
-                mlflow_meta_path = os.path.join(self.triton_model_repo,
-                                                d['name'],
-                                                _MLFLOW_META_FILENAME)
-                if 's3' in self.server_config:
+            if "state" in d and d["state"] == "READY":
+                mlflow_meta_path = os.path.join(
+                    self.triton_model_repo, d["name"], _MLFLOW_META_FILENAME
+                )
+                if "s3" in self.server_config:
                     meta_dict = ast.literal_eval(
-                        self.server_config['s3'].get_object(
-                            Bucket=self.server_config['s3_bucket'],
-                            Key=os.path.join(self.server_config['s3_prefix'],
-                                             d['name'], _MLFLOW_META_FILENAME),
-                        )['Body'].read().decode('utf-8'))
+                        self.server_config["s3"]
+                        .get_object(
+                            Bucket=self.server_config["s3_bucket"],
+                            Key=os.path.join(
+                                self.server_config["s3_prefix"],
+                                d["name"],
+                                _MLFLOW_META_FILENAME,
+                            ),
+                        )["Body"]
+                        .read()
+                        .decode("utf-8")
+                    )
                 elif os.path.isfile(mlflow_meta_path):
-                    meta_dict = self._get_mlflow_meta_dict(d['name'])
+                    meta_dict = self._get_mlflow_meta_dict(d["name"])
                 else:
                     continue
 
-                d['triton_model_path'] = meta_dict['triton_model_path']
-                d['mlflow_model_uri'] = meta_dict['mlflow_model_uri']
-                d['flavor'] = meta_dict['flavor']
+                d["triton_model_path"] = meta_dict["triton_model_path"]
+                d["mlflow_model_uri"] = meta_dict["mlflow_model_uri"]
+                d["flavor"] = meta_dict["flavor"]
                 actives.append(d)
 
         return actives
@@ -217,9 +230,9 @@ def get_deployment(self, name):
         """
         deployments = self.list_deployments()
         for d in deployments:
-            if d['name'] == name:
+            if d["name"] == name:
                 return d
-        raise ValueError(f'Unable to get deployment with name {name}')
+        raise ValueError(f"Unable to get deployment with name {name}")
 
     def predict(self, deployment_name, df):
         single_input_np = None
@@ -231,16 +244,13 @@ def predict(self, deployment_name, df):
             raise MlflowException("Unnamed input is not currently supported")
         else:
             if isinstance(df, pd.DataFrame):
-                model_metadata = self.triton_client.get_model_metadata(
-                    deployment_name)
+                model_metadata = self.triton_client.get_model_metadata(deployment_name)
                 input_dtype = {}
                 for input in model_metadata["inputs"]:
-                    input_dtype[input["name"]] = triton_to_np_dtype(
-                        input["datatype"])
+                    input_dtype[input["name"]] = triton_to_np_dtype(input["datatype"])
                 # Sanity check
                 if len(df.columns) != 1:
-                    raise MlflowException(
-                        "Expect Pandas DataFrame has only 1 column")
+                    raise MlflowException("Expect Pandas DataFrame has only 1 column")
                 col = df.columns[0]
                 for row in df.index:
                     val = df[col][row]
@@ -249,21 +259,24 @@ def predict(self, deployment_name, df):
                         val = np.array(val, dtype=input_dtype[row])
                     inputs.append(
                         tritonhttpclient.InferInput(
-                            row, val.shape, np_to_triton_dtype(val.dtype)))
+                            row, val.shape, np_to_triton_dtype(val.dtype)
+                        )
+                    )
                     inputs[-1].set_data_from_numpy(val)
             else:
                 for key, val in df.items():
                     inputs.append(
                         tritonhttpclient.InferInput(
-                            key, val.shape, np_to_triton_dtype(val.dtype)))
+                            key, val.shape, np_to_triton_dtype(val.dtype)
+                        )
+                    )
                     inputs[-1].set_data_from_numpy(val)
 
         try:
-            resp = self.triton_client.infer(model_name=deployment_name,
-                                            inputs=inputs)
+            resp = self.triton_client.infer(model_name=deployment_name, inputs=inputs)
             res = {}
-            for output in resp.get_response()['outputs']:
-                res[output['name']] = resp.as_numpy(output['name'])
+            for output in resp.get_response()["outputs"]:
+                res[output["name"]] = resp.as_numpy(output["name"])
             return pd.DataFrame.from_dict({"outputs": res})
         except InferenceServerException as ex:
             raise MlflowException(str(ex))
@@ -271,99 +284,105 @@ def predict(self, deployment_name, df):
     def _generate_mlflow_meta_file(self, name, flavor, model_uri):
         triton_deployment_dir = os.path.join(self.triton_model_repo, name)
         meta_dict = {
-            'name': name,
-            'triton_model_path': triton_deployment_dir,
-            'mlflow_model_uri': model_uri,
-            'flavor': flavor
+            "name": name,
+            "triton_model_path": triton_deployment_dir,
+            "mlflow_model_uri": model_uri,
+            "flavor": flavor,
         }
 
-        if 's3' in self.server_config:
-            self.server_config['s3'].put_object(
-                Body=json.dumps(meta_dict, indent=4).encode('utf-8'),
+        if "s3" in self.server_config:
+            self.server_config["s3"].put_object(
+                Body=json.dumps(meta_dict, indent=4).encode("utf-8"),
                 Bucket=self.server_config["s3_bucket"],
-                Key=os.path.join(self.server_config['s3_prefix'], name,
-                                 _MLFLOW_META_FILENAME),
+                Key=os.path.join(
+                    self.server_config["s3_prefix"], name, _MLFLOW_META_FILENAME
+                ),
             )
         else:
             with open(
-                    os.path.join(triton_deployment_dir, _MLFLOW_META_FILENAME),
-                    "w") as outfile:
+                os.path.join(triton_deployment_dir, _MLFLOW_META_FILENAME), "w"
+            ) as outfile:
                 json.dump(meta_dict, outfile, indent=4)
 
         print("Saved", _MLFLOW_META_FILENAME, "to", triton_deployment_dir)
 
     def _get_mlflow_meta_dict(self, name):
-        mlflow_meta_path = os.path.join(self.triton_model_repo, name,
-                                        _MLFLOW_META_FILENAME)
+        mlflow_meta_path = os.path.join(
+            self.triton_model_repo, name, _MLFLOW_META_FILENAME
+        )
 
-        if 's3' in self.server_config:
+        if "s3" in self.server_config:
             mlflow_meta_dict = ast.literal_eval(
-                self.server_config['s3'].get_object(
-                    Bucket=self.server_config['s3_bucket'],
-                    Key=os.path.join(self.server_config['s3_prefix'], name,
-                                     _MLFLOW_META_FILENAME),
-                )['Body'].read().decode('utf-8'))
+                self.server_config["s3"]
+                .get_object(
+                    Bucket=self.server_config["s3_bucket"],
+                    Key=os.path.join(
+                        self.server_config["s3_prefix"], name, _MLFLOW_META_FILENAME
+                    ),
+                )["Body"]
+                .read()
+                .decode("utf-8")
+            )
         else:
-            with open(mlflow_meta_path, 'r') as metafile:
+            with open(mlflow_meta_path, "r") as metafile:
                 mlflow_meta_dict = json.load(metafile)
 
         return mlflow_meta_dict
 
     def _get_copy_paths(self, artifact_path, name, flavor):
         copy_paths = {}
-        copy_paths['model_path'] = {}
+        copy_paths["model_path"] = {}
         triton_deployment_dir = os.path.join(self.triton_model_repo, name)
         if flavor == "triton":
             # When flavor is 'triton', the model is assumed to be preconfigured
             # with proper model versions and version strategy, which may differ from
             # the versioning in MLFlow
             for file in artifact_path.iterdir():
-                if file.name not in ['MLmodel', 'conda.yaml']:
-                    copy_paths['model_path']['from'] = file
-            copy_paths['model_path']['to'] = triton_deployment_dir
+                if file.name not in ["MLmodel", "conda.yaml"]:
+                    copy_paths["model_path"]["from"] = file
+            copy_paths["model_path"]["to"] = triton_deployment_dir
         elif flavor == "onnx":
             # Look for model file via MLModel metadata or iterating dir
             model_file = None
             config_file = None
             for file in artifact_path.iterdir():
-                if file.name == 'MLmodel':
+                if file.name == "MLmodel":
                     mlmodel = Model.load(file)
                     onnx_meta_data = mlmodel.flavors.get("onnx", None)
                     if onnx_meta_data is not None:
-                        model_file = onnx_meta_data.get('data', None)
-                elif file.name == 'config.pbtxt':
+                        model_file = onnx_meta_data.get("data", None)
+                elif file.name == "config.pbtxt":
                     config_file = file.name
-                    copy_paths['config_path'] = {}
-                elif file.suffix == '.txt' and file.stem != 'requirements':
-                    copy_paths[file.stem] = {
-                        'from': file,
-                        'to': triton_deployment_dir
-                    }
+                    copy_paths["config_path"] = {}
+                elif file.suffix == ".txt" and file.stem != "requirements":
+                    copy_paths[file.stem] = {"from": file, "to": triton_deployment_dir}
             if model_file is None:
                 for file in artifact_path.iterdir():
-                    if file.suffix == '.onnx':
+                    if file.suffix == ".onnx":
                         model_file = file.name
                         break
-            copy_paths['model_path']['from'] = os.path.join(
-                artifact_path, model_file)
-            copy_paths['model_path']['to'] = os.path.join(
-                triton_deployment_dir, "1")
+            copy_paths["model_path"]["from"] = os.path.join(artifact_path, model_file)
+            copy_paths["model_path"]["to"] = os.path.join(triton_deployment_dir, "1")
 
             if config_file is not None:
-                copy_paths['config_path']['from'] = os.path.join(
-                    artifact_path, config_file)
-                copy_paths['config_path']['to'] = triton_deployment_dir
+                copy_paths["config_path"]["from"] = os.path.join(
+                    artifact_path, config_file
+                )
+                copy_paths["config_path"]["to"] = triton_deployment_dir
             else:
                 # Make sure the directory has been created for config.pbtxt
                 os.makedirs(triton_deployment_dir, exist_ok=True)
                 # Provide a minimum config file so Triton knows what backend
                 # should be performing the auto-completion
-                config = '''
+                config = """
 backend: "onnxruntime"
 default_model_filename: "{}"
-'''.format(model_file)
-                with open(os.path.join(triton_deployment_dir, "config.pbtxt"),
-                          "w") as cfile:
+""".format(
+                    model_file
+                )
+                with open(
+                    os.path.join(triton_deployment_dir, "config.pbtxt"), "w"
+                ) as cfile:
                     cfile.write(config)
         return copy_paths
 
@@ -379,52 +398,51 @@ def _walk(self, path):
         elif os.path.isdir(path):
             return list(os.walk(path))
         else:
-            raise Exception(
-                f'path: {path} is not a valid path to a file or dir.')
+            raise Exception(f"path: {path} is not a valid path to a file or dir.")
 
     def _copy_files_to_triton_repo(self, artifact_path, name, flavor):
         copy_paths = self._get_copy_paths(artifact_path, name, flavor)
         for key in copy_paths:
-            if 's3' in self.server_config:
+            if "s3" in self.server_config:
                 # copy model dir to s3 recursively
-                for root, dirs, files in self._walk(copy_paths[key]['from']):
+                for root, dirs, files in self._walk(copy_paths[key]["from"]):
                     for filename in files:
                         local_path = os.path.join(root, filename)
 
                         if flavor == "onnx":
                             s3_path = os.path.join(
-                                self.server_config['s3_prefix'],
-                                copy_paths[key]['to'].replace(
-                                    self.server_config['triton_model_repo'],
-                                    '').strip('/'),
+                                self.server_config["s3_prefix"],
+                                copy_paths[key]["to"]
+                                .replace(self.server_config["triton_model_repo"], "")
+                                .strip("/"),
                                 filename,
                             )
 
                         elif flavor == "triton":
                             rel_path = os.path.relpath(
                                 local_path,
-                                copy_paths[key]['from'],
+                                copy_paths[key]["from"],
                             )
                             s3_path = os.path.join(
-                                self.server_config['s3_prefix'], name, rel_path)
+                                self.server_config["s3_prefix"], name, rel_path
+                            )
 
-                        self.server_config['s3'].upload_file(
+                        self.server_config["s3"].upload_file(
                             local_path,
-                            self.server_config['s3_bucket'],
+                            self.server_config["s3_bucket"],
                             s3_path,
                         )
             else:
-                if os.path.isdir(copy_paths[key]['from']):
-                    if os.path.isdir(copy_paths[key]['to']):
-                        shutil.rmtree(copy_paths[key]['to'])
-                    shutil.copytree(copy_paths[key]['from'],
-                                    copy_paths[key]['to'])
+                if os.path.isdir(copy_paths[key]["from"]):
+                    if os.path.isdir(copy_paths[key]["to"]):
+                        shutil.rmtree(copy_paths[key]["to"])
+                    shutil.copytree(copy_paths[key]["from"], copy_paths[key]["to"])
                 else:
-                    if not os.path.isdir(copy_paths[key]['to']):
-                        os.makedirs(copy_paths[key]['to'])
-                    shutil.copy(copy_paths[key]['from'], copy_paths[key]['to'])
+                    if not os.path.isdir(copy_paths[key]["to"]):
+                        os.makedirs(copy_paths[key]["to"])
+                    shutil.copy(copy_paths[key]["from"], copy_paths[key]["to"])
 
-        if 's3' not in self.server_config:
+        if "s3" not in self.server_config:
             triton_deployment_dir = os.path.join(self.triton_model_repo, name)
             version_folder = os.path.join(triton_deployment_dir, "1")
             os.makedirs(version_folder, exist_ok=True)
@@ -432,40 +450,41 @@ def _copy_files_to_triton_repo(self, artifact_path, name, flavor):
         return copy_paths
 
     def _delete_mlflow_meta(self, filepath):
-        if 's3' in self.server_config:
-            self.server_config['s3'].delete_object(
-                Bucket=self.server_config['s3_bucket'],
+        if "s3" in self.server_config:
+            self.server_config["s3"].delete_object(
+                Bucket=self.server_config["s3_bucket"],
                 Key=filepath,
             )
         elif os.path.isfile(filepath):
             os.remove(filepath)
 
     def _delete_deployment_files(self, name):
-
         triton_deployment_dir = os.path.join(self.triton_model_repo, name)
 
-        if 's3' in self.server_config:
-            objs = self.server_config['s3'].list_objects(
-                Bucket=self.server_config['s3_bucket'],
-                Prefix=os.path.join(self.server_config['s3_prefix'], name),
+        if "s3" in self.server_config:
+            objs = self.server_config["s3"].list_objects(
+                Bucket=self.server_config["s3_bucket"],
+                Prefix=os.path.join(self.server_config["s3_prefix"], name),
             )
 
-            for key in objs['Contents']:
-                key = key['Key']
+            for key in objs["Contents"]:
+                key = key["Key"]
                 try:
-                    self.server_config['s3'].delete_object(
-                        Bucket=self.server_config['s3_bucket'],
+                    self.server_config["s3"].delete_object(
+                        Bucket=self.server_config["s3_bucket"],
                         Key=key,
                     )
                 except Exception as e:
-                    raise Exception(f'Could not delete {key}: {e}')
+                    raise Exception(f"Could not delete {key}: {e}")
 
         else:
             # Check if the deployment directory exists
             if not os.path.isdir(triton_deployment_dir):
                 raise Exception(
-                    "A deployment does not exist for this model in directory {} for model name {}"
-                    .format(triton_deployment_dir, name))
+                    "A deployment does not exist for this model in directory {} for model name {}".format(
+                        triton_deployment_dir, name
+                    )
+                )
 
             model_file = glob.glob("{}/model*".format(triton_deployment_dir))
             for file in model_file:
@@ -474,28 +493,30 @@ def _delete_deployment_files(self, name):
                 print("Model directory removed: {}".format(file))
 
         # Delete mlflow meta file
-        mlflow_meta_path = os.path.join(self.triton_model_repo, name,
-                                        _MLFLOW_META_FILENAME)
+        mlflow_meta_path = os.path.join(
+            self.triton_model_repo, name, _MLFLOW_META_FILENAME
+        )
         self._delete_mlflow_meta(mlflow_meta_path)
 
     def _validate_config_args(self, config):
-        if not config['version']:
+        if not config["version"]:
             raise Exception("Please provide the version as a config argument")
-        if not config['version'].isdigit():
+        if not config["version"].isdigit():
             raise ValueError(
                 "Please make sure version is a number. version = {}".format(
-                    config['version']))
+                    config["version"]
+                )
+            )
 
     def _validate_flavor(self, flavor):
         if flavor not in self.supported_flavors:
-            raise Exception(
-                "{} model flavor not supported by Triton".format(flavor))
+            raise Exception("{} model flavor not supported by Triton".format(flavor))
 
     def _model_exists(self, name):
         deploys = self.list_deployments()
         exists = False
         for d in deploys:
-            if d['name'] == name:
+            if d["name"] == name:
                 exists = True
         return exists
 
@@ -508,7 +529,7 @@ def target_help():
     help_msg = (
         "\nmlflow-triton plugin integrates the Triton Inference Server to the mlflow deployment pipeline. \n\n "
         "Example command: \n\n"
-        "  mlflow deployments create -t triton --name mymodel --flavor onnx -m models:/mymodel/Production -C \"version=1\" \n\n"
+        '  mlflow deployments create -t triton --name mymodel --flavor onnx -m models:/mymodel/Production -C "version=1" \n\n'
         "The environment variable TRITON_MODEL_REPO must be set to the location that the Triton"
         "Inference Server is storing its models\n\n"
         "export TRITON_MODEL_REPO = /path/to/triton/model/repo\n\n"
diff --git a/deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py b/deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py
old mode 100644
new mode 100755
index 5343e0da63..779d393020
--- a/deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py
+++ b/deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py
@@ -1,4 +1,6 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -23,10 +25,10 @@
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import mlflow
 import os
-import click
 
+import click
+import mlflow
 import triton_flavor
 
 
@@ -35,18 +37,20 @@
     "--model_name",
     help="Model name",
 )
-@click.option("--model_directory",
-              type=click.Path(exists=True, readable=True),
-              required=True,
-              help="Model filepath")
+@click.option(
+    "--model_directory",
+    type=click.Path(exists=True, readable=True),
+    required=True,
+    help="Model filepath",
+)
 @click.option(
     "--flavor",
-    type=click.Choice(['triton'], case_sensitive=True),
+    type=click.Choice(["triton"], case_sensitive=True),
     required=True,
     help="Model flavor",
 )
 def publish_to_mlflow(model_name, model_directory, flavor):
-    mlflow_tracking_uri = os.environ['MLFLOW_TRACKING_URI']
+    mlflow_tracking_uri = os.environ["MLFLOW_TRACKING_URI"]
     artifact_path = "triton"
 
     mlflow.set_tracking_uri(uri=mlflow_tracking_uri)
diff --git a/deploy/mlflow-triton-plugin/scripts/triton_flavor.py b/deploy/mlflow-triton-plugin/scripts/triton_flavor.py
old mode 100644
new mode 100755
index eaafdea7c7..7b0f61630d
--- a/deploy/mlflow-triton-plugin/scripts/triton_flavor.py
+++ b/deploy/mlflow-triton-plugin/scripts/triton_flavor.py
@@ -1,4 +1,6 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,7 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 """
 The ``triton`` module provides APIs for logging and loading Triton-recognized
-models in the MLflow Model format. This module exports MLflow Models with the following 
+models in the MLflow Model format. This module exports MLflow Models with the following
 flavors:
 
 Triton format
@@ -36,12 +38,12 @@
 import shutil
 import sys
 
+from mlflow.exceptions import MlflowException
 from mlflow.models import Model
 from mlflow.models.model import MLMODEL_FILE_NAME
-from mlflow.exceptions import MlflowException
 from mlflow.protos.databricks_pb2 import RESOURCE_ALREADY_EXISTS
-from mlflow.utils.annotations import experimental
 from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS
+from mlflow.utils.annotations import experimental
 
 FLAVOR_NAME = "triton"
 
@@ -63,8 +65,10 @@ def save_model(
 
     path = os.path.abspath(path)
     if os.path.exists(path):
-        raise MlflowException(message="Path '{}' already exists".format(path),
-                              error_code=RESOURCE_ALREADY_EXISTS)
+        raise MlflowException(
+            message="Path '{}' already exists".format(path),
+            error_code=RESOURCE_ALREADY_EXISTS,
+        )
     os.makedirs(path)
     triton_model_path = os.path.normpath(triton_model_path)
     model_data_subpath = os.path.basename(triton_model_path)
diff --git a/deploy/mlflow-triton-plugin/setup.py b/deploy/mlflow-triton-plugin/setup.py
old mode 100644
new mode 100755
index 6e5c2baa53..65b8e0df1e
--- a/deploy/mlflow-triton-plugin/setup.py
+++ b/deploy/mlflow-triton-plugin/setup.py
@@ -1,4 +1,6 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -23,7 +25,7 @@
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-from setuptools import setup, find_packages
+from setuptools import find_packages, setup
 
 setup(
     name="mlflow-triton",
diff --git a/docker/cpu_only/entrypoint.d/12-banner.sh b/docker/cpu_only/entrypoint.d/12-banner.sh
old mode 100644
new mode 100755
diff --git a/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh b/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh
old mode 100644
new mode 100755
diff --git a/docker/entrypoint.d/50-gpu-driver-check2.sh b/docker/entrypoint.d/50-gpu-driver-check2.sh
old mode 100644
new mode 100755
diff --git a/docker/entrypoint.d/56-network-driver-version-check.sh b/docker/entrypoint.d/56-network-driver-version-check.sh
old mode 100644
new mode 100755
index 8b13789179..a9bf588e2f
--- a/docker/entrypoint.d/56-network-driver-version-check.sh
+++ b/docker/entrypoint.d/56-network-driver-version-check.sh
@@ -1 +1 @@
-
+#!/bin/bash
diff --git a/docker/entrypoint.d/70-shm-check.sh b/docker/entrypoint.d/70-shm-check.sh
old mode 100644
new mode 100755
index 8b13789179..a9bf588e2f
--- a/docker/entrypoint.d/70-shm-check.sh
+++ b/docker/entrypoint.d/70-shm-check.sh
@@ -1 +1 @@
-
+#!/bin/bash
diff --git a/docker/entrypoint.d/99-check-run-aip-mode.sh b/docker/entrypoint.d/99-check-run-aip-mode.sh
old mode 100644
new mode 100755
diff --git a/docker/sagemaker/serve b/docker/sagemaker/serve
index 8f98010c95..268f1f0f68 100755
--- a/docker/sagemaker/serve
+++ b/docker/sagemaker/serve
@@ -32,12 +32,12 @@ SAGEMAKER_SINGLE_MODEL_REPO=/opt/ml/model/
 if [ -n "$SAGEMAKER_TRITON_OVERRIDE_PING_MODE" ]; then
     SAGEMAKER_TRITON_PING_MODE=${SAGEMAKER_TRITON_OVERRIDE_PING_MODE}
 else
-    SAGEMAKER_TRITON_PING_MODE="ready" 
+    SAGEMAKER_TRITON_PING_MODE="ready"
 fi
 
 # Note: in Triton on SageMaker, each model url is registered as a separate repository
 # e.g., /opt/ml/models/<hash>/model. Specifying MME model repo path as /opt/ml/models causes Triton
-# to treat it as an additional empty repository and changes 
+# to treat it as an additional empty repository and changes
 # the state of all models to be UNAVAILABLE in the model repository
 # https://github.com/triton-inference-server/core/blob/main/src/model_repository_manager.cc#L914,L922
 # On Triton, this path will be a dummy path as it's mandatory to specify a model repo when starting triton
@@ -53,10 +53,10 @@ if [ -n "$SAGEMAKER_MULTI_MODEL" ]; then
         if [ -n "$SAGEMAKER_TRITON_OVERRIDE_PING_MODE" ]; then
             SAGEMAKER_TRITON_PING_MODE=${SAGEMAKER_TRITON_OVERRIDE_PING_MODE}
         else
-            SAGEMAKER_TRITON_PING_MODE="live" 
+            SAGEMAKER_TRITON_PING_MODE="live"
         fi
         is_mme_mode=true
-        echo -e "Triton is running in SageMaker MME mode. Using Triton ping mode: \"${SAGEMAKER_TRITON_PING_MODE}\"" 
+        echo -e "Triton is running in SageMaker MME mode. Using Triton ping mode: \"${SAGEMAKER_TRITON_PING_MODE}\""
     fi
 fi
 
diff --git a/docs/Makefile b/docs/Makefile
index 9a2abe880c..98271dfb29 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -49,5 +49,5 @@ help:
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: 
+%:
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/README.md b/docs/README.md
index ed4751c188..f6117c8168 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -29,11 +29,11 @@
 # **Triton Inference Server Documentation**
 
 | [Installation](README.md#installation) | [Getting Started](README.md#getting-started) | [User Guide](README.md#user-guide) | [API Guide](protocol/README.md) | [Additional Resources](README.md#resources) | [Customization Guide](README.md#customization-guide) |
-| ------------ | --------------- | --------------- | ------------ | --------------- | --------------- | 
+| ------------ | --------------- | --------------- | ------------ | --------------- | --------------- |
 
-**New to Triton Inference Server?** Make use of 
+**New to Triton Inference Server?** Make use of
 [these tutorials](https://github.com/triton-inference-server/tutorials)
- to begin your Triton journey! 
+ to begin your Triton journey!
 
 ## **Installation**
 Before you can use the Triton Docker image you must install
@@ -58,14 +58,14 @@ This guide covers the simplest possible workflow for deploying a model using a T
 - [Launch Triton](getting_started/quickstart.md#launch-triton)
 - [Send an Inference Request](getting_started/quickstart.md#send-an-inference-request)
 
-Triton Inference Server has a considerable list versatile and powerful features. All new users are recommended to explore the [User Guide](README.md#user-guide) and the [additional resources](README.md#resources) sections for features most relevant to their use case. 
+Triton Inference Server has a considerable list versatile and powerful features. All new users are recommended to explore the [User Guide](README.md#user-guide) and the [additional resources](README.md#resources) sections for features most relevant to their use case.
 
 ## **User Guide**
 The User Guide describes how to configure Triton, organize and configure your models, use the C++ and Python clients, etc. This guide includes the following:
 * Creating a Model Repository [[Overview](README.md#model-repository) || [Details](user_guide/model_repository.md)]
 * Writing a Model Configuration [[Overview](README.md#model-configuration) || [Details](user_guide/model_configuration.md)]
 * Buillding a Model Pipeline [[Overview](README.md#model-pipeline)]
-* Managing Model Availablity [[Overview](README.md#model-management) || [Details](user_guide/model_management.md)]
+* Managing Model Availability [[Overview](README.md#model-management) || [Details](user_guide/model_management.md)]
 * Collecting Server Metrics [[Overview](README.md#metrics) || [Details](user_guide/metrics.md)]
 * Supporting Custom Ops/layers [[Overview](README.md#framework-custom-operations) || [Details](user_guide/custom_operations.md)]
 * Using the Client API [[Overview](README.md#client-libraries-and-examples) || [Details](https://github.com/triton-inference-server/client)]
@@ -73,14 +73,14 @@ The User Guide describes how to configure Triton, organize and configure your mo
 * Deploying on edge (Jetson) [[Overview](README.md#jetson-and-jetpack)]
 
 
-### Model Repository 
+### Model Repository
 [Model Repositories](user_guide/model_repository.md) are the organizational hub for using Triton. All models, configuration files, and additional resources needed to serve the models are housed inside a model repository.
 - [Cloud Storage](user_guide/model_repository.md#model-repository-locations)
 - [File Organization](user_guide/model_repository.md#model-files)
 - [Model Versioning](user_guide/model_repository.md#model-versions)
 ### Model Configuration
 
-A [Model Configuration](user_guide/model_configuration.md) file is where you set the model-level options, such as output tensor reshaping and dynamic batch sizing. 
+A [Model Configuration](user_guide/model_configuration.md) file is where you set the model-level options, such as output tensor reshaping and dynamic batch sizing.
 
 #### Required Model Configuration
 
@@ -112,7 +112,7 @@ The Model Configuration ModelOptimizationPolicy property is used to specify opti
 
 #### Scheduling and Batching
 
-Triton supports batching individual inference requests to improve compute resource utilization. This is extremely important as individual requests typically will not saturate GPU resources thus not leveraging the parallelism provided by GPUs to its extent. Learn more about Triton's [Batcher and Scheduler](user_guide/model_configuration.md#scheduling-and-batching).  
+Triton supports batching individual inference requests to improve compute resource utilization. This is extremely important as individual requests typically will not saturate GPU resources thus not leveraging the parallelism provided by GPUs to its extent. Learn more about Triton's [Batcher and Scheduler](user_guide/model_configuration.md#scheduling-and-batching).
 - [Default Scheduler - Non-Batching](user_guide/model_configuration.md#default-scheduler)
 - [Dynamic Batcher](user_guide/model_configuration.md#dynamic-batcher)
   - [How to Configure Dynamic Batcher](user_guide/model_configuration.md#recommended-configuration-process)
@@ -134,21 +134,21 @@ Triton supports batching individual inference requests to improve compute resour
 Rate limiter manages the rate at which requests are scheduled on model instances by Triton. The rate limiter operates across all models loaded in Triton to allow cross-model prioritization. [Learn more](user_guide/rate_limiter.md).
 
 #### Model Warmup
-For a few of the Backends (check [Additional Resources](README.md#resources)) some or all of intialization is deffered till the first inference request is received, the benefit is resource conservation but comes with the downside of the initial requests getting processed slower than expected. Users can pre-"warm up" the model by instructing Triton to intialize the model. [Learn more](user_guide/model_configuration.md#model-warmup). 
+For a few of the Backends (check [Additional Resources](README.md#resources)) some or all of initialization is deferred until the first inference request is received, the benefit is resource conservation but comes with the downside of the initial requests getting processed slower than expected. Users can pre-"warm up" the model by instructing Triton to initialize the model. [Learn more](user_guide/model_configuration.md#model-warmup).
 
 #### Inference Request/Response Cache
 Triton has a feature which allows inference responses to get cached. [Learn More](user_guide/response_cache.md).
 
 ### Model Pipeline
-Building ensembles is as easy as adding an addition configuration file which outlines the specific flow of tensors from one model to another. Any additional changes required by the model ensemble can be made in existing (individual) model configurations. 
+Building ensembles is as easy as adding an addition configuration file which outlines the specific flow of tensors from one model to another. Any additional changes required by the model ensemble can be made in existing (individual) model configurations.
 - [Model Ensemble](user_guide/architecture.md#ensemble-models)
 - [Business Logic Scripting (BLS)](https://github.com/triton-inference-server/python_backend#business-logic-scripting)
 ### Model Management
-Users can specify policies in the model configuration for loading and unloading of models. This [section](user_guide/model_management.md) covers user selectable policy details.  
+Users can specify policies in the model configuration for loading and unloading of models. This [section](user_guide/model_management.md) covers user selectable policy details.
 - [Explicit Model Loading and Unloading](user_guide/model_management.md#model-control-mode-explicit)
 - [Modifying the Model Repository](user_guide/model_management.md#modifying-the-model-repository)
 ### Metrics
-Triton provides Prometheus metrics like GPU Utilization, Memory Usage, Latency and more. Learn about [availble metrics](user_guide/metrics.md). 
+Triton provides Prometheus metrics like GPU Utilization, Memory Usage, Latency and more. Learn about [available metrics](user_guide/metrics.md).
 ### Framework Custom Operations
 Some frameworks provide the option of building custom layers/operations. These can be added to specific Triton Backends for the those frameworks. [Learn more](user_guide/custom_operations.md)
 - [TensorRT](user_guide/custom_operations.md#tensorrt)
@@ -164,9 +164,9 @@ Use the [Triton Client](https://github.com/triton-inference-server/client) API t
   - [go](https://github.com/triton-inference-server/client/tree/main/src/grpc_generated/go)
   - [Java/Scala](https://github.com/triton-inference-server/client/tree/main/src/grpc_generated/java)
   - [Javascript](https://github.com/triton-inference-server/client/tree/main/src/grpc_generated/javascript)
-- [Shared Memory Extention](protocol/extension_shared_memory.md)
+- [Shared Memory Extension](protocol/extension_shared_memory.md)
 ### Performance Analysis
-Understanding Inference perfomance is key to better resource utilization. Use Triton's Tools to costomize your deployment.
+Understanding Inference performance is key to better resource utilization. Use Triton's Tools to costomize your deployment.
 - [Performance Tuning Guide](user_guide/performance_tuning.md)
 - [Optimization](user_guide/optimization.md)
 - [Model Analyzer](user_guide/model_analyzer.md)
@@ -189,7 +189,7 @@ The following resources are recommended to explore the full suite of Triton Infe
   - [Model Navigator](https://github.com/triton-inference-server/model_navigator):
   The Triton Model Navigator is a tool that provides the ability to automate the process of moving model from source to optimal format and configuration for deployment on Triton Inference Server. The tool supports export model from source to all possible formats and applies the Triton Inference Server backend optimizations.
 
-- **Backends**: Triton has suports a wide varity of frameworks used to run models. Users can extend this functionality by creating custom backends.
+- **Backends**: Triton has supports a wide variety of frameworks used to run models. Users can extend this functionality by creating custom backends.
   - [PyTorch](https://github.com/triton-inference-server/pytorch_backend): Widely used Open Source DL Framework
   - [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend): Widely used Open Source DL Framework
   - [TensorRT](https://github.com/triton-inference-server/tensorrt_backend): NVIDIA [TensorRT](https://developer.nvidia.com/tensorrt) is an inference acceleration SDK that provide a with range of graph optimizations, kernel optimization, use of lower precision, and more.
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
index 798df3d541..a8c37ced01 100644
--- a/docs/_static/custom.css
+++ b/docs/_static/custom.css
@@ -114,14 +114,14 @@ font-family: NVIDIA Sans, Helvetica, Arial, Sans-serif;
   font-size: 0.85em;
 }
 
-/* colors 
+/* colors
 nv green 118,185,0
 black 0, 0, 0
 light gray 205, 205, 205
 medium gray 140, 140, 140
 dark gray 94, 94, 94
 
-emerald 0, 133, 100 
+emerald 0, 133, 100
 emerald #008564
 amethyst 92, 22, 130
 amethyst #5C1682
diff --git a/docs/conf.py b/docs/conf.py
old mode 100644
new mode 100755
index 98000f2227..9378329752
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -46,9 +48,9 @@
 
 # -- Project information -----------------------------------------------------
 
-project = 'NVIDIA Triton Inference Server'
-copyright = '2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved'
-author = 'NVIDIA'
+project = "NVIDIA Triton Inference Server"
+copyright = "2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved"
+author = "NVIDIA"
 
 # The full version, including alpha/beta/rc tags
 # Env only set during riva-release process, otherwise keep as dev for all internal builds
@@ -69,7 +71,7 @@
     "sphinx_copybutton",
     "sphinx_design",
     "sphinx-prompt",
-    #"sphinxcontrib.bibtex",
+    # "sphinxcontrib.bibtex",
     "sphinx_tabs.tabs",
     "sphinx_sitemap",
 ]
@@ -79,7 +81,9 @@
 numfig = True
 
 # final location of docs for seo/sitemap
-html_baseurl = 'https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/'
+html_baseurl = (
+    "https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/"
+)
 
 myst_enable_extensions = [
     "dollarmath",
@@ -96,7 +100,7 @@
 myst_heading_anchors = 5
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -121,7 +125,7 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 html_css_files = ["custom.css"]
 
 html_theme_options = {
@@ -150,11 +154,10 @@
 deploy_ngc_org = "nvidia"
 deploy_ngc_team = "triton"
 myst_substitutions = {
-    "VersionNum":
-        version_short,
-    "deploy_ngc_org_team":
-        f"{deploy_ngc_org}/{deploy_ngc_team}"
-        if deploy_ngc_team else deploy_ngc_org,
+    "VersionNum": version_short,
+    "deploy_ngc_org_team": f"{deploy_ngc_org}/{deploy_ngc_team}"
+    if deploy_ngc_team
+    else deploy_ngc_org,
 }
 
 
@@ -167,31 +170,31 @@ def ultimateReplace(app, docname, source):
 
 # this is a necessary hack to allow us to fill in variables that exist in code blocks
 ultimate_replacements = {
-    "{VersionNum}":
-        version_short,
-    "{SamplesVersionNum}":
-        version_short,
-    "{NgcOrgTeam}":
-        f"{deploy_ngc_org}/{deploy_ngc_team}"
-        if deploy_ngc_team else deploy_ngc_org,
+    "{VersionNum}": version_short,
+    "{SamplesVersionNum}": version_short,
+    "{NgcOrgTeam}": f"{deploy_ngc_org}/{deploy_ngc_team}"
+    if deploy_ngc_team
+    else deploy_ngc_org,
 }
 
-#bibtex_bibfiles = ["references.bib"]
+# bibtex_bibfiles = ["references.bib"]
 # To test that style looks good with common bibtex config
-#bibtex_reference_style = "author_year"
-#bibtex_default_style = "plain"
+# bibtex_reference_style = "author_year"
+# bibtex_default_style = "plain"
 
-### We currrently use Myst: https://myst-nb.readthedocs.io/en/latest/use/execute.html
+### We currently use Myst: https://myst-nb.readthedocs.io/en/latest/use/execute.html
 jupyter_execute_notebooks = "off"  # Global execution disable
 # execution_excludepatterns = ['tutorials/tts-python-basics.ipynb']  # Individual notebook disable
 
 
 def setup(app):
-    app.add_config_value('ultimate_replacements', {}, True)
-    app.connect('source-read', ultimateReplace)
+    app.add_config_value("ultimate_replacements", {}, True)
+    app.connect("source-read", ultimateReplace)
     app.add_js_file("https://js.hcaptcha.com/1/api.js")
 
-    visitor_script = "//assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js"
+    visitor_script = (
+        "//assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js"
+    )
 
     if visitor_script:
         app.add_js_file(visitor_script)
@@ -213,8 +216,9 @@ def setup(app):
 
 # Patch for sphinx.search stemming short terms (i.e. tts -> tt)
 # https://github.com/sphinx-doc/sphinx/blob/4.5.x/sphinx/search/__init__.py#L380
-def sphinxSearchIndexFeed(self, docname: str, filename: str, title: str,
-                          doctree: nodes.document):
+def sphinxSearchIndexFeed(
+    self, docname: str, filename: str, title: str, doctree: nodes.document
+):
     """Feed a doctree to the index."""
     self._titles[docname] = title
     self._filenames[docname] = filename
@@ -242,11 +246,9 @@ def stem(word: str) -> str:
     for word in visitor.found_words:
         stemmed_word = stem(word)
         # again, stemmer must not remove words from search index
-        if len(stemmed_word) <= 3 or not _filter(stemmed_word) and _filter(
-                word):
+        if len(stemmed_word) <= 3 or not _filter(stemmed_word) and _filter(word):
             stemmed_word = word.lower()
-        already_indexed = docname in self._title_mapping.get(
-            stemmed_word, set())
+        already_indexed = docname in self._title_mapping.get(stemmed_word, set())
         if _filter(stemmed_word) and not already_indexed:
             self._mapping.setdefault(stemmed_word, set()).add(docname)
 
diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md
index ddfc931c19..fca514ab64 100644
--- a/docs/customization_guide/build.md
+++ b/docs/customization_guide/build.md
@@ -47,9 +47,9 @@ The Triton source is distributed across multiple GitHub repositories
 that together can be built and installed to create a complete Triton
 installation. Triton server is built using CMake and (optionally)
 Docker. To simplify the build process, Triton provides a
-[build.py](https://github.com/triton-inference-server/server/blob/main/build.py) script. 
-The build.py script will generate the CMake and Docker build steps required to 
-build Triton, and will optionally invoke those steps or leave the invocation to 
+[build.py](https://github.com/triton-inference-server/server/blob/main/build.py) script.
+The build.py script will generate the CMake and Docker build steps required to
+build Triton, and will optionally invoke those steps or leave the invocation to
 you, as described below.
 
 The build.py script currently supports building Triton for the
@@ -197,9 +197,9 @@ To include the TensorFlow2 backend in your CPU-only build, you must
 provide this additional flag to build.py:
 `--extra-backend-cmake-arg=tensorflow2:TRITON_TENSORFLOW_INSTALL_EXTRA_DEPS=ON`.
 
-CPU-only builds of the TensorFlow and PyTorch backends require some CUDA stubs 
-and runtime dependencies that are not present in the CPU-only base container. 
-These are retrieved from a GPU base container, which can be changed with the 
+CPU-only builds of the TensorFlow and PyTorch backends require some CUDA stubs
+and runtime dependencies that are not present in the CPU-only base container.
+These are retrieved from a GPU base container, which can be changed with the
 `--image=gpu-base,nvcr.io/nvidia/tritonserver:<xx.yy>-py3-min` flag.
 
 ### Building Without Docker
diff --git a/docs/customization_guide/inference_protocols.md b/docs/customization_guide/inference_protocols.md
index 97a505d720..6110b739cd 100644
--- a/docs/customization_guide/inference_protocols.md
+++ b/docs/customization_guide/inference_protocols.md
@@ -149,7 +149,7 @@ protocol types mentioned above:
 request to the protocol is received. The completed header will be in the form of
 `triton-grpc-protocol-<restricted-key>`
 
-* `restricted-value` : The value of the header to be matched in order to preceed
+* `restricted-value` : The value of the header to be matched in order to proceed in
 the process of the specified protocols.
 
 #### Example
@@ -177,8 +177,8 @@ tritonserver.dll. In the Triton Docker image the shared library is
 found in /opt/tritonserver/lib. The header file that defines and
 documents the Server API is
 [tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h).
-[Java bindings for In-Process Triton Server API](#java-bindings-for-in-process-triton-server-api) 
-are built on top of `tritonserver.h` and can be used for Java applications that 
+[Java bindings for In-Process Triton Server API](#java-bindings-for-in-process-triton-server-api)
+are built on top of `tritonserver.h` and can be used for Java applications that
 need to use Tritonserver in-process.
 
 All capabilities of Triton server are encapsulated in the shared
@@ -206,7 +206,7 @@ When you link the Triton shared library into your application you are
 *not* spawning a separate Triton process, instead, you are including
 the Triton core logic directly in your application. The Triton
 HTTP/REST or GRPC protocols are not used to communicate with this
-Triton core logic, instead all communication between your appliation
+Triton core logic, instead all communication between your application
 and the Triton core logic must take place via the [Server
 API](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h).
 
@@ -384,7 +384,7 @@ of Triton. The primary source files for the endpoints are
 The Triton Inference Server uses [Java CPP](https://github.com/bytedeco/javacpp)
 to create bindings around Tritonserver to create Java API.
 
-The API is documented in 
+The API is documented in
 [tritonserver.java](https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/src/gen/java/org/bytedeco/tritonserver/global/tritonserver.java).
 Alternatively, the user can refer to the web version [API docs](http://bytedeco.org/javacpp-presets/tritonserver/apidocs/)
 generated from `tritonserver.java`.
@@ -393,8 +393,8 @@ and the bindings for `C-API Wrapper`. More information about the [developer_tool
 
 A simple example using the Java API can be found in
 [Samples folder](https://github.com/bytedeco/javacpp-presets/tree/master/tritonserver/samples)
-which includes `Simple.java` which is similar to 
-[`simple.cc`](https://github.com/triton-inference-server/server/blob/main/src/simple.cc). 
+which includes `Simple.java` which is similar to
+[`simple.cc`](https://github.com/triton-inference-server/server/blob/main/src/simple.cc).
 Please refer to
 [sample usage documentation](https://github.com/bytedeco/javacpp-presets/tree/master/tritonserver#sample-usage)
 to learn about how to build and run `Simple.java`.
@@ -438,7 +438,7 @@ After ensuring that Tritonserver and dependencies are installed, you can run you
 Java program with the Java bindings with the following steps:
 
 1. Place Java bindings into your environment. You can do this by either:
-   
+
    a. Building Java API bindings with provided build script:
       ```bash
       # Clone Triton client repo. Recommended client repo tag is: main
@@ -451,7 +451,7 @@ Java program with the Java bindings with the following steps:
       $ source clientrepo/src/java-api-bindings/scripts/install_dependencies_and_build.sh --enable-developer-tools-server`
       ```
       This will install the Java bindings to `/workspace/install/java-api-bindings/tritonserver-java-bindings.jar`
-   
+
    *or*
 
    b. Copying "Uber Jar" from Triton SDK container to your environment
@@ -459,7 +459,7 @@ Java program with the Java bindings with the following steps:
       $ id=$(docker run -dit nvcr.io/nvidia/tritonserver:<triton container version>-py3-sdk bash)
       $ docker cp ${id}:/workspace/install/java-api-bindings/tritonserver-java-bindings.jar <Uber Jar directory>/tritonserver-java-bindings.jar
       $ docker stop ${id}
-      ``` 
+      ```
       **Note:** `tritonserver-java-bindings.jar` only includes the `In-Process Java Bindings`. To use the `C-API Wrapper Java Bindings`, please use the build script.
 2. Use the built "Uber Jar" that contains the Java bindings
    ```bash
@@ -474,7 +474,7 @@ bindings Jar](#run-java-program-with-java-bindings-jar) to also build the jar
 yourself without any modifications to the Tritonserver bindings in
 JavaCPP-presets. You can do this using the following steps:
 
-1. Create the JNI binaries in your local repository (`/root/.m2/repository`) 
+1. Create the JNI binaries in your local repository (`/root/.m2/repository`)
    with [`javacpp-presets/tritonserver`](https://github.com/bytedeco/javacpp-presets/tree/master/tritonserver)
 ```bash
  $ git clone https://github.com/bytedeco/javacpp-presets.git
@@ -482,8 +482,8 @@ JavaCPP-presets. You can do this using the following steps:
  $ mvn clean install --projects .,tritonserver
  $ mvn clean install -f platform --projects ../tritonserver/platform -Djavacpp.platform=linux-x86_64
 ```
-2. Create your custom `*.pom` file for Maven. Please refer to 
-   [samples/simple/pom.xml](https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/samples/simple/pom.xml) as 
+2. Create your custom `*.pom` file for Maven. Please refer to
+   [samples/simple/pom.xml](https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/samples/simple/pom.xml) as
    reference for how to create your pom file.
 3. After creating your `pom.xml` file you can build your application with:
 ```bash
diff --git a/docs/examples/README.md b/docs/examples/README.md
index 085e1ee803..3261bc6a9d 100644
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@@ -28,7 +28,7 @@
 
 # Triton Examples
 
-**New to Triton Inference Server?** Make use of [these tutorials](https://github.com/triton-inference-server/tutorials) to begin your Triton journey! 
+**New to Triton Inference Server?** Make use of [these tutorials](https://github.com/triton-inference-server/tutorials) to begin your Triton journey!
 
 This folder contains the following:
 * jetson: This covers deploying Triton Inference Server on Jetson devices.
diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile b/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile
index 5b22a63e06..6dcf0d0dc4 100644
--- a/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile
+++ b/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile
@@ -32,14 +32,14 @@ GCC_PARMS+=-I${HOME}/tritonserver/include/tritonserver -D TRITON_ENABLE_GPU=ON -
 GCC_LIBS=-L${HOME}/tritonserver/lib -L/usr/lib -L/usr/local/cuda/targets/aarch64-linux/lib
 GCC_LIBS+=-lpthread -ltritonserver -lopencv_core -lopencv_highgui -lopencv_imgproc -lopencv_imgcodecs -lopencv_dnn -lcudart
 
-all: $(TARGET) 
+all: $(TARGET)
 
 
 %.o: %.cc
 	$(GCC) $(GCC_PARMS) -c -g -o $@ $^
 
 $(TARGET): $(TARGET).o
-	$(GCC) $^ $(GCC_LIBS) -o $@ 
+	$(GCC) $^ $(GCC_LIBS) -o $@
 
 clean:
 	rm -f $(TARGET).o $(TARGET)
diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/README.md b/docs/examples/jetson/concurrency_and_dynamic_batching/README.md
index ad3c473dfb..30cfe196f1 100644
--- a/docs/examples/jetson/concurrency_and_dynamic_batching/README.md
+++ b/docs/examples/jetson/concurrency_and_dynamic_batching/README.md
@@ -48,7 +48,7 @@ ngc registry model download-version "nvidia/tao/peoplenet:pruned_v2.1"
 
 For latter you need to setup the [NGC CLI](https://ngc.nvidia.com/setup).
 
-Having downloaded the model from the NGC, unzip the archive `peoplenet_pruned_v2.1.zip` into `concurrency_and_dynamic_batching/tao/models/peoplenet`. 
+Having downloaded the model from the NGC, unzip the archive `peoplenet_pruned_v2.1.zip` into `concurrency_and_dynamic_batching/tao/models/peoplenet`.
 
 If you have the zip archive in the `concurrency_and_dynamic_batching` directory, the following will automatically place the model to the correct location:
 
@@ -78,10 +78,10 @@ The `tao-converter` tool is available as a compiled release file for different p
 After you have downloaded `tao-converter`, you might need to execute
 
 ```shell
-chmod 777 tao-converter 
-``` 
+chmod 777 tao-converter
+```
 
-in the directory with the tool. 
+in the directory with the tool.
 
 We provide a conversion script `tao/convert_peoplenet.sh` which expects the model to be present at the location.
 
@@ -139,13 +139,13 @@ To execute from the terminal, run from the `concurrency_and_dynamic_batching` di
 LD_LIBRARY_PATH=$HOME/tritonserver/lib ./people_detection -m system -v -r $(pwd)/trtis_model_repo_sample_1 -t 6 -s false -p $HOME/tritonserver
 ```
 
-The parameter `-t` controlls the number of concurrent inference calls we want to execute. We will be executing the same model on the same sample image with the purpose of demonstrating how setting different concurency options affects the performance.
+The parameter `-t` controls the number of concurrent inference calls we want to execute. We will be executing the same model on the same sample image with the purpose of demonstrating how setting different concurrency options affects the performance.
 
 You can enable saving detected bounding boxes in the project directory in form of overlays over the original image for each execution thread. You can turn the visualization on by setting the parameter `-s` to `true` upon execution (`-s` is set to `false` by default).
 
 ### Expected output
 
-Upon execution, in the terminal log you will see _Model 'peoplenet' Stats_ in json format reflecting the inference performance. We also output _TOTAL INFERENCE TIME_ which simply reflects the elapsed time requred to run the application including data loading, pre-processing and post-processing.
+Upon execution, in the terminal log you will see _Model 'peoplenet' Stats_ in json format reflecting the inference performance. We also output _TOTAL INFERENCE TIME_ which simply reflects the elapsed time required to run the application including data loading, pre-processing and post-processing.
 
 A typical output in the log for _Model 'peoplenet' Stats_ looks as follows:
 
@@ -210,7 +210,7 @@ TOTAL INFERENCE TIME: 174ms
 
 To learn about different statistics check out the [documentation](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_statistics.md#statistics-extension).
 
-To see how setting different values for concurrency affects total execution time and its componets reflected in the model stats, you need to modify a single parameter in the model config file.
+To see how setting different values for concurrency affects total execution time and its components reflected in the model stats, you need to modify a single parameter in the model config file.
 
 To enable concurrent model execution support for a model, corresponding model config file `trtis_model_repo_sample_1/peoplenet/config.pbtxt` includes the following:
 
@@ -223,17 +223,17 @@ instance_group [
 ]
 ```
 
-You can change the count of allowed inferences for the same model instance and observe how it affects performance in _Model 'peoplenet' Stats_ and _TOTAL INFERENCE TIME_. Note that on Jetson we dont recommend setting values too high: for instance, on a device like a Jetson Xavier AGX we don't recommend setting the number larger than 6. The values in the range 1-3 are optimal. 
+You can change the count of allowed inferences for the same model instance and observe how it affects performance in _Model 'peoplenet' Stats_ and _TOTAL INFERENCE TIME_. Note that on Jetson we dont recommend setting values too high: for instance, on a device like a Jetson Xavier AGX we don't recommend setting the number larger than 6. The values in the range 1-3 are optimal.
 
 While trying out different values, note how it affects total inference time as well as some inference statistics (like queue and compute times)
 
 ## Demonstration case 2: Dynamic batching
 
-For models that support batching, Triton implements multiple scheduling and batching algorithms that combine individual inference requests together to improve inference throughput. In this example, we want to demonstrate how enbling automatic dynamic batching affects inference performance. 
+For models that support batching, Triton implements multiple scheduling and batching algorithms that combine individual inference requests together to improve inference throughput. In this example, we want to demonstrate how enbling automatic dynamic batching affects inference performance.
 
 ### Running the sample
 
-To observe the effect of dynamic batching, from the `concurrency_and_dynamic_batching` directory execute: 
+To observe the effect of dynamic batching, from the `concurrency_and_dynamic_batching` directory execute:
 
 ```shell
 LD_LIBRARY_PATH=$HOME/tritonserver/lib ./people_detection -m system -v -r $(pwd)/trtis_model_repo_sample_2 -t 6 -s false -p $HOME/tritonserver
@@ -326,6 +326,6 @@ dynamic_batching {
 }
 ```
 
-To try further options of dynamic batcher see the [documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher). 
+To try further options of dynamic batcher see the [documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher).
 
 You can also try enabling both concurrent model execution and dynamic batching.
\ No newline at end of file
diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/tao/convert_peoplenet.sh b/docs/examples/jetson/concurrency_and_dynamic_batching/tao/convert_peoplenet.sh
old mode 100644
new mode 100755
diff --git a/docs/examples/model_repository/simple_identity/config.pbtxt b/docs/examples/model_repository/simple_identity/config.pbtxt
old mode 100755
new mode 100644
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 316d33c24c..fa1a8ec690 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -28,10 +28,10 @@
 
 # Quickstart
 
-**New to Triton Inference Server and want do just deploy your model quickly?** 
-Make use of 
+**New to Triton Inference Server and want do just deploy your model quickly?**
+Make use of
 [these tutorials](https://github.com/triton-inference-server/tutorials#quick-deploy)
- to begin your Triton journey! 
+ to begin your Triton journey!
 
 The Triton Inference Server is available as [buildable source
   code](../customization_guide/build.md), but the easiest way to install and run Triton is to
@@ -49,7 +49,7 @@ Launching and maintaining Triton Inference Server revolves around the use of bui
 The [model repository](../user_guide/model_repository.md) is the directory where you
 place the models that you want Triton to serve. An example model
 repository is included in the
-[docs/examples/model_repository](../examples/model_repository). 
+[docs/examples/model_repository](../examples/model_repository).
 Before using the repository, you must fetch any missing model definition
 files from their public model zoos via the provided script.
 
diff --git a/docs/index.md b/docs/index.md
index e3fcb91338..7ae2b22173 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -71,7 +71,7 @@ Major features include:
 - [Concurrent model
   execution](user_guide/architecture.md#concurrent-model-execution)
 - [Dynamic batching](user_guide/model_configuration.md#dynamic-batcher)
-- [Sequence batching](user_guide/model_configuration.md#sequence-batcher) and 
+- [Sequence batching](user_guide/model_configuration.md#sequence-batcher) and
   [implicit state management](user_guide/architecture.md#implicit-state-management)
   for stateful models
 - Provides [Backend API](https://github.com/triton-inference-server/backend) that
@@ -90,8 +90,8 @@ Major features include:
 - [Metrics](user_guide/metrics.md) indicating GPU utilization, server
   throughput, server latency, and more
 
-Join the [Triton and TensorRT community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and stay current on the latest product updates, bug fixes, content, best 
-practices, and more. Need enterprise support? NVIDIA global support is available 
+Join the [Triton and TensorRT community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and stay current on the latest product updates, bug fixes, content, best
+practices, and more. Need enterprise support? NVIDIA global support is available
 for Triton Inference Server with the [NVIDIA AI Enterprise software suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/).
 
-See the [Lastest Release Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-23-05.html#rel-23-05) for updates on the newest features and bug fixes.
+See the [Latest Release Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-23-05.html#rel-23-05) for updates on the newest features and bug fixes.
diff --git a/docs/protocol/extension_logging.md b/docs/protocol/extension_logging.md
index 2b31863f0f..d5b770d5d4 100644
--- a/docs/protocol/extension_logging.md
+++ b/docs/protocol/extension_logging.md
@@ -29,7 +29,7 @@
 # Logging Extension
 
 This document describes Triton's logging extension. The logging extension enables
-the client to configure log settings during a Triton run. Triton reports "logging" 
+the client to configure log settings during a Triton run. Triton reports "logging"
 in the extensions field of its Server Metadata.
 
 ## HTTP/REST
@@ -41,7 +41,7 @@ indicates an optional JSON field.
 Triton exposes the logging endpoint at the following URL. The client may use
 HTTP GET request to retrieve the current log settings. A HTTP POST request
 will modify the log settings, and the endpoint will return the updated log
-settings on success or an error in the case of failure. 
+settings on success or an error in the case of failure.
 
 ```
 GET v2/logging
@@ -65,22 +65,22 @@ $log_setting = $string : $string | $boolean | $number
 ```
 
 Each `$log_setting` JSON describes a “name”/”value” pair, where the “name” is
-the `$string` representation of the log setting and the “value” is a `$string`, 
-`$bool`, or `$number` representation of the setting value. Currently, the 
+the `$string` representation of the log setting and the “value” is a `$string`,
+`$bool`, or `$number` representation of the setting value. Currently, the
 following log settings are defined:
 
 - "log_file" : a `$string` parameter defining the file where the log outputs will be saved. If an empty string is specified, log outputs will stream to the console.
 
-- "log_info" : a `$boolean` parameter that controls whether the Triton server logs INFO level messages. 
+- "log_info" : a `$boolean` parameter that controls whether the Triton server logs INFO level messages.
 
-- "log_warning" : a `$boolean` parameter that controls whether the Triton server logs WARNING level messages. 
+- "log_warning" : a `$boolean` parameter that controls whether the Triton server logs WARNING level messages.
 
-- "log_error" : a `$boolean` parameter that controls whether the Triton server logs ERROR level messages. 
+- "log_error" : a `$boolean` parameter that controls whether the Triton server logs ERROR level messages.
 
 - "log_verbose_level" : a `$number` parameter that controls whether the Triton server outputs verbose messages
-of varying degrees. This value can be any integer >= 0. If "log_verbose_level" is 0, verbose logging will be disabled, and 
+of varying degrees. This value can be any integer >= 0. If "log_verbose_level" is 0, verbose logging will be disabled, and
 no verbose messages will be output by the Triton server. If "log_verbose_level" is 1, level 1 verbose messages will be output
-by the Triton server. If "log_verbose_level" is 2, the Triton server will output all verbose messages of 
+by the Triton server. If "log_verbose_level" is 2, the Triton server will output all verbose messages of
 level <= 2, etc. Attempting to set "log_verbose_level" to a number < 0 will result in an error.
 
 - "log_format" : a `$string` parameter that controls the format of Triton server log messages. There are currently
@@ -121,7 +121,7 @@ When a `$log_setting` JSON is received (defined above), only the specified
 settings will be updated.
 
 ### Example Usage
-The logging protocol extension can be invoked using the curl library in the following manner (assuming 
+The logging protocol extension can be invoked using the curl library in the following manner (assuming
 a Triton server is running at `localhost:8000`):
 ```
 curl -s -w '\n%{http_code}\n' -d '{"log_verbose_level":1}' -X POST localhost:8000/v2/logging
@@ -131,7 +131,7 @@ This command should return a `$log_setting_response` JSON object with the follow
 {"log_file":"","log_info":true,"log_warnings":true,"log_errors":true,"log_verbose_level":1,"log_format":"default"}
 200
 ```
-Note that the current values for all parameter fields are returned even though `log_verbose_level` 
+Note that the current values for all parameter fields are returned even though `log_verbose_level`
 was the only parameter that was modified.
 
 ## GRPC
diff --git a/docs/protocol/extension_model_configuration.md b/docs/protocol/extension_model_configuration.md
index 07ecc63e94..a9baaa58d7 100644
--- a/docs/protocol/extension_model_configuration.md
+++ b/docs/protocol/extension_model_configuration.md
@@ -39,7 +39,7 @@ In all JSON schemas shown in this document `$number`, `$string`, `$boolean`,
 `$object` and `$array` refer to the fundamental JSON types. #optional
 indicates an optional JSON field.
 
-Triton exposes the model configuation endpoint at the following
+Triton exposes the model configuration endpoint at the following
 URL. The versions portion of the URL is optional; if not provided
 Triton will return model configuration for the highest-numbered
 version of the model.
diff --git a/docs/protocol/extension_parameters.md b/docs/protocol/extension_parameters.md
index f75f069862..4cdb60cf38 100644
--- a/docs/protocol/extension_parameters.md
+++ b/docs/protocol/extension_parameters.md
@@ -89,12 +89,12 @@ ModelInferRequest message can be used to send custom parameters.
 
 ## Forwarding HTTP/GRPC Headers as Parameters
 
-Triton can forward HTTP/GRPC headers as inference request parameters. By 
+Triton can forward HTTP/GRPC headers as inference request parameters. By
 specifying a regular expression in `--http-header-forward-pattern` and
 `--grpc-header-forward-pattern`,
 Triton will add the headers that match with the regular expression as request
 parameters. All the forwarded headers will be added as a parameter with string
-value. For example to forward all the headers that start with 'PREFIX_' from 
+value. For example to forward all the headers that start with 'PREFIX_' from
 both HTTP and GRPC, you should add `--http-header-forward-pattern PREFIX_.*
 --grpc-header-forward-pattern PREFIX_.*` to your `tritonserver` command.
 
diff --git a/docs/protocol/extension_schedule_policy.md b/docs/protocol/extension_schedule_policy.md
index 25c63e2d1b..c3c57a63c7 100644
--- a/docs/protocol/extension_schedule_policy.md
+++ b/docs/protocol/extension_schedule_policy.md
@@ -36,7 +36,7 @@ request. Because this extension is supported, Triton reports
 Note the policies are specific to [dynamic
 batcher](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher)
 and only experimental support to [sequence
-batcher](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#sequence-batcher) 
+batcher](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#sequence-batcher)
 with the [direct](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#direct)
 scheduling strategy.
 
diff --git a/docs/protocol/extension_sequence.md b/docs/protocol/extension_sequence.md
index f7ebdf9c7d..51c99fc3cf 100644
--- a/docs/protocol/extension_sequence.md
+++ b/docs/protocol/extension_sequence.md
@@ -50,13 +50,13 @@ if the "sequence_id" parameter supports string types.
 - "sequence_start" : boolean value if set to true in a request
   indicates that the request is the first in a sequence. If not set,
   or set to false the request is not the first in a sequence. If set
-  the "sequence_id" parameter must be set to a non-zero or non-empty string 
+  the "sequence_id" parameter must be set to a non-zero or non-empty string
   value.
 
 - "sequence_end" : boolean value if set to true in a request indicates
   that the request is the last in a sequence. If not set, or set to
   false the request is not the last in a sequence. If set the
-  "sequence_id" parameter must be set to a non-zero or non-empty string 
+  "sequence_id" parameter must be set to a non-zero or non-empty string
   value.
 
 ## HTTP/REST
diff --git a/docs/protocol/extension_statistics.md b/docs/protocol/extension_statistics.md
index 6e82e971ba..46e1a92322 100644
--- a/docs/protocol/extension_statistics.md
+++ b/docs/protocol/extension_statistics.md
@@ -227,7 +227,7 @@ $duration_stat =
 - “ns” : The total duration for the statistic in nanoseconds.
 
 ```
-$memory_usage = 
+$memory_usage =
 {
   "type" : $string,
   "id" : $number,
@@ -375,7 +375,7 @@ message InferStatistics
   StatisticDuration fail = 2;
 
   // The count and cumulative duration that inference requests wait in
-  // scheduling or other queues. The "queue" count and cumulative 
+  // scheduling or other queues. The "queue" count and cumulative
   // duration includes cache hits.
   StatisticDuration queue = 3;
 
@@ -405,7 +405,7 @@ message InferStatistics
   // and extract output tensor data from the Response Cache on a cache
   // hit. For example, this duration should include the time to copy
   // output tensor data from the Response Cache to the response object.
-  // On cache hits, triton does not need to go to the model/backend 
+  // On cache hits, triton does not need to go to the model/backend
   // for the output tensor data, so the "compute_input", "compute_infer",
   // and "compute_output" fields are not updated. Assuming the response
   // cache is enabled for a given model, a cache hit occurs for a
@@ -419,7 +419,7 @@ message InferStatistics
   // The count of response cache misses and cumulative duration to lookup
   // and insert output tensor data from the computed response to the cache
   // For example, this duration should include the time to copy
-  // output tensor data from the resposne object to the Response Cache.
+  // output tensor data from the response object to the Response Cache.
   // Assuming the response cache is enabled for a given model, a cache
   // miss occurs for a request to that model when the request metadata
   // does NOT hash to an existing entry in the cache. See the response
@@ -452,7 +452,7 @@ message InferBatchStatistics
 }
 
 // Memory usage.
-message MemoryUsage 
+message MemoryUsage
 {
   // The type of memory, the value can be "CPU", "CPU_PINNED", "GPU".
   string type = 1;
diff --git a/docs/protocol/extension_trace.md b/docs/protocol/extension_trace.md
index 35905b6bef..6472e1db24 100644
--- a/docs/protocol/extension_trace.md
+++ b/docs/protocol/extension_trace.md
@@ -78,7 +78,7 @@ see trace setting "log_frequency" below for detail.
 - "trace_level" : the trace level. "OFF" to disable tracing,
 "TIMESTAMPS" to trace timestamps, "TENSORS" to trace tensors.
 This value is an array of string where user may specify multiple levels to
-trace multiple informations.
+trace multiple information.
 - "trace_rate" : the trace sampling rate. The value represents how many requests
 will one trace be sampled from. For example, if the trace rate is "1000",
 1 trace will be sampled for every 1000 requests.
diff --git a/docs/user_guide/architecture.md b/docs/user_guide/architecture.md
index 973cb98f9d..b343842014 100644
--- a/docs/user_guide/architecture.md
+++ b/docs/user_guide/architecture.md
@@ -312,7 +312,7 @@ description of the model contains variable-sized dimensions, Triton will use *1*
 for every variable-sized dimension for the starting request. For other
 non-starting requests in the sequence, the input state is the output state of
 the previous request in the sequence. For an example ONNX model that uses
-implicit state you can refer to this onnx model generated from the 
+implicit state you can refer to this onnx model generated from the
 `create_onnx_modelfile_wo_initial_state()`
 [from this generation script](https://github.com/triton-inference-server/server/blob/main/qa/common/gen_qa_implicit_models.py).
 This is a simple accumulator model that stores the partial sum of the requests
@@ -321,8 +321,8 @@ request is starting, the model sets the "OUTPUT\_STATE" to be equal to the
 "INPUT" tensor. For non-starting requests, it sets the "OUTPUT\_STATE" tensor
 to the sum of "INPUT" and "INPUT\_STATE" tensors.
 
-In addition to the default state initilization discussed above, Triton provides
-two other mechanisms for initilizing state.
+In addition to the default state initialization discussed above, Triton provides
+two other mechanisms for initializing state.
 
 ###### Initializing State from Zero.
 
@@ -354,7 +354,7 @@ converted to fixed size dimensions.
 
 For initializing state from file, you need to create a directory named
 "initial\_state" under the model directory. The file that contains the initial
-state under this directory needs to be provided in the *data_file* field. 
+state under this directory needs to be provided in the *data_file* field.
 The data stored in this file will be used in row-major order as the initial
 state. Below is an example state description initializing state from file.
 
@@ -522,7 +522,7 @@ model. Over time the following happens:
   the sequence scheduler sees them both available in their respective
   batch slots. The scheduler immediately schedules the model instance
   to perform a batch-size 2 inference and uses START and READY to show
-  that both slots have an inference request avaiable but that only
+  that both slots have an inference request available but that only
   slot1 is the start of a new sequence.
 
 * The processing continues in a similar manner for the other inference
@@ -799,7 +799,7 @@ scheduler will:
 #### Additional Resources
 
 You can find additional end-to-end ensemble examples in the links below:
-* [This guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_5-Model_Ensembles) 
+* [This guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_5-Model_Ensembles)
 explores the concept of ensembles with a running example.
 * [Preprocessing in Python Backend Using
   Ensemble](https://github.com/triton-inference-server/python_backend#preprocessing)
diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md
index 34e0288b8d..c3748647c4 100644
--- a/docs/user_guide/custom_operations.md
+++ b/docs/user_guide/custom_operations.md
@@ -72,7 +72,7 @@ container.
 TensorFlow allows users to [add custom
 operations](https://www.tensorflow.org/guide/create_op) which can then
 be used in TensorFlow models. You can load custom TensorFlow operations
-into Triton in two ways: 
+into Triton in two ways:
 * At model load time, by listing them in the model configuration.
 * At server launch time, by using LD_PRELOAD.
 
@@ -181,7 +181,7 @@ example](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/test/s
 from the
 [microsoft/onnxruntime](https://github.com/microsoft/onnxruntime)
 repository and your ONNXRuntime custom operations are compiled into
-libonnxcustom.so, adding the following to the model configuraion of
+libonnxcustom.so, adding the following to the model configuration of
 your model makes those operations available to that specific ONNX
 model.
 
diff --git a/docs/user_guide/decoupled_models.md b/docs/user_guide/decoupled_models.md
index 4f5c70d3e2..fbe6f4c298 100644
--- a/docs/user_guide/decoupled_models.md
+++ b/docs/user_guide/decoupled_models.md
@@ -56,7 +56,7 @@ TRITONBACKEND_ModelInstanceExecute until that instance is ready to
 handle another set of requests. If not designed properly the backend
 can be easily over-subscribed. This can also cause under-utilization
 of features like [Dynamic Batching](model_configuration.md#dynamic-batcher)
-as it leads to eager batching. 
+as it leads to eager batching.
 
 ### Python model using Python Backend
 
@@ -91,20 +91,20 @@ for more details. The [decoupled_test.py](../../qa/L0_decoupled/decoupled_test.p
 how the gRPC streaming can be used to infer decoupled models.
 
 If using [Triton's in-process C API](../customization_guide/inference_protocols.md#in-process-triton-server-api),
-your application should be cognizant that the callback function you registered with 
+your application should be cognizant that the callback function you registered with
 `TRITONSERVER_InferenceRequestSetResponseCallback` can be invoked any number of times,
 each time with a new response. You can take a look at [grpc_server.cc](https://github.com/triton-inference-server/server/blob/main/src/grpc/grpc_server.cc)
 
 ### Knowing When a Decoupled Inference Request is Complete
 
 An inference request is considered complete when a response containing the
-`TRITONSERVER_RESPONSE_COMPLETE_FINAL` flag is received from a model/backend. 
+`TRITONSERVER_RESPONSE_COMPLETE_FINAL` flag is received from a model/backend.
 
 1. Client applications using streaming GRPC can access this information by
    checking the response parameters for the `"triton_final_response"` parameter.
    Decoupled models may not send a response for each request depending on how
    the model/backend is designed. In these cases where no response is sent by
-   the backend, the streaming GRPC client can opt-in to receive an empty final 
+   the backend, the streaming GRPC client can opt-in to receive an empty final
    response for each request. By default, empty final responses are not sent to
    save on network traffic.
 
diff --git a/docs/user_guide/faq.md b/docs/user_guide/faq.md
index 455692dbb3..518f2cc161 100644
--- a/docs/user_guide/faq.md
+++ b/docs/user_guide/faq.md
@@ -70,7 +70,7 @@ documentation and using
 [grpc_service.proto](https://github.com/triton-inference-server/common/blob/main/protobuf/grpc_service.proto)
 you can generate language bindings for all the languages supported by
 GRPC. We provide three examples of this for
-[Go](https://github.com/triton-inference-server/client/blob/main/src/grpc_generated/go), 
+[Go](https://github.com/triton-inference-server/client/blob/main/src/grpc_generated/go),
 [Python](https://github.com/triton-inference-server/client/blob/main/src/python/examples/grpc_client.py) and
 [Java](https://github.com/triton-inference-server/client/blob/main/src/grpc_generated/java).
 
@@ -154,7 +154,7 @@ available Triton instances.
 
 ## If the server segfaults, how can I debug it?
 
-The NGC build is a Release build and does not contain Debug symbols. 
+The NGC build is a Release build and does not contain Debug symbols.
 The build.py as well defaults to a Release build. Refer to the instructions
 in [build.md](../customization_guide/build.md#building-with-debug-symbols) to create a Debug build
 of Triton. This will help find the cause of the segmentation fault when
diff --git a/docs/user_guide/jetson.md b/docs/user_guide/jetson.md
index b5e1dcf46d..79e97f5166 100644
--- a/docs/user_guide/jetson.md
+++ b/docs/user_guide/jetson.md
@@ -144,7 +144,7 @@ apt update && apt install -y gpg wget && \
       echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | \
       tee /etc/apt/sources.list.d/kitware.list >/dev/null && \
       apt-get update && \
-      apt-get install -y --no-install-recommends cmake cmake-data 
+      apt-get install -y --no-install-recommends cmake cmake-data
 ```
 
 ### Runtime Dependencies for Triton
@@ -178,7 +178,7 @@ pip3 install --upgrade wheel setuptools && \
     pip3 install --upgrade grpcio-tools numpy attrdict pillow
 ```
 
-The PyTorch runtime depenencies are the same as the build dependencies listed above.
+The PyTorch runtime dependencies are the same as the build dependencies listed above.
 
 ### Usage
 
diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
index 7f84e30706..6e4f01bcd2 100644
--- a/docs/user_guide/metrics.md
+++ b/docs/user_guide/metrics.md
@@ -45,13 +45,13 @@ all metric reporting, while the `--allow-gpu-metrics=false` and
 metrics respectively.
 
 The `--metrics-port` option can be used to select a different port. By default,
-Triton reuses the `--http-address` option for the metrics endpoint and binds the 
+Triton reuses the `--http-address` option for the metrics endpoint and binds the
 http and metrics endpoints to the same specific address when http service is
 enabled. If http service is not enabled, the metric address will bind to `0.0.0.0`
 by default. To uniquely specify the metric endpoint, `--metrics-address` option
 can be used. See the `tritonserver --help` output for more info on these CLI options.
 
-To change the interval at whichs metrics are polled/updated, see the `--metrics-interval-ms` flag. Metrics that are updated "Per Request" are unaffected by this interval setting. This interval only applies to metrics that are designated as "Per Interval" in the tables of each section below:
+To change the interval at which's metrics are polled/updated, see the `--metrics-interval-ms` flag. Metrics that are updated "Per Request" are unaffected by this interval setting. This interval only applies to metrics that are designated as "Per Interval" in the tables of each section below:
 
 - [Inference Request Metrics](#inference-request-metrics)
 - [GPU Metrics](#gpu-metrics)
@@ -105,7 +105,7 @@ that are published through the `--metrics-config` CLI options.
 
 #### Counters
 
-By default, the following 
+By default, the following
 [Counter](https://prometheus.io/docs/concepts/metric_types/#counter)
 metrics are used for latencies:
 
@@ -129,7 +129,7 @@ To disable these metrics specifically, you can set `--metrics-config counter_lat
 To get configurable quantiles over a sliding time window, Triton supports
 a set a [Summary](https://prometheus.io/docs/concepts/metric_types/#summary)
 metrics for latencies as well. These metrics are disabled by default, but can
-be enabled by setting `--metrics-config summary_latencies=true`. 
+be enabled by setting `--metrics-config summary_latencies=true`.
 
 For more information on how the quantiles are calculated, see
 [this explanation](https://grafana.com/blog/2022/03/01/how-summary-metrics-work-in-prometheus/).
@@ -146,7 +146,7 @@ The following summary metrics are available:
 
 Each summary above is actually composed of several sub-metrics. For each
 metric, there is a set of `quantile` metrics tracking the latency for each
-quantile. Additionaly, there are `_count` and `_sum` metrics that aggregate
+quantile. Additionally, there are `_count` and `_sum` metrics that aggregate
 the count and observed values for each. For example, see the following
 information exposed by the Inference Queue Summary metrics:
 ```
@@ -187,8 +187,8 @@ To better understand the setting of error values for computing each quantile, se
 
 ## GPU Metrics
 
-GPU metrics are collected through the use of [DCGM](https://developer.nvidia.com/dcgm). 
-Collection of GPU metrics can be toggled with the `--allow-gpu-metrics` CLI flag. 
+GPU metrics are collected through the use of [DCGM](https://developer.nvidia.com/dcgm).
+Collection of GPU metrics can be toggled with the `--allow-gpu-metrics` CLI flag.
 If building Triton locally, the `TRITON_ENABLE_METRICS_GPU` CMake build flag can be used to toggle building the relevant code entirely.
 
 |Category      |Metric          |Metric Name |Description                            |Granularity|Frequency    |
@@ -203,7 +203,7 @@ If building Triton locally, the `TRITON_ENABLE_METRICS_GPU` CMake build flag can
 
 ## CPU Metrics
 
-Collection of CPU metrics can be toggled with the `--allow-cpu-metrics` CLI flag. 
+Collection of CPU metrics can be toggled with the `--allow-cpu-metrics` CLI flag.
 If building Triton locally, the `TRITON_ENABLE_METRICS_CPU` CMake build flag can be used to toggle building the relevant code entirely.
 
 > **Note**
@@ -225,15 +225,15 @@ Cache metrics can be reported in two ways:
 by Triton directly, such as the cache hit/miss counts and durations described
 below.
 
-2. As of 23.03, additional cache metrics may be reported depending on the 
-[cache implementation](response_cache.md#cache-implementations) 
+2. As of 23.03, additional cache metrics may be reported depending on the
+[cache implementation](response_cache.md#cache-implementations)
 being used through Triton's [Metrics API](#custom-metrics).
 
 ### Triton-reported Response Cache Metrics
 
-Compute latency metrics in the 
-[Inference Request Metrics table](#inference-request-metrics) above are 
-calculated for the time spent in model inference backends. If the response 
+Compute latency metrics in the
+[Inference Request Metrics table](#inference-request-metrics) above are
+calculated for the time spent in model inference backends. If the response
 cache is enabled for a given model (see [Response Cache](response_cache.md)
 docs for more info), total inference times may be affected by response cache
 lookup times.
@@ -243,7 +243,7 @@ response, and "Compute Input Time" /  "Compute Time" / "Compute Output Time"
 are not recorded.
 
 On cache misses, "Cache Miss Time" indicates the time spent looking up
-the request hash and inserting the computed output tensor data into the cache. 
+the request hash and inserting the computed output tensor data into the cache.
 Otherwise, "Compute Input Time" /  "Compute Time" / "Compute Output Time" will
 be recorded as usual.
 
@@ -271,7 +271,7 @@ custom metrics with the existing Triton metrics endpoint. The user takes the
 ownership of the custom metrics created through the APIs and must manage their
 lifetime following the API documentation.
 
-The 
+The
 [identity_backend](https://github.com/triton-inference-server/identity_backend/blob/main/README.md#custom-metric-example)
 demonstrates a practical example of adding a custom metric to a backend.
 
diff --git a/docs/user_guide/model_analyzer.md b/docs/user_guide/model_analyzer.md
index bc6c67fc8b..663a8a277a 100644
--- a/docs/user_guide/model_analyzer.md
+++ b/docs/user_guide/model_analyzer.md
@@ -36,7 +36,7 @@ utilization. The Model Analyzer is specifically useful for characterizing the
 GPU memory requirements for your model under different batching and model
 instance configurations. Once you have this GPU memory usage information you can
 more intelligently decide on how to combine multiple models on the same GPU
-while remaining within the memory capacity of the GPU. 
+while remaining within the memory capacity of the GPU.
 
 For more detailed examples and explanations of using Model Analyzer, see:
 - [Model Analyzer Conceptual Guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_3-optimizing_triton_configuration)
diff --git a/docs/user_guide/model_configuration.md b/docs/user_guide/model_configuration.md
index 8e4f53844e..9e8ba6e5a0 100644
--- a/docs/user_guide/model_configuration.md
+++ b/docs/user_guide/model_configuration.md
@@ -28,10 +28,10 @@
 
 # Model Configuration
 
-**Is this your first time writing a config file?** Check out 
+**Is this your first time writing a config file?** Check out
 [this guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_1-model_deployment#model-configuration)
- or this 
-[example](https://github.com/triton-inference-server/tutorials/tree/main/HuggingFace#examples)! 
+ or this
+[example](https://github.com/triton-inference-server/tutorials/tree/main/HuggingFace#examples)!
 
 Each model in a [model repository](model_repository.md) must include a
 model configuration that provides required and optional information
@@ -39,7 +39,7 @@ about the model. Typically, this configuration is provided in a
 config.pbtxt file specified as [ModelConfig
 protobuf](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto).
 In some cases, discussed in [Auto-Generated Model
-Configuraton](#auto-generated-model-configuration), the model
+Configuration](#auto-generated-model-configuration), the model
 configuration can be generated automatically by Triton and so does not
 need to be provided explicitly.
 
@@ -135,7 +135,7 @@ expected by the model.
 
 #### Special Conventions for PyTorch Backend
 
-**Naming Convention:** 
+**Naming Convention:**
 
 Due to the absence of sufficient metadata for inputs/outputs in TorchScript
 model files, the "name" attribute of inputs/outputs in the configuration must
@@ -147,7 +147,7 @@ the forward function in the model's definition.
 
 For example, if the forward function for the Torchscript model was defined as
 `forward(self, input0, input1)`, the first and second inputs should be named
-"input0" and "input1" respectively. 
+"input0" and "input1" respectively.
 
 2. `<name>__<index>`: Where \<name\> can be any string and \<index\> is an
 integer index that refers to the position of the corresponding input/output.
@@ -158,9 +158,9 @@ can be named "OUTPUT__0" and "OUTPUT__1" respectively.
 
 3. If all inputs (or outputs) do not follow the same naming convention, then we
 enforce strict ordering from the model configuration i.e. we assume the order of
-inputs (or outputs) in the configuartion is the true ordering of these inputs.
+inputs (or outputs) in the configuration is the true ordering of these inputs.
 
-***Dictionary of Tensors as Input:*** 
+***Dictionary of Tensors as Input:***
 
 The PyTorch backend supports passing of inputs to the model in the form of a
 Dictionary of Tensors. This is only supported when there is a *single* input to
@@ -290,7 +290,7 @@ function can be implemented in Python backend to provide
 and [`output`](#inputs-and-outputs) properties using `set_max_batch_size`,
 `add_input`, and `add_output` functions. These properties will allow Triton
 to load the Python model with [Minimal Model Configuration](#minimal-model-configuration)
-in absence of a configuration file. 
+in absence of a configuration file.
 All other model types *must* provide a model configuration file.
 
 When developing a custom backend, you can populate required settings
@@ -298,7 +298,7 @@ in the configuration and call `TRITONBACKEND_ModelSetConfig` API to
 update completed configuration with Triton core. You can take a
 look at [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend)
 and [Onnxruntime](https://github.com/triton-inference-server/onnxruntime_backend)
-backends as examples of how to acheive this. Currently, only
+backends as examples of how to achieve this. Currently, only
 [inputs, outputs](#inputs-and-outputs), [max_batch_size](#maximum-batch-size)
 and [dynamic batching](#dynamic-batcher) settings can be populated by
 backend. For custom backends, your config.pbtxt file must
@@ -323,25 +323,25 @@ config.pbtxt file.
 
 ### Default Max Batch Size and Dynamic Batcher
 
-When a model is using the auto-complete feature, a default maximum 
-batch size may be set by using the `--backend-config=default-max-batch-size=<int>` 
+When a model is using the auto-complete feature, a default maximum
+batch size may be set by using the `--backend-config=default-max-batch-size=<int>`
 command line argument. This allows all models which are capable of
 batching and which make use of [Auto Generated Model Configuration](#auto-generated-model-configuration)
-to have a default maximum batch size. This value is set to 4 by 
+to have a default maximum batch size. This value is set to 4 by
 default. Backend developers may make use of this default-max-batch-size
 by obtaining it from the TRITONBACKEND_BackendConfig api. Currently, the
-following backends which utilize these default batch values and turn on 
+following backends which utilize these default batch values and turn on
 dynamic batching in their generated model configurations are:
 
 1. [TensorFlow backend](https://github.com/triton-inference-server/tensorflow_backend)
 2. [Onnxruntime backend](https://github.com/triton-inference-server/onnxruntime_backend)
 3. [TensorRT backend](https://github.com/triton-inference-server/tensorrt_backend)
    1. TensorRT models store the maximum batch size explicitly and do not make use
-   of the default-max-batch-size parameter. However, if max_batch_size > 1 
+   of the default-max-batch-size parameter. However, if max_batch_size > 1
    and no [scheduler](model_configuration.md#scheduling-and-batching)
    is provided, the dynamic batch scheduler will be enabled.
-   
-If a value greater than 1 for the maximum batch size is set for the 
+
+If a value greater than 1 for the maximum batch size is set for the
 model, the [dynamic_batching](#dynamic-batcher) config will be set
 if no scheduler is provided in the configuration file.
 
@@ -731,21 +731,21 @@ requirements and run on the same device as them.
 
 [Ensemble models](architecture.md#ensemble-models)
 are an abstraction Triton uses to execute a user-defined pipeline of models.
-Since there is no physical instance associated with an ensemble model, the 
+Since there is no physical instance associated with an ensemble model, the
 `instance_group` field can not be specified for it.
 
-However, each composing model that makes up an ensemble can specify 
+However, each composing model that makes up an ensemble can specify
 `instance_group` in its config file and individually support parallel
 execution as described above when the ensemble receives multiple requests.
 
 ## CUDA Compute Capability
 
-Similar to the `default_model_filename` field, you can optionally specify the 
+Similar to the `default_model_filename` field, you can optionally specify the
 `cc_model_filenames` field to map the GPU's
-[CUDA Compute Capability](https://developer.nvidia.com/cuda-gpus) 
-to a correspoding model filename at model load time. This is particularly 
-useful for TensorRT models, since they are generally tied to a specific 
-compute capability. 
+[CUDA Compute Capability](https://developer.nvidia.com/cuda-gpus)
+to a corresponding model filename at model load time. This is particularly
+useful for TensorRT models, since they are generally tied to a specific
+compute capability.
 
 ```
 cc_model_filenames [
@@ -798,7 +798,7 @@ configuration. These settings control the preferred size(s) of the
 dynamically created batches, the maximum time that requests can be
 delayed in the scheduler to allow other requests to join the dynamic
 batch, and queue properties such a queue size, priorities, and
-time-outs. Refer to 
+time-outs. Refer to
 [this guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_2-improving_resource_utilization#what-is-dynamic-batching)
 for a more detailed example of dynamic batching.
 
@@ -849,7 +849,7 @@ dynamic batcher should attempt to create. For most models,
 [Recommended Configuration
 Process](#recommended-configuration-process). An exception is TensorRT
 models that specify multiple optimization profiles for different batch
-sizes. In this case, bacause some optimization profiles may give
+sizes. In this case, because some optimization profiles may give
 significant performance improvement compared to others, it may make
 sense to use *preferred_batch_size* for the batch sizes supported by
 those higher-performance optimization profiles.
@@ -942,10 +942,10 @@ timeout.
 #### Custom Batching
 
 You can set custom batching rules that work _in addition to_ the specified behavior of the dynamic batcher.
-To do so, you would implement five functions in [tritonbackend.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonbackend.h) 
+To do so, you would implement five functions in [tritonbackend.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonbackend.h)
 and create a shared library. These functions are described below.
 
-| Function | Description| 
+| Function | Description|
 | :--          |   :--           |
 | TRITONBACKEND_ModelBatchIncludeRequest | Determines whether a request should be included in the current batch |
 | TRITONBACKEND_ModelBatchInitialize | Initializes a record-keeping data structure for a new batch |
@@ -953,10 +953,10 @@ and create a shared library. These functions are described below.
 | TRITONBACKEND_ModelBatcherInitialize | Initializes a read-only data structure for use with all batches |
 | TRITONBACKEND_ModelBatcherFinalize | Deallocates the read-only data structure after the model is unloaded |
 
-The path to the shared library can be passed into the model configuration via the parameter 
-`TRITON_BATCH_STRATEGY_PATH`. If not provided, the dynamic batcher will look for a custom 
-batching strategy named batchstrategy.so in the model version, model, and backend directories, 
-in that order. If found, it will load it. This lets you easily share a custom batching strategy 
+The path to the shared library can be passed into the model configuration via the parameter
+`TRITON_BATCH_STRATEGY_PATH`. If not provided, the dynamic batcher will look for a custom
+batching strategy named batchstrategy.so in the model version, model, and backend directories,
+in that order. If found, it will load it. This lets you easily share a custom batching strategy
 among all models using the same backend.
 
 For a tutorial of how to create and use a custom batching library, please see the
@@ -1036,7 +1036,7 @@ for examples on specifying different variants of warmup samples.
 ## Response Cache
 
 The model configuration `response_cache` section has an `enable` boolean used to
-enable the Response Cache for this model. 
+enable the Response Cache for this model.
 
 ```
 response_cache {
@@ -1045,6 +1045,6 @@ response_cache {
 ```
 
 In addition to enabling the cache in the model config, a `--cache-config` must
-be specified when starting the server to enable caching on the server-side. See 
+be specified when starting the server to enable caching on the server-side. See
 the [Response Cache](response_cache.md) doc for more details on enabling
 server-side caching.
diff --git a/docs/user_guide/model_management.md b/docs/user_guide/model_management.md
index ae1c24da20..dc323a087c 100644
--- a/docs/user_guide/model_management.md
+++ b/docs/user_guide/model_management.md
@@ -55,8 +55,8 @@ Repository](#modifying-the-model-repository).
 ## Model Control Mode EXPLICIT
 
 At startup, Triton loads only those models specified explicitly with the
-`--load-model` command-line option. To load ALL models at startup, specify 
-`--load-model=*` as the ONLY `--load-model` argument. Specifying 
+`--load-model` command-line option. To load ALL models at startup, specify
+`--load-model=*` as the ONLY `--load-model` argument. Specifying
 `--load-model=*` in conjunction with another `--load-model` argument will
 result in error. If `--load-model` is not specified then no models are loaded
 at startup. Models that Triton is not able to load will be marked as
@@ -226,7 +226,7 @@ configuration, so its presence in the model directory may be detected as a new f
 and cause the model to fully reload when only an update is expected.
 
 * If a sequence model is updated with in-flight sequence(s), Triton does not
-guarentee any remaining request(s) from the in-flight sequence(s) will be routed
+guarantee any remaining request(s) from the in-flight sequence(s) will be routed
 to the same model instance for processing. It is currently the responsibility of
 the user to ensure any in-flight sequence(s) is complete before updating a
 sequence model.
@@ -239,7 +239,7 @@ performance requirements, the optimal amount of resources dedicated to loading
 models may differ. Triton exposes a `--model-load-thread-count` option to
 configure the number of threads dedicated to loading models, which defaults to 4.
 
-To set this parameter with the C API, refer to 
-`TRITONSERVER_ServerOptionsSetModelLoadThreadCount` in 
+To set this parameter with the C API, refer to
+`TRITONSERVER_ServerOptionsSetModelLoadThreadCount` in
 [tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h).
 
diff --git a/docs/user_guide/model_repository.md b/docs/user_guide/model_repository.md
index a16633b75e..a96a1fb768 100644
--- a/docs/user_guide/model_repository.md
+++ b/docs/user_guide/model_repository.md
@@ -28,9 +28,9 @@
 
 # Model Repository
 
-**Is this your first time setting up a model repository?** Check out 
+**Is this your first time setting up a model repository?** Check out
 [these tutorials](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_1-model_deployment#setting-up-the-model-repository)
- to begin your Triton journey! 
+ to begin your Triton journey!
 
 The Triton Inference Server serves models from one or more model
 repositories that are specified when the server is started. While
@@ -80,7 +80,7 @@ corresponding model. The config.pbtxt file describes the [model
 configuration](model_configuration.md) for the model. For some models,
 config.pbtxt is required while for others it is optional. See
 [Auto-Generated Model
-Configuration](model_configuration.md#auto-generated-model-configuration) 
+Configuration](model_configuration.md#auto-generated-model-configuration)
 for more information.
 
 Each <model-name> directory must have at least one numeric
@@ -126,7 +126,7 @@ environment variable should be set and contains the location of a credential
 JSON file. If no credential is provided, Triton will use credentials from the
 [attached service account](https://cloud.google.com/docs/authentication/application-default-credentials#attached-sa)
 providing a value for the
-[Authorization HTTP header](https://googleapis.dev/cpp/google-cloud-storage/1.42.0/classgoogle_1_1cloud_1_1storage_1_1oauth2_1_1ComputeEngineCredentials.html#a8c3a5d405366523e2f4df06554f0a676) 
+[Authorization HTTP header](https://googleapis.dev/cpp/google-cloud-storage/1.42.0/classgoogle_1_1cloud_1_1storage_1_1oauth2_1_1ComputeEngineCredentials.html#a8c3a5d405366523e2f4df06554f0a676)
 can be obtained. If not obtainable, anonymous credential will be used.
 
 To access buckets with anonymous credential (also known as public bucket), the
@@ -159,9 +159,9 @@ subsequently the bucket path.
 $ tritonserver --model-repository=s3://host:port/bucket/path/to/model/repository ...
 ```
 
-By default, Triton uses HTTP to communicate with your instance of S3. If 
+By default, Triton uses HTTP to communicate with your instance of S3. If
 your instance of S3 supports HTTPS and you wish for Triton to use the HTTPS
-protocol to communicate with it, you can specify the same in the model 
+protocol to communicate with it, you can specify the same in the model
 repository path by prefixing the host name with https://.
 
 ```bash
@@ -201,8 +201,8 @@ $ export AZURE_STORAGE_KEY=$(az storage account keys list -n $AZURE_STORAGE_ACCO
 
 *This feature is currently in beta and may be subject to change.*
 
-To group the credentials into a single file for Triton, you may set the 
-`TRITON_CLOUD_CREDENTIAL_PATH` environment variable to a path pointing to a 
+To group the credentials into a single file for Triton, you may set the
+`TRITON_CLOUD_CREDENTIAL_PATH` environment variable to a path pointing to a
 JSON file of the following format, residing in the local file system.
 
 ```
@@ -254,7 +254,7 @@ This feature is intended for use-cases which multiple credentials are needed
 for each cloud storage provider. Be sure to replace any credential paths/keys
 with the actual paths/keys from the example above.
 
-If the `TRITON_CLOUD_CREDENTIAL_PATH` environment variable is not set, the 
+If the `TRITON_CLOUD_CREDENTIAL_PATH` environment variable is not set, the
 [Cloud Storage with Environment variables](#cloud-storage-with-environment-variables)
 will be used.
 
diff --git a/docs/user_guide/optimization.md b/docs/user_guide/optimization.md
index 7d2d9c61aa..f842198a90 100644
--- a/docs/user_guide/optimization.md
+++ b/docs/user_guide/optimization.md
@@ -81,11 +81,11 @@ latency.
 
 For most models, the Triton feature that provides the largest
 performance improvement is [dynamic
-batching](model_configuration.md#dynamic-batcher). 
+batching](model_configuration.md#dynamic-batcher).
 [This example](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_2-improving_resource_utilization#dynamic-batching--concurrent-model-execution)
  sheds more light on conceptual details. If your model does not
 support batching then you can skip ahead to [Model
-Instances](#model-instances). 
+Instances](#model-instances).
 
 
 ### Dynamic Batcher
@@ -131,8 +131,8 @@ typically applies when perf_analyzer is running on the same system as
 Triton. The first rule is that for minimum latency set the request
 concurrency to 1 and disable the dynamic batcher and use only 1 [model
 instance](#model-instances). The second rule is that for maximum
-throughput set the request concurrency to be 
-`2 * <maximum batch size> * <model instance count>`. We will discuss model 
+throughput set the request concurrency to be
+`2 * <maximum batch size> * <model instance count>`. We will discuss model
 instances [below](#model-instances), for now we are working with one model
 instance. So for maximum-batch-size 4 we want to run perf_analyzer
 with request concurrency of `2 * 4 * 1 = 8`.
@@ -219,7 +219,7 @@ settings that best satisfy your throughput and latency requirements.
 Triton has several optimization settings that apply to only a subset
 of the supported model frameworks. These optimization settings are
 controlled by the model configuration [optimization
-policy](model_configuration.md#optimization-policy). Visit 
+policy](model_configuration.md#optimization-policy). Visit
 [this guide](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_4-inference_acceleration)
  for an end to end discussion.
 
diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md
index 9764efcc23..877e4ecfa0 100644
--- a/docs/user_guide/performance_tuning.md
+++ b/docs/user_guide/performance_tuning.md
@@ -37,7 +37,7 @@ for most use cases.
 For those who wish to jump right in, skip to the
 [end-to-end example](#end-to-end-example).
 
-For additional material, see the 
+For additional material, see the
 [Triton Conceptual Guide tutorial](https://github.com/triton-inference-server/tutorials/tree/main/Conceptual_Guide/Part_4-inference_acceleration).
 
 ## Overview
@@ -187,7 +187,7 @@ other frameworks.
 mkdir -p ./models/densenet_onnx/1
 
 # Download model and place it in model repository
-wget -O models/densenet_onnx/1/model.onnx 
+wget -O models/densenet_onnx/1/model.onnx
 https://contentmamluswest001.blob.core.windows.net/content/14b2744cf8d6418c87ffddc3f3127242/9502630827244d60a1214f250e3bbca7/08aed7327d694b8dbaee2c97b8d0fcba/densenet121-1.2.onnx
 ```
 
@@ -318,7 +318,7 @@ SERVER_PID=`ps | grep tritonserver | awk '{ printf $1 }'`
 kill ${SERVER_PID}
 
 # Install model analyzer
-pip install --upgrade pip 
+pip install --upgrade pip
 pip install triton-model-analyzer wkhtmltopdf
 
 # Profile the model using local (default) mode
@@ -369,7 +369,7 @@ your models for your use case.
 
 6. Extract optimal config from Model Analyzer results
 
-In our example above, `densenet_onnx_config_3` was the optimal configuration. 
+In our example above, `densenet_onnx_config_3` was the optimal configuration.
 So let's extract that `config.pbtxt` and put it back in our model repository for future use.
 
 ```bash
diff --git a/docs/user_guide/rate_limiter.md b/docs/user_guide/rate_limiter.md
index 2e38327042..69b94fd8b8 100644
--- a/docs/user_guide/rate_limiter.md
+++ b/docs/user_guide/rate_limiter.md
@@ -42,9 +42,9 @@ frameworks dynamically allocate memory. Running all such models
 simultaneously may lead to system going out-of-memory.
 
 Rate limiter allows to postpone the inference execution on some
-model instances such that not all of them runs simultaneously. 
+model instances such that not all of them runs simultaneously.
 The model priorities are used to decide which model instance
-to schedule next. 
+to schedule next.
 
 ## Using Rate Limiter
 
diff --git a/docs/user_guide/response_cache.md b/docs/user_guide/response_cache.md
index b526a3c84e..fbc1233f3b 100644
--- a/docs/user_guide/response_cache.md
+++ b/docs/user_guide/response_cache.md
@@ -47,18 +47,18 @@ used for the request. When this happens there is no need for Triton to execute
 the model to produce the inference result. If the hash is not found in the
 cache, Triton executes the model to produce the inference result, and then
 records that result in the cache so that subsequent inference requests can
-(re)use those results. 
+(re)use those results.
 
 ## Usage
 
 In order for caching to be used on a given model, it must be enabled
-on both the server-side, and in the model's 
+on both the server-side, and in the model's
 [model config](model_configuration.md#response-cache). See the following
 sections below for more details.
 
 ### Enable Caching on Server-side
 
-The response cache is enabled on the server-side by specifying a 
+The response cache is enabled on the server-side by specifying a
 `<cache_implementation>` and corresponding configuration when starting
 the Triton server.
 
@@ -75,10 +75,10 @@ This allows users to enable/disable caching globally on server startup.
 
 ### Enable Caching for a Model
 
-**By default, no model uses response caching even if the response cache 
-is enabled globally with the `--cache-config` flag.** 
+**By default, no model uses response caching even if the response cache
+is enabled globally with the `--cache-config` flag.**
 
-For a given model to use response caching, the model must also have 
+For a given model to use response caching, the model must also have
 response caching enabled in its model configuration:
 ```
 # config.pbtxt
@@ -90,7 +90,7 @@ response_cache {
 
 This allows users to enable/disable caching for specific models.
 
-For more information on enabling the response cache for each model, see the 
+For more information on enabling the response cache for each model, see the
 [model configuration docs](model_configuration.md#response-cache).
 
 ### Cache Implementations
@@ -100,7 +100,7 @@ Starting in the 23.03 release, Triton has a set of
 that are used to communicate with a cache implementation of the user's choice.
 
 A cache implementation is a shared library that implements the required
-TRITONCACHE APIs and is dynamically loaded on server startup, if enabled. 
+TRITONCACHE APIs and is dynamically loaded on server startup, if enabled.
 
 Triton's most recent
 [tritonserver release containers](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver)
@@ -108,7 +108,7 @@ come with the following cache implementations out of the box:
 - [local](https://github.com/triton-inference-server/local_cache): `/opt/tritonserver/caches/local/libtritoncache_local.so`
 - [redis](https://github.com/triton-inference-server/redis_cache): `/opt/tritonserver/caches/redis/libtritoncache_redis.so`
 
-With these TRITONCACHE APIs, `tritonserver` exposes a new `--cache-config` 
+With these TRITONCACHE APIs, `tritonserver` exposes a new `--cache-config`
 CLI flag that gives the user flexible customization of which cache implementation
 to use, and how to configure it. Similar to the `--backend-config` flag,
 the expected format is `--cache-config <cache_name>,<key>=<value>` and may
@@ -122,31 +122,31 @@ internally before the 23.03 release. For more implementation specific details,
 see the
 [local cache implementation](https://github.com/triton-inference-server/local_cache).
 
-When `--cache-config local,size=SIZE` is specified with a non-zero `SIZE`, 
+When `--cache-config local,size=SIZE` is specified with a non-zero `SIZE`,
 Triton allocates the requested size in CPU memory and **shares the
-cache across all inference requests and across all models**. 
+cache across all inference requests and across all models**.
 
 #### Redis Cache
 
 The `redis` cache implementation exposes the ability for Triton to communicate
 with a Redis server for caching. The `redis_cache` implementation is essentially
-a Redis client that acts as an intermediary between Triton and Redis. 
+a Redis client that acts as an intermediary between Triton and Redis.
 
 To list a few benefits of the `redis` cache compared to the `local` cache in
 the context of Triton:
-- The Redis server can be hosted remotely as long as it is accesible by Triton,
-  so it is not tied directly to the Triton process lifetime. 
+- The Redis server can be hosted remotely as long as it is accessible by Triton,
+  so it is not tied directly to the Triton process lifetime.
   - This means Triton can be restarted and still have access to previously cached entries.
   - This also means that Triton doesn't have to compete with the cache for memory/resource usage.
 - Multiple Triton instances can share a cache by configuring each Triton instance
   to communicate with the same Redis server.
 - The Redis server can be updated/restarted independently of Triton, and
-  Triton will fallback to operating as it would with no cache access during 
+  Triton will fallback to operating as it would with no cache access during
   any Redis server downtime, and log appropriate errors.
 
-In general, the Redis server can be configured/deployed as needed for your use 
-case, and Triton's `redis` cache will simply act as a client of your Redis 
-deployment. The [Redis docs](https://redis.io/docs/) should be consulted for 
+In general, the Redis server can be configured/deployed as needed for your use
+case, and Triton's `redis` cache will simply act as a client of your Redis
+deployment. The [Redis docs](https://redis.io/docs/) should be consulted for
 questions and details about configuring the Redis server.
 
 For Triton-specific `redis` cache implementation details/configuration, see the
@@ -157,7 +157,7 @@ For Triton-specific `redis` cache implementation details/configuration, see the
 With the TRITONCACHE API interface, it is now possible for
 users to implement their own cache to suit any use-case specific needs.
 To see the required interface that must be implemented by a cache
-developer, see the 
+developer, see the
 [TRITONCACHE API header](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritoncache.h).
 The `local` or `redis` cache implementations may be used as reference.
 
@@ -165,22 +165,22 @@ Upon successfully developing and building a custom cache, the resulting shared
 library (ex: `libtritoncache_<name>.so`) must be placed in the cache directory
 similar to where the `local` and `redis` cache implementations live. By default,
 this directory is `/opt/tritonserver/caches`, but a custom directory may be
-specified with `--cache-dir` as needed. 
+specified with `--cache-dir` as needed.
 
 To put this example together, if the custom cache were named "custom"
-(this name is arbitrary), by default Triton would expect to find the 
+(this name is arbitrary), by default Triton would expect to find the
 cache implementation at `/opt/tritonserver/caches/custom/libtritoncache_custom.so`.
 
 ## Deprecation Notes
 
 > **Note**
 > Prior to 23.03, enabling the `local` cache used to be done through setting a non-zero size
-> (in bytes) when Triton was launched using the `--response-cache-byte-size` flag. 
+> (in bytes) when Triton was launched using the `--response-cache-byte-size` flag.
 >
-> Starting in 23.03, the `--response-cache-byte-size` flag is now deprecated and 
-> `--cache-config` should be used instead. For backwards compatibility, 
-> `--response-cache-byte-size` will continue to function under the hood by being 
-> converted to the corresponding `--cache-config` argument, but it will default 
+> Starting in 23.03, the `--response-cache-byte-size` flag is now deprecated and
+> `--cache-config` should be used instead. For backwards compatibility,
+> `--response-cache-byte-size` will continue to function under the hood by being
+> converted to the corresponding `--cache-config` argument, but it will default
 > to using the `local` cache implementation. It is not possible to choose other
 > cache implementations using the `--response-cache-byte-size` flag.
 >
@@ -190,10 +190,10 @@ cache implementation at `/opt/tritonserver/caches/custom/libtritoncache_custom.s
 
 > **Warning**
 >
-> The `local` cache implementation may fail to initialize for very small values 
-> of `--cache-config local,size=<small_value>` or `--response-cache-byte-size` 
-> (ex: less than 1024 bytes) due to internal memory management requirements. 
-> If you encounter an initialization error for a relatively small cache size, 
+> The `local` cache implementation may fail to initialize for very small values
+> of `--cache-config local,size=<small_value>` or `--response-cache-byte-size`
+> (ex: less than 1024 bytes) due to internal memory management requirements.
+> If you encounter an initialization error for a relatively small cache size,
 > try increasing it.
 >
 > Similarly, the size is upper bounded by the available RAM on the system.
@@ -202,14 +202,14 @@ cache implementation at `/opt/tritonserver/caches/custom/libtritoncache_custom.s
 
 ## Performance
 
-The response cache is intended to be used for use cases where a significant 
-number of duplicate requests (cache hits) are expected and therefore would 
+The response cache is intended to be used for use cases where a significant
+number of duplicate requests (cache hits) are expected and therefore would
 benefit from caching. The term "significant" here is subjective to the use
 case, but a simple interpretation would be to consider the proportion of
 expected cache hits/misses, as well as the average time spend computing
-a response. 
+a response.
 
-For cases where cache hits are common and computation is expensive, 
+For cases where cache hits are common and computation is expensive,
 the cache can significantly improve overall performance.
 
 For cases where most requests are unique (cache misses) or the compute is
diff --git a/qa/L0_async_work_queue/test.sh b/qa/L0_async_work_queue/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_backend_config/test.sh b/qa/L0_backend_config/test.sh
old mode 100644
new mode 100755
index 2bca7fa529..b898735798
--- a/qa/L0_backend_config/test.sh
+++ b/qa/L0_backend_config/test.sh
@@ -66,7 +66,7 @@ POSITIVE_TEST_ARGS=("--backend-config=tensorflow,default-max-batch-size=5 $COMMO
                     "--backend-config=default-max-batch-size=7 --backend-config=tensorflow,default-max-batch-size=8 $COMMON_ARGS" \
 )
 
-# These integers correspond to the expected default-max-batch-size which gets set 
+# These integers correspond to the expected default-max-batch-size which gets set
 # in the POSITIVE_TEST_ARGS
 POSITIVE_TEST_ANSWERS=(5 6 8)
 
@@ -86,12 +86,12 @@ else
 
     RESULT_LOG_LINE=$(grep -a "Adding default backend config setting:" $SERVER_LOG)
     if [ "$RESULT_LOG_LINE" != "" ]; then
-        
+
         # Pick out the logged value of the default-max-batch-size which gets passed into model creation
         RESOLVED_DEFAULT_MAX_BATCH_SIZE=$(awk -v line="$RESULT_LOG_LINE" 'BEGIN {split(line, a, "]"); split(a[2], b, ": "); split(b[2], c, ","); print c[2]}')
 
         if [ "$RESOLVED_DEFAULT_MAX_BATCH_SIZE" != "4" ]; then
-            echo "*** FAILED: Found default-max-batch-size not equal to the expected default-max-batch-size. Expected: default-max-batch-size,4, Found: $RESOLVED_DEFAULT_MAX_BATCH_SIZE \n" 
+            echo "*** FAILED: Found default-max-batch-size not equal to the expected default-max-batch-size. Expected: default-max-batch-size,4, Found: $RESOLVED_DEFAULT_MAX_BATCH_SIZE \n"
             RET=1
         fi
     else
@@ -104,7 +104,7 @@ for ((i=0; i < ${#POSITIVE_TEST_ARGS[@]}; i++)); do
     SERVER_ARGS=${POSITIVE_TEST_ARGS[$i]}
     SERVER_LOG=$SERVER_LOG_BASE.backend_config_positive_$i.log
     run_server
-    
+
     if [ "$SERVER_PID" == "0" ]; then
         echo -e "*** FAILED: Server failed to start $SERVER\n"
         RET=1
@@ -115,12 +115,12 @@ for ((i=0; i < ${#POSITIVE_TEST_ARGS[@]}; i++)); do
 
         RESULT_LOG_LINE=$(grep -a "Found overwritten default setting:" $SERVER_LOG)
         if [ "$RESULT_LOG_LINE" != "" ]; then
-            
+
             # Pick out the logged value of the default-max-batch-size which gets passed into model creation
             RESOLVED_DEFAULT_MAX_BATCH_SIZE=$(awk -v line="$RESULT_LOG_LINE" 'BEGIN {split(line, a, "]"); split(a[2], b, ": "); split(b[2], c, ","); print c[2]}')
 
             if [ "$RESOLVED_DEFAULT_MAX_BATCH_SIZE" != "${POSITIVE_TEST_ANSWERS[$i]}" ]; then
-                echo "*** FAILED: Found default-max-batch-size not equal to the expected default-max-batch-size. Expected: ${POSITIVE_TEST_ANSWERS[$i]}, Found: $RESOLVED_DEFAULT_MAX_BATCH_SIZE \n" 
+                echo "*** FAILED: Found default-max-batch-size not equal to the expected default-max-batch-size. Expected: ${POSITIVE_TEST_ANSWERS[$i]}, Found: $RESOLVED_DEFAULT_MAX_BATCH_SIZE \n"
                 RET=1
             fi
         else
@@ -152,11 +152,11 @@ done
 
 
 #
-# Sepcific backend tests
-# 
+# Specific backend tests
+#
 
-# While inference server is running, save the 
-# config of the 'no_config' model to the TRIAL 
+# While inference server is running, save the
+# config of the 'no_config' model to the TRIAL
 # file.
 function save_model_config() {
     CODE=`curl -s -w %{http_code} -o ./$TRIAL.out localhost:8000/v2/models/no_config/config`
@@ -192,13 +192,13 @@ else
         RET=1
     fi
 
-    # Assert we are also turning on the dynamic_batcher    
+    # Assert we are also turning on the dynamic_batcher
     DYNAMIC_BATCHING_LOG_LINE=$(grep -a "Starting dynamic-batcher thread" $SERVER_LOG)
     if [ "$DYNAMIC_BATCHING_LOG_LINE" == "" ]; then
         echo "*** FAILED: Expected dynamic batching to be set in model config but was not found\n"
         RET=1
     fi
-    
+
     kill $SERVER_PID
     wait $SERVER_PID
 
@@ -225,7 +225,7 @@ else
         RET=1
     fi
 
-    # Assert batching disabled    
+    # Assert batching disabled
     if [ "$(grep -a -E '\"dynamic_batching\": \{}' $SERVER_LOG)" != "" ]; then
         echo "*** FAILED: Found dynamic batching enabled in configuration when none expected.\n"
         RET=1
@@ -252,7 +252,7 @@ if [ "$SERVER_PID" == "0" ]; then
 
 else
     save_model_config
-    
+
     # Assert the max-batch-size is the command line value
     MAX_BATCH_LOG_LINE=$(grep -a "\"max_batch_size\":5" $TRIAL.out)
     if [ "$MAX_BATCH_LOG_LINE" == "" ]; then
@@ -260,13 +260,13 @@ else
         RET=1
     fi
 
-    # Assert we are also turning on the dynamic_batcher    
+    # Assert we are also turning on the dynamic_batcher
     DYNAMIC_BATCHING_LOG_LINE=$(grep -a "Starting dynamic-batcher thread" $SERVER_LOG)
     if [ "$DYNAMIC_BATCHING_LOG_LINE" == "" ]; then
         echo "*** FAILED: Expected dynamic batching to be set in model config but was not found\n"
         RET=1
     fi
-    
+
     kill $SERVER_PID
     wait $SERVER_PID
 fi
@@ -296,7 +296,7 @@ else
         RET=1
     fi
 
-    # Assert batching disabled    
+    # Assert batching disabled
     if [ "$(grep -a -E '\"dynamic_batching\": \{}' $SERVER_LOG)" != "" ]; then
         echo "*** FAILED: Found dynamic batching in configuration when none expected.\n"
         RET=1
@@ -309,17 +309,17 @@ fi
 
 #
 # General backend tests
-# 
+#
 
-# We want to make sure that backend configurations 
+# We want to make sure that backend configurations
 # are not lost. For this purpose we are using only onnx backend
 
 rm -rf ./models/
 mkdir -p ./models/no_config/
 cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/onnx_float32_float32_float32/1 ./models/no_config/
 
-# First getting a baseline for the number of default configs 
-# added during a server set up 
+# First getting a baseline for the number of default configs
+# added during a server set up
 SERVER_ARGS="$COMMON_ARGS"
 SERVER_LOG=$SERVER_LOG_BASE.default_configs.log
 run_server
@@ -345,11 +345,11 @@ fi
 # Now make sure that when setting specific backend configs
 # default ones are not lost.
 # Current logic for backend config resolution reads default configs first,
-# then specific configs and overrides defaults if needed. 
-# We would like to make sure that none of configs are lost and 
-# defaults are properly overriden.
+# then specific configs and overrides defaults if needed.
+# We would like to make sure that none of configs are lost and
+# defaults are properly overridden.
 # One of defaultconfigs is `min-compute-capability`. This test
-# checks if it is properlly overriden.
+# checks if it is properlly overridden.
 MIN_COMPUTE_CAPABILITY=XX
 SERVER_ARGS="--backend-config=onnxruntime,min-compute-capability=$MIN_COMPUTE_CAPABILITY $COMMON_ARGS"
 SERVER_LOG=$SERVER_LOG_BASE.global_configs.log
diff --git a/qa/L0_backend_fastertransformer/test.sh b/qa/L0_backend_fastertransformer/test.sh
old mode 100644
new mode 100755
index 49d444392e..8e5d20271a
--- a/qa/L0_backend_fastertransformer/test.sh
+++ b/qa/L0_backend_fastertransformer/test.sh
@@ -43,7 +43,7 @@ rm -f $SERVER_LOG* $CLIENT_LOG*
 RET=0
 # install dependencies
 apt-get update && \
-    apt-get install -y --no-install-recommends python3 python3-pip python3-protobuf 
+    apt-get install -y --no-install-recommends python3 python3-pip python3-protobuf
 python3 -m pip install --upgrade pip && \
     pip3 install --upgrade numpy
 
diff --git a/qa/L0_backend_identity/identity_test.py b/qa/L0_backend_identity/identity_test.py
old mode 100644
new mode 100755
index e9b3465050..ef0634b95c
--- a/qa/L0_backend_identity/identity_test.py
+++ b/qa/L0_backend_identity/identity_test.py
@@ -27,42 +27,45 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
-import requests as httpreq
 from builtins import range
+
+import numpy as np
+import requests as httpreq
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 from tritonclientutils import np_to_triton_dtype
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
     parser.add_argument(
-        '-i',
-        '--protocol',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-u", "--url", type=str, required=False, help="Inference server URL."
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
         type=str,
         required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
+        default="http",
+        help='Protocol ("http"/"grpc") used to '
+        + 'communicate with inference service. Default is "http".',
+    )
 
     FLAGS = parser.parse_args()
     if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
+        print(
+            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
+        )
         exit(1)
 
     client_util = httpclient if FLAGS.protocol == "http" else grpcclient
@@ -77,17 +80,18 @@
         model_name = "identity_uint32"
         request_parallelism = 4
         shape = [2, 2]
-        with client_util.InferenceServerClient(FLAGS.url,
-                                               concurrency=request_parallelism,
-                                               verbose=FLAGS.verbose) as client:
+        with client_util.InferenceServerClient(
+            FLAGS.url, concurrency=request_parallelism, verbose=FLAGS.verbose
+        ) as client:
             input_datas = []
             requests = []
             for i in range(request_parallelism):
                 input_data = (16384 * np.random.randn(*shape)).astype(np.uint32)
                 input_datas.append(input_data)
                 inputs = [
-                    client_util.InferInput("INPUT0", input_data.shape,
-                                           np_to_triton_dtype(input_data.dtype))
+                    client_util.InferInput(
+                        "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
                 ]
                 inputs[0].set_data_from_numpy(input_data)
                 requests.append(client.async_infer(model_name, inputs))
@@ -104,32 +108,44 @@
                     sys.exit(1)
 
                 if not np.array_equal(output_data, input_datas[i]):
-                    print("error: expected output {} to match input {}".format(
-                        output_data, input_datas[i]))
+                    print(
+                        "error: expected output {} to match input {}".format(
+                            output_data, input_datas[i]
+                        )
+                    )
                     sys.exit(1)
 
             # Make sure the requests ran in parallel.
             stats = client.get_inference_statistics(model_name)
-            if (len(stats['model_stats']) !=
-                    1) or (stats['model_stats'][0]['name'] != model_name):
+            if (len(stats["model_stats"]) != 1) or (
+                stats["model_stats"][0]["name"] != model_name
+            ):
                 print("error: expected statistics for {}".format(model_name))
                 sys.exit(1)
 
-            stat = stats['model_stats'][0]
-            if (stat['inference_count'] != 8) or (stat['execution_count'] != 1):
+            stat = stats["model_stats"][0]
+            if (stat["inference_count"] != 8) or (stat["execution_count"] != 1):
                 print(
-                    "error: expected execution_count == 1 and inference_count == 8, got {} and {}"
-                    .format(stat['execution_count'], stat['inference_count']))
+                    "error: expected execution_count == 1 and inference_count == 8, got {} and {}".format(
+                        stat["execution_count"], stat["inference_count"]
+                    )
+                )
                 sys.exit(1)
 
             # Check metrics to make sure they are reported correctly
-            metrics = httpreq.get('http://localhost:8002/metrics')
+            metrics = httpreq.get("http://localhost:8002/metrics")
             print(metrics.text)
 
-            success_str = 'nv_inference_request_success{model="identity_uint32",version="1"}'
+            success_str = (
+                'nv_inference_request_success{model="identity_uint32",version="1"}'
+            )
             infer_count_str = 'nv_inference_count{model="identity_uint32",version="1"}'
-            infer_exec_str = 'nv_inference_exec_count{model="identity_uint32",version="1"}'
-            custom_metric_str = 'input_byte_size_counter{model="identity_uint32",version="1"}'
+            infer_exec_str = (
+                'nv_inference_exec_count{model="identity_uint32",version="1"}'
+            )
+            custom_metric_str = (
+                'input_byte_size_counter{model="identity_uint32",version="1"}'
+            )
 
             success_val = None
             infer_count_val = None
@@ -137,36 +153,47 @@
             custom_metric_val = None
             for line in metrics.text.splitlines():
                 if line.startswith(success_str):
-                    success_val = float(line[len(success_str):])
+                    success_val = float(line[len(success_str) :])
                 if line.startswith(infer_count_str):
-                    infer_count_val = float(line[len(infer_count_str):])
+                    infer_count_val = float(line[len(infer_count_str) :])
                 if line.startswith(infer_exec_str):
-                    infer_exec_val = float(line[len(infer_exec_str):])
+                    infer_exec_val = float(line[len(infer_exec_str) :])
                 if line.startswith(custom_metric_str):
-                    custom_metric_val = float(line[len(custom_metric_str):])
+                    custom_metric_val = float(line[len(custom_metric_str) :])
 
             if success_val != 4:
-                print("error: expected metric {} == 4, got {}".format(
-                    success_str, success_val))
+                print(
+                    "error: expected metric {} == 4, got {}".format(
+                        success_str, success_val
+                    )
+                )
                 sys.exit(1)
             if infer_count_val != 8:
-                print("error: expected metric {} == 8, got {}".format(
-                    infer_count_str, infer_count_val))
+                print(
+                    "error: expected metric {} == 8, got {}".format(
+                        infer_count_str, infer_count_val
+                    )
+                )
                 sys.exit(1)
             if infer_exec_val != 1:
-                print("error: expected metric {} == 1, got {}".format(
-                    infer_exec_str, infer_exec_val))
+                print(
+                    "error: expected metric {} == 1, got {}".format(
+                        infer_exec_str, infer_exec_val
+                    )
+                )
                 sys.exit(1)
             if custom_metric_val != 64:
-                print("error: expected metric {} == 64, got {}".format(
-                    custom_metric_str, custom_metric_val))
+                print(
+                    "error: expected metric {} == 64, got {}".format(
+                        custom_metric_str, custom_metric_val
+                    )
+                )
                 sys.exit(1)
 
     # Reuse a single client for all sync tests
-    with client_util.InferenceServerClient(FLAGS.url,
-                                           verbose=FLAGS.verbose) as client:
+    with client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) as client:
         for model_name, np_dtype, shape in (
-                # yapf: disable
+            # yapf: disable
             ("identity_fp32", np.float32, [1, 0]),
             ("identity_fp32", np.float32, [1, 5]),
             ("identity_uint32", np.uint32, [4, 0]),
@@ -175,22 +202,20 @@
             ("identity_nobatch_int8", np.int8, [7]),
             ("identity_bytes", object, [1, 1]),
             ("identity_bf16", np.float32, [1, 0]),
-            ("identity_bf16", np.float32, [1, 5])):
+            ("identity_bf16", np.float32, [1, 5])
+        ):
             # yapf: enable
             if np_dtype != object:
                 input_data = (16384 * np.random.randn(*shape)).astype(np_dtype)
             else:
-                in0 = (16384 * np.ones(shape, dtype='int'))
-                in0n = np.array([str(x) for x in in0.reshape(in0.size)],
-                                dtype=object)
+                in0 = 16384 * np.ones(shape, dtype="int")
+                in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object)
                 input_data = in0n.reshape(in0.shape)
             if model_name != "identity_bf16":
                 triton_type = np_to_triton_dtype(input_data.dtype)
             else:
                 triton_type = "BF16"
-            inputs = [
-                client_util.InferInput("INPUT0", input_data.shape, triton_type)
-            ]
+            inputs = [client_util.InferInput("INPUT0", input_data.shape, triton_type)]
             inputs[0].set_data_from_numpy(input_data)
 
             results = client.infer(model_name, inputs)
@@ -201,41 +226,47 @@
 
             if np_dtype == object:
                 output_data = np.array(
-                    [str(x, encoding='utf-8') for x in output_data.flatten()],
-                    dtype=object).reshape(output_data.shape)
+                    [str(x, encoding="utf-8") for x in output_data.flatten()],
+                    dtype=object,
+                ).reshape(output_data.shape)
 
             if output_data is None:
                 print("error: expected 'OUTPUT0'")
                 sys.exit(1)
 
             if model_name == "identity_bf16":
-                if (input_data.shape != output_data.shape):
+                if input_data.shape != output_data.shape:
                     print(
-                        "error: expected output shape {} to match input shape {}"
-                        .format(output_data.shape, input_data.shape))
+                        "error: expected output shape {} to match input shape {}".format(
+                            output_data.shape, input_data.shape
+                        )
+                    )
                     sys.exit(1)
                 for input, output in zip(
-                        np.nditer(input_data,
-                                  flags=["refs_ok", "zerosize_ok"],
-                                  order='C'),
-                        np.nditer(output_data,
-                                  flags=["refs_ok", "zerosize_ok"],
-                                  order='C')):
+                    np.nditer(input_data, flags=["refs_ok", "zerosize_ok"], order="C"),
+                    np.nditer(output_data, flags=["refs_ok", "zerosize_ok"], order="C"),
+                ):
                     if input.tobytes()[2:4] != output.tobytes()[2:4]:
                         print(
-                            "error: expected low-order bits of output {} to match low-order bits of input {}"
-                            .format(output, input))
+                            "error: expected low-order bits of output {} to match low-order bits of input {}".format(
+                                output, input
+                            )
+                        )
                         sys.exit(1)
-                    if output.tobytes()[0:2] != b'\x00\x00':
+                    if output.tobytes()[0:2] != b"\x00\x00":
                         print(
-                            "error: expected output {} to have all-zero high-order bits, got {}"
-                            .format(output,
-                                    output.tobytes()[0:2]))
+                            "error: expected output {} to have all-zero high-order bits, got {}".format(
+                                output, output.tobytes()[0:2]
+                            )
+                        )
                         sys.exit(1)
             else:
                 if not np.array_equal(output_data, input_data):
-                    print("error: expected output {} to match input {}".format(
-                        output_data, input_data))
+                    print(
+                        "error: expected output {} to match input {}".format(
+                            output_data, input_data
+                        )
+                    )
                     sys.exit(1)
 
             # Make sure response parameters are correct
@@ -252,8 +283,7 @@
                 param2 = params["param2"].bool_param
 
             if param0 != "an example string parameter":
-                print(
-                    "error: expected 'param0' == 'an example string parameter'")
+                print("error: expected 'param0' == 'an example string parameter'")
                 sys.exit(1)
             if param1 != 42:
                 print("error: expected 'param1' == 42")
diff --git a/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py b/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py
old mode 100644
new mode 100755
index 8669132b3c..bd5fae1afe
--- a/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py
+++ b/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,18 +26,18 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
 import unittest
+
+import numpy as np
 import triton_python_backend_utils as pb_utils
 
 
 class ArgumentValidationTest(unittest.TestCase):
-
     def test_infer_request_args(self):
         # Dummy arguments used in the tests.
-        inputs = [pb_utils.Tensor('INPUT0', np.asarray([1, 2], dtype=np.int32))]
-        model_name = 'my_model'
-        requested_output_names = ['my_output']
+        inputs = [pb_utils.Tensor("INPUT0", np.asarray([1, 2], dtype=np.int32))]
+        model_name = "my_model"
+        requested_output_names = ["my_output"]
 
         #
         # inputs field validation
@@ -46,21 +48,24 @@ def test_infer_request_args(self):
             pb_utils.InferenceRequest(
                 inputs=[None],
                 model_name=model_name,
-                requested_output_names=requested_output_names)
+                requested_output_names=requested_output_names,
+            )
 
         # Test None object as list of inputs
         with self.assertRaises(TypeError) as e:
             pb_utils.InferenceRequest(
                 inputs=None,
                 model_name=model_name,
-                requested_output_names=requested_output_names)
+                requested_output_names=requested_output_names,
+            )
 
         # model_name validation
         with self.assertRaises(TypeError) as e:
             pb_utils.InferenceRequest(
                 model_name=None,
                 inputs=inputs,
-                requested_output_names=requested_output_names)
+                requested_output_names=requested_output_names,
+            )
 
         #
         # Requested output name validations
@@ -68,14 +73,14 @@ def test_infer_request_args(self):
 
         # Test list of None objects as requested_output_names
         with self.assertRaises(TypeError) as e:
-            pb_utils.InferenceRequest(requested_output_names=[None],
-                                      inputs=inputs,
-                                      model_name=model_name)
+            pb_utils.InferenceRequest(
+                requested_output_names=[None], inputs=inputs, model_name=model_name
+            )
 
         with self.assertRaises(TypeError) as e:
-            pb_utils.InferenceRequest(requested_output_names=None,
-                                      inputs=inputs,
-                                      model_name=model_name)
+            pb_utils.InferenceRequest(
+                requested_output_names=None, inputs=inputs, model_name=model_name
+            )
 
         # Other arguments validation
 
@@ -85,7 +90,8 @@ def test_infer_request_args(self):
                 requested_output_names=requested_output_names,
                 inputs=inputs,
                 model_name=model_name,
-                correleation_id=None)
+                correleation_id=None,
+            )
 
         # request_id set to None
         with self.assertRaises(TypeError) as e:
@@ -93,7 +99,8 @@ def test_infer_request_args(self):
                 requested_output_names=requested_output_names,
                 inputs=inputs,
                 model_name=model_name,
-                request_id=None)
+                request_id=None,
+            )
 
         # model_version set to None
         with self.assertRaises(TypeError) as e:
@@ -101,7 +108,8 @@ def test_infer_request_args(self):
                 requested_output_names=requested_output_names,
                 inputs=inputs,
                 model_name=model_name,
-                model_version=None)
+                model_version=None,
+            )
 
         # flags set to None
         with self.assertRaises(TypeError) as e:
@@ -109,17 +117,16 @@ def test_infer_request_args(self):
                 requested_output_names=requested_output_names,
                 inputs=inputs,
                 model_name=model_name,
-                flags=None)
+                flags=None,
+            )
 
         # Empty lists should not raise an exception
-        pb_utils.InferenceRequest(requested_output_names=[],
-                                  inputs=[],
-                                  model_name=model_name)
+        pb_utils.InferenceRequest(
+            requested_output_names=[], inputs=[], model_name=model_name
+        )
 
     def test_infer_response_args(self):
-        outputs = [
-            pb_utils.Tensor('OUTPUT0', np.asarray([1, 2], dtype=np.int32))
-        ]
+        outputs = [pb_utils.Tensor("OUTPUT0", np.asarray([1, 2], dtype=np.int32))]
 
         # Test list of None object as output tensor
         with self.assertRaises(pb_utils.TritonModelException) as e:
@@ -195,12 +202,15 @@ def execute(self, requests):
         responses = []
         for _ in requests:
             # Run the unittest and store the results in InferenceResponse.
-            test = unittest.main('model', exit=False)
+            test = unittest.main("model", exit=False)
             responses.append(
-                pb_utils.InferenceResponse([
-                    pb_utils.Tensor(
-                        'OUTPUT0',
-                        np.array([test.result.wasSuccessful()],
-                                 dtype=np.float16))
-                ]))
+                pb_utils.InferenceResponse(
+                    [
+                        pb_utils.Tensor(
+                            "OUTPUT0",
+                            np.array([test.result.wasSuccessful()], dtype=np.float16),
+                        )
+                    ]
+                )
+            )
         return responses
diff --git a/qa/L0_backend_python/argument_validation/test.sh b/qa/L0_backend_python/argument_validation/test.sh
old mode 100644
new mode 100755
index f80ce3e84b..f47abb8485
--- a/qa/L0_backend_python/argument_validation/test.sh
+++ b/qa/L0_backend_python/argument_validation/test.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_backend_python/bls/test.sh b/qa/L0_backend_python/bls/test.sh
old mode 100644
new mode 100755
index 3f88df01f3..3d87cf7b65
--- a/qa/L0_backend_python/bls/test.sh
+++ b/qa/L0_backend_python/bls/test.sh
@@ -115,7 +115,7 @@ for TRIAL in non_decoupled decoupled ; do
     set +e
 
     export MODEL_NAME='bls'
-    python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 
+    python3 $CLIENT_PY >> $CLIENT_LOG 2>&1
     if [ $? -ne 0 ]; then
         echo -e "\n***\n*** 'bls' $BLS_KIND test FAILED. \n***"
         cat $CLIENT_LOG
@@ -130,7 +130,7 @@ for TRIAL in non_decoupled decoupled ; do
     fi
 
     export MODEL_NAME='bls_memory'
-    python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 
+    python3 $CLIENT_PY >> $CLIENT_LOG 2>&1
     if [ $? -ne 0 ]; then
         echo -e "\n***\n*** 'bls_memory' $BLS_KIND test FAILED. \n***"
         cat $CLIENT_LOG
@@ -145,7 +145,7 @@ for TRIAL in non_decoupled decoupled ; do
     fi
 
     export MODEL_NAME='bls_memory_async'
-    python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 
+    python3 $CLIENT_PY >> $CLIENT_LOG 2>&1
     if [ $? -ne 0 ]; then
         echo -e "\n***\n*** 'bls_async_memory' $BLS_KIND test FAILED. \n***"
         cat $CLIENT_LOG
@@ -160,7 +160,7 @@ for TRIAL in non_decoupled decoupled ; do
     fi
 
     export MODEL_NAME='bls_async'
-    python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 
+    python3 $CLIENT_PY >> $CLIENT_LOG 2>&1
     if [ $? -ne 0 ]; then
         echo -e "\n***\n*** 'bls_async' $BLS_KIND test FAILED. \n***"
         cat $CLIENT_LOG
diff --git a/qa/L0_backend_python/common.sh b/qa/L0_backend_python/common.sh
old mode 100644
new mode 100755
index 074ad26da0..6030849fc9
--- a/qa/L0_backend_python/common.sh
+++ b/qa/L0_backend_python/common.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -52,7 +53,7 @@ install_build_deps() {
       echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | \
       tee /etc/apt/sources.list.d/kitware.list >/dev/null && \
       apt-get update && \
-      apt-get install -y --no-install-recommends cmake cmake-data 
+      apt-get install -y --no-install-recommends cmake cmake-data
 }
 
 create_conda_env() {
diff --git a/qa/L0_backend_python/custom_metrics/test.sh b/qa/L0_backend_python/custom_metrics/test.sh
old mode 100644
new mode 100755
index 8842fa4ecf..149f5e5d56
--- a/qa/L0_backend_python/custom_metrics/test.sh
+++ b/qa/L0_backend_python/custom_metrics/test.sh
@@ -54,7 +54,7 @@ fi
 set +e
 
 export MODEL_NAME='custom_metrics'
-python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 
+python3 $CLIENT_PY >> $CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
     echo -e "\n***\n*** 'Custom Metrics' test FAILED. \n***"
     cat $CLIENT_LOG
diff --git a/qa/L0_backend_python/decoupled/decoupled_test.py b/qa/L0_backend_python/decoupled/decoupled_test.py
old mode 100644
new mode 100755
index 98b19b1cd2..4a4b77c661
--- a/qa/L0_backend_python/decoupled/decoupled_test.py
+++ b/qa/L0_backend_python/decoupled/decoupled_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,18 +30,18 @@
 
 sys.path.append("../../common")
 
-import test_util as tu
+import queue
 import time
-import tritonclient.grpc as grpcclient
-from tritonclient.utils import *
-import numpy as np
 import unittest
 from functools import partial
-import queue
 
+import numpy as np
+import test_util as tu
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import *
 
-class UserData:
 
+class UserData:
     def __init__(self):
         self._completed_requests = queue.Queue()
 
@@ -52,10 +54,9 @@ def callback(user_data, result, error):
 
 
 class DecoupledTest(tu.TestResultCollector):
-
     def test_decoupled_execute_error(self):
         # The decoupled_execute_error model returns an error for the first
-        # request and sucessfully processes the second request. This is making
+        # request and successfully processes the second request. This is making
         # sure that an error in a single request does not completely fail the
         # batch.
 
@@ -63,8 +64,7 @@ def test_decoupled_execute_error(self):
         shape = [2, 2]
         number_of_requests = 2
         user_data = UserData()
-        with grpcclient.InferenceServerClient(
-                "localhost:8001") as triton_client:
+        with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
             triton_client.start_stream(callback=partial(callback, user_data))
 
             input_datas = []
@@ -72,12 +72,12 @@ def test_decoupled_execute_error(self):
                 input_data = np.random.randn(*shape).astype(np.float32)
                 input_datas.append(input_data)
                 inputs = [
-                    grpcclient.InferInput("IN", input_data.shape,
-                                          np_to_triton_dtype(input_data.dtype))
+                    grpcclient.InferInput(
+                        "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
                 ]
                 inputs[0].set_data_from_numpy(input_data)
-                triton_client.async_stream_infer(model_name=model_name,
-                                                 inputs=inputs)
+                triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
 
             for i in range(number_of_requests):
                 result = user_data._completed_requests.get()
@@ -91,27 +91,28 @@ def test_decoupled_execute_error(self):
                 self.assertTrue(
                     np.array_equal(output_data, input_datas[i]),
                     "error: expected output {} to match input {}".format(
-                        output_data, input_datas[i]))
+                        output_data, input_datas[i]
+                    ),
+                )
 
     def test_decoupled_bls(self):
         # Test combinations of BLS and decoupled API in Python backend.
         model_name = "decoupled_bls"
         shape = [1, 2]
         user_data = UserData()
-        with grpcclient.InferenceServerClient(
-                "localhost:8001") as triton_client:
+        with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
             triton_client.start_stream(callback=partial(callback, user_data))
 
             input_datas = []
             input_data = np.random.randn(*shape).astype(np.float32)
             input_datas.append(input_data)
             inputs = [
-                grpcclient.InferInput("IN", input_data.shape,
-                                      np_to_triton_dtype(input_data.dtype))
+                grpcclient.InferInput(
+                    "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                )
             ]
             inputs[0].set_data_from_numpy(input_data)
-            triton_client.async_stream_infer(model_name=model_name,
-                                             inputs=inputs)
+            triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
 
             # Check the results of the decoupled model using BLS
             def check_result(result):
@@ -123,7 +124,9 @@ def check_result(result):
                 self.assertTrue(
                     np.array_equal(output_data, input_data),
                     "error: expected output {} to match input {}".format(
-                        output_data, input_data))
+                        output_data, input_data
+                    ),
+                )
 
             result = user_data._completed_requests.get()
             check_result(result)
@@ -134,19 +137,19 @@ def test_decoupled_bls_stream(self):
         in_values = [4, 2, 0, 1]
         shape = [1]
         user_data = UserData()
-        with grpcclient.InferenceServerClient(
-                "localhost:8001") as triton_client:
+        with grpcclient.InferenceServerClient("localhost:8001") as triton_client:
             triton_client.start_stream(callback=partial(callback, user_data))
             for i in range(len(in_values)):
                 input_data = np.array([in_values[i]], dtype=np.int32)
                 inputs = [
-                    grpcclient.InferInput("IN", input_data.shape,
-                                          np_to_triton_dtype(input_data.dtype))
+                    grpcclient.InferInput(
+                        "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
                 ]
                 inputs[0].set_data_from_numpy(input_data)
-                triton_client.async_stream_infer(model_name=model_name,
-                                                 inputs=inputs,
-                                                 request_id=str(i))
+                triton_client.async_stream_infer(
+                    model_name=model_name, inputs=inputs, request_id=str(i)
+                )
 
             # Retrieve results...
             recv_count = 0
@@ -172,23 +175,27 @@ def test_decoupled_bls_stream(self):
                 if in_values[i] != 0:
                     self.assertTrue(
                         is_received,
-                        "response for request id {} not received".format(
-                            this_id))
+                        "response for request id {} not received".format(this_id),
+                    )
                     self.assertEqual(len(result_dict[this_id]), in_values[i])
 
                     result_list = result_dict[this_id]
                     expected_data = np.array([in_values[i]], dtype=np.int32)
                     for j in range(len(result_list)):
-                        this_data = result_list[j][1].as_numpy('OUT')
+                        this_data = result_list[j][1].as_numpy("OUT")
                         self.assertTrue(
                             np.array_equal(expected_data, this_data),
                             "error: incorrect data: expected {}, got {}".format(
-                                expected_data, this_data))
+                                expected_data, this_data
+                            ),
+                        )
                 else:
                     self.assertFalse(
                         is_received,
                         "received unexpected response for request id {}".format(
-                            this_id))
+                            this_id
+                        ),
+                    )
 
     def test_decoupled_return_response_error(self):
         model_name = "decoupled_return_response_error"
@@ -199,10 +206,12 @@ def test_decoupled_return_response_error(self):
             input_data_0 = np.random.random(shape).astype(np.float32)
             input_data_1 = np.random.random(shape).astype(np.float32)
             inputs = [
-                grpcclient.InferInput("INPUT0", input_data_0.shape,
-                                      np_to_triton_dtype(input_data_0.dtype)),
-                grpcclient.InferInput("INPUT1", input_data_1.shape,
-                                      np_to_triton_dtype(input_data_1.dtype))
+                grpcclient.InferInput(
+                    "INPUT0", input_data_0.shape, np_to_triton_dtype(input_data_0.dtype)
+                ),
+                grpcclient.InferInput(
+                    "INPUT1", input_data_1.shape, np_to_triton_dtype(input_data_1.dtype)
+                ),
             ]
             inputs[0].set_data_from_numpy(input_data_0)
             inputs[1].set_data_from_numpy(input_data_1)
@@ -213,7 +222,9 @@ def test_decoupled_return_response_error(self):
                     data_item.message(),
                     "Python model 'decoupled_return_response_error_0' is using "
                     "the decoupled mode and the execute function must return "
-                    "None.", "Exception message didn't match.")
+                    "None.",
+                    "Exception message didn't match.",
+                )
 
     def test_decoupled_send_after_close_error(self):
         model_name = "decoupled_send_after_close_error"
@@ -224,10 +235,12 @@ def test_decoupled_send_after_close_error(self):
             input_data_0 = np.random.random(shape).astype(np.float32)
             input_data_1 = np.random.random(shape).astype(np.float32)
             inputs = [
-                grpcclient.InferInput("INPUT0", input_data_0.shape,
-                                      np_to_triton_dtype(input_data_0.dtype)),
-                grpcclient.InferInput("INPUT1", input_data_1.shape,
-                                      np_to_triton_dtype(input_data_1.dtype))
+                grpcclient.InferInput(
+                    "INPUT0", input_data_0.shape, np_to_triton_dtype(input_data_0.dtype)
+                ),
+                grpcclient.InferInput(
+                    "INPUT1", input_data_1.shape, np_to_triton_dtype(input_data_1.dtype)
+                ),
             ]
             inputs[0].set_data_from_numpy(input_data_0)
             inputs[1].set_data_from_numpy(input_data_1)
@@ -237,9 +250,12 @@ def test_decoupled_send_after_close_error(self):
             # way to deliver the error message to the client. The error
             # will be logged on the server side.
             time.sleep(4)
-            self.assertEqual(user_data._completed_requests.qsize(), 0,
-                             "The completed request size must be zero.")
+            self.assertEqual(
+                user_data._completed_requests.qsize(),
+                0,
+                "The completed request size must be zero.",
+            )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py
old mode 100644
new mode 100755
index 84e43eccf9..901e4c46b7
--- a/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py
+++ b/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,19 +26,19 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
 import json
+import sys
 import threading
 import time
+
 import numpy as np
 import torch
+import triton_python_backend_utils as pb_utils
 from torch.utils.dlpack import from_dlpack, to_dlpack
-import sys
 
 
 class TritonPythonModel:
-    """ This model sends an error message with the first request.
-    """
+    """This model sends an error message with the first request."""
 
     def initialize(self, args):
         logger = pb_utils.Logger
@@ -45,22 +47,25 @@ def initialize(self, args):
         logger.log_warn("Initialize-Warning Msg!")
         logger.log_error("Initialize-Error Msg!")
         # You must parse model_config. JSON string is not parsed here
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
         using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
-            model_config)
+            model_config
+        )
         if not using_decoupled:
             raise pb_utils.TritonModelException(
                 """the model `{}` can generate any number of responses per request,
                 enable decoupled transaction policy in model configuration to
-                serve this model""".format(args['model_name']))
+                serve this model""".format(
+                    args["model_name"]
+                )
+            )
 
         # Get OUT configuration
         out_config = pb_utils.get_output_config_by_name(model_config, "OUT")
 
         # Convert Triton types to numpy types
-        self.out_dtype = pb_utils.triton_string_to_numpy(
-            out_config['data_type'])
+        self.out_dtype = pb_utils.triton_string_to_numpy(out_config["data_type"])
 
         self.inflight_thread_count = 0
         self.inflight_thread_count_lck = threading.Lock()
@@ -71,8 +76,7 @@ def initialize(self, args):
         logger.log_error("Initialize-Error Msg!")
 
     def execute(self, requests):
-        """ This function is called on inference request.
-        """
+        """This function is called on inference request."""
         logger = pb_utils.Logger
         logger.log("Execute-Specific Msg!", logger.INFO)
         logger.log_info("Execute-Info Msg!")
@@ -80,30 +84,33 @@ def execute(self, requests):
         logger.log_error("Execute-Error Msg!")
         # Only generate the error for the first request
         for i, request in enumerate(requests):
-            request_input = pb_utils.get_input_tensor_by_name(request, 'IN')
+            request_input = pb_utils.get_input_tensor_by_name(request, "IN")
 
             # Sync BLS request
             infer_request = pb_utils.InferenceRequest(
-                model_name='identity_fp32',
+                model_name="identity_fp32",
                 requested_output_names=["OUTPUT0"],
-                inputs=[pb_utils.Tensor('INPUT0', request_input.as_numpy())])
+                inputs=[pb_utils.Tensor("INPUT0", request_input.as_numpy())],
+            )
             infer_response = infer_request.exec()
             if infer_response.has_error():
                 raise pb_utils.TritonModelException(
                     f"BLS Response has an error: {infer_response.error().message()}"
                 )
 
-            output0 = pb_utils.get_output_tensor_by_name(
-                infer_response, "OUTPUT0")
+            output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
             if np.any(output0.as_numpy() != request_input.as_numpy()):
                 raise pb_utils.TritonModelException(
                     f"BLS Request input and BLS response output do not match. {request_input.as_numpy()} != {output0.as_numpy()}"
                 )
 
-            thread1 = threading.Thread(target=self.response_thread,
-                                       args=(request.get_response_sender(),
-                                             pb_utils.get_input_tensor_by_name(
-                                                 request, 'IN').as_numpy()))
+            thread1 = threading.Thread(
+                target=self.response_thread,
+                args=(
+                    request.get_response_sender(),
+                    pb_utils.get_input_tensor_by_name(request, "IN").as_numpy(),
+                ),
+            )
             thread1.daemon = True
             with self.inflight_thread_count_lck:
                 self.inflight_thread_count += 1
@@ -131,15 +138,16 @@ def _get_gpu_bls_outputs(self, input0_pb, input1_pb):
         logger.log_error("_get_gpu_bls_outputs-Error Msg!")
 
         infer_request = pb_utils.InferenceRequest(
-            model_name='dlpack_add_sub',
+            model_name="dlpack_add_sub",
             inputs=[input0_pb, input1_pb],
-            requested_output_names=['OUTPUT0', 'OUTPUT1'])
+            requested_output_names=["OUTPUT0", "OUTPUT1"],
+        )
         infer_response = infer_request.exec()
         if infer_response.has_error():
             return False
 
-        output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
-        output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')
+        output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
+        output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT1")
         if output0 is None or output1 is None:
             return False
 
@@ -193,30 +201,32 @@ def _test_gpu_bls_add_sub(self, is_input0_gpu, is_input1_gpu):
         input1 = torch.rand(16)
 
         if is_input0_gpu:
-            input0 = input0.to('cuda')
+            input0 = input0.to("cuda")
 
         if is_input1_gpu:
-            input1 = input1.to('cuda')
+            input1 = input1.to("cuda")
 
-        input0_pb = pb_utils.Tensor.from_dlpack('INPUT0', to_dlpack(input0))
-        input1_pb = pb_utils.Tensor.from_dlpack('INPUT1', to_dlpack(input1))
+        input0_pb = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(input0))
+        input1_pb = pb_utils.Tensor.from_dlpack("INPUT1", to_dlpack(input1))
         gpu_bls_return = self._get_gpu_bls_outputs(input0_pb, input1_pb)
         if gpu_bls_return:
             output0_dlpack, output1_dlpack = gpu_bls_return
         else:
             return False
 
-        expected_output_0 = from_dlpack(
-            input0_pb.to_dlpack()).to('cpu') + from_dlpack(
-                input1_pb.to_dlpack()).to('cpu')
-        expected_output_1 = from_dlpack(
-            input0_pb.to_dlpack()).to('cpu') - from_dlpack(
-                input1_pb.to_dlpack()).to('cpu')
+        expected_output_0 = from_dlpack(input0_pb.to_dlpack()).to("cpu") + from_dlpack(
+            input1_pb.to_dlpack()
+        ).to("cpu")
+        expected_output_1 = from_dlpack(input0_pb.to_dlpack()).to("cpu") - from_dlpack(
+            input1_pb.to_dlpack()
+        ).to("cpu")
 
         output0_matches = torch.all(
-            expected_output_0 == from_dlpack(output0_dlpack).to('cpu'))
+            expected_output_0 == from_dlpack(output0_dlpack).to("cpu")
+        )
         output1_matches = torch.all(
-            expected_output_1 == from_dlpack(output1_dlpack).to('cpu'))
+            expected_output_1 == from_dlpack(output1_dlpack).to("cpu")
+        )
         if not output0_matches or not output1_matches:
             return False
 
@@ -230,8 +240,7 @@ def execute_gpu_bls(self):
         logger.log_error("execute_gpu_bls-Error Msg!")
         for input0_device in [True, False]:
             for input1_device in [True, False]:
-                test_status = self._test_gpu_bls_add_sub(
-                    input0_device, input1_device)
+                test_status = self._test_gpu_bls_add_sub(input0_device, input1_device)
                 if not test_status:
                     return False
 
@@ -250,39 +259,39 @@ def response_thread(self, response_sender, in_input):
 
         status = self.execute_gpu_bls()
         if not status:
-            infer_response = pb_utils.InferenceResponse(
-                error="GPU BLS test failed.")
+            infer_response = pb_utils.InferenceResponse(error="GPU BLS test failed.")
             response_sender.send(infer_response)
         else:
             in_value = in_input
             infer_request = pb_utils.InferenceRequest(
-                model_name='identity_fp32',
+                model_name="identity_fp32",
                 requested_output_names=["OUTPUT0"],
-                inputs=[pb_utils.Tensor('INPUT0', in_input)])
+                inputs=[pb_utils.Tensor("INPUT0", in_input)],
+            )
             infer_response = infer_request.exec()
-            output0 = pb_utils.get_output_tensor_by_name(
-                infer_response, "OUTPUT0")
+            output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
             if infer_response.has_error():
                 response = pb_utils.InferenceResponse(
-                    error=infer_response.error().message())
+                    error=infer_response.error().message()
+                )
                 response_sender.send(
-                    response,
-                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                    response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                )
             elif np.any(in_input != output0.as_numpy()):
                 error_message = (
                     "BLS Request input and BLS response output do not match."
-                    f" {in_value} != {output0.as_numpy()}")
+                    f" {in_value} != {output0.as_numpy()}"
+                )
                 response = pb_utils.InferenceResponse(error=error_message)
                 response_sender.send(
-                    response,
-                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                    response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                )
             else:
-                output_tensors = [pb_utils.Tensor('OUT', in_value)]
-                response = pb_utils.InferenceResponse(
-                    output_tensors=output_tensors)
+                output_tensors = [pb_utils.Tensor("OUT", in_value)]
+                response = pb_utils.InferenceResponse(output_tensors=output_tensors)
                 response_sender.send(
-                    response,
-                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                    response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                )
 
         with self.inflight_thread_count_lck:
             self.inflight_thread_count -= 1
@@ -297,13 +306,13 @@ def finalize(self):
         the model to perform any necessary clean ups before exit.
         """
         logger = pb_utils.Logger
-        logger.log_info('Finalize invoked')
+        logger.log_info("Finalize invoked")
 
         inflight_threads = True
         while inflight_threads:
             with self.inflight_thread_count_lck:
-                inflight_threads = (self.inflight_thread_count != 0)
+                inflight_threads = self.inflight_thread_count != 0
             if inflight_threads:
                 time.sleep(0.1)
 
-        logger.log_info('Finalize complete...')
+        logger.log_info("Finalize complete...")
diff --git a/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py
old mode 100644
new mode 100755
index 81bb397115..e6334d34dc
--- a/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py
+++ b/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,11 +26,12 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
 import json
 import threading
 import time
+
 import numpy as np
+import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
@@ -38,28 +41,34 @@ class TritonPythonModel:
 
     def initialize(self, args):
         # You must parse model_config. JSON string is not parsed here
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
         using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
-            model_config)
+            model_config
+        )
         if not using_decoupled:
             raise pb_utils.TritonModelException(
                 """the model `{}` can generate any number of responses per request,
                 enable decoupled transaction policy in model configuration to
-                serve this model""".format(args['model_name']))
+                serve this model""".format(
+                    args["model_name"]
+                )
+            )
 
         self.inflight_thread_count = 0
         self.inflight_thread_count_lck = threading.Lock()
 
     def execute(self, requests):
-        """ This function is called on inference request.
-        """
+        """This function is called on inference request."""
 
         for request in requests:
-            thread = threading.Thread(target=self.response_thread,
-                                      args=(request.get_response_sender(),
-                                            pb_utils.get_input_tensor_by_name(
-                                                request, 'IN').as_numpy()))
+            thread = threading.Thread(
+                target=self.response_thread,
+                args=(
+                    request.get_response_sender(),
+                    pb_utils.get_input_tensor_by_name(request, "IN").as_numpy(),
+                ),
+            )
             thread.daemon = True
             with self.inflight_thread_count_lck:
                 self.inflight_thread_count += 1
@@ -69,50 +78,49 @@ def execute(self, requests):
 
     def response_thread(self, response_sender, in_value):
         infer_request = pb_utils.InferenceRequest(
-            model_name='square_int32',
+            model_name="square_int32",
             requested_output_names=["OUT"],
-            inputs=[pb_utils.Tensor('IN', in_value)])
+            inputs=[pb_utils.Tensor("IN", in_value)],
+        )
         infer_responses = infer_request.exec(decoupled=True)
 
         response_count = 0
         for infer_response in infer_responses:
             if len(infer_response.output_tensors()) > 0:
-                output0 = pb_utils.get_output_tensor_by_name(
-                    infer_response, "OUT")
+                output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
                 if infer_response.has_error():
                     response = pb_utils.InferenceResponse(
-                        error=infer_response.error().message())
+                        error=infer_response.error().message()
+                    )
                     response_sender.send(
-                        response,
-                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                        response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                    )
                 elif np.any(in_value != output0.as_numpy()):
                     error_message = (
                         "BLS Request input and BLS response output do not match."
-                        f" {in_value} != {output0.as_numpy()}")
+                        f" {in_value} != {output0.as_numpy()}"
+                    )
                     response = pb_utils.InferenceResponse(error=error_message)
                     response_sender.send(
-                        response,
-                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                        response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                    )
                 else:
-                    output_tensors = [
-                        pb_utils.Tensor('OUT', output0.as_numpy())
-                    ]
-                    response = pb_utils.InferenceResponse(
-                        output_tensors=output_tensors)
+                    output_tensors = [pb_utils.Tensor("OUT", output0.as_numpy())]
+                    response = pb_utils.InferenceResponse(output_tensors=output_tensors)
                     response_sender.send(response)
 
             response_count += 1
 
         if in_value != response_count - 1:
-            error_message = ("Expected {} responses, got {}".format(
-                in_value,
-                len(infer_responses) - 1))
+            error_message = "Expected {} responses, got {}".format(
+                in_value, len(infer_responses) - 1
+            )
             response = pb_utils.InferenceResponse(error=error_message)
             response_sender.send(
-                response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+            )
         else:
-            response_sender.send(
-                flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+            response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
 
         with self.inflight_thread_count_lck:
             self.inflight_thread_count -= 1
@@ -121,6 +129,6 @@ def finalize(self):
         inflight_threads = True
         while inflight_threads:
             with self.inflight_thread_count_lck:
-                inflight_threads = (self.inflight_thread_count != 0)
+                inflight_threads = self.inflight_thread_count != 0
             if inflight_threads:
                 time.sleep(0.1)
diff --git a/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py
old mode 100644
new mode 100755
index 1a7bd7abed..ecdb7df322
--- a/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py
+++ b/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,49 +26,55 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
 import json
 import threading
 import time
 
+import triton_python_backend_utils as pb_utils
+
 
 class TritonPythonModel:
-    """ This model sends an error message with the first request.
-    """
+    """This model sends an error message with the first request."""
 
     def initialize(self, args):
         # You must parse model_config. JSON string is not parsed here
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
         using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
-            model_config)
+            model_config
+        )
         if not using_decoupled:
             raise pb_utils.TritonModelException(
                 """the model `{}` can generate any number of responses per request,
                 enable decoupled transaction policy in model configuration to
-                serve this model""".format(args['model_name']))
+                serve this model""".format(
+                    args["model_name"]
+                )
+            )
 
         # Get OUT configuration
         out_config = pb_utils.get_output_config_by_name(model_config, "OUT")
 
         # Convert Triton types to numpy types
-        self.out_dtype = pb_utils.triton_string_to_numpy(
-            out_config['data_type'])
+        self.out_dtype = pb_utils.triton_string_to_numpy(out_config["data_type"])
 
         self.inflight_thread_count = 0
         self.inflight_thread_count_lck = threading.Lock()
 
     def execute(self, requests):
-        """ This function is called on inference request.
-        """
+        """This function is called on inference request."""
 
         # Only generate the error for the first request
         for i, request in enumerate(requests):
             # Start a separate thread to send the responses for the request.
-            thread = threading.Thread(target=self.response_thread,
-                                      args=(request.get_response_sender(), i,
-                                            pb_utils.get_input_tensor_by_name(
-                                                request, 'IN').as_numpy()))
+            thread = threading.Thread(
+                target=self.response_thread,
+                args=(
+                    request.get_response_sender(),
+                    i,
+                    pb_utils.get_input_tensor_by_name(request, "IN").as_numpy(),
+                ),
+            )
             thread.daemon = True
 
             with self.inflight_thread_count_lck:
@@ -86,9 +94,10 @@ def response_thread(self, response_sender, index, in_input):
         out_output = pb_utils.Tensor("OUT", in_value)
 
         if index == 0:
-            error = pb_utils.TritonError('An error occured during execution')
-            response = pb_utils.InferenceResponse(output_tensors=[out_output],
-                                                  error=error)
+            error = pb_utils.TritonError("An error occurred during execution")
+            response = pb_utils.InferenceResponse(
+                output_tensors=[out_output], error=error
+            )
         else:
             response = pb_utils.InferenceResponse(output_tensors=[out_output])
         response_sender.send(response)
@@ -96,8 +105,7 @@ def response_thread(self, response_sender, index, in_input):
         # We must close the response sender to indicate to Triton that we are
         # done sending responses for the corresponding request. We can't use the
         # response sender after closing it.
-        response_sender.send(
-            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+        response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
 
         with self.inflight_thread_count_lck:
             self.inflight_thread_count -= 1
@@ -107,13 +115,13 @@ def finalize(self):
         Implementing `finalize` function is OPTIONAL. This function allows
         the model to perform any necessary clean ups before exit.
         """
-        print('Finalize invoked')
+        print("Finalize invoked")
 
         inflight_threads = True
         while inflight_threads:
             with self.inflight_thread_count_lck:
-                inflight_threads = (self.inflight_thread_count != 0)
+                inflight_threads = self.inflight_thread_count != 0
             if inflight_threads:
                 time.sleep(0.1)
 
-        print('Finalize complete...')
+        print("Finalize complete...")
diff --git a/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py
old mode 100644
new mode 100755
index 4c882481cf..10b9ef12fe
--- a/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py
+++ b/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,37 +27,42 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import json
+
 import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-    """ This model tries to return a response directly from
+    """This model tries to return a response directly from
     execute function when configured as decoupled model.
     """
 
     def initialize(self, args):
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
         using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
-            model_config)
+            model_config
+        )
         if not using_decoupled:
             raise pb_utils.TritonModelException(
                 """the model `{}` can generate any number of responses per request,
-                enable decoupled transaction policy in model configuration to 
-                serve this model""".format(args['model_name']))
+                enable decoupled transaction policy in model configuration to
+                serve this model""".format(
+                    args["model_name"]
+                )
+            )
 
-        output0_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT0")
-        output1_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT1")
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
 
         self.output0_dtype = pb_utils.triton_string_to_numpy(
-            output0_config['data_type'])
+            output0_config["data_type"]
+        )
         self.output1_dtype = pb_utils.triton_string_to_numpy(
-            output1_config['data_type'])
+            output1_config["data_type"]
+        )
 
     def execute(self, requests):
-        """ Tries to create a response sender object and use that
+        """Tries to create a response sender object and use that
         for sending the response.
         """
 
@@ -66,13 +73,12 @@ def execute(self, requests):
         for request in requests:
             in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
             in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
-            out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(),
-                            in_0.as_numpy() - in_1.as_numpy())
+            out_0, out_1 = (
+                in_0.as_numpy() + in_1.as_numpy(),
+                in_0.as_numpy() - in_1.as_numpy(),
+            )
 
-            out_tensor_0 = pb_utils.Tensor("OUTPUT0",
-                                           out_0.astype(output0_dtype))
-            out_tensor_1 = pb_utils.Tensor("OUTPUT1",
-                                           out_1.astype(output1_dtype))
-            responses.append(
-                pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
+            out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
+            responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
         return responses
diff --git a/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py
old mode 100644
new mode 100755
index 9611c2875c..aeab19851c
--- a/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py
+++ b/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,44 +27,50 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import json
+
 import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-    """ This model tries to send response after closing
+    """This model tries to send response after closing
     the response_sender.
     """
 
     def initialize(self, args):
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
         using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
-            model_config)
+            model_config
+        )
         if not using_decoupled:
             raise pb_utils.TritonModelException(
                 """the model `{}` can generate any number of responses per request,
-                enable decoupled transaction policy in model configuration to 
-                serve this model""".format(args['model_name']))
+                enable decoupled transaction policy in model configuration to
+                serve this model""".format(
+                    args["model_name"]
+                )
+            )
 
-        output0_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT0")
-        output1_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT1")
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
 
         self.output0_dtype = pb_utils.triton_string_to_numpy(
-            output0_config['data_type'])
+            output0_config["data_type"]
+        )
         self.output1_dtype = pb_utils.triton_string_to_numpy(
-            output1_config['data_type'])
+            output1_config["data_type"]
+        )
 
     def execute(self, requests):
-        """ Create a response sender object and use that
+        """Create a response sender object and use that
         for sending the response.
         """
 
         # This model does not support batching, so 'request_count' should always be 1.
         if len(requests) != 1:
-            raise pb_utils.TritonModelException("unsupported batch size " +
-                                                len(requests))
+            raise pb_utils.TritonModelException(
+                "unsupported batch size " + len(requests)
+            )
 
         output0_dtype = self.output0_dtype
         output1_dtype = self.output1_dtype
@@ -70,13 +78,14 @@ def execute(self, requests):
         response_sender = requests[0].get_response_sender()
         in_0 = pb_utils.get_input_tensor_by_name(requests[0], "INPUT0")
         in_1 = pb_utils.get_input_tensor_by_name(requests[0], "INPUT1")
-        out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(),
-                        in_0.as_numpy() - in_1.as_numpy())
+        out_0, out_1 = (
+            in_0.as_numpy() + in_1.as_numpy(),
+            in_0.as_numpy() - in_1.as_numpy(),
+        )
 
         out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
         out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
         response = pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])
 
-        response_sender.send(
-            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+        response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
         response_sender.send(response)
diff --git a/qa/L0_backend_python/decoupled/test.sh b/qa/L0_backend_python/decoupled/test.sh
old mode 100644
new mode 100755
index c71055a511..0e316c8452
--- a/qa/L0_backend_python/decoupled/test.sh
+++ b/qa/L0_backend_python/decoupled/test.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_backend_python/ensemble/ensemble_test.py b/qa/L0_backend_python/ensemble/ensemble_test.py
old mode 100644
new mode 100755
index f0cceed4e7..64ddc3816f
--- a/qa/L0_backend_python/ensemble/ensemble_test.py
+++ b/qa/L0_backend_python/ensemble/ensemble_test.py
@@ -1,4 +1,6 @@
-# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,16 +30,16 @@
 
 sys.path.append("../../common")
 
-import test_util as tu
+import unittest
+
+import numpy as np
 import shm_util
+import test_util as tu
 import tritonclient.http as httpclient
 from tritonclient.utils import *
-import numpy as np
-import unittest
 
 
 class EnsembleTest(tu.TestResultCollector):
-
     def setUp(self):
         self._shm_leak_detector = shm_util.ShmLeakDetector()
 
@@ -50,17 +52,21 @@ def test_ensemble(self):
                 input_data_1 = np.random.random(shape).astype(np.float32)
                 inputs = [
                     httpclient.InferInput(
-                        "INPUT0", input_data_0.shape,
-                        np_to_triton_dtype(input_data_0.dtype)),
+                        "INPUT0",
+                        input_data_0.shape,
+                        np_to_triton_dtype(input_data_0.dtype),
+                    ),
                     httpclient.InferInput(
-                        "INPUT1", input_data_1.shape,
-                        np_to_triton_dtype(input_data_1.dtype))
+                        "INPUT1",
+                        input_data_1.shape,
+                        np_to_triton_dtype(input_data_1.dtype),
+                    ),
                 ]
                 inputs[0].set_data_from_numpy(input_data_0)
                 inputs[1].set_data_from_numpy(input_data_1)
                 result = client.infer(model_name, inputs)
-                output0 = result.as_numpy('OUTPUT0')
-                output1 = result.as_numpy('OUTPUT1')
+                output0 = result.as_numpy("OUTPUT0")
+                output1 = result.as_numpy("OUTPUT1")
                 self.assertIsNotNone(output0)
                 self.assertIsNotNone(output1)
 
@@ -74,17 +80,21 @@ def test_ensemble(self):
                 input_data_1 = np.random.random(shape).astype(np.float32)
                 inputs = [
                     httpclient.InferInput(
-                        "INPUT0", input_data_0.shape,
-                        np_to_triton_dtype(input_data_0.dtype)),
+                        "INPUT0",
+                        input_data_0.shape,
+                        np_to_triton_dtype(input_data_0.dtype),
+                    ),
                     httpclient.InferInput(
-                        "INPUT1", input_data_1.shape,
-                        np_to_triton_dtype(input_data_1.dtype))
+                        "INPUT1",
+                        input_data_1.shape,
+                        np_to_triton_dtype(input_data_1.dtype),
+                    ),
                 ]
                 inputs[0].set_data_from_numpy(input_data_0)
                 inputs[1].set_data_from_numpy(input_data_1)
                 result = client.infer(model_name, inputs)
-                output0 = result.as_numpy('OUTPUT0')
-                output1 = result.as_numpy('OUTPUT1')
+                output0 = result.as_numpy("OUTPUT0")
+                output1 = result.as_numpy("OUTPUT1")
                 self.assertIsNotNone(output0)
                 self.assertIsNotNone(output1)
 
@@ -92,5 +102,5 @@ def test_ensemble(self):
                 self.assertTrue(np.allclose(output1, 2 * input_data_1))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_backend_python/ensemble/test.sh b/qa/L0_backend_python/ensemble/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_backend_python/env/test.sh b/qa/L0_backend_python/env/test.sh
old mode 100644
new mode 100755
index a32c4036a4..4161be5b49
--- a/qa/L0_backend_python/env/test.sh
+++ b/qa/L0_backend_python/env/test.sh
@@ -210,7 +210,7 @@ wait $SERVER_PID
 
 set +e
 
-PY310_ENV_EXTRACTION="Extracting Python execution env" 
+PY310_ENV_EXTRACTION="Extracting Python execution env"
 if [ `grep -c "${PY310_ENV_EXTRACTION}" ${SERVER_LOG}` != "2" ]; then
     cat $SERVER_LOG
     echo -e "\n***\n*** Python execution environment should be extracted exactly twice. \n***"
diff --git a/qa/L0_backend_python/examples/test.sh b/qa/L0_backend_python/examples/test.sh
old mode 100644
new mode 100755
index 2c94904135..bbad8b5bfd
--- a/qa/L0_backend_python/examples/test.sh
+++ b/qa/L0_backend_python/examples/test.sh
@@ -37,7 +37,7 @@ SERVER_LOG="./inference_server.log"
 RET=0
 rm -fr *.log python_backend/
 
-# Install torch 
+# Install torch
 # Skip torch and torchvision install on Jetson since it is already installed.
 if [ "$TEST_JETSON" == "0" ]; then
     pip3 uninstall -y torch
diff --git a/qa/L0_backend_python/io/io_test.py b/qa/L0_backend_python/io/io_test.py
old mode 100644
new mode 100755
index d054ee54a8..9adb4414ab
--- a/qa/L0_backend_python/io/io_test.py
+++ b/qa/L0_backend_python/io/io_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,21 +30,21 @@
 
 sys.path.append("../../common")
 
+import os
+import queue
+import unittest
 from functools import partial
-import test_util as tu
+
+import numpy as np
 import shm_util
+import test_util as tu
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import *
-import numpy as np
-import unittest
-import queue
-import os
 
-TRIAL = os.getenv('TRIAL')
+TRIAL = os.getenv("TRIAL")
 
 
 class UserData:
-
     def __init__(self):
         self._completed_requests = queue.Queue()
 
@@ -55,7 +57,6 @@ def callback(user_data, result, error):
 
 
 class IOTest(tu.TestResultCollector):
-
     def setUp(self):
         self._shm_leak_detector = shm_util.ShmLeakDetector()
         self._client = grpcclient.InferenceServerClient("localhost:8001")
@@ -69,60 +70,66 @@ def _run_ensemble_test(self):
             for model_2_in_gpu in [True, False]:
                 for model_3_in_gpu in [True, False]:
                     gpu_output = np.asarray(
-                        [model_1_in_gpu, model_2_in_gpu, model_3_in_gpu],
-                        dtype=bool)
+                        [model_1_in_gpu, model_2_in_gpu, model_3_in_gpu], dtype=bool
+                    )
                     inputs = [
-                        grpcclient.InferInput("INPUT0", input0.shape,
-                                              np_to_triton_dtype(input0.dtype)),
                         grpcclient.InferInput(
-                            "GPU_OUTPUT", gpu_output.shape,
-                            np_to_triton_dtype(gpu_output.dtype))
+                            "INPUT0", input0.shape, np_to_triton_dtype(input0.dtype)
+                        ),
+                        grpcclient.InferInput(
+                            "GPU_OUTPUT",
+                            gpu_output.shape,
+                            np_to_triton_dtype(gpu_output.dtype),
+                        ),
                     ]
                     inputs[0].set_data_from_numpy(input0)
                     inputs[1].set_data_from_numpy(gpu_output)
-                    self._client.async_stream_infer(model_name=model_name,
-                                                    inputs=inputs)
-                    if TRIAL == 'default':
+                    self._client.async_stream_infer(
+                        model_name=model_name, inputs=inputs
+                    )
+                    if TRIAL == "default":
                         result = user_data._completed_requests.get()
-                        output0 = result.as_numpy('OUTPUT0')
+                        output0 = result.as_numpy("OUTPUT0")
                         self.assertIsNotNone(output0)
                         self.assertTrue(np.all(output0 == input0))
                     else:
                         response_repeat = 2
                         for _ in range(response_repeat):
                             result = user_data._completed_requests.get()
-                            output0 = result.as_numpy('OUTPUT0')
+                            output0 = result.as_numpy("OUTPUT0")
                             self.assertIsNotNone(output0)
                             self.assertTrue(np.all(output0 == input0))
 
     def test_ensemble_io(self):
         # Only run the shared memory leak detection with the default trial
-        if TRIAL == 'default':
+        if TRIAL == "default":
             with self._shm_leak_detector.Probe():
                 self._run_ensemble_test()
         else:
             self._run_ensemble_test()
 
     def test_empty_gpu_output(self):
-        model_name = 'dlpack_empty_output'
+        model_name = "dlpack_empty_output"
         input_data = np.array([[1.0]], dtype=np.float32)
         inputs = [
-            grpcclient.InferInput("INPUT", input_data.shape,
-                                  np_to_triton_dtype(input_data.dtype))
+            grpcclient.InferInput(
+                "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype)
+            )
         ]
         inputs[0].set_data_from_numpy(input_data)
         result = self._client.infer(model_name, inputs)
-        output = result.as_numpy('OUTPUT')
+        output = result.as_numpy("OUTPUT")
         self.assertIsNotNone(output)
         self.assertEqual(output.size, 0)
 
     def test_variable_gpu_output(self):
         # Input is not important in this test
-        model_name = 'variable_gpu_output'
+        model_name = "variable_gpu_output"
         input_data = np.array([[1.0]], dtype=np.float32)
         inputs = [
-            grpcclient.InferInput("INPUT", input_data.shape,
-                                  np_to_triton_dtype(input_data.dtype))
+            grpcclient.InferInput(
+                "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype)
+            )
         ]
         inputs[0].set_data_from_numpy(input_data)
         user_data = UserData()
@@ -131,20 +138,21 @@ def test_variable_gpu_output(self):
         # responses with different GPU output shapes
         num_requests = 5
         for _ in range(num_requests):
-            result = self._client.async_infer(model_name=model_name,
-                                              inputs=inputs,
-                                              callback=partial(
-                                                  callback, user_data))
+            result = self._client.async_infer(
+                model_name=model_name,
+                inputs=inputs,
+                callback=partial(callback, user_data),
+            )
 
         for i in range(num_requests):
             result = user_data._completed_requests.get()
             if result is InferenceServerException:
                 self.assertTrue(False, result)
-            output = result.as_numpy('OUTPUT')
+            output = result.as_numpy("OUTPUT")
             self.assertIsNotNone(output)
             self.assertEqual(output.size, i + 1)
             np.testing.assert_almost_equal(output, np.ones(i + 1) * (i + 1))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_backend_python/io/test.sh b/qa/L0_backend_python/io/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
old mode 100644
new mode 100755
index 425eb4322d..23c0f9686d
--- a/qa/L0_backend_python/lifecycle/lifecycle_test.py
+++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,19 +30,19 @@
 
 sys.path.append("../../common")
 
-import test_util as tu
-import shm_util
+import queue
+import unittest
 from functools import partial
-import tritonclient.http as httpclient
+
+import numpy as np
+import shm_util
+import test_util as tu
 import tritonclient.grpc as grpcclient
+import tritonclient.http as httpclient
 from tritonclient.utils import *
-import numpy as np
-import unittest
-import queue
 
 
 class UserData:
-
     def __init__(self):
         self._completed_requests = queue.Queue()
 
@@ -53,13 +55,12 @@ def callback(user_data, result, error):
 
 
 class LifecycleTest(tu.TestResultCollector):
-
     def setUp(self):
         self._shm_leak_detector = shm_util.ShmLeakDetector()
 
     def test_batch_error(self):
         # The execute_error model returns an error for the first and third
-        # request and sucessfully processes the second request. This is making
+        # request and successfully processes the second request. This is making
         # sure that an error in a single request does not completely fail the
         # batch.
         model_name = "execute_error"
@@ -75,12 +76,12 @@ def test_batch_error(self):
                 input_data = np.random.randn(*shape).astype(np.float32)
                 input_datas.append(input_data)
                 inputs = [
-                    grpcclient.InferInput("IN", input_data.shape,
-                                          np_to_triton_dtype(input_data.dtype))
+                    grpcclient.InferInput(
+                        "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
                 ]
                 inputs[0].set_data_from_numpy(input_data)
-                triton_client.async_stream_infer(model_name=model_name,
-                                                 inputs=inputs)
+                triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
 
             for i in range(number_of_requests):
                 result = user_data._completed_requests.get()
@@ -94,7 +95,9 @@ def test_batch_error(self):
                 self.assertTrue(
                     np.array_equal(output_data, input_datas[i]),
                     "error: expected output {} to match input {}".format(
-                        output_data, input_datas[i]))
+                        output_data, input_datas[i]
+                    ),
+                )
 
     def test_infer_pymodel_error(self):
         model_name = "wrong_model"
@@ -104,8 +107,9 @@ def test_infer_pymodel_error(self):
             with httpclient.InferenceServerClient("localhost:8000") as client:
                 input_data = (16384 * np.random.randn(*shape)).astype(np.uint32)
                 inputs = [
-                    httpclient.InferInput("IN", input_data.shape,
-                                          np_to_triton_dtype(input_data.dtype))
+                    httpclient.InferInput(
+                        "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
                 ]
                 inputs[0].set_data_from_numpy(input_data)
                 try:
@@ -115,21 +119,24 @@ def test_infer_pymodel_error(self):
                     self.assertTrue(
                         e.message().startswith(
                             "Failed to process the request(s) for model instance"
-                        ), "Exception message is not correct")
+                        ),
+                        "Exception message is not correct",
+                    )
                 else:
                     self.assertTrue(
-                        False,
-                        "Wrong exception raised or did not raise an exception")
+                        False, "Wrong exception raised or did not raise an exception"
+                    )
 
     def test_incorrect_execute_return(self):
-        model_name = 'execute_return_error'
+        model_name = "execute_return_error"
         shape = [1, 1]
         with self._shm_leak_detector.Probe() as shm_probe:
             with httpclient.InferenceServerClient("localhost:8000") as client:
                 input_data = (5 * np.random.randn(*shape)).astype(np.float32)
                 inputs = [
-                    httpclient.InferInput("INPUT", input_data.shape,
-                                          np_to_triton_dtype(input_data.dtype))
+                    httpclient.InferInput(
+                        "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
                 ]
                 inputs[0].set_data_from_numpy(input_data)
 
@@ -141,7 +148,8 @@ def test_incorrect_execute_return(self):
                     "Failed to process the request(s) for model instance "
                     "'execute_return_error_0', message: Expected a list in the "
                     "execute return" in str(e.exception),
-                    "Exception message is not correct.")
+                    "Exception message is not correct.",
+                )
 
                 # The second inference request will return a list of None object
                 # instead of Python InferenceResponse objects.
@@ -153,8 +161,9 @@ def test_incorrect_execute_return(self):
                     "'execute_return_error_0', message: Expected an "
                     "'InferenceResponse' object in the execute function return"
                     " list" in str(e.exception),
-                    "Exception message is not correct.")
+                    "Exception message is not correct.",
+                )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_backend_python/lifecycle/test.sh b/qa/L0_backend_python/lifecycle/test.sh
old mode 100644
new mode 100755
index b393b0f06b..c1ab6baf92
--- a/qa/L0_backend_python/lifecycle/test.sh
+++ b/qa/L0_backend_python/lifecycle/test.sh
@@ -72,7 +72,7 @@ set +e
 
 # Run this multiple times to catch any intermittent segfault.
 for i in {0..4}; do
-    python3 lifecycle_test.py > $CLIENT_LOG 2>&1 
+    python3 lifecycle_test.py > $CLIENT_LOG 2>&1
     if [ $? -ne 0 ]; then
         cat $CLIENT_LOG
         echo -e "\n***\n*** lifecycle_test.py FAILED. \n***"
diff --git a/qa/L0_backend_python/logging/logging_test.py b/qa/L0_backend_python/logging/logging_test.py
old mode 100644
new mode 100755
index 1070d240a7..b21919df65
--- a/qa/L0_backend_python/logging/logging_test.py
+++ b/qa/L0_backend_python/logging/logging_test.py
@@ -1,4 +1,6 @@
-# Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,29 +30,29 @@
 
 sys.path.append("../../common")
 import unittest
+
 import numpy as np
 import test_util as tu
-
-from tritonclient.utils import *
 import tritonclient.http as httpclient
+from tritonclient.utils import *
 
 
 class LogTest(tu.TestResultCollector):
-
     def test_log_output(self):
-        model_name = 'identity_fp32_logging'
+        model_name = "identity_fp32_logging"
         with httpclient.InferenceServerClient("localhost:8000") as client:
             input_data = np.array([[1.0]], dtype=np.float32)
             inputs = [
-                httpclient.InferInput("INPUT0", input_data.shape,
-                                      np_to_triton_dtype(input_data.dtype))
+                httpclient.InferInput(
+                    "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                )
             ]
             inputs[0].set_data_from_numpy(input_data)
             result = client.infer(model_name, inputs)
-            output0 = result.as_numpy('OUTPUT0')
+            output0 = result.as_numpy("OUTPUT0")
             self.assertIsNotNone(output0)
             self.assertTrue(np.all(output0 == input_data))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_backend_python/logging/test.sh b/qa/L0_backend_python/logging/test.sh
index 4b6b017d6d..369d28d0b9 100755
--- a/qa/L0_backend_python/logging/test.sh
+++ b/qa/L0_backend_python/logging/test.sh
@@ -68,7 +68,7 @@ source ../../common/util.sh
 function verify_log_counts () {
   non_verbose_expected=$1
   verbose_expected=$2
-  
+
   if [ `grep -c "Specific Msg!" $SERVER_LOG` != $non_verbose_expected ]; then
     echo -e "\n***\n*** Test Failed: Specific Msg Count Incorrect\n***"
     RET=1
@@ -145,7 +145,7 @@ if [ "$SERVER_PID" == "0" ]; then
 fi
 
 set +e
-# Enable verbose logging 
+# Enable verbose logging
 code=`curl -s -w %{http_code} -o ./curl.out -d'{"log_verbose_level":1}' localhost:8000/v2/logging`
 
 if [ "$code" != "200" ]; then
diff --git a/qa/L0_backend_python/model_control/model_control_test.py b/qa/L0_backend_python/model_control/model_control_test.py
old mode 100644
new mode 100755
index feceda01e4..17686f97d5
--- a/qa/L0_backend_python/model_control/model_control_test.py
+++ b/qa/L0_backend_python/model_control/model_control_test.py
@@ -1,4 +1,6 @@
-# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,22 +30,22 @@
 
 sys.path.append("../../common")
 
+import unittest
+
+import numpy as np
+import shm_util
 import test_util as tu
 import tritonclient.http as httpclient
 from tritonclient.utils import *
-import numpy as np
-import unittest
-import shm_util
 
 
 class ExplicitModelTest(tu.TestResultCollector):
-
     def setUp(self):
         self._shm_leak_detector = shm_util.ShmLeakDetector()
 
     def send_identity_request(self, client, model_name):
         inputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "FP32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "FP32"))
         input0_data = np.arange(start=0, stop=16, dtype=np.float32)
         input0_data = np.expand_dims(input0_data, axis=0)
         inputs[0].set_data_from_numpy(input0_data)
@@ -52,13 +54,14 @@ def send_identity_request(self, client, model_name):
             result = client.infer(
                 model_name=model_name,
                 inputs=inputs,
-                outputs=[httpclient.InferRequestedOutput('OUTPUT0')])
-        output_numpy = result.as_numpy('OUTPUT0')
+                outputs=[httpclient.InferRequestedOutput("OUTPUT0")],
+            )
+        output_numpy = result.as_numpy("OUTPUT0")
         self.assertTrue(np.all(input0_data == output_numpy))
 
     def test_model_reload(self):
         model_name = "identity_fp32"
-        ensemble_model_name = 'simple_' + "identity_fp32"
+        ensemble_model_name = "simple_" + "identity_fp32"
         with httpclient.InferenceServerClient("localhost:8000") as client:
             for _ in range(5):
                 self.assertFalse(client.is_model_ready(model_name))
@@ -76,5 +79,5 @@ def test_model_reload(self):
                 self.assertFalse(client.is_model_ready(ensemble_model_name))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_backend_python/model_control/test.sh b/qa/L0_backend_python/model_control/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_backend_python/python_test.py b/qa/L0_backend_python/python_test.py
old mode 100644
new mode 100755
index ba4dc25ecb..eb4d02aa53
--- a/qa/L0_backend_python/python_test.py
+++ b/qa/L0_backend_python/python_test.py
@@ -30,21 +30,20 @@
 
 sys.path.append("../common")
 
+import os
 import unittest
+
 import numpy as np
-import test_util as tu
-import shm_util
 import requests as httpreq
-import os
-
-from tritonclient.utils import *
+import shm_util
+import test_util as tu
 import tritonclient.http as httpclient
+from tritonclient.utils import *
 
-TEST_JETSON = bool(int(os.environ.get('TEST_JETSON', 0)))
+TEST_JETSON = bool(int(os.environ.get("TEST_JETSON", 0)))
 
 
 class PythonTest(tu.TestResultCollector):
-
     def setUp(self):
         self._shm_leak_detector = shm_util.ShmLeakDetector()
 
@@ -52,41 +51,39 @@ def _infer_help(self, model_name, shape, data_type):
         with httpclient.InferenceServerClient("localhost:8000") as client:
             input_data_0 = np.array(np.random.randn(*shape), dtype=data_type)
             inputs = [
-                httpclient.InferInput("INPUT0", shape,
-                                      np_to_triton_dtype(input_data_0.dtype))
+                httpclient.InferInput(
+                    "INPUT0", shape, np_to_triton_dtype(input_data_0.dtype)
+                )
             ]
             inputs[0].set_data_from_numpy(input_data_0)
 
             result = client.infer(model_name, inputs)
-            output0 = result.as_numpy('OUTPUT0')
+            output0 = result.as_numpy("OUTPUT0")
             self.assertTrue(np.all(input_data_0 == output0))
 
     def _create_cuda_region(self, client, size, name):
         import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
+
         shm0_handle = cuda_shared_memory.create_shared_memory_region(
-            name, byte_size=size, device_id=0)
+            name, byte_size=size, device_id=0
+        )
         client.register_cuda_shared_memory(
-            name, cuda_shared_memory.get_raw_handle(shm0_handle), 0, size)
+            name, cuda_shared_memory.get_raw_handle(shm0_handle), 0, size
+        )
         return shm0_handle
 
     def _optional_input_infer(self, model_name, has_input0, has_input1):
         with httpclient.InferenceServerClient("localhost:8000") as client:
             shape = (1,)
             if has_input0:
-                input0_numpy = np.random.randint(0,
-                                                 100,
-                                                 size=shape,
-                                                 dtype=np.int32)
+                input0_numpy = np.random.randint(0, 100, size=shape, dtype=np.int32)
             else:
                 # Set the input0 to a default value if it is optional. This is
                 # the input used by the model if it is not provided.
                 input0_numpy = np.array([5], dtype=np.int32)
 
             if has_input1:
-                input1_numpy = np.random.randint(0,
-                                                 100,
-                                                 size=shape,
-                                                 dtype=np.int32)
+                input1_numpy = np.random.randint(0, 100, size=shape, dtype=np.int32)
             else:
                 # Set the input1 to a default value if it is optional. This is
                 # the input used by the model if it is not provided.
@@ -96,56 +93,62 @@ def _optional_input_infer(self, model_name, has_input0, has_input1):
             if has_input0:
                 inputs.append(
                     httpclient.InferInput(
-                        "INPUT0", shape,
-                        np_to_triton_dtype(input0_numpy.dtype)))
+                        "INPUT0", shape, np_to_triton_dtype(input0_numpy.dtype)
+                    )
+                )
                 inputs[-1].set_data_from_numpy(input0_numpy)
 
             if has_input1:
                 inputs.append(
                     httpclient.InferInput(
-                        "INPUT1", shape,
-                        np_to_triton_dtype(input1_numpy.dtype)))
+                        "INPUT1", shape, np_to_triton_dtype(input1_numpy.dtype)
+                    )
+                )
                 inputs[-1].set_data_from_numpy(input1_numpy)
 
             result = client.infer(model_name, inputs)
-            output0 = result.as_numpy('OUTPUT0')
+            output0 = result.as_numpy("OUTPUT0")
             self.assertIsNotNone(output0, "OUTPUT0 was not found.")
 
-            output1 = result.as_numpy('OUTPUT1')
+            output1 = result.as_numpy("OUTPUT1")
             self.assertIsNotNone(output1, "OUTPUT1 was not found.")
 
             expected_output0 = input0_numpy + input1_numpy
             expected_output1 = input0_numpy - input1_numpy
-            np.testing.assert_equal(output0, expected_output0,
-                                    "OUTPUT0 doesn't match expected OUTPUT0")
-            np.testing.assert_equal(output1, expected_output1,
-                                    "OUTPUT1 doesn't match expected OUTPUT1")
+            np.testing.assert_equal(
+                output0, expected_output0, "OUTPUT0 doesn't match expected OUTPUT0"
+            )
+            np.testing.assert_equal(
+                output1, expected_output1, "OUTPUT1 doesn't match expected OUTPUT1"
+            )
 
     def test_growth_error(self):
         # 2 MiBs
         total_byte_size = 2 * 1024 * 1024
         shape = [total_byte_size]
-        model_name = 'identity_uint8_nobatch'
+        model_name = "identity_uint8_nobatch"
         dtype = np.uint8
         with self._shm_leak_detector.Probe() as shm_probe:
             self._infer_help(model_name, shape, dtype)
 
-        # 1 GiB payload leads to error in the main Python backned process.
+        # 1 GiB payload leads to error in the main Python backend process.
         # Total shared memory available is 1GiB.
         total_byte_size = 1024 * 1024 * 1024
         shape = [total_byte_size]
         with self.assertRaises(InferenceServerException) as ex:
             self._infer_help(model_name, shape, dtype)
-        self.assertIn("Failed to increase the shared memory pool size",
-                      str(ex.exception))
+        self.assertIn(
+            "Failed to increase the shared memory pool size", str(ex.exception)
+        )
 
         # 512 MiBs payload leads to error in the Python stub process.
         total_byte_size = 512 * 1024 * 1024
         shape = [total_byte_size]
         with self.assertRaises(InferenceServerException) as ex:
             self._infer_help(model_name, shape, dtype)
-        self.assertIn("Failed to increase the shared memory pool size",
-                      str(ex.exception))
+        self.assertIn(
+            "Failed to increase the shared memory pool size", str(ex.exception)
+        )
 
         # 2 MiBs
         # Send a small paylaod to make sure it is still working properly
@@ -160,60 +163,64 @@ def test_growth_error(self):
 
         def test_gpu_tensor_error(self):
             import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
-            model_name = 'identity_bool'
+
+            model_name = "identity_bool"
             with httpclient.InferenceServerClient("localhost:8000") as client:
                 input_data = np.array([[True] * 1000], dtype=bool)
                 inputs = [
-                    httpclient.InferInput("INPUT0", input_data.shape,
-                                          np_to_triton_dtype(input_data.dtype))
+                    httpclient.InferInput(
+                        "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
                 ]
                 inputs[0].set_data_from_numpy(input_data)
 
-                requested_outputs = [httpclient.InferRequestedOutput('OUTPUT0')]
+                requested_outputs = [httpclient.InferRequestedOutput("OUTPUT0")]
 
                 # intentionally create a shared memory region with not enough size.
                 client.unregister_cuda_shared_memory()
-                shm0_handle = self._create_cuda_region(client, 1,
-                                                       'output0_data')
+                shm0_handle = self._create_cuda_region(client, 1, "output0_data")
 
-                requested_outputs[0].set_shared_memory('output0_data', 1)
+                requested_outputs[0].set_shared_memory("output0_data", 1)
                 with self.assertRaises(InferenceServerException) as ex:
                     client.infer(model_name, inputs, outputs=requested_outputs)
                 self.assertIn(
                     "should be at least 1000 bytes to hold the results",
-                    str(ex.exception))
+                    str(ex.exception),
+                )
                 client.unregister_cuda_shared_memory()
                 cuda_shared_memory.destroy_shared_memory_region(shm0_handle)
 
         def test_dlpack_tensor_error(self):
             import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
-            model_name = 'dlpack_identity'
+
+            model_name = "dlpack_identity"
             with httpclient.InferenceServerClient("localhost:8000") as client:
                 input_data = np.array([[1] * 1000], dtype=np.float32)
                 inputs = [
-                    httpclient.InferInput("INPUT0", input_data.shape,
-                                          np_to_triton_dtype(input_data.dtype))
+                    httpclient.InferInput(
+                        "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
                 ]
 
-                requested_outputs = [httpclient.InferRequestedOutput('OUTPUT0')]
+                requested_outputs = [httpclient.InferRequestedOutput("OUTPUT0")]
                 input_data_size = input_data.itemsize * input_data.size
                 client.unregister_cuda_shared_memory()
-                input_region = self._create_cuda_region(client, input_data_size,
-                                                        'input0_data')
-                inputs[0].set_shared_memory('input0_data', input_data_size)
-                cuda_shared_memory.set_shared_memory_region(
-                    input_region, [input_data])
+                input_region = self._create_cuda_region(
+                    client, input_data_size, "input0_data"
+                )
+                inputs[0].set_shared_memory("input0_data", input_data_size)
+                cuda_shared_memory.set_shared_memory_region(input_region, [input_data])
 
                 # Intentionally create a small region to trigger an error
-                shm0_handle = self._create_cuda_region(client, 1,
-                                                       'output0_data')
-                requested_outputs[0].set_shared_memory('output0_data', 1)
+                shm0_handle = self._create_cuda_region(client, 1, "output0_data")
+                requested_outputs[0].set_shared_memory("output0_data", 1)
 
                 with self.assertRaises(InferenceServerException) as ex:
                     client.infer(model_name, inputs, outputs=requested_outputs)
                 self.assertIn(
                     "should be at least 4000 bytes to hold the results",
-                    str(ex.exception))
+                    str(ex.exception),
+                )
                 client.unregister_cuda_shared_memory()
                 cuda_shared_memory.destroy_shared_memory_region(shm0_handle)
 
@@ -224,18 +231,19 @@ def test_async_infer(self):
 
         with self._shm_leak_detector.Probe() as shm_probe:
             with httpclient.InferenceServerClient(
-                    "localhost:8000",
-                    concurrency=request_parallelism) as client:
+                "localhost:8000", concurrency=request_parallelism
+            ) as client:
                 input_datas = []
                 requests = []
                 for i in range(request_parallelism):
-                    input_data = (16384 * np.random.randn(*shape)).astype(
-                        np.uint8)
+                    input_data = (16384 * np.random.randn(*shape)).astype(np.uint8)
                     input_datas.append(input_data)
                     inputs = [
                         httpclient.InferInput(
-                            "INPUT0", input_data.shape,
-                            np_to_triton_dtype(input_data.dtype))
+                            "INPUT0",
+                            input_data.shape,
+                            np_to_triton_dtype(input_data.dtype),
+                        )
                     ]
                     inputs[0].set_data_from_numpy(input_data)
                     requests.append(client.async_infer(model_name, inputs))
@@ -246,76 +254,92 @@ def test_async_infer(self):
                     results = requests[i].get_result()
 
                     output_data = results.as_numpy("OUTPUT0")
-                    self.assertIsNotNone(output_data,
-                                         "error: expected 'OUTPUT0'")
+                    self.assertIsNotNone(output_data, "error: expected 'OUTPUT0'")
                     self.assertTrue(
                         np.array_equal(output_data, input_datas[i]),
                         "error: expected output {} to match input {}".format(
-                            output_data, input_datas[i]))
+                            output_data, input_datas[i]
+                        ),
+                    )
 
                 # Make sure the requests ran in parallel.
                 stats = client.get_inference_statistics(model_name)
-                test_cond = (len(stats['model_stats']) != 1) or (
-                    stats['model_stats'][0]['name'] != model_name)
+                test_cond = (len(stats["model_stats"]) != 1) or (
+                    stats["model_stats"][0]["name"] != model_name
+                )
+                self.assertFalse(
+                    test_cond, "error: expected statistics for {}".format(model_name)
+                )
+
+                stat = stats["model_stats"][0]
                 self.assertFalse(
-                    test_cond,
-                    "error: expected statistics for {}".format(model_name))
-
-                stat = stats['model_stats'][0]
-                self.assertFalse((stat['inference_count'] != 8) or (
-                    stat['execution_count'] != 1
-                ), "error: expected execution_count == 1 and inference_count == 8, got {} and {}"
-                                 .format(stat['execution_count'],
-                                         stat['inference_count']))
-                batch_stat = stat['batch_stats'][0]
+                    (stat["inference_count"] != 8) or (stat["execution_count"] != 1),
+                    "error: expected execution_count == 1 and inference_count == 8, got {} and {}".format(
+                        stat["execution_count"], stat["inference_count"]
+                    ),
+                )
+                batch_stat = stat["batch_stats"][0]
                 self.assertFalse(
-                    batch_stat['batch_size'] != 8,
-                    f"error: expected batch_size == 8, got {batch_stat['batch_size']}"
+                    batch_stat["batch_size"] != 8,
+                    f"error: expected batch_size == 8, got {batch_stat['batch_size']}",
                 )
                 # Check metrics to make sure they are reported correctly
-                metrics = httpreq.get('http://localhost:8002/metrics')
+                metrics = httpreq.get("http://localhost:8002/metrics")
                 print(metrics.text)
 
-                success_str = 'nv_inference_request_success{model="identity_uint8",version="1"}'
-                infer_count_str = 'nv_inference_count{model="identity_uint8",version="1"}'
-                infer_exec_str = 'nv_inference_exec_count{model="identity_uint8",version="1"}'
+                success_str = (
+                    'nv_inference_request_success{model="identity_uint8",version="1"}'
+                )
+                infer_count_str = (
+                    'nv_inference_count{model="identity_uint8",version="1"}'
+                )
+                infer_exec_str = (
+                    'nv_inference_exec_count{model="identity_uint8",version="1"}'
+                )
 
                 success_val = None
                 infer_count_val = None
                 infer_exec_val = None
                 for line in metrics.text.splitlines():
                     if line.startswith(success_str):
-                        success_val = float(line[len(success_str):])
+                        success_val = float(line[len(success_str) :])
                     if line.startswith(infer_count_str):
-                        infer_count_val = float(line[len(infer_count_str):])
+                        infer_count_val = float(line[len(infer_count_str) :])
                     if line.startswith(infer_exec_str):
-                        infer_exec_val = float(line[len(infer_exec_str):])
+                        infer_exec_val = float(line[len(infer_exec_str) :])
 
                 self.assertFalse(
                     success_val != 4,
                     "error: expected metric {} == 4, got {}".format(
-                        success_str, success_val))
+                        success_str, success_val
+                    ),
+                )
                 self.assertFalse(
                     infer_count_val != 8,
                     "error: expected metric {} == 8, got {}".format(
-                        infer_count_str, infer_count_val))
+                        infer_count_str, infer_count_val
+                    ),
+                )
                 self.assertFalse(
                     infer_exec_val != 1,
                     "error: expected metric {} == 1, got {}".format(
-                        infer_exec_str, infer_exec_val))
+                        infer_exec_str, infer_exec_val
+                    ),
+                )
 
     def test_bool(self):
-        model_name = 'identity_bool'
+        model_name = "identity_bool"
         with self._shm_leak_detector.Probe() as shm_probe:
             with httpclient.InferenceServerClient("localhost:8000") as client:
                 input_data = np.array([[True, False, True]], dtype=bool)
                 inputs = [
-                    httpclient.InferInput("INPUT0", input_data.shape,
-                                          np_to_triton_dtype(input_data.dtype))
+                    httpclient.InferInput(
+                        "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
                 ]
                 inputs[0].set_data_from_numpy(input_data)
                 result = client.infer(model_name, inputs)
-                output0 = result.as_numpy('OUTPUT0')
+                output0 = result.as_numpy("OUTPUT0")
                 self.assertIsNotNone(output0)
                 self.assertTrue(np.all(output0 == input_data))
 
@@ -326,21 +350,32 @@ def test_infer_pytorch(self):
             with httpclient.InferenceServerClient("localhost:8000") as client:
                 input_data = np.zeros(shape, dtype=np.float32)
                 inputs = [
-                    httpclient.InferInput("IN", input_data.shape,
-                                          np_to_triton_dtype(input_data.dtype))
+                    httpclient.InferInput(
+                        "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
                 ]
                 inputs[0].set_data_from_numpy(input_data)
                 result = client.infer(model_name, inputs)
-                output_data = result.as_numpy('OUT')
+                output_data = result.as_numpy("OUT")
                 self.assertIsNotNone(output_data, "error: expected 'OUT'")
 
-                # expected inference resposne from a zero tensor
+                # expected inference response from a zero tensor
                 expected_result = [
-                    -2.2377274, -2.3976364, -2.2464046, -2.2790744, -2.3828976,
-                    -2.2940576, -2.2928185, -2.340665, -2.275219, -2.292135
+                    -2.2377274,
+                    -2.3976364,
+                    -2.2464046,
+                    -2.2790744,
+                    -2.3828976,
+                    -2.2940576,
+                    -2.2928185,
+                    -2.340665,
+                    -2.275219,
+                    -2.292135,
                 ]
-                self.assertTrue(np.allclose(output_data[0], expected_result),
-                                'Inference result is not correct')
+                self.assertTrue(
+                    np.allclose(output_data[0], expected_result),
+                    "Inference result is not correct",
+                )
 
     def test_init_args(self):
         model_name = "init_args"
@@ -349,15 +384,17 @@ def test_init_args(self):
             with httpclient.InferenceServerClient("localhost:8000") as client:
                 input_data = np.zeros(shape, dtype=np.float32)
                 inputs = [
-                    httpclient.InferInput("IN", input_data.shape,
-                                          np_to_triton_dtype(input_data.dtype))
+                    httpclient.InferInput(
+                        "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
                 ]
                 inputs[0].set_data_from_numpy(input_data)
                 result = client.infer(model_name, inputs)
-                # output respone in this model is the number of keys in the args
+                # output response in this model is the number of keys in the args
                 self.assertTrue(
                     result.as_numpy("OUT") == 7,
-                    "Number of keys in the init args is not correct")
+                    "Number of keys in the init args is not correct",
+                )
 
     def test_unicode(self):
         model_name = "string"
@@ -367,19 +404,19 @@ def test_unicode(self):
         # np.object_
         for i in range(2):
             with self._shm_leak_detector.Probe() as shm_probe:
-                with httpclient.InferenceServerClient(
-                        "localhost:8000") as client:
-                    utf8 = '😀'
-                    input_data = np.array([bytes(utf8, encoding='utf-8')],
-                                          dtype=np.bytes_)
+                with httpclient.InferenceServerClient("localhost:8000") as client:
+                    utf8 = "😀"
+                    input_data = np.array(
+                        [bytes(utf8, encoding="utf-8")], dtype=np.bytes_
+                    )
                     inputs = [
                         httpclient.InferInput(
-                            "INPUT0", shape,
-                            np_to_triton_dtype(input_data.dtype))
+                            "INPUT0", shape, np_to_triton_dtype(input_data.dtype)
+                        )
                     ]
                     inputs[0].set_data_from_numpy(input_data)
                     result = client.infer(model_name, inputs)
-                    output0 = result.as_numpy('OUTPUT0')
+                    output0 = result.as_numpy("OUTPUT0")
                     self.assertIsNotNone(output0)
                     self.assertEqual(output0[0], input_data)
 
@@ -389,8 +426,7 @@ def test_optional_input(self):
         with self._shm_leak_detector.Probe() as shm_probe:
             for has_input0 in [True, False]:
                 for has_input1 in [True, False]:
-                    self._optional_input_infer(model_name, has_input0,
-                                               has_input1)
+                    self._optional_input_infer(model_name, has_input0, has_input1)
 
     def test_string(self):
         model_name = "string_fixed"
@@ -401,27 +437,25 @@ def test_string(self):
         # (empty output and fixed output)
         for i in range(4):
             with self._shm_leak_detector.Probe() as shm_probe:
-                with httpclient.InferenceServerClient(
-                        "localhost:8000") as client:
-                    input_data = np.array(['123456'], dtype=np.object_)
+                with httpclient.InferenceServerClient("localhost:8000") as client:
+                    input_data = np.array(["123456"], dtype=np.object_)
                     inputs = [
                         httpclient.InferInput(
-                            "INPUT0", shape,
-                            np_to_triton_dtype(input_data.dtype))
+                            "INPUT0", shape, np_to_triton_dtype(input_data.dtype)
+                        )
                     ]
                     inputs[0].set_data_from_numpy(input_data)
                     result = client.infer(model_name, inputs)
-                    output0 = result.as_numpy('OUTPUT0')
+                    output0 = result.as_numpy("OUTPUT0")
                     self.assertIsNotNone(output0)
 
                     if i % 2 == 0:
-                        self.assertEqual(output0[0],
-                                         input_data.astype(np.bytes_))
+                        self.assertEqual(output0[0], input_data.astype(np.bytes_))
                     else:
                         self.assertEqual(output0.size, 0)
 
     def test_non_contiguous(self):
-        model_name = 'non_contiguous'
+        model_name = "non_contiguous"
         shape = [2, 10, 11, 6, 5]
         new_shape = [10, 2, 6, 5, 11]
         shape_reorder = [1, 0, 4, 2, 3]
@@ -429,8 +463,9 @@ def test_non_contiguous(self):
             input_numpy = np.random.rand(*shape)
             input_numpy = input_numpy.astype(np.float32)
             inputs = [
-                httpclient.InferInput("INPUT0", shape,
-                                      np_to_triton_dtype(input_numpy.dtype))
+                httpclient.InferInput(
+                    "INPUT0", shape, np_to_triton_dtype(input_numpy.dtype)
+                )
             ]
             inputs[0].set_data_from_numpy(input_numpy)
             result = client.infer(model_name, inputs)
@@ -440,10 +475,10 @@ def test_non_contiguous(self):
             output1 = input_numpy.T
             output2 = np.transpose(input_numpy, shape_reorder)
 
-            self.assertTrue(np.all(output0 == result.as_numpy('OUTPUT0')))
-            self.assertTrue(np.all(output1 == result.as_numpy('OUTPUT1')))
-            self.assertTrue(np.all(output2 == result.as_numpy('OUTPUT2')))
+            self.assertTrue(np.all(output0 == result.as_numpy("OUTPUT0")))
+            self.assertTrue(np.all(output1 == result.as_numpy("OUTPUT1")))
+            self.assertTrue(np.all(output2 == result.as_numpy("OUTPUT2")))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_backend_python/python_unittest.py b/qa/L0_backend_python/python_unittest.py
old mode 100644
new mode 100755
index 9ff1b30e02..bff4dd57da
--- a/qa/L0_backend_python/python_unittest.py
+++ b/qa/L0_backend_python/python_unittest.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,16 +30,16 @@
 
 sys.path.append("../../common")
 
-import test_util as tu
-import shm_util
+import os
 import unittest
+
+import shm_util
+import test_util as tu
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import *
-import os
 
 
 class PythonUnittest(tu.TestResultCollector):
-
     def setUp(self):
         self._shm_leak_detector = shm_util.ShmLeakDetector()
 
@@ -45,15 +47,15 @@ def _run_unittest(self, model_name):
         with grpcclient.InferenceServerClient("localhost:8001") as client:
             # No input is required
             result = client.infer(model_name, [], client_timeout=240)
-            output0 = result.as_numpy('OUTPUT0')
+            output0 = result.as_numpy("OUTPUT0")
 
-            # The model returns 1 if the tests were sucessfully passed.
+            # The model returns 1 if the tests were successfully passed.
             # Otherwise, it will return 0.
             self.assertEqual(output0, [1])
 
     def test_python_unittest(self):
-        model_name = os.environ['MODEL_NAME']
-        bls_kind = os.environ.get('BLS_KIND', 'non_decoupled')
+        model_name = os.environ["MODEL_NAME"]
+        bls_kind = os.environ.get("BLS_KIND", "non_decoupled")
 
         if bls_kind == "decoupled":
             # Skip the shared memory probe for decoupled models for now as
@@ -62,7 +64,11 @@ def test_python_unittest(self):
             # is bounded.
             self._run_unittest(model_name)
         else:
-            if model_name == 'bls' or model_name == 'bls_memory' or model_name == 'bls_memory_async':
+            if (
+                model_name == "bls"
+                or model_name == "bls_memory"
+                or model_name == "bls_memory_async"
+            ):
                 # For these tests, the memory region size will be grown. Because of
                 # this we need to use the shared memory probe only on the later
                 # call so that the probe can detect the leak correctly.
@@ -77,5 +83,5 @@ def test_python_unittest(self):
                     self._run_unittest(model_name)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_backend_python/restart/models/restart/1/model.py b/qa/L0_backend_python/restart/models/restart/1/model.py
old mode 100644
new mode 100755
index 72bce2933a..d7cb765ec9
--- a/qa/L0_backend_python/restart/models/restart/1/model.py
+++ b/qa/L0_backend_python/restart/models/restart/1/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,29 +26,30 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
-import c_python_backend_utils as c_utils
 from os import path
 
+import c_python_backend_utils as c_utils
+import triton_python_backend_utils as pb_utils
 
-class TritonPythonModel:
 
+class TritonPythonModel:
     def execute(self, requests):
         # This function will be called once to record the free memory. Then,
         # the stub process will be killed to trigger Python backend restart.
         # After that this value will be read again to make sure that it matches
         # before restart.
 
-        file_name = 'free_memory.txt'
+        file_name = "free_memory.txt"
         current_free_memory = str(c_utils.shared_memory.free_memory())
         if path.exists(file_name):
-            with open(file_name, 'r') as f:
+            with open(file_name, "r") as f:
                 expected_free_memory = f.read()
-                assert expected_free_memory == current_free_memory, \
-                        (f'Free shared memory before and after restart are not equal. '
-                         '{expected_free_memory} (before) != {current_free_memory} (after).')
+                assert expected_free_memory == current_free_memory, (
+                    f"Free shared memory before and after restart are not equal. "
+                    "{expected_free_memory} (before) != {current_free_memory} (after)."
+                )
         else:
-            with open(file_name, 'w') as f:
+            with open(file_name, "w") as f:
                 f.write(current_free_memory)
 
         responses = []
diff --git a/qa/L0_backend_python/restart/restart_test.py b/qa/L0_backend_python/restart/restart_test.py
old mode 100644
new mode 100755
index cf5afcbdb1..4f4bf63082
--- a/qa/L0_backend_python/restart/restart_test.py
+++ b/qa/L0_backend_python/restart/restart_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,30 +30,31 @@
 
 sys.path.append("../../common")
 
+import unittest
+
+import numpy as np
 import test_util as tu
 import tritonclient.http as httpclient
 from tritonclient.utils import *
-import numpy as np
-import unittest
 
 
 class RestartTest(tu.TestResultCollector):
-
     def _infer_helper(self, model_name, shape, data_type):
         with httpclient.InferenceServerClient("localhost:8000") as client:
             input_data_0 = np.array(np.random.randn(*shape), dtype=data_type)
             inputs = [
-                httpclient.InferInput("INPUT0", shape,
-                                      np_to_triton_dtype(input_data_0.dtype))
+                httpclient.InferInput(
+                    "INPUT0", shape, np_to_triton_dtype(input_data_0.dtype)
+                )
             ]
             inputs[0].set_data_from_numpy(input_data_0)
             result = client.infer(model_name, inputs)
-            output0 = result.as_numpy('OUTPUT0')
+            output0 = result.as_numpy("OUTPUT0")
             self.assertTrue(np.all(input_data_0 == output0))
 
     def test_restart(self):
         shape = [1, 16]
-        model_name = 'restart'
+        model_name = "restart"
         dtype = np.float32
 
         # Since the stub process has been killed, the first request
@@ -65,10 +68,10 @@ def test_restart(self):
 
     def test_infer(self):
         shape = [1, 16]
-        model_name = 'restart'
+        model_name = "restart"
         dtype = np.float32
         self._infer_helper(model_name, shape, dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_backend_python/restart/test.sh b/qa/L0_backend_python/restart/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_backend_python/variants/test.sh b/qa/L0_backend_python/variants/test.sh
old mode 100644
new mode 100755
index 24ceb1cf4c..65116cb2dc
--- a/qa/L0_backend_python/variants/test.sh
+++ b/qa/L0_backend_python/variants/test.sh
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# Buidling a CPU build of Python backend
+# Building a CPU build of Python backend
 
 source ../common.sh
 install_build_deps
diff --git a/qa/L0_batch_custom/batch_custom_test.py b/qa/L0_batch_custom/batch_custom_test.py
old mode 100644
new mode 100755
index 3fb74cf25d..6cd6346ad3
--- a/qa/L0_batch_custom/batch_custom_test.py
+++ b/qa/L0_batch_custom/batch_custom_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,31 +30,32 @@
 
 sys.path.append("../common")
 
-from builtins import range
 import os
-import time
 import threading
+import time
 import unittest
-import numpy as np
+from builtins import range
+from collections.abc import Iterable
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
-from collections.abc import Iterable
 import tritonclient.grpc as grpcclient
 
 # By default, find tritonserver on "localhost", but can be overridden
 # with TRITONSERVER_IPADDR envvar
-_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost')
+_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
 
 _deferred_exceptions_lock = threading.Lock()
 _deferred_exceptions = []
 
 
 class BatcherTest(tu.TestResultCollector):
-
     def setUp(self):
         # The helper client for setup will be GRPC for simplicity.
         self.triton_client_ = grpcclient.InferenceServerClient(
-            f"{_tritonserver_ipaddr}:8001")
+            f"{_tritonserver_ipaddr}:8001"
+        )
         self.precreated_shm_regions_ = []
         global _deferred_exceptions
         _deferred_exceptions = []
@@ -71,35 +74,45 @@ def check_deferred_exception(self):
             if len(_deferred_exceptions) > 0:
                 raise _deferred_exceptions[0]
 
-    def check_response(self,
-                       trial,
-                       bs,
-                       thresholds,
-                       requested_outputs=("OUTPUT0", "OUTPUT1"),
-                       input_size=16,
-                       shm_region_names=None,
-                       precreated_shm_regions=None):
+    def check_response(
+        self,
+        trial,
+        bs,
+        thresholds,
+        requested_outputs=("OUTPUT0", "OUTPUT1"),
+        input_size=16,
+        shm_region_names=None,
+        precreated_shm_regions=None,
+    ):
         try:
             start_ms = int(round(time.time() * 1000))
 
-            if trial == "savedmodel" or trial == "graphdef" or trial == "libtorch" \
-                    or trial == "onnx" or trial == "plan" or trial == "python":
+            if (
+                trial == "savedmodel"
+                or trial == "graphdef"
+                or trial == "libtorch"
+                or trial == "onnx"
+                or trial == "plan"
+                or trial == "python"
+            ):
                 tensor_shape = (bs, input_size)
-                iu.infer_exact(self,
-                               trial,
-                               tensor_shape,
-                               bs,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               swap=False,
-                               model_version=1,
-                               outputs=requested_outputs,
-                               use_http=False,
-                               use_grpc=False,
-                               use_http_json_tensors=False,
-                               skip_request_id_check=True,
-                               use_streaming=False)
+                iu.infer_exact(
+                    self,
+                    trial,
+                    tensor_shape,
+                    bs,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    swap=False,
+                    model_version=1,
+                    outputs=requested_outputs,
+                    use_http=False,
+                    use_grpc=False,
+                    use_http_json_tensors=False,
+                    skip_request_id_check=True,
+                    use_streaming=False,
+                )
             else:
                 self.assertFalse(True, "unknown trial type: " + trial)
 
@@ -110,79 +123,110 @@ def check_response(self,
             if lt_ms is not None:
                 self.assertTrue(
                     (end_ms - start_ms) < lt_ms,
-                    "expected less than " + str(lt_ms) +
-                    "ms response time, got " + str(end_ms - start_ms) + " ms")
+                    "expected less than "
+                    + str(lt_ms)
+                    + "ms response time, got "
+                    + str(end_ms - start_ms)
+                    + " ms",
+                )
             if gt_ms is not None:
                 self.assertTrue(
                     (end_ms - start_ms) > gt_ms,
-                    "expected greater than " + str(gt_ms) +
-                    "ms response time, got " + str(end_ms - start_ms) + " ms")
+                    "expected greater than "
+                    + str(gt_ms)
+                    + "ms response time, got "
+                    + str(end_ms - start_ms)
+                    + " ms",
+                )
         except Exception as ex:
             self.add_deferred_exception(ex)
 
-    def check_status(self, model_name, batch_exec, request_cnt, infer_cnt,
-                     exec_count):
+    def check_status(self, model_name, batch_exec, request_cnt, infer_cnt, exec_count):
         # There is a time window between when responses are returned and statistics are updated.
         # To prevent intermittent test failure during that window, wait up to 10 seconds for the
         # inference statistics to be ready.
         num_tries = 10
         for i in range(num_tries):
-            stats = self.triton_client_.get_inference_statistics(
-                model_name, "1")
+            stats = self.triton_client_.get_inference_statistics(model_name, "1")
             self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats")
             actual_exec_cnt = stats.model_stats[0].execution_count
             if actual_exec_cnt == exec_count:
                 break
-            print("WARNING: expect {} executions, got {} (attempt {})".format(
-                exec_count, actual_exec_cnt, i))
+            print(
+                "WARNING: expect {} executions, got {} (attempt {})".format(
+                    exec_count, actual_exec_cnt, i
+                )
+            )
             time.sleep(1)
 
-        self.assertEqual(stats.model_stats[0].name, model_name,
-                         "expect model stats for model {}".format(model_name))
         self.assertEqual(
-            stats.model_stats[0].version, "1",
-            "expect model stats for model {} version 1".format(model_name))
+            stats.model_stats[0].name,
+            model_name,
+            "expect model stats for model {}".format(model_name),
+        )
+        self.assertEqual(
+            stats.model_stats[0].version,
+            "1",
+            "expect model stats for model {} version 1".format(model_name),
+        )
 
         if batch_exec:
             batch_stats = stats.model_stats[0].batch_stats
             self.assertEqual(
-                len(batch_stats), len(batch_exec),
+                len(batch_stats),
+                len(batch_exec),
                 "expected {} different batch-sizes, got {}".format(
-                    len(batch_exec), len(batch_stats)))
+                    len(batch_exec), len(batch_stats)
+                ),
+            )
 
             for batch_stat in batch_stats:
                 bs = batch_stat.batch_size
                 bc = batch_stat.compute_infer.count
-                self.assertTrue(bs in batch_exec,
-                                "unexpected batch-size {}".format(bs))
+                self.assertTrue(bs in batch_exec, "unexpected batch-size {}".format(bs))
                 # Get count from one of the stats
                 self.assertEqual(
-                    bc, batch_exec[bs],
-                    "expected model-execution-count {} for batch size {}, got {}"
-                    .format(batch_exec[bs], bs, bc))
+                    bc,
+                    batch_exec[bs],
+                    "expected model-execution-count {} for batch size {}, got {}".format(
+                        batch_exec[bs], bs, bc
+                    ),
+                )
 
         actual_request_cnt = stats.model_stats[0].inference_stats.success.count
         self.assertEqual(
-            actual_request_cnt, request_cnt,
+            actual_request_cnt,
+            request_cnt,
             "expected model-request-count {}, got {}".format(
-                request_cnt, actual_request_cnt))
+                request_cnt, actual_request_cnt
+            ),
+        )
 
         actual_exec_cnt = stats.model_stats[0].execution_count
         if isinstance(exec_count, Iterable):
             self.assertIn(
-                actual_exec_cnt, exec_count,
+                actual_exec_cnt,
+                exec_count,
                 "expected model-exec-count {}, got {}".format(
-                    exec_count, actual_exec_cnt))
+                    exec_count, actual_exec_cnt
+                ),
+            )
         else:
             self.assertEqual(
-                actual_exec_cnt, exec_count,
+                actual_exec_cnt,
+                exec_count,
                 "expected model-exec-count {}, got {}".format(
-                    exec_count, actual_exec_cnt))
+                    exec_count, actual_exec_cnt
+                ),
+            )
         actual_infer_cnt = stats.model_stats[0].inference_count
         self.assertEqual(
-            actual_infer_cnt, infer_cnt,
+            actual_infer_cnt,
+            infer_cnt,
             "expected model-inference-count {}, got {}".format(
-                infer_cnt, actual_infer_cnt))
+                infer_cnt, actual_infer_cnt
+            ),
+        )
 
     def test_volume_batching(self):
         # Send 12 requests with batch size 1. The max_queue_delay is set
@@ -190,26 +234,30 @@ def test_volume_batching(self):
         # there can be either 4-6 model executions.
         model_base = "onnx"
         dtype = np.float16
-        shapes = ([
-            1,
-            4,
-            4,
-        ],)
+        shapes = (
+            [
+                1,
+                4,
+                4,
+            ],
+        )
 
         try:
             # use threads to send 12 requests without waiting for response
             threads = []
             for i in range(12):
                 threads.append(
-                    threading.Thread(target=iu.infer_zero,
-                                     args=(self, model_base, 1, dtype, shapes,
-                                           shapes),
-                                     kwargs={
-                                         'use_http': True,
-                                         'use_grpc': False,
-                                         'use_http_json_tensors': False,
-                                         'use_streaming': False,
-                                     }))
+                    threading.Thread(
+                        target=iu.infer_zero,
+                        args=(self, model_base, 1, dtype, shapes, shapes),
+                        kwargs={
+                            "use_http": True,
+                            "use_grpc": False,
+                            "use_http_json_tensors": False,
+                            "use_streaming": False,
+                        },
+                    )
+                )
             for t in threads:
                 t.start()
             for t in threads:
@@ -221,5 +269,5 @@ def test_volume_batching(self):
             self.assertTrue(False, "unexpected error {}".format(ex))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_batch_custom/test.sh b/qa/L0_batch_custom/test.sh
index c957ec4515..11735e1470 100755
--- a/qa/L0_batch_custom/test.sh
+++ b/qa/L0_batch_custom/test.sh
@@ -125,7 +125,7 @@ for i in "${!test_setups[@]}"; do
     if [ "$SERVER_PID" == "0" ]; then
         echo -e "\n***\n*** Failed to start $SERVER\n***"
         cat $SERVER_LOG
-        exit 1  
+        exit 1
     fi
     if [ `grep -c "Loading custom batching strategy" $SERVER_LOG` != "1" ]; then
         cat $SERVER_LOG
@@ -157,7 +157,7 @@ done
 FILE_PATH="backend/examples/batching_strategies/volume_batching/src/volume_batching.cc"
 OLD_STRING="\/\/ Batcher will point to an unsigned integer representing the maximum"
 NEW_STRING="return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_NOT_FOUND,\"Failure test case\");"
- 
+
 sed -i "s/${OLD_STRING}/${NEW_STRING}/g" ${FILE_PATH}
 
 (cd backend/examples/batching_strategies/volume_batching &&
diff --git a/qa/L0_batch_input/batch_input_test.py b/qa/L0_batch_input/batch_input_test.py
old mode 100644
new mode 100755
index d5dfe2763d..02de27d921
--- a/qa/L0_batch_input/batch_input_test.py
+++ b/qa/L0_batch_input/batch_input_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,19 +30,19 @@
 
 sys.path.append("../common")
 
+import queue
 import unittest
-import numpy as np
 from functools import partial
-import queue
+
+import numpy as np
 import test_util as tu
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import InferenceServerException
 
 
 class BatchInputTest(tu.TestResultCollector):
-
     def setUp(self):
-        self.client = grpcclient.InferenceServerClient(url='localhost:8001')
+        self.client = grpcclient.InferenceServerClient(url="localhost:8001")
 
         def callback(user_data, result, error):
             if error:
@@ -55,28 +57,27 @@ def set_inputs(self, shapes, input_name):
         self.inputs = []
         for shape in shapes:
             self.inputs.append(
-                [grpcclient.InferInput(input_name, [1, shape[0]], "FP32")])
+                [grpcclient.InferInput(input_name, [1, shape[0]], "FP32")]
+            )
             self.inputs[-1][0].set_data_from_numpy(
-                np.full([1, shape[0]], shape[0], np.float32))
+                np.full([1, shape[0]], shape[0], np.float32)
+            )
 
     def set_inputs_for_batch_item(self, shapes, input_name):
         self.dtype_ = np.float32
         self.inputs = []
         for shape in shapes:
-            self.inputs.append(
-                [grpcclient.InferInput(input_name, shape, "FP32")])
-            self.inputs[-1][0].set_data_from_numpy(
-                np.full(shape, shape[0], np.float32))
+            self.inputs.append([grpcclient.InferInput(input_name, shape, "FP32")])
+            self.inputs[-1][0].set_data_from_numpy(np.full(shape, shape[0], np.float32))
 
     def test_ragged_output(self):
         model_name = "ragged_io"
         # The model is an identity model
         self.set_inputs([[2], [4], [1], [3]], "INPUT0")
         user_data = queue.Queue()
-        self.client.start_stream(
-            callback=partial(self.client_callback, user_data))
+        self.client.start_stream(callback=partial(self.client_callback, user_data))
 
-        output_name = 'OUTPUT0'
+        output_name = "OUTPUT0"
         outputs = [grpcclient.InferRequestedOutput(output_name)]
 
         async_requests = []
@@ -84,9 +85,10 @@ def test_ragged_output(self):
             for input in self.inputs:
                 # Asynchronous inference call.
                 async_requests.append(
-                    self.client.async_stream_infer(model_name=model_name,
-                                                   inputs=input,
-                                                   outputs=outputs))
+                    self.client.async_stream_infer(
+                        model_name=model_name, inputs=input, outputs=outputs
+                    )
+                )
 
             expected_value_list = [[v] * v for v in [2, 4, 1, 3]]
             expected_value_list = [
@@ -103,7 +105,9 @@ def test_ragged_output(self):
                 self.assertTrue(
                     np.array_equal(output_data, expected_value_list[idx]),
                     "Expect response {} to have value {}, got {}".format(
-                        idx, expected_value_list[idx], output_data))
+                        idx, expected_value_list[idx], output_data
+                    ),
+                )
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         self.client.stop_stream()
@@ -112,19 +116,19 @@ def test_ragged_input(self):
         model_name = "ragged_acc_shape"
         self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT")
         user_data = queue.Queue()
-        self.client.start_stream(
-            callback=partial(self.client_callback, user_data))
+        self.client.start_stream(callback=partial(self.client_callback, user_data))
 
-        output_name = 'RAGGED_OUTPUT'
+        output_name = "RAGGED_OUTPUT"
         outputs = [grpcclient.InferRequestedOutput(output_name)]
         async_requests = []
         try:
             for input in self.inputs:
                 # Asynchronous inference call.
                 async_requests.append(
-                    self.client.async_stream_infer(model_name=model_name,
-                                                   inputs=input,
-                                                   outputs=outputs))
+                    self.client.async_stream_infer(
+                        model_name=model_name, inputs=input, outputs=outputs
+                    )
+                )
 
             value_lists = [[v] * v for v in [2, 4, 1, 3]]
             expected_value = []
@@ -140,7 +144,9 @@ def test_ragged_input(self):
                 self.assertTrue(
                     np.array_equal(output_data, expected_value),
                     "Expect response {} to have value {}, got {}".format(
-                        idx, expected_value, output_data))
+                        idx, expected_value, output_data
+                    ),
+                )
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         self.client.stop_stream()
@@ -149,10 +155,9 @@ def test_element_count(self):
         model_name = "ragged_element_count_acc_zero"
         self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT")
         user_data = queue.Queue()
-        self.client.start_stream(
-            callback=partial(self.client_callback, user_data))
+        self.client.start_stream(callback=partial(self.client_callback, user_data))
 
-        output_name = 'BATCH_AND_SIZE_OUTPUT'
+        output_name = "BATCH_AND_SIZE_OUTPUT"
         outputs = [grpcclient.InferRequestedOutput(output_name)]
 
         async_requests = []
@@ -160,9 +165,10 @@ def test_element_count(self):
             for input in self.inputs:
                 # Asynchronous inference call.
                 async_requests.append(
-                    self.client.async_stream_infer(model_name=model_name,
-                                                   inputs=input,
-                                                   outputs=outputs))
+                    self.client.async_stream_infer(
+                        model_name=model_name, inputs=input, outputs=outputs
+                    )
+                )
 
             expected_value = np.asarray([[2, 4, 1, 3]], np.float32)
             for idx in range(len(async_requests)):
@@ -175,7 +181,9 @@ def test_element_count(self):
                 self.assertTrue(
                     np.array_equal(output_data, expected_value),
                     "Expect response {} to have value {}, got {}".format(
-                        idx, expected_value, output_data))
+                        idx, expected_value, output_data
+                    ),
+                )
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         self.client.stop_stream()
@@ -184,10 +192,9 @@ def test_accumulated_element_count(self):
         model_name = "ragged_acc_shape"
         self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT")
         user_data = queue.Queue()
-        self.client.start_stream(
-            callback=partial(self.client_callback, user_data))
+        self.client.start_stream(callback=partial(self.client_callback, user_data))
 
-        output_name = 'BATCH_AND_SIZE_OUTPUT'
+        output_name = "BATCH_AND_SIZE_OUTPUT"
         outputs = [grpcclient.InferRequestedOutput(output_name)]
 
         async_requests = []
@@ -195,9 +202,10 @@ def test_accumulated_element_count(self):
             for input in self.inputs:
                 # Asynchronous inference call.
                 async_requests.append(
-                    self.client.async_stream_infer(model_name=model_name,
-                                                   inputs=input,
-                                                   outputs=outputs))
+                    self.client.async_stream_infer(
+                        model_name=model_name, inputs=input, outputs=outputs
+                    )
+                )
 
             expected_value = np.asarray([[2, 6, 7, 10]], np.float32)
             for idx in range(len(async_requests)):
@@ -210,7 +218,9 @@ def test_accumulated_element_count(self):
                 self.assertTrue(
                     np.array_equal(output_data, expected_value),
                     "Expect response {} to have value {}, got {}".format(
-                        idx, expected_value, output_data))
+                        idx, expected_value, output_data
+                    ),
+                )
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         self.client.stop_stream()
@@ -219,10 +229,9 @@ def test_accumulated_element_count_with_zero(self):
         model_name = "ragged_element_count_acc_zero"
         self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT")
         user_data = queue.Queue()
-        self.client.start_stream(
-            callback=partial(self.client_callback, user_data))
+        self.client.start_stream(callback=partial(self.client_callback, user_data))
 
-        output_name = 'BATCH_OUTPUT'
+        output_name = "BATCH_OUTPUT"
         outputs = [grpcclient.InferRequestedOutput(output_name)]
 
         async_requests = []
@@ -230,9 +239,10 @@ def test_accumulated_element_count_with_zero(self):
             for input in self.inputs:
                 # Asynchronous inference call.
                 async_requests.append(
-                    self.client.async_stream_infer(model_name=model_name,
-                                                   inputs=input,
-                                                   outputs=outputs))
+                    self.client.async_stream_infer(
+                        model_name=model_name, inputs=input, outputs=outputs
+                    )
+                )
 
             expected_value = np.asarray([[0, 2, 6, 7, 10]], np.float32)
             for idx in range(len(async_requests)):
@@ -245,7 +255,9 @@ def test_accumulated_element_count_with_zero(self):
                 self.assertTrue(
                     np.array_equal(output_data, expected_value),
                     "Expect response {} to have value {}, got {}".format(
-                        idx, expected_value, output_data))
+                        idx, expected_value, output_data
+                    ),
+                )
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         self.client.stop_stream()
@@ -254,10 +266,9 @@ def test_max_element_count_as_shape(self):
         model_name = "ragged_acc_shape"
         self.set_inputs([[2], [4], [1], [3]], "RAGGED_INPUT")
         user_data = queue.Queue()
-        self.client.start_stream(
-            callback=partial(self.client_callback, user_data))
+        self.client.start_stream(callback=partial(self.client_callback, user_data))
 
-        output_name = 'BATCH_OUTPUT'
+        output_name = "BATCH_OUTPUT"
         outputs = [grpcclient.InferRequestedOutput(output_name)]
 
         async_requests = []
@@ -265,9 +276,10 @@ def test_max_element_count_as_shape(self):
             for input in self.inputs:
                 # Asynchronous inference call.
                 async_requests.append(
-                    self.client.async_stream_infer(model_name=model_name,
-                                                   inputs=input,
-                                                   outputs=outputs))
+                    self.client.async_stream_infer(
+                        model_name=model_name, inputs=input, outputs=outputs
+                    )
+                )
 
             for idx in range(len(async_requests)):
                 # Get the result from the initiated asynchronous inference request.
@@ -277,9 +289,12 @@ def test_max_element_count_as_shape(self):
                 # Validate the results by comparing with precomputed values.
                 output_data = result.as_numpy(output_name)
                 self.assertEqual(
-                    output_data.shape, (1, 4),
-                    "Expect response {} to have shape to represent max element count {} among the batch , got {}"
-                    .format(idx, 4, output_data.shape))
+                    output_data.shape,
+                    (1, 4),
+                    "Expect response {} to have shape to represent max element count {} among the batch , got {}".format(
+                        idx, 4, output_data.shape
+                    ),
+                )
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         self.client.stop_stream()
@@ -290,14 +305,14 @@ def test_batch_item_shape_flatten(self):
         # Note that the test only checks the formation of "BATCH_INPUT" where
         # the value of "RAGGED_INPUT" is irrelevant, only the shape matters
         self.set_inputs_for_batch_item(
-            [[1, 4, 1], [1, 1, 2], [1, 1, 2], [1, 2, 2]], "RAGGED_INPUT")
+            [[1, 4, 1], [1, 1, 2], [1, 1, 2], [1, 2, 2]], "RAGGED_INPUT"
+        )
 
         model_name = "batch_item_flatten"
         user_data = queue.Queue()
-        self.client.start_stream(
-            callback=partial(self.client_callback, user_data))
+        self.client.start_stream(callback=partial(self.client_callback, user_data))
 
-        output_name = 'BATCH_OUTPUT'
+        output_name = "BATCH_OUTPUT"
         outputs = [grpcclient.InferRequestedOutput(output_name)]
 
         async_requests = []
@@ -305,9 +320,10 @@ def test_batch_item_shape_flatten(self):
             for input in self.inputs:
                 # Asynchronous inference call.
                 async_requests.append(
-                    self.client.async_stream_infer(model_name=model_name,
-                                                   inputs=input,
-                                                   outputs=outputs))
+                    self.client.async_stream_infer(
+                        model_name=model_name, inputs=input, outputs=outputs
+                    )
+                )
 
             expected_value = np.asarray([[4, 1, 1, 2, 1, 2, 2, 2]], np.float32)
             for idx in range(len(async_requests)):
@@ -320,7 +336,9 @@ def test_batch_item_shape_flatten(self):
                 self.assertTrue(
                     np.array_equal(output_data, expected_value),
                     "Expect response {} to have value {}, got {}".format(
-                        idx, expected_value, output_data))
+                        idx, expected_value, output_data
+                    ),
+                )
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         self.client.stop_stream()
@@ -329,8 +347,9 @@ def test_batch_item_shape(self):
         # Use 3 set of inputs with shape [2, 1, 2], [1, 1, 2], [1, 2, 2]
         # Note that the test only checks the formation of "BATCH_INPUT" where
         # the value of "RAGGED_INPUT" is irrelevant, only the shape matters
-        self.set_inputs_for_batch_item([[2, 1, 2], [1, 1, 2], [1, 2, 2]],
-                                       "RAGGED_INPUT")
+        self.set_inputs_for_batch_item(
+            [[2, 1, 2], [1, 1, 2], [1, 2, 2]], "RAGGED_INPUT"
+        )
 
         expected_outputs = [
             np.array([[1.0, 2.0], [1.0, 2.0]]),
@@ -340,10 +359,9 @@ def test_batch_item_shape(self):
 
         model_name = "batch_item"
         user_data = queue.Queue()
-        self.client.start_stream(
-            callback=partial(self.client_callback, user_data))
+        self.client.start_stream(callback=partial(self.client_callback, user_data))
 
-        output_name = 'BATCH_OUTPUT'
+        output_name = "BATCH_OUTPUT"
         outputs = [grpcclient.InferRequestedOutput(output_name)]
 
         async_requests = []
@@ -351,9 +369,10 @@ def test_batch_item_shape(self):
             for input in self.inputs:
                 # Asynchronous inference call.
                 async_requests.append(
-                    self.client.async_stream_infer(model_name=model_name,
-                                                   inputs=input,
-                                                   outputs=outputs))
+                    self.client.async_stream_infer(
+                        model_name=model_name, inputs=input, outputs=outputs
+                    )
+                )
 
             for idx in range(len(async_requests)):
                 # Get the result from the initiated asynchronous inference request.
@@ -364,13 +383,16 @@ def test_batch_item_shape(self):
                 output_data = result.as_numpy(output_name)
                 self.assertTrue(
                     np.allclose(output_data, expected_outputs[idx]),
-                    "Expect response to have value:\n{}, got:\n{}\nEqual matrix:\n{}"
-                    .format(expected_outputs[idx], output_data,
-                            np.isclose(expected_outputs[idx], output_data)))
+                    "Expect response to have value:\n{}, got:\n{}\nEqual matrix:\n{}".format(
+                        expected_outputs[idx],
+                        output_data,
+                        np.isclose(expected_outputs[idx], output_data),
+                    ),
+                )
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         self.client.stop_stream()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_batch_input/test.sh b/qa/L0_batch_input/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_batcher/batcher_test.py b/qa/L0_batcher/batcher_test.py
old mode 100644
new mode 100755
index 4600474442..38e208c21e
--- a/qa/L0_batcher/batcher_test.py
+++ b/qa/L0_batcher/batcher_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,25 +30,23 @@
 
 sys.path.append("../common")
 
-from builtins import range
 import os
-import time
 import threading
+import time
 import unittest
-import numpy as np
+from builtins import range
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
-
 import tritonclient.grpc as grpcclient
 
 # By default, find tritonserver on "localhost", but can be overridden
 # with TRITONSERVER_IPADDR envvar
-_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost')
+_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
 
-TEST_SYSTEM_SHARED_MEMORY = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
-TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get('TEST_CUDA_SHARED_MEMORY',
-                                                  0)))
+TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
+TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0)))
 
 if TEST_SYSTEM_SHARED_MEMORY:
     import tritonclient.utils.shared_memory as shm
@@ -55,14 +55,13 @@
 
 # Test with either GRPC of HTTP, but not both since when we check
 # results we expect only one to run
-USE_GRPC = (os.environ.get('USE_GRPC', 1) != "0")
-USE_HTTP = (os.environ.get('USE_HTTP', 1) != "0")
+USE_GRPC = os.environ.get("USE_GRPC", 1) != "0"
+USE_HTTP = os.environ.get("USE_HTTP", 1) != "0"
 if USE_GRPC and USE_HTTP:
     USE_GRPC = False
 assert USE_GRPC or USE_HTTP, "USE_GRPC or USE_HTTP must be non-zero"
 
-BACKENDS = os.environ.get('BACKENDS',
-                          "graphdef savedmodel onnx libtorch plan python")
+BACKENDS = os.environ.get("BACKENDS", "graphdef savedmodel onnx libtorch plan python")
 
 _trials = BACKENDS.split(" ")
 
@@ -81,11 +80,11 @@
 
 
 class BatcherTest(tu.TestResultCollector):
-
     def setUp(self):
         # The helper client for setup will be GRPC for simplicity.
         self.triton_client_ = grpcclient.InferenceServerClient(
-            f"{_tritonserver_ipaddr}:8001")
+            f"{_tritonserver_ipaddr}:8001"
+        )
         self.precreated_shm_regions_ = []
         global _deferred_exceptions
         _deferred_exceptions = []
@@ -107,19 +106,22 @@ def create_advance(self, shm_regions=None):
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
             precreated_shm_regions = []
             if shm_regions is None:
-                shm_regions = ['output0', 'output1']
+                shm_regions = ["output0", "output1"]
             for shm_region in shm_regions:
                 if TEST_SYSTEM_SHARED_MEMORY:
                     shm_handle = shm.create_shared_memory_region(
-                        shm_region + '_data', '/' + shm_region, 512)
+                        shm_region + "_data", "/" + shm_region, 512
+                    )
                     self.triton_client_.register_system_shared_memory(
-                        shm_region + '_data', '/' + shm_region, 512)
+                        shm_region + "_data", "/" + shm_region, 512
+                    )
                 else:
                     shm_handle = cudashm.create_shared_memory_region(
-                        shm_region + '_data', 512, 0)
+                        shm_region + "_data", 512, 0
+                    )
                     self.triton_client_.register_cuda_shared_memory(
-                        shm_region + '_data',
-                        cudashm.get_raw_handle(shm_handle), 0, 512)
+                        shm_region + "_data", cudashm.get_raw_handle(shm_handle), 0, 512
+                    )
                 # Collect precreated handles for cleanup
                 self.precreated_shm_regions_.append(shm_handle)
                 precreated_shm_regions.append(shm_handle)
@@ -137,19 +139,27 @@ def check_deferred_exception(self):
             if len(_deferred_exceptions) > 0:
                 raise _deferred_exceptions[0]
 
-    def check_response(self,
-                       trial,
-                       bs,
-                       thresholds,
-                       requested_outputs=("OUTPUT0", "OUTPUT1"),
-                       input_size=16,
-                       shm_region_names=None,
-                       precreated_shm_regions=None):
+    def check_response(
+        self,
+        trial,
+        bs,
+        thresholds,
+        requested_outputs=("OUTPUT0", "OUTPUT1"),
+        input_size=16,
+        shm_region_names=None,
+        precreated_shm_regions=None,
+    ):
         try:
             start_ms = int(round(time.time() * 1000))
 
-            if trial == "savedmodel" or trial == "graphdef" or trial == "libtorch" \
-                    or trial == "onnx" or trial == "plan" or trial == "python":
+            if (
+                trial == "savedmodel"
+                or trial == "graphdef"
+                or trial == "libtorch"
+                or trial == "onnx"
+                or trial == "plan"
+                or trial == "python"
+            ):
                 tensor_shape = (bs, input_size)
                 iu.infer_exact(
                     self,
@@ -170,7 +180,8 @@ def check_response(self,
                     shm_region_names=shm_region_names,
                     precreated_shm_regions=precreated_shm_regions,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             else:
                 self.assertFalse(True, "unknown trial type: " + trial)
 
@@ -181,86 +192,109 @@ def check_response(self,
             if lt_ms is not None:
                 self.assertTrue(
                     (end_ms - start_ms) < lt_ms,
-                    "expected less than " + str(lt_ms) +
-                    "ms response time, got " + str(end_ms - start_ms) + " ms")
+                    "expected less than "
+                    + str(lt_ms)
+                    + "ms response time, got "
+                    + str(end_ms - start_ms)
+                    + " ms",
+                )
             if gt_ms is not None:
                 self.assertTrue(
                     (end_ms - start_ms) > gt_ms,
-                    "expected greater than " + str(gt_ms) +
-                    "ms response time, got " + str(end_ms - start_ms) + " ms")
+                    "expected greater than "
+                    + str(gt_ms)
+                    + "ms response time, got "
+                    + str(end_ms - start_ms)
+                    + " ms",
+                )
         except Exception as ex:
             self.add_deferred_exception(ex)
 
-    def check_setup(self, model_name, preferred_batch_sizes,
-                    max_queue_delay_us):
+    def check_setup(self, model_name, preferred_batch_sizes, max_queue_delay_us):
         # Make sure test.sh set up the correct batcher settings
         config = self.triton_client_.get_model_config(model_name).config
         bconfig = config.dynamic_batching
-        self.assertEqual(len(bconfig.preferred_batch_size),
-                         len(preferred_batch_sizes))
+        self.assertEqual(len(bconfig.preferred_batch_size), len(preferred_batch_sizes))
         for i in preferred_batch_sizes:
             self.assertTrue(i in bconfig.preferred_batch_size)
-        self.assertEqual(bconfig.max_queue_delay_microseconds,
-                         max_queue_delay_us)
+        self.assertEqual(bconfig.max_queue_delay_microseconds, max_queue_delay_us)
 
-    def check_status(self, model_name, batch_exec, request_cnt, infer_cnt,
-                     exec_count):
+    def check_status(self, model_name, batch_exec, request_cnt, infer_cnt, exec_count):
         # There is a time window between when responses are returned and statistics are updated.
         # To prevent intermittent test failure during that window, wait up to 10 seconds for the
         # inference statistics to be ready.
         num_tries = 10
         for i in range(num_tries):
-            stats = self.triton_client_.get_inference_statistics(
-                model_name, "1")
+            stats = self.triton_client_.get_inference_statistics(model_name, "1")
             self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats")
             actual_exec_cnt = stats.model_stats[0].execution_count
             if actual_exec_cnt in exec_count:
                 break
-            print("WARNING: expect {} executions, got {} (attempt {})".format(
-                exec_count, actual_exec_cnt, i))
+            print(
+                "WARNING: expect {} executions, got {} (attempt {})".format(
+                    exec_count, actual_exec_cnt, i
+                )
+            )
             time.sleep(1)
 
-        self.assertEqual(stats.model_stats[0].name, model_name,
-                         "expect model stats for model {}".format(model_name))
         self.assertEqual(
-            stats.model_stats[0].version, "1",
-            "expect model stats for model {} version 1".format(model_name))
+            stats.model_stats[0].name,
+            model_name,
+            "expect model stats for model {}".format(model_name),
+        )
+        self.assertEqual(
+            stats.model_stats[0].version,
+            "1",
+            "expect model stats for model {} version 1".format(model_name),
+        )
 
         if batch_exec:
             batch_stats = stats.model_stats[0].batch_stats
             self.assertEqual(
-                len(batch_stats), len(batch_exec),
+                len(batch_stats),
+                len(batch_exec),
                 "expected {} different batch-sizes, got {}".format(
-                    len(batch_exec), len(batch_stats)))
+                    len(batch_exec), len(batch_stats)
+                ),
+            )
 
             for batch_stat in batch_stats:
                 bs = batch_stat.batch_size
                 bc = batch_stat.compute_infer.count
-                self.assertTrue(bs in batch_exec,
-                                "unexpected batch-size {}".format(bs))
+                self.assertTrue(bs in batch_exec, "unexpected batch-size {}".format(bs))
                 # Get count from one of the stats
                 self.assertEqual(
-                    bc, batch_exec[bs],
-                    "expected model-execution-count {} for batch size {}, got {}"
-                    .format(batch_exec[bs], bs, bc))
+                    bc,
+                    batch_exec[bs],
+                    "expected model-execution-count {} for batch size {}, got {}".format(
+                        batch_exec[bs], bs, bc
+                    ),
+                )
 
         actual_request_cnt = stats.model_stats[0].inference_stats.success.count
         self.assertEqual(
-            actual_request_cnt, request_cnt,
+            actual_request_cnt,
+            request_cnt,
             "expected model-request-count {}, got {}".format(
-                request_cnt, actual_request_cnt))
+                request_cnt, actual_request_cnt
+            ),
+        )
 
         actual_exec_cnt = stats.model_stats[0].execution_count
         self.assertIn(
-            actual_exec_cnt, exec_count,
-            "expected model-exec-count {}, got {}".format(
-                exec_count, actual_exec_cnt))
+            actual_exec_cnt,
+            exec_count,
+            "expected model-exec-count {}, got {}".format(exec_count, actual_exec_cnt),
+        )
 
         actual_infer_cnt = stats.model_stats[0].inference_count
         self.assertEqual(
-            actual_infer_cnt, infer_cnt,
+            actual_infer_cnt,
+            infer_cnt,
             "expected model-inference-count {}, got {}".format(
-                infer_cnt, actual_infer_cnt))
+                infer_cnt, actual_infer_cnt
+            ),
+        )
 
     def test_static_batch_preferred(self):
         # Send two requests with static batch sizes == preferred
@@ -269,20 +303,25 @@ def test_static_batch_preferred(self):
         precreated_shm_regions = self.create_advance()
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
                 self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
 
                 self.check_response(
                     trial,
-                    2, (3000, None),
-                    precreated_shm_regions=precreated_shm_regions)
+                    2,
+                    (3000, None),
+                    precreated_shm_regions=precreated_shm_regions,
+                )
                 self.check_response(
                     trial,
-                    6, (3000, None),
-                    precreated_shm_regions=precreated_shm_regions)
+                    6,
+                    (3000, None),
+                    precreated_shm_regions=precreated_shm_regions,
+                )
                 self.check_deferred_exception()
                 self.check_status(model_name, {2: 1, 6: 1}, 2, 8, (2,))
             except Exception as ex:
@@ -295,16 +334,19 @@ def test_static_batch_lt_any_preferred(self):
         precreated_shm_regions = self.create_advance()
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
                 self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
 
                 self.check_response(
                     trial,
-                    1, (_max_queue_delay_ms * 1.5, _max_queue_delay_ms),
-                    precreated_shm_regions=precreated_shm_regions)
+                    1,
+                    (_max_queue_delay_ms * 1.5, _max_queue_delay_ms),
+                    precreated_shm_regions=precreated_shm_regions,
+                )
                 self.check_deferred_exception()
                 self.check_status(model_name, {1: 1}, 1, 1, (1,))
             except Exception as ex:
@@ -317,16 +359,19 @@ def test_static_batch_not_preferred(self):
         precreated_shm_regions = self.create_advance()
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
                 self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
 
                 self.check_response(
                     trial,
-                    3, (_max_queue_delay_ms * 1.5, _max_queue_delay_ms),
-                    precreated_shm_regions=precreated_shm_regions)
+                    3,
+                    (_max_queue_delay_ms * 1.5, _max_queue_delay_ms),
+                    precreated_shm_regions=precreated_shm_regions,
+                )
                 self.check_deferred_exception()
                 self.check_status(model_name, {3: 1}, 1, 3, (1,))
             except Exception as ex:
@@ -339,16 +384,19 @@ def test_static_batch_gt_max_preferred(self):
         precreated_shm_regions = self.create_advance()
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
                 self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
 
                 self.check_response(
                     trial,
-                    7, (3000, None),
-                    precreated_shm_regions=precreated_shm_regions)
+                    7,
+                    (3000, None),
+                    precreated_shm_regions=precreated_shm_regions,
+                )
                 self.check_deferred_exception()
                 self.check_status(model_name, {7: 1}, 1, 7, (1,))
             except Exception as ex:
@@ -369,25 +417,29 @@ def test_multi_batch_different_shape_allow_ragged(self):
 
                 threads = []
                 threads.append(
-                    threading.Thread(target=iu.infer_zero,
-                                     args=(self, trial, 1, dtype, ([1, 16],),
-                                           ([1, 16],)),
-                                     kwargs={
-                                         'use_grpc': USE_GRPC,
-                                         'use_http': USE_HTTP,
-                                         'use_http_json_tensors': False,
-                                         'use_streaming': False
-                                     }))
-                threads.append(
-                    threading.Thread(target=iu.infer_zero,
-                                     args=(self, trial, 1, dtype, ([1, 8],),
-                                           ([1, 8],)),
-                                     kwargs={
-                                         'use_grpc': USE_GRPC,
-                                         'use_http': USE_HTTP,
-                                         'use_http_json_tensors': False,
-                                         'use_streaming': False
-                                     }))
+                    threading.Thread(
+                        target=iu.infer_zero,
+                        args=(self, trial, 1, dtype, ([1, 16],), ([1, 16],)),
+                        kwargs={
+                            "use_grpc": USE_GRPC,
+                            "use_http": USE_HTTP,
+                            "use_http_json_tensors": False,
+                            "use_streaming": False,
+                        },
+                    )
+                )
+                threads.append(
+                    threading.Thread(
+                        target=iu.infer_zero,
+                        args=(self, trial, 1, dtype, ([1, 8],), ([1, 8],)),
+                        kwargs={
+                            "use_grpc": USE_GRPC,
+                            "use_http": USE_HTTP,
+                            "use_http_json_tensors": False,
+                            "use_streaming": False,
+                        },
+                    )
+                )
                 threads[0].start()
                 threads[1].start()
                 for t in threads:
@@ -405,17 +457,18 @@ def test_multi_batch_different_shape(self):
         # immediately and the second delayed by the max batch queue
         # delay
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
         else:
             shm0_region_names = None
             shm1_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
                 self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
@@ -426,20 +479,27 @@ def test_multi_batch_different_shape(self):
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'input_size': 16,
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "input_size": 16,
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
-                        args=(trial, 1, (_max_queue_delay_ms * 1.5,
-                                         _max_queue_delay_ms)),
+                        args=(
+                            trial,
+                            1,
+                            (_max_queue_delay_ms * 1.5, _max_queue_delay_ms),
+                        ),
                         kwargs={
-                            'input_size': 8,
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "input_size": 8,
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads[0].start()
                 time.sleep(1)
                 threads[1].start()
@@ -457,17 +517,18 @@ def test_multi_batch_not_preferred(self):
         # delay (minus the difference in time that they arrived in the
         # queue)
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
         else:
             shm0_region_names = None
             shm1_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
                 self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
@@ -476,21 +537,31 @@ def test_multi_batch_not_preferred(self):
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
-                        args=(trial, 1, (_max_queue_delay_ms * 1.5,
-                                         _max_queue_delay_ms)),
+                        args=(
+                            trial,
+                            1,
+                            (_max_queue_delay_ms * 1.5, _max_queue_delay_ms),
+                        ),
                         kwargs={
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
-                        args=(trial, 3, (_max_queue_delay_ms * 1.5,
-                                         _max_queue_delay_ms - 2000)),
+                        args=(
+                            trial,
+                            3,
+                            (_max_queue_delay_ms * 1.5, _max_queue_delay_ms - 2000),
+                        ),
                         kwargs={
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads[0].start()
                 time.sleep(1)
                 threads[1].start()
@@ -508,20 +579,21 @@ def test_multi_batch_not_preferred_different_shape(self):
         # two requests to be immediately responded to and the third
         # response to be delayed by the max batch queue delay.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
-            shm2_region_names = ['ip20', 'ip21', 'op20', 'op21']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
+            shm2_region_names = ["ip20", "ip21", "op20", "op21"]
         else:
             shm0_region_names = None
             shm1_region_names = None
             shm2_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
-        precreated_shm2_regions = self.create_advance(['op20', 'op21'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
+        precreated_shm2_regions = self.create_advance(["op20", "op21"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
                 self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
@@ -532,27 +604,36 @@ def test_multi_batch_not_preferred_different_shape(self):
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 3, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
-                        args=(trial, 1, (_max_queue_delay_ms * 1.5,
-                                         _max_queue_delay_ms)),
+                        args=(
+                            trial,
+                            1,
+                            (_max_queue_delay_ms * 1.5, _max_queue_delay_ms),
+                        ),
                         kwargs={
-                            'input_size': 8,
-                            'shm_region_names': shm2_region_names,
-                            'precreated_shm_regions': precreated_shm2_regions
-                        }))
+                            "input_size": 8,
+                            "shm_region_names": shm2_region_names,
+                            "precreated_shm_regions": precreated_shm2_regions,
+                        },
+                    )
+                )
                 threads[0].start()
                 threads[1].start()
                 time.sleep(1)
@@ -573,23 +654,24 @@ def test_multi_batch_preferred_different_shape(self):
         # preferred size so that third and forth response are sent
         # immediately.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
-            shm2_region_names = ['ip20', 'ip21', 'op20', 'op21']
-            shm3_region_names = ['ip30', 'ip31', 'op30', 'op31']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
+            shm2_region_names = ["ip20", "ip21", "op20", "op21"]
+            shm3_region_names = ["ip30", "ip31", "op30", "op31"]
         else:
             shm0_region_names = None
             shm1_region_names = None
             shm2_region_names = None
             shm3_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
-        precreated_shm2_regions = self.create_advance(['op20', 'op21'])
-        precreated_shm3_regions = self.create_advance(['op30', 'op31'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
+        precreated_shm2_regions = self.create_advance(["op20", "op21"])
+        precreated_shm3_regions = self.create_advance(["op30", "op31"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
                 self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
@@ -600,35 +682,43 @@ def test_multi_batch_preferred_different_shape(self):
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 3, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'input_size': 8,
-                            'shm_region_names': shm2_region_names,
-                            'precreated_shm_regions': precreated_shm2_regions
-                        }))
+                            "input_size": 8,
+                            "shm_region_names": shm2_region_names,
+                            "precreated_shm_regions": precreated_shm2_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 5, (6000, None)),
                         kwargs={
-                            'input_size': 8,
-                            'shm_region_names': shm3_region_names,
-                            'precreated_shm_regions': precreated_shm3_regions
-                        }))
+                            "input_size": 8,
+                            "shm_region_names": shm3_region_names,
+                            "precreated_shm_regions": precreated_shm3_regions,
+                        },
+                    )
+                )
                 threads[0].start()
                 threads[1].start()
                 time.sleep(1)
@@ -648,17 +738,18 @@ def test_multi_batch_gt_max_preferred(self):
         # be processed by the dynamic batcher. This should cause both
         # responses to be returned immediately.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
         else:
             shm0_region_names = None
             shm1_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
                 self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
@@ -669,17 +760,21 @@ def test_multi_batch_gt_max_preferred(self):
                         target=self.check_response,
                         args=(trial, 3, (3000, None)),
                         kwargs={
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 7, (3000, None)),
                         kwargs={
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads[0].start()
                 time.sleep(1)
                 threads[1].start()
@@ -700,17 +795,18 @@ def test_multi_batch_sum_gt_max_preferred(self):
         # since it alone is not greater than max preferred size, will
         # be delayed.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
         else:
             shm0_region_names = None
             shm1_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
                 self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
@@ -721,18 +817,25 @@ def test_multi_batch_sum_gt_max_preferred(self):
                         target=self.check_response,
                         args=(trial, 3, (3000, None)),
                         kwargs={
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
-                        args=(trial, 4, (_max_queue_delay_ms * 1.5,
-                                         _max_queue_delay_ms)),
+                        args=(
+                            trial,
+                            4,
+                            (_max_queue_delay_ms * 1.5, _max_queue_delay_ms),
+                        ),
                         kwargs={
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads[0].start()
                 time.sleep(1)
                 threads[1].start()
@@ -748,17 +851,18 @@ def test_multi_same_output0(self):
         # batched and get the correct response even though they don't
         # request both outputs.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00']
-            shm1_region_names = ['ip10', 'ip11', 'op10']
+            shm0_region_names = ["ip00", "ip01", "op00"]
+            shm1_region_names = ["ip10", "ip11", "op10"]
         else:
             shm0_region_names = None
             shm1_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00'])
-        precreated_shm1_regions = self.create_advance(['op10'])
+        precreated_shm0_regions = self.create_advance(["op00"])
+        precreated_shm1_regions = self.create_advance(["op10"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
 
@@ -770,19 +874,23 @@ def test_multi_same_output0(self):
                         target=self.check_response,
                         args=(trial, 1, (3000, None)),
                         kwargs={
-                            'requested_outputs': ("OUTPUT0",),
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "requested_outputs": ("OUTPUT0",),
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (3000, None)),
                         kwargs={
-                            'requested_outputs': ("OUTPUT0",),
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "requested_outputs": ("OUTPUT0",),
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads[0].start()
                 threads[1].start()
                 for t in threads:
@@ -797,17 +905,18 @@ def test_multi_same_output1(self):
         # batched and get the correct response even though they don't
         # request both outputs.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op11']
+            shm0_region_names = ["ip00", "ip01", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op11"]
         else:
             shm0_region_names = None
             shm1_region_names = None
-        precreated_shm0_regions = self.create_advance(['op01'])
-        precreated_shm1_regions = self.create_advance(['op11'])
+        precreated_shm0_regions = self.create_advance(["op01"])
+        precreated_shm1_regions = self.create_advance(["op11"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
 
@@ -819,19 +928,23 @@ def test_multi_same_output1(self):
                         target=self.check_response,
                         args=(trial, 1, (3000, None)),
                         kwargs={
-                            'requested_outputs': ("OUTPUT1",),
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "requested_outputs": ("OUTPUT1",),
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (3000, None)),
                         kwargs={
-                            'requested_outputs': ("OUTPUT1",),
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "requested_outputs": ("OUTPUT1",),
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads[0].start()
                 threads[1].start()
                 for t in threads:
@@ -847,17 +960,18 @@ def test_multi_different_outputs(self):
         # batched and get the correct response even though they don't
         # request both outputs.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00']
-            shm1_region_names = ['ip10', 'ip11', 'op11']
+            shm0_region_names = ["ip00", "ip01", "op00"]
+            shm1_region_names = ["ip10", "ip11", "op11"]
         else:
             shm0_region_names = None
             shm1_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00'])
-        precreated_shm1_regions = self.create_advance(['op11'])
+        precreated_shm0_regions = self.create_advance(["op00"])
+        precreated_shm1_regions = self.create_advance(["op11"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
 
@@ -869,19 +983,23 @@ def test_multi_different_outputs(self):
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'requested_outputs': ("OUTPUT0",),
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "requested_outputs": ("OUTPUT0",),
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'requested_outputs': ("OUTPUT1",),
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "requested_outputs": ("OUTPUT1",),
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads[0].start()
                 threads[1].start()
                 for t in threads:
@@ -896,15 +1014,16 @@ def test_multi_different_output_order(self):
         # different order. They should be batched and get the correct
         # response even though they use different order.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op11', 'op10']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op11", "op10"]
         else:
             shm0_region_names = None
             shm1_region_names = None
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
 
@@ -912,21 +1031,25 @@ def test_multi_different_output_order(self):
 
                 threads = []
                 threads.append(
-                    threading.Thread(target=self.check_response,
-                                     args=(trial, 1, (6000, None)),
-                                     kwargs={
-                                         'requested_outputs':
-                                             ("OUTPUT0", "OUTPUT1"),
-                                         'shm_region_names': shm0_region_names
-                                     }))
-                threads.append(
-                    threading.Thread(target=self.check_response,
-                                     args=(trial, 1, (6000, None)),
-                                     kwargs={
-                                         'requested_outputs':
-                                             ("OUTPUT1", "OUTPUT0"),
-                                         'shm_region_names': shm1_region_names
-                                     }))
+                    threading.Thread(
+                        target=self.check_response,
+                        args=(trial, 1, (6000, None)),
+                        kwargs={
+                            "requested_outputs": ("OUTPUT0", "OUTPUT1"),
+                            "shm_region_names": shm0_region_names,
+                        },
+                    )
+                )
+                threads.append(
+                    threading.Thread(
+                        target=self.check_response,
+                        args=(trial, 1, (6000, None)),
+                        kwargs={
+                            "requested_outputs": ("OUTPUT1", "OUTPUT0"),
+                            "shm_region_names": shm1_region_names,
+                        },
+                    )
+                )
                 threads[0].start()
                 threads[1].start()
                 for t in threads:
@@ -946,24 +1069,24 @@ def test_multi_batch_delayed_sum_gt_max_preferred(self):
         # immediately but the second response, since it alone is not
         # greater than max preferred size, will be delayed.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
         else:
             shm0_region_names = None
             shm1_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
 
                 # Need scheduler to wait for queue to contain 2 requests
                 self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
-                self.assertEqual(
-                    int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 2)
+                self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 2)
 
                 threads = []
                 threads.append(
@@ -971,18 +1094,25 @@ def test_multi_batch_delayed_sum_gt_max_preferred(self):
                         target=self.check_response,
                         args=(trial, 3, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
-                        args=(trial, 4, (_max_queue_delay_ms * 1.5,
-                                         _max_queue_delay_ms)),
+                        args=(
+                            trial,
+                            4,
+                            (_max_queue_delay_ms * 1.5, _max_queue_delay_ms),
+                        ),
                         kwargs={
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads[0].start()
                 time.sleep(1)
                 threads[1].start()
@@ -996,7 +1126,7 @@ def test_multi_batch_delayed_sum_gt_max_preferred(self):
     def test_multi_batch_delayed_use_max_batch(self):
         # Send three requests with first not having preferred size,
         # second being smaller than max preferred size but the sum of
-        # the requests being larger than max preferred size and thrid
+        # the requests being larger than max preferred size and third
         # is sent after the first two requests exceeds the queue delay
         # and the sum of the requests to be in full batch. Use
         # TRITONSERVER_DELAY_SCHEDULER in the environment so that
@@ -1005,55 +1135,67 @@ def test_multi_batch_delayed_use_max_batch(self):
         # while it appears that the first two responses to be returned
         # after being delayed and the third response to be returned immediately.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
-            shm2_region_names = ['ip20', 'ip21', 'op20', 'op21']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
+            shm2_region_names = ["ip20", "ip21", "op20", "op21"]
         else:
             shm0_region_names = None
             shm1_region_names = None
             shm2_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
-        precreated_shm2_regions = self.create_advance(['op20', 'op21'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
+        precreated_shm2_regions = self.create_advance(["op20", "op21"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
 
                 # Need scheduler to wait for queue to contain 3 requests
                 self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
-                self.assertEqual(
-                    int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3)
+                self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3)
 
                 threads = []
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
-                        args=(trial, 3, (_max_queue_delay_ms * 1.5,
-                                         _max_queue_delay_ms)),
+                        args=(
+                            trial,
+                            3,
+                            (_max_queue_delay_ms * 1.5, _max_queue_delay_ms),
+                        ),
                         kwargs={
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
-                        args=(trial, 4, (_max_queue_delay_ms * 1.5,
-                                         _max_queue_delay_ms)),
+                        args=(
+                            trial,
+                            4,
+                            (_max_queue_delay_ms * 1.5, _max_queue_delay_ms),
+                        ),
                         kwargs={
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm2_region_names,
-                            'precreated_shm_regions': precreated_shm2_regions
-                        }))
+                            "shm_region_names": shm2_region_names,
+                            "precreated_shm_regions": precreated_shm2_regions,
+                        },
+                    )
+                )
                 threads[0].start()
                 threads[1].start()
                 time.sleep(11)
@@ -1076,30 +1218,30 @@ def test_multi_batch_delayed_preferred_different_shape(self):
         # shape as the third that causes a preferred size so that
         # third and forth response are sent immediately.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
-            shm2_region_names = ['ip20', 'ip21', 'op20', 'op21']
-            shm3_region_names = ['ip30', 'ip31', 'op30', 'op31']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
+            shm2_region_names = ["ip20", "ip21", "op20", "op21"]
+            shm3_region_names = ["ip30", "ip31", "op30", "op31"]
         else:
             shm0_region_names = None
             shm1_region_names = None
             shm2_region_names = None
             shm3_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
-        precreated_shm2_regions = self.create_advance(['op20', 'op21'])
-        precreated_shm3_regions = self.create_advance(['op30', 'op31'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
+        precreated_shm2_regions = self.create_advance(["op20", "op21"])
+        precreated_shm3_regions = self.create_advance(["op30", "op31"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
 
                 # Need scheduler to wait for queue to contain 4 requests
                 self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
-                self.assertEqual(
-                    int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 4)
+                self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 4)
 
                 threads = []
                 threads.append(
@@ -1107,35 +1249,43 @@ def test_multi_batch_delayed_preferred_different_shape(self):
                         target=self.check_response,
                         args=(trial, 1, (3000, None)),
                         kwargs={
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 3, (3000, None)),
                         kwargs={
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (3000, None)),
                         kwargs={
-                            'input_size': 8,
-                            'shm_region_names': shm2_region_names,
-                            'precreated_shm_regions': precreated_shm2_regions
-                        }))
+                            "input_size": 8,
+                            "shm_region_names": shm2_region_names,
+                            "precreated_shm_regions": precreated_shm2_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 5, (3000, None)),
                         kwargs={
-                            'input_size': 8,
-                            'shm_region_names': shm3_region_names,
-                            'precreated_shm_regions': precreated_shm3_regions
-                        }))
+                            "input_size": 8,
+                            "shm_region_names": shm3_region_names,
+                            "precreated_shm_regions": precreated_shm3_regions,
+                        },
+                    )
+                )
                 threads[0].start()
                 threads[1].start()
                 time.sleep(1)
@@ -1155,12 +1305,12 @@ def test_multi_batch_use_biggest_preferred(self):
         # that requests can be queued up before scheduler starts
         # servicing.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
-            shm2_region_names = ['ip20', 'ip21', 'op20', 'op21']
-            shm3_region_names = ['ip30', 'ip31', 'op30', 'op31']
-            shm4_region_names = ['ip40', 'ip41', 'op40', 'op41']
-            shm5_region_names = ['ip50', 'ip51', 'op50', 'op51']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
+            shm2_region_names = ["ip20", "ip21", "op20", "op21"]
+            shm3_region_names = ["ip30", "ip31", "op30", "op31"]
+            shm4_region_names = ["ip40", "ip41", "op40", "op41"]
+            shm5_region_names = ["ip50", "ip51", "op50", "op51"]
         else:
             shm0_region_names = None
             shm1_region_names = None
@@ -1168,23 +1318,23 @@ def test_multi_batch_use_biggest_preferred(self):
             shm3_region_names = None
             shm4_region_names = None
             shm5_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
-        precreated_shm2_regions = self.create_advance(['op20', 'op21'])
-        precreated_shm3_regions = self.create_advance(['op30', 'op31'])
-        precreated_shm4_regions = self.create_advance(['op40', 'op41'])
-        precreated_shm5_regions = self.create_advance(['op50', 'op51'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
+        precreated_shm2_regions = self.create_advance(["op20", "op21"])
+        precreated_shm3_regions = self.create_advance(["op30", "op31"])
+        precreated_shm4_regions = self.create_advance(["op40", "op41"])
+        precreated_shm5_regions = self.create_advance(["op50", "op51"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
 
                 # Need scheduler to wait for queue to contain 6 request
                 self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
-                self.assertEqual(
-                    int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 6)
+                self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 6)
 
                 threads = []
                 threads.append(
@@ -1192,49 +1342,61 @@ def test_multi_batch_use_biggest_preferred(self):
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm2_region_names,
-                            'precreated_shm_regions': precreated_shm2_regions
-                        }))
+                            "shm_region_names": shm2_region_names,
+                            "precreated_shm_regions": precreated_shm2_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm3_region_names,
-                            'precreated_shm_regions': precreated_shm3_regions
-                        }))
+                            "shm_region_names": shm3_region_names,
+                            "precreated_shm_regions": precreated_shm3_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm4_region_names,
-                            'precreated_shm_regions': precreated_shm4_regions
-                        }))
+                            "shm_region_names": shm4_region_names,
+                            "precreated_shm_regions": precreated_shm4_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm5_region_names,
-                            'precreated_shm_regions': precreated_shm5_regions
-                        }))
+                            "shm_region_names": shm5_region_names,
+                            "precreated_shm_regions": precreated_shm5_regions,
+                        },
+                    )
+                )
                 for t in threads:
                     t.start()
                 for t in threads:
@@ -1253,27 +1415,27 @@ def test_multi_batch_use_best_preferred(self):
         # that requests can be queued up before scheduler starts
         # servicing.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
-            shm2_region_names = ['ip20', 'ip21', 'op20', 'op21']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
+            shm2_region_names = ["ip20", "ip21", "op20", "op21"]
         else:
             shm0_region_names = None
             shm1_region_names = None
             shm2_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
-        precreated_shm2_regions = self.create_advance(['op20', 'op21'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
+        precreated_shm2_regions = self.create_advance(["op20", "op21"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [2, 6], _max_queue_delay_ms * 1000)
 
                 # Need scheduler to wait for queue to contain 3 requests
                 self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
-                self.assertEqual(
-                    int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3)
+                self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3)
 
                 threads = []
                 threads.append(
@@ -1281,26 +1443,35 @@ def test_multi_batch_use_best_preferred(self):
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
-                        args=(trial, 1, (_max_queue_delay_ms * 1.5,
-                                         _max_queue_delay_ms)),
+                        args=(
+                            trial,
+                            1,
+                            (_max_queue_delay_ms * 1.5, _max_queue_delay_ms),
+                        ),
                         kwargs={
-                            'shm_region_names': shm2_region_names,
-                            'precreated_shm_regions': precreated_shm2_regions
-                        }))
+                            "shm_region_names": shm2_region_names,
+                            "precreated_shm_regions": precreated_shm2_regions,
+                        },
+                    )
+                )
                 threads[0].start()
                 threads[1].start()
                 time.sleep(1)
@@ -1315,41 +1486,36 @@ def test_multi_batch_use_best_preferred(self):
     def test_multi_batch_preserve_ordering(self):
         model_base = "custom"
         dtype = np.float32
-        shapes = ([
-            1,
-            1,
-        ],)
+        shapes = (
+            [
+                1,
+                1,
+            ],
+        )
 
         try:
             # use threads to send 12 requests without waiting for response
             threads = []
             for i in range(12):
                 if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-                    shm_region_name_prefix = [
-                        "input" + str(i), "output" + str(i)
-                    ]
+                    shm_region_name_prefix = ["input" + str(i), "output" + str(i)]
                 else:
                     shm_region_name_prefix = None
                 threads.append(
-                    threading.Thread(target=iu.infer_zero,
-                                     args=(self, model_base, 1, dtype, shapes,
-                                           shapes),
-                                     kwargs={
-                                         'use_grpc':
-                                             USE_GRPC,
-                                         'use_http':
-                                             USE_HTTP,
-                                         'use_http_json_tensors':
-                                             False,
-                                         'use_streaming':
-                                             False,
-                                         'shm_region_name_prefix':
-                                             shm_region_name_prefix,
-                                         'use_system_shared_memory':
-                                             TEST_SYSTEM_SHARED_MEMORY,
-                                         'use_cuda_shared_memory':
-                                             TEST_CUDA_SHARED_MEMORY
-                                     }))
+                    threading.Thread(
+                        target=iu.infer_zero,
+                        args=(self, model_base, 1, dtype, shapes, shapes),
+                        kwargs={
+                            "use_grpc": USE_GRPC,
+                            "use_http": USE_HTTP,
+                            "use_http_json_tensors": False,
+                            "use_streaming": False,
+                            "shm_region_name_prefix": shm_region_name_prefix,
+                            "use_system_shared_memory": TEST_SYSTEM_SHARED_MEMORY,
+                            "use_cuda_shared_memory": TEST_CUDA_SHARED_MEMORY,
+                        },
+                    )
+                )
             for t in threads:
                 t.start()
             for t in threads:
@@ -1367,30 +1533,30 @@ def test_preferred_batch_only_aligned(self):
         # servicing. The batcher should form a batch of preferred
         # size 4.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
-            shm2_region_names = ['ip20', 'ip21', 'op20', 'op21']
-            shm3_region_names = ['ip30', 'ip31', 'op30', 'op31']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
+            shm2_region_names = ["ip20", "ip21", "op20", "op21"]
+            shm3_region_names = ["ip30", "ip31", "op30", "op31"]
         else:
             shm0_region_names = None
             shm1_region_names = None
             shm2_region_names = None
             shm3_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
-        precreated_shm2_regions = self.create_advance(['op20', 'op21'])
-        precreated_shm3_regions = self.create_advance(['op30', 'op31'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
+        precreated_shm2_regions = self.create_advance(["op20", "op21"])
+        precreated_shm3_regions = self.create_advance(["op30", "op31"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [4, 6], 0)
 
                 # Need scheduler to wait for queue to contain 4 requests
                 self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
-                self.assertEqual(
-                    int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 4)
+                self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 4)
 
                 threads = []
                 threads.append(
@@ -1398,33 +1564,41 @@ def test_preferred_batch_only_aligned(self):
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm2_region_names,
-                            'precreated_shm_regions': precreated_shm2_regions
-                        }))
+                            "shm_region_names": shm2_region_names,
+                            "precreated_shm_regions": precreated_shm2_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm3_region_names,
-                            'precreated_shm_regions': precreated_shm3_regions
-                        }))
+                            "shm_region_names": shm3_region_names,
+                            "precreated_shm_regions": precreated_shm3_regions,
+                        },
+                    )
+                )
                 for t in threads:
                     t.start()
                 for t in threads:
@@ -1441,33 +1615,33 @@ def test_preferred_batch_only_unaligned(self):
         # servicing. The batcher should form a batch of preferred
         # size 4 followed by a batch of size 1.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
-            shm2_region_names = ['ip20', 'ip21', 'op20', 'op21']
-            shm3_region_names = ['ip30', 'ip31', 'op30', 'op31']
-            shm4_region_names = ['ip40', 'ip41', 'op40', 'op41']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
+            shm2_region_names = ["ip20", "ip21", "op20", "op21"]
+            shm3_region_names = ["ip30", "ip31", "op30", "op31"]
+            shm4_region_names = ["ip40", "ip41", "op40", "op41"]
         else:
             shm0_region_names = None
             shm1_region_names = None
             shm2_region_names = None
             shm3_region_names = None
             shm4_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
-        precreated_shm2_regions = self.create_advance(['op20', 'op21'])
-        precreated_shm3_regions = self.create_advance(['op30', 'op31'])
-        precreated_shm4_regions = self.create_advance(['op40', 'op41'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
+        precreated_shm2_regions = self.create_advance(["op20", "op21"])
+        precreated_shm3_regions = self.create_advance(["op30", "op31"])
+        precreated_shm4_regions = self.create_advance(["op40", "op41"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [4, 6], 0)
 
                 # Need scheduler to wait for queue to contain 3 requests
                 self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
-                self.assertEqual(
-                    int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 5)
+                self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 5)
 
                 threads = []
                 threads.append(
@@ -1475,41 +1649,51 @@ def test_preferred_batch_only_unaligned(self):
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm2_region_names,
-                            'precreated_shm_regions': precreated_shm2_regions
-                        }))
+                            "shm_region_names": shm2_region_names,
+                            "precreated_shm_regions": precreated_shm2_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm3_region_names,
-                            'precreated_shm_regions': precreated_shm3_regions
-                        }))
+                            "shm_region_names": shm3_region_names,
+                            "precreated_shm_regions": precreated_shm3_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm4_region_names,
-                            'precreated_shm_regions': precreated_shm4_regions
-                        }))
+                            "shm_region_names": shm4_region_names,
+                            "precreated_shm_regions": precreated_shm4_regions,
+                        },
+                    )
+                )
                 for t in threads:
                     t.start()
                 for t in threads:
@@ -1526,13 +1710,13 @@ def test_preferred_batch_only_use_biggest_preferred(self):
         # servicing. The batcher should form a batch of largest preferred
         # size 6 followed by a batch of size 1.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
-            shm2_region_names = ['ip20', 'ip21', 'op20', 'op21']
-            shm3_region_names = ['ip30', 'ip31', 'op30', 'op31']
-            shm4_region_names = ['ip40', 'ip41', 'op40', 'op41']
-            shm5_region_names = ['ip50', 'ip51', 'op50', 'op51']
-            shm6_region_names = ['ip60', 'ip61', 'op60', 'op61']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
+            shm2_region_names = ["ip20", "ip21", "op20", "op21"]
+            shm3_region_names = ["ip30", "ip31", "op30", "op31"]
+            shm4_region_names = ["ip40", "ip41", "op40", "op41"]
+            shm5_region_names = ["ip50", "ip51", "op50", "op51"]
+            shm6_region_names = ["ip60", "ip61", "op60", "op61"]
         else:
             shm0_region_names = None
             shm1_region_names = None
@@ -1541,24 +1725,24 @@ def test_preferred_batch_only_use_biggest_preferred(self):
             shm4_region_names = None
             shm5_region_names = None
             shm6_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
-        precreated_shm2_regions = self.create_advance(['op20', 'op21'])
-        precreated_shm3_regions = self.create_advance(['op30', 'op31'])
-        precreated_shm4_regions = self.create_advance(['op40', 'op41'])
-        precreated_shm5_regions = self.create_advance(['op50', 'op51'])
-        precreated_shm6_regions = self.create_advance(['op60', 'op61'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
+        precreated_shm2_regions = self.create_advance(["op20", "op21"])
+        precreated_shm3_regions = self.create_advance(["op30", "op31"])
+        precreated_shm4_regions = self.create_advance(["op40", "op41"])
+        precreated_shm5_regions = self.create_advance(["op50", "op51"])
+        precreated_shm6_regions = self.create_advance(["op60", "op61"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [4, 6], 0)
 
                 # Need scheduler to wait for queue to contain 6 request
                 self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
-                self.assertEqual(
-                    int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 7)
+                self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 7)
 
                 threads = []
                 threads.append(
@@ -1566,57 +1750,71 @@ def test_preferred_batch_only_use_biggest_preferred(self):
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm2_region_names,
-                            'precreated_shm_regions': precreated_shm2_regions
-                        }))
+                            "shm_region_names": shm2_region_names,
+                            "precreated_shm_regions": precreated_shm2_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm3_region_names,
-                            'precreated_shm_regions': precreated_shm3_regions
-                        }))
+                            "shm_region_names": shm3_region_names,
+                            "precreated_shm_regions": precreated_shm3_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm4_region_names,
-                            'precreated_shm_regions': precreated_shm4_regions
-                        }))
+                            "shm_region_names": shm4_region_names,
+                            "precreated_shm_regions": precreated_shm4_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm5_region_names,
-                            'precreated_shm_regions': precreated_shm5_regions
-                        }))
+                            "shm_region_names": shm5_region_names,
+                            "precreated_shm_regions": precreated_shm5_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm6_region_names,
-                            'precreated_shm_regions': precreated_shm6_regions
-                        }))
+                            "shm_region_names": shm6_region_names,
+                            "precreated_shm_regions": precreated_shm6_regions,
+                        },
+                    )
+                )
                 for t in threads:
                     t.start()
                 for t in threads:
@@ -1632,27 +1830,27 @@ def test_preferred_batch_only_use_no_preferred_size(self):
         # requests can be queued up before scheduler starts
         # servicing. The batcher should form a batch of of 3.
         if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-            shm0_region_names = ['ip00', 'ip01', 'op00', 'op01']
-            shm1_region_names = ['ip10', 'ip11', 'op10', 'op11']
-            shm2_region_names = ['ip20', 'ip21', 'op20', 'op21']
+            shm0_region_names = ["ip00", "ip01", "op00", "op01"]
+            shm1_region_names = ["ip10", "ip11", "op10", "op11"]
+            shm2_region_names = ["ip20", "ip21", "op20", "op21"]
         else:
             shm0_region_names = None
             shm1_region_names = None
             shm2_region_names = None
-        precreated_shm0_regions = self.create_advance(['op00', 'op01'])
-        precreated_shm1_regions = self.create_advance(['op10', 'op11'])
-        precreated_shm2_regions = self.create_advance(['op20', 'op21'])
+        precreated_shm0_regions = self.create_advance(["op00", "op01"])
+        precreated_shm1_regions = self.create_advance(["op10", "op11"])
+        precreated_shm2_regions = self.create_advance(["op20", "op21"])
         for trial in _trials:
             try:
-                model_name = tu.get_model_name(trial, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(
+                    trial, np.float32, np.float32, np.float32
+                )
 
                 self.check_setup(model_name, [4, 6], 0)
 
                 # Need scheduler to wait for queue to contain 3 request
                 self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
-                self.assertEqual(
-                    int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3)
+                self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 3)
 
                 threads = []
                 threads.append(
@@ -1660,25 +1858,31 @@ def test_preferred_batch_only_use_no_preferred_size(self):
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm0_region_names,
-                            'precreated_shm_regions': precreated_shm0_regions
-                        }))
+                            "shm_region_names": shm0_region_names,
+                            "precreated_shm_regions": precreated_shm0_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm1_region_names,
-                            'precreated_shm_regions': precreated_shm1_regions
-                        }))
+                            "shm_region_names": shm1_region_names,
+                            "precreated_shm_regions": precreated_shm1_regions,
+                        },
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_response,
                         args=(trial, 1, (6000, None)),
                         kwargs={
-                            'shm_region_names': shm2_region_names,
-                            'precreated_shm_regions': precreated_shm2_regions
-                        }))
+                            "shm_region_names": shm2_region_names,
+                            "precreated_shm_regions": precreated_shm2_regions,
+                        },
+                    )
+                )
                 for t in threads:
                     t.start()
                 for t in threads:
@@ -1694,41 +1898,36 @@ def test_max_queue_delay_only_non_default(self):
         # there can be either 1 or 2 model executions.
         model_base = "custom"
         dtype = np.float32
-        shapes = ([
-            1,
-            1,
-        ],)
+        shapes = (
+            [
+                1,
+                1,
+            ],
+        )
 
         try:
             # use threads to send 12 requests without waiting for response
             threads = []
             for i in range(12):
                 if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-                    shm_region_name_prefix = [
-                        "input" + str(i), "output" + str(i)
-                    ]
+                    shm_region_name_prefix = ["input" + str(i), "output" + str(i)]
                 else:
                     shm_region_name_prefix = None
                 threads.append(
-                    threading.Thread(target=iu.infer_zero,
-                                     args=(self, model_base, 1, dtype, shapes,
-                                           shapes),
-                                     kwargs={
-                                         'use_grpc':
-                                             USE_GRPC,
-                                         'use_http':
-                                             USE_HTTP,
-                                         'use_http_json_tensors':
-                                             False,
-                                         'use_streaming':
-                                             False,
-                                         'shm_region_name_prefix':
-                                             shm_region_name_prefix,
-                                         'use_system_shared_memory':
-                                             TEST_SYSTEM_SHARED_MEMORY,
-                                         'use_cuda_shared_memory':
-                                             TEST_CUDA_SHARED_MEMORY
-                                     }))
+                    threading.Thread(
+                        target=iu.infer_zero,
+                        args=(self, model_base, 1, dtype, shapes, shapes),
+                        kwargs={
+                            "use_grpc": USE_GRPC,
+                            "use_http": USE_HTTP,
+                            "use_http_json_tensors": False,
+                            "use_streaming": False,
+                            "shm_region_name_prefix": shm_region_name_prefix,
+                            "use_system_shared_memory": TEST_SYSTEM_SHARED_MEMORY,
+                            "use_cuda_shared_memory": TEST_CUDA_SHARED_MEMORY,
+                        },
+                    )
+                )
             for t in threads:
                 t.start()
             for t in threads:
@@ -1746,41 +1945,36 @@ def test_max_queue_delay_only_default(self):
         # and the remaining requests will form the second batch.
         model_base = "custom"
         dtype = np.float32
-        shapes = ([
-            1,
-            1,
-        ],)
+        shapes = (
+            [
+                1,
+                1,
+            ],
+        )
 
         try:
             # use threads to send 12 requests without waiting for response
             threads = []
             for i in range(12):
                 if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
-                    shm_region_name_prefix = [
-                        "input" + str(i), "output" + str(i)
-                    ]
+                    shm_region_name_prefix = ["input" + str(i), "output" + str(i)]
                 else:
                     shm_region_name_prefix = None
                 threads.append(
-                    threading.Thread(target=iu.infer_zero,
-                                     args=(self, model_base, 1, dtype, shapes,
-                                           shapes),
-                                     kwargs={
-                                         'use_grpc':
-                                             USE_GRPC,
-                                         'use_http':
-                                             USE_HTTP,
-                                         'use_http_json_tensors':
-                                             False,
-                                         'use_streaming':
-                                             False,
-                                         'shm_region_name_prefix':
-                                             shm_region_name_prefix,
-                                         'use_system_shared_memory':
-                                             TEST_SYSTEM_SHARED_MEMORY,
-                                         'use_cuda_shared_memory':
-                                             TEST_CUDA_SHARED_MEMORY
-                                     }))
+                    threading.Thread(
+                        target=iu.infer_zero,
+                        args=(self, model_base, 1, dtype, shapes, shapes),
+                        kwargs={
+                            "use_grpc": USE_GRPC,
+                            "use_http": USE_HTTP,
+                            "use_http_json_tensors": False,
+                            "use_streaming": False,
+                            "shm_region_name_prefix": shm_region_name_prefix,
+                            "use_system_shared_memory": TEST_SYSTEM_SHARED_MEMORY,
+                            "use_cuda_shared_memory": TEST_CUDA_SHARED_MEMORY,
+                        },
+                    )
+                )
             for t in threads:
                 t.start()
             for t in threads:
@@ -1792,5 +1986,5 @@ def test_max_queue_delay_only_default(self):
             self.assertTrue(False, "unexpected error {}".format(ex))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_batcher/test.sh b/qa/L0_batcher/test.sh
old mode 100644
new mode 100755
index c13c249a3c..c5f8819276
--- a/qa/L0_batcher/test.sh
+++ b/qa/L0_batcher/test.sh
@@ -159,7 +159,7 @@ for BACKEND in $BACKENDS; do
         cp $onnx_model/output0_labels.txt models/$python_model
         cp ../python_models/add_sub/model.py models/$python_model/1/
     else
-        cp -r $TMP_MODEL_DIR models/. 
+        cp -r $TMP_MODEL_DIR models/.
     fi
     (cd models/$(basename $TMP_MODEL_DIR) && \
           sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \
diff --git a/qa/L0_batcher/verify_timestamps.py b/qa/L0_batcher/verify_timestamps.py
old mode 100644
new mode 100755
index c39f560c73..3271135fcd
--- a/qa/L0_batcher/verify_timestamps.py
+++ b/qa/L0_batcher/verify_timestamps.py
@@ -33,7 +33,7 @@
 
 def verify_timestamps(traces, preserve):
     # Order traces by id
-    traces = sorted(traces, key=lambda t: t.get('id', -1))
+    traces = sorted(traces, key=lambda t: t.get("id", -1))
 
     # Filter the trace that is not meaningful and group them by 'id'
     filtered_traces = dict()
@@ -41,7 +41,7 @@ def verify_timestamps(traces, preserve):
     for trace in traces:
         if "id" not in trace:
             continue
-        # Skip GRPC traces as actual traces are not genarated via GRPC,
+        # Skip GRPC traces as actual traces are not generated via GRPC,
         # thus GRPC traces are ill-formed
         if "timestamps" in trace:
             is_grpc = False
@@ -53,16 +53,16 @@ def verify_timestamps(traces, preserve):
                 grpc_id_offset += 1
                 continue
 
-        if (trace['id'] in filtered_traces.keys()):
-            rep_trace = filtered_traces[trace['id']]
-            # Apend the timestamp to the trace representing this 'id'
+        if trace["id"] in filtered_traces.keys():
+            rep_trace = filtered_traces[trace["id"]]
+            # Append the timestamp to the trace representing this 'id'
             if "timestamps" in trace:
                 rep_trace["timestamps"] += trace["timestamps"]
         else:
             # Use this trace to represent this 'id'
             if "timestamps" not in trace:
                 trace["timestamps"] = []
-            filtered_traces[trace['id']] = trace
+            filtered_traces[trace["id"]] = trace
 
     # First find the latest response complete timestamp for the batch with large delay
     large_delay_response_complete = 0
@@ -75,11 +75,11 @@ def verify_timestamps(traces, preserve):
         compute_span = timestamps["COMPUTE_END"] - timestamps["COMPUTE_START"]
         # If the 3rd batch is also processed by large delay instance, we don't
         # want to use its responses as baseline
-        if trace["id"] <= (
-                8 + grpc_id_offset) and compute_span >= 400 * 1000 * 1000:
+        if trace["id"] <= (8 + grpc_id_offset) and compute_span >= 400 * 1000 * 1000:
             response_complete = timestamps["INFER_RESPONSE_COMPLETE"]
-            large_delay_response_complete = max(large_delay_response_complete,
-                                                response_complete)
+            large_delay_response_complete = max(
+                large_delay_response_complete, response_complete
+            )
         else:
             small_delay_traces.append(trace)
 
@@ -93,8 +93,11 @@ def verify_timestamps(traces, preserve):
             response_request_after_large_delay_count += 1
 
     # Hardcoded expected count here
-    print("responses after large delay count: {}".format(
-        response_request_after_large_delay_count))
+    print(
+        "responses after large delay count: {}".format(
+            response_request_after_large_delay_count
+        )
+    )
     if preserve:
         # If preserve ordering, there must be large delay batch followed by
         # small delay batch and thus at least 4 responses are sent after
@@ -105,15 +108,17 @@ def verify_timestamps(traces, preserve):
         return 0 if response_request_after_large_delay_count == 0 else 1
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-p',
-                        '--preserve',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Timestamps is collected with preserve ordering')
-    parser.add_argument('file', type=argparse.FileType('r'), nargs='+')
+    parser.add_argument(
+        "-p",
+        "--preserve",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Timestamps is collected with preserve ordering",
+    )
+    parser.add_argument("file", type=argparse.FileType("r"), nargs="+")
     FLAGS = parser.parse_args()
 
     for f in FLAGS.file:
diff --git a/qa/L0_buffer_attributes/buffer_attributes_test.py b/qa/L0_buffer_attributes/buffer_attributes_test.py
old mode 100644
new mode 100755
index 907a469bab..7d61e082c5
--- a/qa/L0_buffer_attributes/buffer_attributes_test.py
+++ b/qa/L0_buffer_attributes/buffer_attributes_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,28 +31,26 @@
 sys.path.append("../common")
 
 import unittest
+
 import numpy as np
 import test_util as tu
-
+import tritonclient.grpc as grpcclient
+import tritonclient.http as httpclient
 import tritonclient.utils.cuda_shared_memory as cudashm
 from tritonclient.utils import triton_to_np_dtype
-import tritonclient.http as httpclient
-import tritonclient.grpc as grpcclient
 
 
 class BufferAttributesTest(tu.TestResultCollector):
-
     def test_buffer_attributes(self):
-        model_name = 'bls'
+        model_name = "bls"
 
         # Infer
         clients = [
-            httpclient.InferenceServerClient(url='localhost:8000'),
-            grpcclient.InferenceServerClient(url='localhost:8001')
+            httpclient.InferenceServerClient(url="localhost:8000"),
+            grpcclient.InferenceServerClient(url="localhost:8001"),
         ]
         triton_clients = [httpclient, grpcclient]
         for i, client in enumerate(clients):
-
             # To make sure no shared memory regions are registered with the
             # server.
             client.unregister_system_shared_memory()
@@ -59,8 +59,7 @@ def test_buffer_attributes(self):
             triton_client = triton_clients[i]
             inputs = []
             outputs = []
-            inputs.append(triton_client.InferInput('INPUT0', [1, 1000],
-                                                   "INT32"))
+            inputs.append(triton_client.InferInput("INPUT0", [1, 1000], "INT32"))
 
             input0_data = np.arange(start=0, stop=1000, dtype=np.int32)
             input0_data = np.expand_dims(input0_data, axis=0)
@@ -69,45 +68,55 @@ def test_buffer_attributes(self):
             output_byte_size = input_byte_size
 
             shm_ip0_handle = cudashm.create_shared_memory_region(
-                "input0_data", input_byte_size, 0)
+                "input0_data", input_byte_size, 0
+            )
             shm_op0_handle = cudashm.create_shared_memory_region(
-                "output0_data", output_byte_size, 0)
+                "output0_data", output_byte_size, 0
+            )
 
             client.register_cuda_shared_memory(
-                "input0_data", cudashm.get_raw_handle(shm_ip0_handle), 0,
-                input_byte_size)
+                "input0_data",
+                cudashm.get_raw_handle(shm_ip0_handle),
+                0,
+                input_byte_size,
+            )
             client.register_cuda_shared_memory(
-                "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0,
-                input_byte_size)
+                "output0_data",
+                cudashm.get_raw_handle(shm_op0_handle),
+                0,
+                input_byte_size,
+            )
 
             cudashm.set_shared_memory_region(shm_ip0_handle, [input0_data])
             inputs[0].set_shared_memory("input0_data", input_byte_size)
 
             if triton_client is grpcclient:
-                outputs.append(triton_client.InferRequestedOutput('OUTPUT0'))
+                outputs.append(triton_client.InferRequestedOutput("OUTPUT0"))
                 outputs[0].set_shared_memory("output0_data", output_byte_size)
             else:
                 outputs.append(
-                    triton_client.InferRequestedOutput('OUTPUT0',
-                                                       binary_data=True))
+                    triton_client.InferRequestedOutput("OUTPUT0", binary_data=True)
+                )
                 outputs[0].set_shared_memory("output0_data", output_byte_size)
 
-            results = client.infer(model_name=model_name,
-                                   inputs=inputs,
-                                   outputs=outputs)
+            results = client.infer(
+                model_name=model_name, inputs=inputs, outputs=outputs
+            )
 
             output0 = results.get_output("OUTPUT0")
             self.assertIsNotNone(output0)
             if triton_client is grpcclient:
                 output0_data = cudashm.get_contents_as_numpy(
-                    shm_op0_handle, triton_to_np_dtype(output0.datatype),
-                    output0.shape)
+                    shm_op0_handle, triton_to_np_dtype(output0.datatype), output0.shape
+                )
             else:
                 output0_data = cudashm.get_contents_as_numpy(
-                    shm_op0_handle, triton_to_np_dtype(output0['datatype']),
-                    output0['shape'])
+                    shm_op0_handle,
+                    triton_to_np_dtype(output0["datatype"]),
+                    output0["shape"],
+                )
             self.assertTrue(np.all(output0_data == input0_data))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_buffer_attributes/models/bls/1/model.py b/qa/L0_buffer_attributes/models/bls/1/model.py
old mode 100644
new mode 100755
index 201d5a4a5e..6c035bb6a4
--- a/qa/L0_buffer_attributes/models/bls/1/model.py
+++ b/qa/L0_buffer_attributes/models/bls/1/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,25 +31,26 @@
 
 # Simple Python model that executes a BLS request on an identity model.
 class TritonPythonModel:
-
     def execute(self, requests):
         responses = []
         for request in requests:
             # Get INPUT0
-            input0 = pb_utils.get_input_tensor_by_name(request, 'INPUT0')
+            input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
             infer_request = pb_utils.InferenceRequest(
-                model_name='identity',
+                model_name="identity",
                 requested_output_names=["OUTPUT0"],
-                inputs=[input0])
+                inputs=[input0],
+            )
             infer_response = infer_request.exec()
 
             if infer_response.has_error():
-                raise pb_utils.TritonModelException(
-                    infer_response.error().message())
+                raise pb_utils.TritonModelException(infer_response.error().message())
 
-            inference_response = pb_utils.InferenceResponse(output_tensors=[
-                pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
-            ])
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[
+                    pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
+                ]
+            )
             responses.append(inference_response)
 
         return responses
diff --git a/qa/L0_buffer_attributes/models/identity/1/model.py b/qa/L0_buffer_attributes/models/identity/1/model.py
old mode 100644
new mode 100755
index 74b114deb7..933ed6d9c5
--- a/qa/L0_buffer_attributes/models/identity/1/model.py
+++ b/qa/L0_buffer_attributes/models/identity/1/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,7 +30,6 @@
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         """
         Identity model using DLPack in Python backend.
@@ -36,7 +37,8 @@ def execute(self, requests):
         responses = []
         for request in requests:
             input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
-            out_tensor = pb_utils.Tensor.from_dlpack("OUTPUT0",
-                                                     input_tensor.to_dlpack())
+            out_tensor = pb_utils.Tensor.from_dlpack(
+                "OUTPUT0", input_tensor.to_dlpack()
+            )
             responses.append(pb_utils.InferenceResponse([out_tensor]))
         return responses
diff --git a/qa/L0_buffer_attributes/test.sh b/qa/L0_buffer_attributes/test.sh
old mode 100644
new mode 100755
index 52babf37e2..7e2f35d837
--- a/qa/L0_buffer_attributes/test.sh
+++ b/qa/L0_buffer_attributes/test.sh
@@ -1,4 +1,5 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/bin/bash
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/qa/L0_client_build_variants/test.sh b/qa/L0_client_build_variants/test.sh
index be8ae2c15e..63eb34fa5a 100755
--- a/qa/L0_client_build_variants/test.sh
+++ b/qa/L0_client_build_variants/test.sh
@@ -40,7 +40,7 @@ apt update && apt install -y gpg wget && \
     echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | \
     tee /etc/apt/sources.list.d/kitware.list >/dev/null && \
     apt-get update && \
-    apt-get install -y --no-install-recommends cmake cmake-data 
+    apt-get install -y --no-install-recommends cmake cmake-data
 cmake --version
 
 
diff --git a/qa/L0_client_java/test.sh b/qa/L0_client_java/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_client_memory_growth/client_memory_mail.py b/qa/L0_client_memory_growth/client_memory_mail.py
old mode 100644
new mode 100755
index 4662f4ba41..ef1703f2c3
--- a/qa/L0_client_memory_growth/client_memory_mail.py
+++ b/qa/L0_client_memory_growth/client_memory_mail.py
@@ -29,18 +29,22 @@
 
 sys.path.append("../common")
 
-import nightly_email_helper
-
 import glob
 from datetime import date
 
-if __name__ == '__main__':
+import nightly_email_helper
+
+if __name__ == "__main__":
     today = date.today().strftime("%Y-%m-%d")
     subject = "Triton Client Memory Growth " + sys.argv[1] + " Summary: " + today
     memory_graphs = glob.glob("client_memory_growth*.log")
     write_up = "<p>This test is run for both HTTP and GRPC protocols using C++ and Python test scripts. The max-allowed difference between mean and maximum memory usage is set to 10MB and 1MB for C++ and Python tests individually.</p>"
     write_up += "<p><b>&#8226 What to look for</b><br>A linear memory growth in the beginning of the graph is acceptable only when it is followed by a flat memory usage. If a linear memory growth is observed during the entire test then there is possibly a memory leak.</p>"
-    html_content = "<html><head></head><body><pre style=\"font-size:11pt;font-family:Arial, sans-serif;\">" + write_up + "</pre><pre style=\"font-size:11pt;font-family:Consolas;\">"
+    html_content = (
+        '<html><head></head><body><pre style="font-size:11pt;font-family:Arial, sans-serif;">'
+        + write_up
+        + '</pre><pre style="font-size:11pt;font-family:Consolas;">'
+    )
     for mem_graph in sorted(memory_graphs):
         html_content += "\n" + mem_graph + "\n"
         with open(mem_graph, "r") as f:
diff --git a/qa/L0_client_memory_growth/models/custom_identity_int32/config.pbtxt b/qa/L0_client_memory_growth/models/custom_identity_int32/config.pbtxt
index 8d3a78baf4..6a2a76bde5 100644
--- a/qa/L0_client_memory_growth/models/custom_identity_int32/config.pbtxt
+++ b/qa/L0_client_memory_growth/models/custom_identity_int32/config.pbtxt
@@ -35,7 +35,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_INT32
     dims: [ -1 ]
-    
+
   }
 ]
 output [
diff --git a/qa/L0_client_memory_growth/test.sh b/qa/L0_client_memory_growth/test.sh
index 8d90a649cf..73188812b2 100755
--- a/qa/L0_client_memory_growth/test.sh
+++ b/qa/L0_client_memory_growth/test.sh
@@ -117,7 +117,7 @@ for PROTOCOL in http grpc; do
             MEMORY_GROWTH_TEST=$MEMORY_GROWTH_TEST_CPP
             MAX_ALLOWED_ALLOC="10"
             # NOTE: This test has risk of exhausting all available sockets in
-            # the ephemeral port range. Re-using the same client connection 
+            # the ephemeral port range. Re-using the same client connection
             # ("-R") can easily solve this problem. However, to cleanly separate
             # the resources used by different client objects, we create new
             # connections for each request and retry/sleep on failure to give
diff --git a/qa/L0_client_nobatch/client_test.py b/qa/L0_client_nobatch/client_test.py
old mode 100644
new mode 100755
index d3a0e5f596..ed6a3149df
--- a/qa/L0_client_nobatch/client_test.py
+++ b/qa/L0_client_nobatch/client_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -29,15 +31,15 @@
 sys.path.append("../common")
 
 import unittest
+
 import numpy as np
-import tritonhttpclient
+import test_util as tu
 import tritongrpcclient
+import tritonhttpclient
 from tritonclientutils import InferenceServerException
-import test_util as tu
 
 
 class ClientNoBatchTest(tu.TestResultCollector):
-
     def test_nobatch_request_for_batching_model(self):
         input_size = 16
 
@@ -46,53 +48,46 @@ def test_nobatch_request_for_batching_model(self):
         # input shapes.
         tensor_shape = (input_size,)
         for protocol in ["http", "grpc"]:
-            model_name = tu.get_model_name("graphdef", np.int32, np.int8,
-                                           np.int8)
-            in0 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
-            in1 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
+            model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8)
+            in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
+            in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
 
             inputs = []
             outputs = []
             if protocol == "http":
                 triton_client = tritonhttpclient.InferenceServerClient(
-                    url='localhost:8000', verbose=True)
+                    url="localhost:8000", verbose=True
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritonhttpclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1'))
+                    tritonhttpclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT1"))
             else:
                 triton_client = tritongrpcclient.InferenceServerClient(
-                    url='localhost:8001', verbose=True)
+                    url="localhost:8001", verbose=True
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritongrpcclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))
+                    tritongrpcclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1"))
 
             # Initialize the data
             inputs[0].set_data_from_numpy(in0)
             inputs[1].set_data_from_numpy(in1)
 
             try:
-                results = triton_client.infer(model_name,
-                                              inputs,
-                                              outputs=outputs)
+                results = triton_client.infer(model_name, inputs, outputs=outputs)
                 self.assertTrue(
-                    False,
-                    "expected failure with no batch request for batching model")
+                    False, "expected failure with no batch request for batching model"
+                )
             except InferenceServerException as ex:
                 pass
 
@@ -104,53 +99,48 @@ def test_batch_request_for_nobatching_model(self):
         # is included in the shape
         tensor_shape = (1, input_size)
         for protocol in ["http", "grpc"]:
-            model_name = tu.get_model_name("graphdef_nobatch", np.int32,
-                                           np.int8, np.int8)
-            in0 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
-            in1 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
+            model_name = tu.get_model_name(
+                "graphdef_nobatch", np.int32, np.int8, np.int8
+            )
+            in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
+            in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
 
             inputs = []
             outputs = []
             if protocol == "http":
                 triton_client = tritonhttpclient.InferenceServerClient(
-                    url='localhost:8000', verbose=True)
+                    url="localhost:8000", verbose=True
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritonhttpclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1'))
+                    tritonhttpclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT1"))
             else:
                 triton_client = tritongrpcclient.InferenceServerClient(
-                    url='localhost:8001', verbose=True)
+                    url="localhost:8001", verbose=True
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritongrpcclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))
+                    tritongrpcclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1"))
 
             # Initialize the data
             inputs[0].set_data_from_numpy(in0)
             inputs[1].set_data_from_numpy(in1)
 
             try:
-                results = triton_client.infer(model_name,
-                                              inputs,
-                                              outputs=outputs)
+                results = triton_client.infer(model_name, inputs, outputs=outputs)
                 self.assertTrue(
                     False,
-                    "expected failure with batched request for non-batching model"
+                    "expected failure with batched request for non-batching model",
                 )
             except InferenceServerException as ex:
                 pass
@@ -163,41 +153,38 @@ def test_nobatch_request_for_nonbatching_model(self):
         # input shapes.
         tensor_shape = (input_size,)
         for protocol in ["http", "grpc"]:
-            model_name = tu.get_model_name("graphdef_nobatch", np.int32,
-                                           np.int8, np.int8)
-            in0 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
-            in1 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
+            model_name = tu.get_model_name(
+                "graphdef_nobatch", np.int32, np.int8, np.int8
+            )
+            in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
+            in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
 
             inputs = []
             outputs = []
             if protocol == "http":
                 triton_client = tritonhttpclient.InferenceServerClient(
-                    url='localhost:8000', verbose=True)
+                    url="localhost:8000", verbose=True
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritonhttpclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1'))
+                    tritonhttpclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT1"))
             else:
                 triton_client = tritongrpcclient.InferenceServerClient(
-                    url='localhost:8001', verbose=True)
+                    url="localhost:8001", verbose=True
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritongrpcclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))
+                    tritongrpcclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1"))
 
             # Initialize the data
             inputs[0].set_data_from_numpy(in0)
@@ -213,41 +200,36 @@ def test_batch_request_for_batching_model(self):
         # is included in the shape
         tensor_shape = (1, input_size)
         for protocol in ["http", "grpc"]:
-            model_name = tu.get_model_name("graphdef", np.int32, np.int8,
-                                           np.int8)
-            in0 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
-            in1 = np.random.randint(low=0,
-                                    high=100,
-                                    size=tensor_shape,
-                                    dtype=np.int32)
+            model_name = tu.get_model_name("graphdef", np.int32, np.int8, np.int8)
+            in0 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
+            in1 = np.random.randint(low=0, high=100, size=tensor_shape, dtype=np.int32)
 
             inputs = []
             outputs = []
             if protocol == "http":
                 triton_client = tritonhttpclient.InferenceServerClient(
-                    url='localhost:8000', verbose=True)
+                    url="localhost:8000", verbose=True
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritonhttpclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritonhttpclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT1'))
+                    tritonhttpclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT1"))
             else:
                 triton_client = tritongrpcclient.InferenceServerClient(
-                    url='localhost:8001', verbose=True)
+                    url="localhost:8001", verbose=True
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT0', tensor_shape,
-                                                "INT32"))
+                    tritongrpcclient.InferInput("INPUT0", tensor_shape, "INT32")
+                )
                 inputs.append(
-                    tritongrpcclient.InferInput('INPUT1', tensor_shape,
-                                                "INT32"))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
-                outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))
+                    tritongrpcclient.InferInput("INPUT1", tensor_shape, "INT32")
+                )
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0"))
+                outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1"))
 
             # Initialize the data
             inputs[0].set_data_from_numpy(in0)
@@ -256,5 +238,5 @@ def test_batch_request_for_batching_model(self):
             results = triton_client.infer(model_name, inputs, outputs=outputs)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_client_timeout/client_timeout_test.py b/qa/L0_client_timeout/client_timeout_test.py
old mode 100644
new mode 100755
index f85eec5084..af7ea768eb
--- a/qa/L0_client_timeout/client_timeout_test.py
+++ b/qa/L0_client_timeout/client_timeout_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,20 +30,19 @@
 
 sys.path.append("../common")
 
-from functools import partial
-import numpy as np
 import queue
-import unittest
 import socket
-import test_util as tu
+import unittest
+from functools import partial
 
+import numpy as np
+import test_util as tu
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 from tritonclientutils import InferenceServerException
 
 
 class UserData:
-
     def __init__(self):
         self._completed_requests = queue.Queue()
 
@@ -54,54 +55,57 @@ def callback(user_data, result, error):
 
 
 class ClientTimeoutTest(tu.TestResultCollector):
-
     def setUp(self):
         self.model_name_ = "custom_identity_int32"
         self.input0_data_ = np.array([[10]], dtype=np.int32)
 
     def _prepare_request(self, protocol):
-        if (protocol == "grpc"):
+        if protocol == "grpc":
             self.inputs_ = []
-            self.inputs_.append(grpcclient.InferInput('INPUT0', [1, 1],
-                                                      "INT32"))
+            self.inputs_.append(grpcclient.InferInput("INPUT0", [1, 1], "INT32"))
             self.outputs_ = []
-            self.outputs_.append(grpcclient.InferRequestedOutput('OUTPUT0'))
+            self.outputs_.append(grpcclient.InferRequestedOutput("OUTPUT0"))
         else:
             self.inputs_ = []
-            self.inputs_.append(httpclient.InferInput('INPUT0', [1, 1],
-                                                      "INT32"))
+            self.inputs_.append(httpclient.InferInput("INPUT0", [1, 1], "INT32"))
             self.outputs_ = []
-            self.outputs_.append(httpclient.InferRequestedOutput('OUTPUT0'))
+            self.outputs_.append(httpclient.InferRequestedOutput("OUTPUT0"))
 
         self.inputs_[0].set_data_from_numpy(self.input0_data_)
 
     def test_grpc_infer(self):
-        triton_client = grpcclient.InferenceServerClient(url="localhost:8001",
-                                                         verbose=True)
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
         self._prepare_request("grpc")
 
         # The model is configured to take three seconds to send the
         # response. Expect an exception for small timeout values.
         with self.assertRaises(InferenceServerException) as cm:
-            result = triton_client.infer(model_name=self.model_name_,
-                                         inputs=self.inputs_,
-                                         outputs=self.outputs_,
-                                         client_timeout=0.2)
+            result = triton_client.infer(
+                model_name=self.model_name_,
+                inputs=self.inputs_,
+                outputs=self.outputs_,
+                client_timeout=0.2,
+            )
         self.assertIn("Deadline Exceeded", str(cm.exception))
 
         # Expect inference to pass successfully for a large timeout
         # value
-        result = triton_client.infer(model_name=self.model_name_,
-                                     inputs=self.inputs_,
-                                     outputs=self.outputs_,
-                                     client_timeout=10)
-
-        output0_data = result.as_numpy('OUTPUT0')
+        result = triton_client.infer(
+            model_name=self.model_name_,
+            inputs=self.inputs_,
+            outputs=self.outputs_,
+            client_timeout=10,
+        )
+
+        output0_data = result.as_numpy("OUTPUT0")
         self.assertTrue(np.array_equal(self.input0_data_, output0_data))
 
     def test_grpc_async_infer(self):
-        triton_client = grpcclient.InferenceServerClient(url="localhost:8001",
-                                                         verbose=True)
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
         self._prepare_request("grpc")
 
         user_data = UserData()
@@ -109,11 +113,13 @@ def test_grpc_async_infer(self):
         # The model is configured to take three seconds to send the
         # response. Expect an exception for small timeout values.
         with self.assertRaises(InferenceServerException) as cm:
-            triton_client.async_infer(model_name=self.model_name_,
-                                      inputs=self.inputs_,
-                                      callback=partial(callback, user_data),
-                                      outputs=self.outputs_,
-                                      client_timeout=2)
+            triton_client.async_infer(
+                model_name=self.model_name_,
+                inputs=self.inputs_,
+                callback=partial(callback, user_data),
+                outputs=self.outputs_,
+                client_timeout=2,
+            )
             data_item = user_data._completed_requests.get()
             if type(data_item) == InferenceServerException:
                 raise data_item
@@ -121,23 +127,25 @@ def test_grpc_async_infer(self):
 
         # Expect inference to pass successfully for a large timeout
         # value
-        triton_client.async_infer(model_name=self.model_name_,
-                                  inputs=self.inputs_,
-                                  callback=partial(callback, user_data),
-                                  outputs=self.outputs_,
-                                  client_timeout=10)
+        triton_client.async_infer(
+            model_name=self.model_name_,
+            inputs=self.inputs_,
+            callback=partial(callback, user_data),
+            outputs=self.outputs_,
+            client_timeout=10,
+        )
 
         # Wait until the results are available in user_data
         data_item = user_data._completed_requests.get()
         self.assertFalse(type(data_item) == InferenceServerException)
 
-        output0_data = data_item.as_numpy('OUTPUT0')
+        output0_data = data_item.as_numpy("OUTPUT0")
         self.assertTrue(np.array_equal(self.input0_data_, output0_data))
 
     def test_grpc_stream_infer(self):
-
-        triton_client = grpcclient.InferenceServerClient(url="localhost:8001",
-                                                         verbose=True)
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
 
         self._prepare_request("grpc")
         user_data = UserData()
@@ -146,11 +154,12 @@ def test_grpc_stream_infer(self):
         # response. Expect an exception for small timeout values.
         with self.assertRaises(InferenceServerException) as cm:
             triton_client.stop_stream()
-            triton_client.start_stream(callback=partial(callback, user_data),
-                                       stream_timeout=1)
-            triton_client.async_stream_infer(model_name=self.model_name_,
-                                             inputs=self.inputs_,
-                                             outputs=self.outputs_)
+            triton_client.start_stream(
+                callback=partial(callback, user_data), stream_timeout=1
+            )
+            triton_client.async_stream_infer(
+                model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
+            )
             data_item = user_data._completed_requests.get()
             if type(data_item) == InferenceServerException:
                 raise data_item
@@ -159,73 +168,75 @@ def test_grpc_stream_infer(self):
         # Expect inference to pass successfully for a large timeout
         # value
         triton_client.stop_stream()
-        triton_client.start_stream(callback=partial(callback, user_data),
-                                   stream_timeout=100)
+        triton_client.start_stream(
+            callback=partial(callback, user_data), stream_timeout=100
+        )
 
-        triton_client.async_stream_infer(model_name=self.model_name_,
-                                         inputs=self.inputs_,
-                                         outputs=self.outputs_)
+        triton_client.async_stream_infer(
+            model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
+        )
         data_item = user_data._completed_requests.get()
         triton_client.stop_stream()
 
         if type(data_item) == InferenceServerException:
             raise data_item
-        output0_data = data_item.as_numpy('OUTPUT0')
+        output0_data = data_item.as_numpy("OUTPUT0")
         self.assertTrue(np.array_equal(self.input0_data_, output0_data))
 
     def test_http_infer(self):
-
         self._prepare_request("http")
 
         # The model is configured to take three seconds to send the
         # response. Expect an exception for small timeout values.
         with self.assertRaises(socket.timeout) as cm:
             triton_client = httpclient.InferenceServerClient(
-                url="localhost:8000", verbose=True, network_timeout=2.0)
-            result = triton_client.infer(model_name=self.model_name_,
-                                         inputs=self.inputs_,
-                                         outputs=self.outputs_)
+                url="localhost:8000", verbose=True, network_timeout=2.0
+            )
+            result = triton_client.infer(
+                model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
+            )
         self.assertIn("timed out", str(cm.exception))
 
         # Expect to successfully pass with sufficiently large timeout
         triton_client = httpclient.InferenceServerClient(
-            url="localhost:8000", verbose=True, connection_timeout=10.0)
+            url="localhost:8000", verbose=True, connection_timeout=10.0
+        )
 
-        result = triton_client.infer(model_name=self.model_name_,
-                                     inputs=self.inputs_,
-                                     outputs=self.outputs_)
+        result = triton_client.infer(
+            model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
+        )
 
-        output0_data = result.as_numpy('OUTPUT0')
+        output0_data = result.as_numpy("OUTPUT0")
         self.assertTrue(np.array_equal(self.input0_data_, output0_data))
 
     def test_http_async_infer(self):
-
         self._prepare_request("http")
 
         # The model is configured to take three seconds to send the
         # response. Expect an exception for small timeout values.
         with self.assertRaises(socket.timeout) as cm:
             triton_client = httpclient.InferenceServerClient(
-                url="localhost:8000", verbose=True, network_timeout=2.0)
+                url="localhost:8000", verbose=True, network_timeout=2.0
+            )
             async_request = triton_client.async_infer(
-                model_name=self.model_name_,
-                inputs=self.inputs_,
-                outputs=self.outputs_)
+                model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
+            )
             result = async_request.get_result()
         self.assertIn("timed out", str(cm.exception))
 
         # Expect to successfully pass with sufficiently large timeout
         triton_client = httpclient.InferenceServerClient(
-            url="localhost:8000", verbose=True, connection_timeout=10.0)
+            url="localhost:8000", verbose=True, connection_timeout=10.0
+        )
 
-        async_request = triton_client.async_infer(model_name=self.model_name_,
-                                                  inputs=self.inputs_,
-                                                  outputs=self.outputs_)
+        async_request = triton_client.async_infer(
+            model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
+        )
         result = async_request.get_result()
 
-        output0_data = result.as_numpy('OUTPUT0')
+        output0_data = result.as_numpy("OUTPUT0")
         self.assertTrue(np.array_equal(self.input0_data_, output0_data))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_client_timeout/models/custom_identity_int32/config.pbtxt b/qa/L0_client_timeout/models/custom_identity_int32/config.pbtxt
index a42c5dcd45..1732ff32fd 100644
--- a/qa/L0_client_timeout/models/custom_identity_int32/config.pbtxt
+++ b/qa/L0_client_timeout/models/custom_identity_int32/config.pbtxt
@@ -35,7 +35,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_INT32
     dims: [ -1 ]
-    
+
   }
 ]
 output [
diff --git a/qa/L0_client_timeout/test.sh b/qa/L0_client_timeout/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_client_valgrind/models/custom_identity_int32/config.pbtxt b/qa/L0_client_valgrind/models/custom_identity_int32/config.pbtxt
index 8d3a78baf4..6a2a76bde5 100644
--- a/qa/L0_client_valgrind/models/custom_identity_int32/config.pbtxt
+++ b/qa/L0_client_valgrind/models/custom_identity_int32/config.pbtxt
@@ -35,7 +35,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_INT32
     dims: [ -1 ]
-    
+
   }
 ]
 output [
diff --git a/qa/L0_cmdline_trace/test.sh b/qa/L0_cmdline_trace/test.sh
index 3de5328610..66f9a08fc0 100755
--- a/qa/L0_cmdline_trace/test.sh
+++ b/qa/L0_cmdline_trace/test.sh
@@ -570,7 +570,7 @@ else
 fi
 
 
-# check deprecation warnings 
+# check deprecation warnings
 SERVER_ARGS=" --trace-file=/tmp/trace.json --trace-rate=100 --trace-level=TIMESTAMPS \
               --trace-log-frequency=50 --trace-count=100 --model-repository=$MODELSDIR"
 SERVER_LOG="./inference_server_trace_config_flag.log"
diff --git a/qa/L0_cmdline_trace/trace_client.py b/qa/L0_cmdline_trace/trace_client.py
old mode 100644
new mode 100755
index 8e19ba6fb7..4d59579d7c
--- a/qa/L0_cmdline_trace/trace_client.py
+++ b/qa/L0_cmdline_trace/trace_client.py
@@ -26,24 +26,26 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
 
+import numpy as np
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8001',
-                        help='Inference server URL. Default is localhost:8001.')
-    parser.add_argument('-i', '--protocol', type=str, required=True)
+    parser.add_argument(
+        "-u",
+        "--url",
+        type=str,
+        required=False,
+        default="localhost:8001",
+        help="Inference server URL. Default is localhost:8001.",
+    )
+    parser.add_argument("-i", "--protocol", type=str, required=True)
     FLAGS = parser.parse_args()
 
-    if FLAGS.protocol == 'grpc':
+    if FLAGS.protocol == "grpc":
         client_type = grpcclient
     else:
         client_type = httpclient
@@ -59,8 +61,8 @@
     # Infer
     inputs = []
     outputs = []
-    inputs.append(client_type.InferInput('INPUT0', [1, 16], "INT32"))
-    inputs.append(client_type.InferInput('INPUT1', [1, 16], "INT32"))
+    inputs.append(client_type.InferInput("INPUT0", [1, 16], "INT32"))
+    inputs.append(client_type.InferInput("INPUT1", [1, 16], "INT32"))
 
     input0_data = np.arange(start=0, stop=16, dtype=np.int32)
     input0_data = np.expand_dims(input0_data, axis=0)
@@ -69,10 +71,9 @@
     inputs[0].set_data_from_numpy(input0_data)
     inputs[1].set_data_from_numpy(input1_data)
 
-    outputs.append(client_type.InferRequestedOutput('OUTPUT0'))
-    outputs.append(client_type.InferRequestedOutput('OUTPUT1'))
+    outputs.append(client_type.InferRequestedOutput("OUTPUT0"))
+    outputs.append(client_type.InferRequestedOutput("OUTPUT1"))
 
-    triton_client.infer(model_name=model_name,
-                        inputs=inputs,
-                        outputs=outputs,
-                        request_id="1")
+    triton_client.infer(
+        model_name=model_name, inputs=inputs, outputs=outputs, request_id="1"
+    )
diff --git a/qa/L0_cuda_graph/test.sh b/qa/L0_cuda_graph/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_cuda_graph/trt_cuda_graph_test.py b/qa/L0_cuda_graph/trt_cuda_graph_test.py
old mode 100644
new mode 100755
index 6cb68255ae..a7f9f3be98
--- a/qa/L0_cuda_graph/trt_cuda_graph_test.py
+++ b/qa/L0_cuda_graph/trt_cuda_graph_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,8 +31,9 @@
 sys.path.append("../common")
 
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
 from tritonclientutils import *
 
@@ -49,22 +52,25 @@ def _check_infer(self, tensor_shape, batch_size=1):
                 full_shape = (batch_size,) + tensor_shape
             else:
                 full_shape = tensor_shape
-            iu.infer_exact(self,
-                           self.model_name_,
-                           full_shape,
-                           batch_size,
-                           self.dtype_,
-                           self.dtype_,
-                           self.dtype_,
-                           model_version=1,
-                           use_http_json_tensors=False,
-                           use_grpc=False,
-                           use_streaming=False)
+            iu.infer_exact(
+                self,
+                self.model_name_,
+                full_shape,
+                batch_size,
+                self.dtype_,
+                self.dtype_,
+                self.dtype_,
+                model_version=1,
+                use_http_json_tensors=False,
+                use_grpc=False,
+                use_streaming=False,
+            )
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def _erroneous_infer(self, tensor_shape, batch_size):
         import tritonhttpclient
+
         item_size = batch_size
         for dim in tensor_shape:
             item_size *= dim
@@ -75,30 +81,38 @@ def _erroneous_infer(self, tensor_shape, batch_size):
 
         inputs = []
         inputs.append(
-            tritonhttpclient.InferInput('INPUT0', full_shape, self.dtype_str_))
+            tritonhttpclient.InferInput("INPUT0", full_shape, self.dtype_str_)
+        )
         inputs[-1].set_data_from_numpy(input_np)
         inputs.append(
-            tritonhttpclient.InferInput('INPUT1', full_shape, self.dtype_str_))
+            tritonhttpclient.InferInput("INPUT1", full_shape, self.dtype_str_)
+        )
         inputs[-1].set_data_from_numpy(input_np)
         outputs = []
         outputs.append(
-            tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
+            tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True)
+        )
         outputs.append(
-            tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True))
+            tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True)
+        )
 
-        model_name = tu.get_model_name(self.model_name_, self.dtype_,
-                                       self.dtype_, self.dtype_)
+        model_name = tu.get_model_name(
+            self.model_name_, self.dtype_, self.dtype_, self.dtype_
+        )
         results = tritonhttpclient.InferenceServerClient(
-            "localhost:8000", verbose=True).infer(model_name=model_name,
-                                                  inputs=inputs,
-                                                  outputs=outputs)
+            "localhost:8000", verbose=True
+        ).infer(model_name=model_name, inputs=inputs, outputs=outputs)
         # Validate the results by comparing with precomputed values.
-        output0_np = results.as_numpy('OUTPUT0')
-        output1_np = results.as_numpy('OUTPUT1')
-        self.assertFalse(np.array_equal(output0_np, expected_output0_np),
-                         "expects OUTPUT0 is not correct")
-        self.assertFalse(np.array_equal(output1_np, expected_output1_np),
-                         "expects OUTPUT1 is not correct")
+        output0_np = results.as_numpy("OUTPUT0")
+        output1_np = results.as_numpy("OUTPUT1")
+        self.assertFalse(
+            np.array_equal(output0_np, expected_output0_np),
+            "expects OUTPUT0 is not correct",
+        )
+        self.assertFalse(
+            np.array_equal(output1_np, expected_output1_np),
+            "expects OUTPUT1 is not correct",
+        )
 
     def test_fixed_shape(self):
         tensor_shape = (16,)
@@ -142,7 +156,7 @@ def test_nobatch_fixed_shape(self):
         self._check_infer((16,), 0)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     if len(sys.argv) > 2:
         TrtCudaGraphTest.MODELNAME = sys.argv.pop()
 
diff --git a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
old mode 100644
new mode 100755
index 2e8939951b..87fb7c1d3c
--- a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
+++ b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,11 +30,11 @@
 
 sys.path.append("../common")
 
-import numpy as np
-import unittest
 import os
-import test_util as tu
+import unittest
 
+import numpy as np
+import test_util as tu
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 import tritonshmutils.cuda_shared_memory as cshm
@@ -40,16 +42,13 @@
 
 
 class CudaSharedMemoryTest(tu.TestResultCollector):
-
     def test_invalid_create_shm(self):
         # Raises error since tried to create invalid cuda shared memory region
         try:
-            shm_op0_handle = cshm.create_shared_memory_region(
-                "dummy_data", -1, 0)
+            shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0)
             cshm.destroy_shared_memory_region(shm_op0_handle)
         except Exception as ex:
-            self.assertEqual(str(ex),
-                             "unable to create cuda shared memory handle")
+            self.assertEqual(str(ex), "unable to create cuda shared memory handle")
 
     def test_valid_create_set_register(self):
         # Create a valid cuda shared memory region, fill data in it and register
@@ -58,10 +57,12 @@ def test_valid_create_set_register(self):
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
         shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        cshm.set_shared_memory_region(shm_op0_handle,
-                                      [np.array([1, 2], dtype=np.float32)])
+        cshm.set_shared_memory_region(
+            shm_op0_handle, [np.array([1, 2], dtype=np.float32)]
+        )
         triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8)
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
         shm_status = triton_client.get_cuda_shared_memory_status()
         if _protocol == "http":
             self.assertEqual(len(shm_status), 1)
@@ -92,7 +93,8 @@ def test_unregister_after_register(self):
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
         shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
         triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8)
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
         triton_client.unregister_cuda_shared_memory("dummy_data")
         shm_status = triton_client.get_cuda_shared_memory_status()
         if _protocol == "http":
@@ -109,13 +111,16 @@ def test_reregister_after_register(self):
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
         shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
         triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8)
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
         try:
             triton_client.register_cuda_shared_memory(
-                "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8)
+                "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+            )
         except Exception as ex:
             self.assertIn(
-                "shared memory region 'dummy_data' already in manager", str(ex))
+                "shared memory region 'dummy_data' already in manager", str(ex)
+            )
         shm_status = triton_client.get_cuda_shared_memory_status()
         if _protocol == "http":
             self.assertEqual(len(shm_status), 1)
@@ -138,27 +143,33 @@ def _configure_sever(self):
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
         triton_client.register_cuda_shared_memory(
-            "input0_data", cshm.get_raw_handle(shm_ip0_handle), 0, 64)
+            "input0_data", cshm.get_raw_handle(shm_ip0_handle), 0, 64
+        )
         triton_client.register_cuda_shared_memory(
-            "input1_data", cshm.get_raw_handle(shm_ip1_handle), 0, 64)
+            "input1_data", cshm.get_raw_handle(shm_ip1_handle), 0, 64
+        )
         triton_client.register_cuda_shared_memory(
-            "output0_data", cshm.get_raw_handle(shm_op0_handle), 0, 64)
+            "output0_data", cshm.get_raw_handle(shm_op0_handle), 0, 64
+        )
         triton_client.register_cuda_shared_memory(
-            "output1_data", cshm.get_raw_handle(shm_op1_handle), 0, 64)
+            "output1_data", cshm.get_raw_handle(shm_op1_handle), 0, 64
+        )
         return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
 
     def _cleanup_server(self, shm_handles):
         for shm_handle in shm_handles:
             cshm.destroy_shared_memory_region(shm_handle)
 
-    def _basic_inference(self,
-                         shm_ip0_handle,
-                         shm_ip1_handle,
-                         shm_op0_handle,
-                         shm_op1_handle,
-                         error_msg,
-                         big_shm_name="",
-                         big_shm_size=64):
+    def _basic_inference(
+        self,
+        shm_ip0_handle,
+        shm_ip1_handle,
+        shm_op0_handle,
+        shm_op1_handle,
+        error_msg,
+        big_shm_name="",
+        big_shm_size=64,
+    ):
         input0_data = np.arange(start=0, stop=16, dtype=np.int32)
         input1_data = np.ones(shape=16, dtype=np.int32)
         inputs = []
@@ -167,16 +178,16 @@ def _basic_inference(self,
             triton_client = httpclient.InferenceServerClient(_url, verbose=True)
             inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
             inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
+            outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
             outputs.append(
-                httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
-            outputs.append(
-                httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
+                httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)
+            )
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
             inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
             inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
-            outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
-            outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))
+            outputs.append(grpcclient.InferRequestedOutput("OUTPUT0"))
+            outputs.append(grpcclient.InferRequestedOutput("OUTPUT1"))
         inputs[0].set_shared_memory("input0_data", 64)
         if type(shm_ip1_handle) == np.array:
             inputs[1].set_data_from_numpy(input0_data, binary_data=True)
@@ -188,22 +199,21 @@ def _basic_inference(self,
         outputs[1].set_shared_memory("output1_data", 64)
 
         try:
-            results = triton_client.infer("simple",
-                                          inputs,
-                                          model_version="",
-                                          outputs=outputs)
-            output = results.get_output('OUTPUT0')
+            results = triton_client.infer(
+                "simple", inputs, model_version="", outputs=outputs
+            )
+            output = results.get_output("OUTPUT0")
             if _protocol == "http":
-                output_datatype = output['datatype']
-                output_shape = output['shape']
+                output_datatype = output["datatype"]
+                output_shape = output["shape"]
             else:
                 output_datatype = output.datatype
                 output_shape = output.shape
             output_dtype = triton_to_np_dtype(output_datatype)
-            output_data = cshm.get_contents_as_numpy(shm_op0_handle,
-                                                     output_dtype, output_shape)
-            self.assertTrue(
-                (output_data[0] == (input0_data + input1_data)).all())
+            output_data = cshm.get_contents_as_numpy(
+                shm_op0_handle, output_dtype, output_shape
+            )
+            self.assertTrue((output_data[0] == (input0_data + input1_data)).all())
         except Exception as ex:
             error_msg.append(str(ex))
 
@@ -211,8 +221,9 @@ def test_unregister_after_inference(self):
         # Unregister after inference
         error_msg = []
         shm_handles = self._configure_sever()
-        self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2],
-                              shm_handles[3], error_msg)
+        self._basic_inference(
+            shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg
+        )
         if len(error_msg) > 0:
             raise Exception(str(error_msg))
         if _protocol == "http":
@@ -235,13 +246,15 @@ def test_register_after_inference(self):
             triton_client = httpclient.InferenceServerClient(_url, verbose=True)
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-        self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2],
-                              shm_handles[3], error_msg)
+        self._basic_inference(
+            shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg
+        )
         if len(error_msg) > 0:
             raise Exception(str(error_msg))
         shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 64, 0)
         triton_client.register_cuda_shared_memory(
-            "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 64)
+            "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 64
+        )
         shm_status = triton_client.get_cuda_shared_memory_status()
         if _protocol == "http":
             self.assertEqual(len(shm_status), 5)
@@ -260,13 +273,22 @@ def test_too_big_shm(self):
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
         triton_client.register_cuda_shared_memory(
-            "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 128)
-        self._basic_inference(shm_handles[0], shm_ip2_handle, shm_handles[2],
-                              shm_handles[3], error_msg, "input2_data", 128)
+            "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 128
+        )
+        self._basic_inference(
+            shm_handles[0],
+            shm_ip2_handle,
+            shm_handles[2],
+            shm_handles[3],
+            error_msg,
+            "input2_data",
+            128,
+        )
         if len(error_msg) > 0:
             self.assertIn(
                 "unexpected total byte size 128 for input 'INPUT1', expecting 64",
-                error_msg[-1])
+                error_msg[-1],
+            )
         shm_handles.append(shm_ip2_handle)
         self._cleanup_server(shm_handles)
 
@@ -275,8 +297,9 @@ def test_mixed_raw_shm(self):
         error_msg = []
         shm_handles = self._configure_sever()
         input1_data = np.ones(shape=16, dtype=np.int32)
-        self._basic_inference(shm_handles[0], [input1_data], shm_handles[2],
-                              shm_handles[3], error_msg)
+        self._basic_inference(
+            shm_handles[0], [input1_data], shm_handles[2], shm_handles[3], error_msg
+        )
         if len(error_msg) > 0:
             raise Exception(error_msg[-1])
         self._cleanup_server(shm_handles)
@@ -302,8 +325,8 @@ def test_unregisterall(self):
         self._cleanup_server(shm_handles)
 
 
-if __name__ == '__main__':
-    _protocol = os.environ.get('CLIENT_TYPE', "http")
+if __name__ == "__main__":
+    _protocol = os.environ.get("CLIENT_TYPE", "http")
     if _protocol == "http":
         _url = "localhost:8000"
     else:
diff --git a/qa/L0_cuda_shared_memory/test.sh b/qa/L0_cuda_shared_memory/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_custom_ops/cuda_op_test.py b/qa/L0_custom_ops/cuda_op_test.py
old mode 100644
new mode 100755
index d4389d67ad..896ed2adf0
--- a/qa/L0_custom_ops/cuda_op_test.py
+++ b/qa/L0_custom_ops/cuda_op_test.py
@@ -27,47 +27,50 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
 from builtins import range
+
+import numpy as np
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 from tritonclientutils import np_to_triton_dtype
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8000',
-                        help='Inference server URL. Default is localhost:8000.')
     parser.add_argument(
-        '-i',
-        '--protocol',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
         type=str,
         required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-m',
-                        '--model',
-                        type=str,
-                        required=True,
-                        help='Name of model.')
+        default="localhost:8000",
+        help="Inference server URL. Default is localhost:8000.",
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
+        type=str,
+        required=False,
+        default="http",
+        help='Protocol ("http"/"grpc") used to '
+        + 'communicate with inference service. Default is "http".',
+    )
+    parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.")
 
     FLAGS = parser.parse_args()
     if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
+        print(
+            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
+        )
         exit(1)
 
     client_util = httpclient if FLAGS.protocol == "http" else grpcclient
@@ -84,21 +87,22 @@
     input_data = np.arange(start=42, stop=42 + elements, dtype=np.int32)
 
     inputs = [
-        client_util.InferInput("in", input_data.shape,
-                               np_to_triton_dtype(input_data.dtype))
+        client_util.InferInput(
+            "in", input_data.shape, np_to_triton_dtype(input_data.dtype)
+        )
     ]
     inputs[0].set_data_from_numpy(input_data)
 
     results = client.infer(model_name, inputs)
-    output_data = results.as_numpy('out')
+    output_data = results.as_numpy("out")
     if output_data is None:
         print("error: expected 'out'")
         sys.exit(1)
 
     for i in range(elements):
         print(
-            str(i) + ": input " + str(input_data[i]) + ", output " +
-            str(output_data[i]))
+            str(i) + ": input " + str(input_data[i]) + ", output " + str(output_data[i])
+        )
         if output_data[i] != (input_data[i] + 1):
             print("error: incorrect value")
             sys.exit(1)
diff --git a/qa/L0_custom_ops/mod_op_test.py b/qa/L0_custom_ops/mod_op_test.py
old mode 100644
new mode 100755
index 62edd1e289..14855f7c40
--- a/qa/L0_custom_ops/mod_op_test.py
+++ b/qa/L0_custom_ops/mod_op_test.py
@@ -27,47 +27,50 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
 from builtins import range
+
+import numpy as np
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 from tritonclientutils import np_to_triton_dtype
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8000',
-                        help='Inference server URL. Default is localhost:8000.')
     parser.add_argument(
-        '-i',
-        '--protocol',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
         type=str,
         required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-m',
-                        '--model',
-                        type=str,
-                        required=True,
-                        help='Name of model.')
+        default="localhost:8000",
+        help="Inference server URL. Default is localhost:8000.",
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
+        type=str,
+        required=False,
+        default="http",
+        help='Protocol ("http"/"grpc") used to '
+        + 'communicate with inference service. Default is "http".',
+    )
+    parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.")
 
     FLAGS = parser.parse_args()
     if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
+        print(
+            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
+        )
         exit(1)
 
     client_util = httpclient if FLAGS.protocol == "http" else grpcclient
@@ -87,22 +90,32 @@
     inputs = []
     for i in range(len(input_data)):
         inputs.append(
-            client_util.InferInput("INPUT__{}".format(i), input_data[0].shape,
-                                   np_to_triton_dtype(input_data[0].dtype)))
+            client_util.InferInput(
+                "INPUT__{}".format(i),
+                input_data[0].shape,
+                np_to_triton_dtype(input_data[0].dtype),
+            )
+        )
         inputs[i].set_data_from_numpy(input_data[i])
 
     results = client.infer(model_name, inputs)
 
     # We expect 1 result of size 10 with alternating 1 and 0.
-    output_data = results.as_numpy('OUTPUT__0')
+    output_data = results.as_numpy("OUTPUT__0")
     if output_data is None:
         print("error: expected 'OUTPUT__0'")
         sys.exit(1)
 
     for i in range(elements):
         print(
-            str(i) + ": " + str(input_data[0][i]) + " % " +
-            str(input_data[1][i]) + " = " + str(output_data[i]))
-        if ((input_data[0][i] % input_data[1][i]) != output_data[i]):
+            str(i)
+            + ": "
+            + str(input_data[0][i])
+            + " % "
+            + str(input_data[1][i])
+            + " = "
+            + str(output_data[i])
+        )
+        if (input_data[0][i] % input_data[1][i]) != output_data[i]:
             print("error: incorrect value")
             sys.exit(1)
diff --git a/qa/L0_custom_ops/onnx_op_test.py b/qa/L0_custom_ops/onnx_op_test.py
old mode 100644
new mode 100755
index 6a3d5ebb53..9b246c8e31
--- a/qa/L0_custom_ops/onnx_op_test.py
+++ b/qa/L0_custom_ops/onnx_op_test.py
@@ -27,47 +27,50 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
 from builtins import range
+
+import numpy as np
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 from tritonclientutils import np_to_triton_dtype
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8000',
-                        help='Inference server URL. Default is localhost:8000.')
     parser.add_argument(
-        '-i',
-        '--protocol',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
+        type=str,
+        required=False,
+        default="localhost:8000",
+        help="Inference server URL. Default is localhost:8000.",
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
         type=str,
         required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-m',
-                        '--model',
-                        type=str,
-                        required=True,
-                        help='Name of model.')
+        default="http",
+        help='Protocol ("http"/"grpc") used to '
+        + 'communicate with inference service. Default is "http".',
+    )
+    parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.")
 
     FLAGS = parser.parse_args()
     if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
+        print(
+            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
+        )
         exit(1)
 
     client_util = httpclient if FLAGS.protocol == "http" else grpcclient
@@ -88,14 +91,16 @@
     inputs = []
     for i in range(len(input_data)):
         inputs.append(
-            client_util.InferInput("input_{}".format(i + 1), shape,
-                                   np_to_triton_dtype(dtype)))
+            client_util.InferInput(
+                "input_{}".format(i + 1), shape, np_to_triton_dtype(dtype)
+            )
+        )
         inputs[i].set_data_from_numpy(input_data[i])
 
     results = client.infer(model_name, inputs)
 
     # We expect 1 result of size 10 with alternating 1 and 0.
-    output_data = results.as_numpy('output')
+    output_data = results.as_numpy("output")
     if output_data is None:
         print("error: expected 'output'")
         sys.exit(1)
@@ -103,9 +108,12 @@
     for i in range(3):
         for j in range(5):
             print(
-                str(input_data[0][i][j]) + " + " + str(input_data[1][i][j]) +
-                " = " + str(output_data[i][j]))
-            if ((input_data[0][i][j] + input_data[1][i][j]) !=
-                    output_data[i][j]):
+                str(input_data[0][i][j])
+                + " + "
+                + str(input_data[1][i][j])
+                + " = "
+                + str(output_data[i][j])
+            )
+            if (input_data[0][i][j] + input_data[1][i][j]) != output_data[i][j]:
                 print("error: incorrect value")
                 sys.exit(1)
diff --git a/qa/L0_custom_ops/vision_op_test.py b/qa/L0_custom_ops/vision_op_test.py
old mode 100644
new mode 100755
index c925dc19c0..88857c3d12
--- a/qa/L0_custom_ops/vision_op_test.py
+++ b/qa/L0_custom_ops/vision_op_test.py
@@ -27,46 +27,49 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
+
+import numpy as np
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
 from tritonclient.utils import np_to_triton_dtype
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8000',
-                        help='Inference server URL. Default is localhost:8000.')
     parser.add_argument(
-        '-i',
-        '--protocol',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
+        type=str,
+        required=False,
+        default="localhost:8000",
+        help="Inference server URL. Default is localhost:8000.",
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
         type=str,
         required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-m',
-                        '--model',
-                        type=str,
-                        required=True,
-                        help='Name of model.')
+        default="http",
+        help='Protocol ("http"/"grpc") used to '
+        + 'communicate with inference service. Default is "http".',
+    )
+    parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.")
 
     FLAGS = parser.parse_args()
     if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
+        print(
+            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
+        )
         exit(1)
 
     client_util = httpclient if FLAGS.protocol == "http" else grpcclient
@@ -83,23 +86,26 @@
 
     inputs = []
     inputs.append(
-        client_util.InferInput("INPUT__0", input_data.shape,
-                               np_to_triton_dtype(input_data.dtype)))
+        client_util.InferInput(
+            "INPUT__0", input_data.shape, np_to_triton_dtype(input_data.dtype)
+        )
+    )
     inputs[0].set_data_from_numpy(input_data)
     inputs.append(
-        client_util.InferInput("INPUT__1", box_data.shape,
-                               np_to_triton_dtype(box_data.dtype)))
+        client_util.InferInput(
+            "INPUT__1", box_data.shape, np_to_triton_dtype(box_data.dtype)
+        )
+    )
     inputs[1].set_data_from_numpy(box_data)
 
     results = client.infer(model_name, inputs)
 
     # We expect 1 result of shape [1, 3, 5, 5].
-    output_data = results.as_numpy('OUTPUT__0')
+    output_data = results.as_numpy("OUTPUT__0")
     if output_data is None:
         print("error: expected 'OUTPUT__0'")
         sys.exit(1)
 
-    if (output_data.shape != (1, 3, 5, 5)):
-        print("error: incorrect shape " + str(output_data.shape) +
-              "for 'OUTPUT__0'")
+    if output_data.shape != (1, 3, 5, 5):
+        print("error: incorrect shape " + str(output_data.shape) + "for 'OUTPUT__0'")
         sys.exit(1)
diff --git a/qa/L0_custom_ops/zero_out_test.py b/qa/L0_custom_ops/zero_out_test.py
old mode 100644
new mode 100755
index ad87dc8f37..28d5d2c9e6
--- a/qa/L0_custom_ops/zero_out_test.py
+++ b/qa/L0_custom_ops/zero_out_test.py
@@ -27,47 +27,50 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
 from builtins import range
+
+import numpy as np
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 from tritonclientutils import np_to_triton_dtype
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8000',
-                        help='Inference server URL. Default is localhost:8000.')
     parser.add_argument(
-        '-i',
-        '--protocol',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
         type=str,
         required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-m',
-                        '--model',
-                        type=str,
-                        required=True,
-                        help='Name of model.')
+        default="localhost:8000",
+        help="Inference server URL. Default is localhost:8000.",
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
+        type=str,
+        required=False,
+        default="http",
+        help='Protocol ("http"/"grpc") used to '
+        + 'communicate with inference service. Default is "http".',
+    )
+    parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.")
 
     FLAGS = parser.parse_args()
     if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
+        print(
+            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
+        )
         exit(1)
 
     client_util = httpclient if FLAGS.protocol == "http" else grpcclient
@@ -83,8 +86,9 @@
     input_data = np.arange(start=42, stop=42 + elements, dtype=np.int32)
 
     inputs = [
-        client_util.InferInput("to_zero", input_data.shape,
-                               np_to_triton_dtype(input_data.dtype))
+        client_util.InferInput(
+            "to_zero", input_data.shape, np_to_triton_dtype(input_data.dtype)
+        )
     ]
     inputs[0].set_data_from_numpy(input_data)
     results = client.infer(model_name, inputs)
@@ -97,8 +101,8 @@
 
     for i in range(elements):
         print(
-            str(i) + ": input " + str(input_data[i]) + ", output " +
-            str(output_data[i]))
+            str(i) + ": input " + str(input_data[i]) + ", output " + str(output_data[i])
+        )
         if (i == 0) and (input_data[i] != output_data[i]):
             print("error: incorrect value")
             sys.exit(1)
diff --git a/qa/L0_data_compression/test.sh b/qa/L0_data_compression/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_data_compression/validation.py b/qa/L0_data_compression/validation.py
old mode 100644
new mode 100755
index 927c863952..a0e5cb1576
--- a/qa/L0_data_compression/validation.py
+++ b/qa/L0_data_compression/validation.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,8 +31,9 @@
 
 def generate_compressed_data():
     with open("raw_data", "rb") as f:
-        import zlib
         import gzip
+        import zlib
+
         raw_data = f.read()
         with open("deflate_compressed_data", "wb") as of:
             of.write(zlib.compress(raw_data))
@@ -40,8 +43,9 @@ def generate_compressed_data():
 
 def validate_compressed_data():
     with open("raw_data", "rb") as f:
-        import zlib
         import gzip
+        import zlib
+
         raw_data = f.read()
         with open("generated_deflate_compressed_data", "rb") as cf:
             decompressed_data = zlib.decompress(cf.read())
@@ -53,5 +57,5 @@ def validate_compressed_data():
                 exit(1)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     globals()[sys.argv[1]]()
diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py
old mode 100644
new mode 100755
index 0ce47e5b80..b78170cf63
--- a/qa/L0_decoupled/decoupled_test.py
+++ b/qa/L0_decoupled/decoupled_test.py
@@ -1,4 +1,6 @@
-# Copyright 2020-2023, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,11 +27,12 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import sys
+
 sys.path.append("../common")
 
 import os
-import time
 import queue
+import time
 import unittest
 from functools import partial
 
@@ -41,7 +44,6 @@
 
 
 class UserData:
-
     def __init__(self):
         self._response_queue = queue.Queue()
 
@@ -54,23 +56,25 @@ def callback(user_data, result, error):
 
 
 class DecoupledTest(tu.TestResultCollector):
-
     def setUp(self):
-        self.trials_ = [("repeat_int32", None), ("simple_repeat", None),
-                        ("sequence_repeat", None),
-                        ("fan_repeat", self._fan_validate),
-                        ("repeat_square", self._nested_validate),
-                        ("nested_square", self._nested_validate)]
+        self.trials_ = [
+            ("repeat_int32", None),
+            ("simple_repeat", None),
+            ("sequence_repeat", None),
+            ("fan_repeat", self._fan_validate),
+            ("repeat_square", self._nested_validate),
+            ("nested_square", self._nested_validate),
+        ]
         self.model_name_ = "repeat_int32"
 
         self.inputs_ = []
-        self.inputs_.append(grpcclient.InferInput('IN', [1], "INT32"))
-        self.inputs_.append(grpcclient.InferInput('DELAY', [1], "UINT32"))
-        self.inputs_.append(grpcclient.InferInput('WAIT', [1], "UINT32"))
+        self.inputs_.append(grpcclient.InferInput("IN", [1], "INT32"))
+        self.inputs_.append(grpcclient.InferInput("DELAY", [1], "UINT32"))
+        self.inputs_.append(grpcclient.InferInput("WAIT", [1], "UINT32"))
 
         self.outputs_ = []
-        self.outputs_.append(grpcclient.InferRequestedOutput('OUT'))
-        self.outputs_.append(grpcclient.InferRequestedOutput('IDX'))
+        self.outputs_.append(grpcclient.InferRequestedOutput("OUT"))
+        self.outputs_.append(grpcclient.InferRequestedOutput("IDX"))
         # Some trials only expect a subset of outputs
         self.requested_outputs_ = self.outputs_
 
@@ -95,14 +99,22 @@ def setUp(self):
     # If the decoupled backend/model always sends the final response flag along
     # with a non-null response, no opt-in is needed.
     #
-    # With this behavior, the client can programatically detect when all responses
+    # With this behavior, the client can programmatically detect when all responses
     # for an individual request have been received without knowing the expected
     # number of responses in advance and without closing the stream.
-    def _stream_infer_with_params(self, request_count, request_delay, _,
-                                  delay_data, delay_factor, user_data,
-                                  result_dict):
-        with grpcclient.InferenceServerClient(url="localhost:8001",
-                                              verbose=True) as triton_client:
+    def _stream_infer_with_params(
+        self,
+        request_count,
+        request_delay,
+        _,
+        delay_data,
+        delay_factor,
+        user_data,
+        result_dict,
+    ):
+        with grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        ) as triton_client:
             # Establish stream
             triton_client.start_stream(callback=partial(callback, user_data))
             # Send specified many requests in parallel
@@ -116,7 +128,8 @@ def _stream_infer_with_params(self, request_count, request_delay, _,
                     outputs=self.requested_outputs_,
                     # Opt-in to receiving flags-only responses from model/backend
                     # to help detect final responses for decoupled models.
-                    enable_empty_final_response=True)
+                    enable_empty_final_response=True,
+                )
                 # Update delay input in accordance with the scaling factor
                 delay_data = delay_data * delay_factor
                 delay_data = delay_data.astype(np.uint32)
@@ -134,11 +147,11 @@ def _stream_infer_with_params(self, request_count, request_delay, _,
                     # to associate decoupled responses with their requests.
                     if not response.id:
                         raise ValueError(
-                            "No response id found. Was a request_id provided?")
+                            "No response id found. Was a request_id provided?"
+                        )
 
                     # Detect final response. Parameters are oneof and we expect bool_param
-                    if response.parameters.get(
-                            "triton_final_response").bool_param:
+                    if response.parameters.get("triton_final_response").bool_param:
                         completed_requests += 1
 
                     # Only process non-empty response, ignore if empty (no outputs)
@@ -148,10 +161,19 @@ def _stream_infer_with_params(self, request_count, request_delay, _,
                         result_dict[response.id].append((recv_count, data_item))
                         recv_count += 1
 
-    def _stream_infer(self, request_count, request_delay, expected_count,
-                      delay_data, delay_factor, user_data, result_dict):
-        with grpcclient.InferenceServerClient(url="localhost:8001",
-                                              verbose=True) as triton_client:
+    def _stream_infer(
+        self,
+        request_count,
+        request_delay,
+        expected_count,
+        delay_data,
+        delay_factor,
+        user_data,
+        result_dict,
+    ):
+        with grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        ) as triton_client:
             # Establish stream
             triton_client.start_stream(callback=partial(callback, user_data))
             # Send specified many requests in parallel
@@ -162,7 +184,8 @@ def _stream_infer(self, request_count, request_delay, expected_count,
                     model_name=self.model_name_,
                     inputs=self.inputs_,
                     request_id=str(i),
-                    outputs=self.requested_outputs_)
+                    outputs=self.requested_outputs_,
+                )
                 # Update delay input in accordance with the scaling factor
                 delay_data = delay_data * delay_factor
                 delay_data = delay_data.astype(np.uint32)
@@ -186,7 +209,7 @@ def _fan_validate(self, result_list, data_offset, repeat_count):
         self.assertEqual(len(result_list), repeat_count)
         expected_data = 2 * data_offset
         for j in range(len(result_list)):
-            this_data = result_list[j][1].as_numpy('OUT')
+            this_data = result_list[j][1].as_numpy("OUT")
             self.assertEqual(len(this_data), 1)
             self.assertEqual(this_data[0], expected_data)
             expected_data += 2
@@ -194,13 +217,12 @@ def _fan_validate(self, result_list, data_offset, repeat_count):
     def _nested_validate(self, result_list, data_offset, repeat_count):
         # if repeat model returns repeat result n, repeat_square-like model
         # will return the same result n times
-        expected_len = sum(
-            x for x in range(data_offset, data_offset + repeat_count))
+        expected_len = sum(x for x in range(data_offset, data_offset + repeat_count))
         self.assertEqual(len(result_list), expected_len)
         expected_data = data_offset
         expected_count = expected_data
         for j in range(len(result_list)):
-            this_data = result_list[j][1].as_numpy('OUT')
+            this_data = result_list[j][1].as_numpy("OUT")
             self.assertEqual(len(this_data), 1)
             self.assertEqual(this_data[0], expected_data)
             expected_count -= 1
@@ -208,20 +230,22 @@ def _nested_validate(self, result_list, data_offset, repeat_count):
                 expected_data += 1
                 expected_count = expected_data
 
-    def _decoupled_infer(self,
-                         request_count,
-                         request_delay=0,
-                         repeat_count=1,
-                         data_offset=100,
-                         delay_time=1000,
-                         delay_factor=1,
-                         wait_time=500,
-                         order_sequence=None,
-                         validate_fn=None):
+    def _decoupled_infer(
+        self,
+        request_count,
+        request_delay=0,
+        repeat_count=1,
+        data_offset=100,
+        delay_time=1000,
+        delay_factor=1,
+        wait_time=500,
+        order_sequence=None,
+        validate_fn=None,
+    ):
         # Initialize data for IN
-        input_data = np.arange(start=data_offset,
-                               stop=data_offset + repeat_count,
-                               dtype=np.int32)
+        input_data = np.arange(
+            start=data_offset, stop=data_offset + repeat_count, dtype=np.int32
+        )
         self.inputs_[0].set_shape([repeat_count])
         self.inputs_[0].set_data_from_numpy(input_data)
 
@@ -234,24 +258,31 @@ def _decoupled_infer(self,
         self.inputs_[2].set_data_from_numpy(wait_data)
 
         # use validate_fn to differentiate requested outputs
-        self.requested_outputs_ = self.outputs_ if validate_fn is None else self.outputs_[
-            0:1]
+        self.requested_outputs_ = (
+            self.outputs_ if validate_fn is None else self.outputs_[0:1]
+        )
 
-        for infer_helper in [
-                self._stream_infer, self._stream_infer_with_params
-        ]:
+        for infer_helper in [self._stream_infer, self._stream_infer_with_params]:
             user_data = UserData()
             result_dict = {}
 
             try:
                 if "square" not in self.model_name_:
-                    expected_count = (repeat_count * request_count)
+                    expected_count = repeat_count * request_count
                 else:
-                    expected_count = sum(
-                        x for x in range(data_offset, data_offset +
-                                         repeat_count)) * request_count
-                infer_helper(request_count, request_delay, expected_count,
-                             delay_data, delay_factor, user_data, result_dict)
+                    expected_count = (
+                        sum(x for x in range(data_offset, data_offset + repeat_count))
+                        * request_count
+                    )
+                infer_helper(
+                    request_count,
+                    request_delay,
+                    expected_count,
+                    delay_data,
+                    delay_factor,
+                    user_data,
+                    result_dict,
+                )
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -260,33 +291,34 @@ def _decoupled_infer(self,
                 this_id = str(i)
                 if repeat_count != 0 and this_id not in result_dict.keys():
                     self.assertTrue(
-                        False, "response for request id {} not received".format(
-                            this_id))
+                        False, "response for request id {} not received".format(this_id)
+                    )
                 elif repeat_count == 0 and this_id in result_dict.keys():
                     self.assertTrue(
                         False,
                         "received unexpected response for request id {}".format(
-                            this_id))
+                            this_id
+                        ),
+                    )
                 if repeat_count != 0:
                     if validate_fn is None:
-                        self.assertEqual(len(result_dict[this_id]),
-                                         repeat_count)
+                        self.assertEqual(len(result_dict[this_id]), repeat_count)
                         expected_data = data_offset
                         result_list = result_dict[this_id]
                         for j in range(len(result_list)):
                             if order_sequence is not None:
-                                self.assertEqual(result_list[j][0],
-                                                 order_sequence[i][j])
-                            this_data = result_list[j][1].as_numpy('OUT')
+                                self.assertEqual(
+                                    result_list[j][0], order_sequence[i][j]
+                                )
+                            this_data = result_list[j][1].as_numpy("OUT")
                             self.assertEqual(len(this_data), 1)
                             self.assertEqual(this_data[0], expected_data)
-                            this_idx = result_list[j][1].as_numpy('IDX')
+                            this_idx = result_list[j][1].as_numpy("IDX")
                             self.assertEqual(len(this_idx), 1)
                             self.assertEqual(this_idx[0], j)
                             expected_data += 1
                     else:
-                        validate_fn(result_dict[this_id], data_offset,
-                                    repeat_count)
+                        validate_fn(result_dict[this_id], data_offset, repeat_count)
 
     def test_one_to_none(self):
         # Test cases where each request generates no response.
@@ -296,13 +328,9 @@ def test_one_to_none(self):
         for trial in self.trials_:
             self.model_name_ = trial[0]
             # Single request case
-            self._decoupled_infer(request_count=1,
-                                  repeat_count=0,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(request_count=1, repeat_count=0, validate_fn=trial[1])
             # Multiple request case
-            self._decoupled_infer(request_count=5,
-                                  repeat_count=0,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(request_count=5, repeat_count=0, validate_fn=trial[1])
 
     def test_one_to_one(self):
         # Test cases where each request generates single response.
@@ -313,23 +341,15 @@ def test_one_to_one(self):
             self.model_name_ = trial[0]
             # Single request case
             # Release request before the response is delivered
-            self._decoupled_infer(request_count=1,
-                                  wait_time=500,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(request_count=1, wait_time=500, validate_fn=trial[1])
             # Release request after the response is delivered
-            self._decoupled_infer(request_count=1,
-                                  wait_time=2000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(request_count=1, wait_time=2000, validate_fn=trial[1])
 
             # Multiple request case
             # Release request before the response is delivered
-            self._decoupled_infer(request_count=5,
-                                  wait_time=500,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(request_count=5, wait_time=500, validate_fn=trial[1])
             # Release request after the response is delivered
-            self._decoupled_infer(request_count=5,
-                                  wait_time=2000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(request_count=5, wait_time=2000, validate_fn=trial[1])
 
     def test_one_to_many(self):
         # Test cases where each request generates multiple response.
@@ -342,37 +362,31 @@ def test_one_to_many(self):
             self.model_name_ = trial[0]
             # Single request case
             # Release request before the first response is delivered
-            self._decoupled_infer(request_count=1,
-                                  repeat_count=5,
-                                  wait_time=500,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=500, validate_fn=trial[1]
+            )
             # Release request when the responses are getting delivered
-            self._decoupled_infer(request_count=1,
-                                  repeat_count=5,
-                                  wait_time=2000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=2000, validate_fn=trial[1]
+            )
             # Release request after all the responses are delivered
-            self._decoupled_infer(request_count=1,
-                                  repeat_count=5,
-                                  wait_time=10000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=10000, validate_fn=trial[1]
+            )
 
             # Multiple request case
             # Release request before the first response is delivered
-            self._decoupled_infer(request_count=5,
-                                  repeat_count=5,
-                                  wait_time=500,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=500, validate_fn=trial[1]
+            )
             # Release request when the responses are getting delivered
-            self._decoupled_infer(request_count=5,
-                                  repeat_count=5,
-                                  wait_time=2000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=2000, validate_fn=trial[1]
+            )
             # Release request after all the responses are delivered
-            self._decoupled_infer(request_count=5,
-                                  repeat_count=5,
-                                  wait_time=10000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=10000, validate_fn=trial[1]
+            )
 
     def test_one_to_multi_many(self):
         # Test cases where each request generates multiple response but the
@@ -385,37 +399,31 @@ def test_one_to_multi_many(self):
             self.model_name_ = trial[0]
             # Single request case
             # Release request before the first response is delivered
-            self._decoupled_infer(request_count=1,
-                                  repeat_count=5,
-                                  wait_time=500,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=500, validate_fn=trial[1]
+            )
             # Release request when the responses are getting delivered
-            self._decoupled_infer(request_count=1,
-                                  repeat_count=5,
-                                  wait_time=8000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=8000, validate_fn=trial[1]
+            )
             # Release request after all the responses are delivered
-            self._decoupled_infer(request_count=1,
-                                  repeat_count=5,
-                                  wait_time=20000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=20000, validate_fn=trial[1]
+            )
 
             # Multiple request case
             # Release request before the first response is delivered
-            self._decoupled_infer(request_count=5,
-                                  repeat_count=5,
-                                  wait_time=500,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=500, validate_fn=trial[1]
+            )
             # Release request when the responses are getting delivered
-            self._decoupled_infer(request_count=5,
-                                  repeat_count=5,
-                                  wait_time=3000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=3000, validate_fn=trial[1]
+            )
             # Release request after all the responses are delivered
-            self._decoupled_infer(request_count=5,
-                                  repeat_count=5,
-                                  wait_time=10000,
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=10000, validate_fn=trial[1]
+            )
 
     def test_response_order(self):
         # Test the expected response order for different cases
@@ -426,51 +434,61 @@ def test_response_order(self):
             self.model_name_ = trial[0]
 
             # Case 1: Interleaved responses
-            self._decoupled_infer(request_count=2,
-                                  request_delay=500,
-                                  repeat_count=4,
-                                  order_sequence=[[0, 2, 4, 6], [1, 3, 5, 7]],
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=500,
+                repeat_count=4,
+                order_sequence=[[0, 2, 4, 6], [1, 3, 5, 7]],
+                validate_fn=trial[1],
+            )
 
             # Case 2: All responses of second request delivered before any
             # response from the first
-            self._decoupled_infer(request_count=2,
-                                  request_delay=500,
-                                  repeat_count=4,
-                                  delay_time=2000,
-                                  delay_factor=0.1,
-                                  order_sequence=[[4, 5, 6, 7], [0, 1, 2, 3]],
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=500,
+                repeat_count=4,
+                delay_time=2000,
+                delay_factor=0.1,
+                order_sequence=[[4, 5, 6, 7], [0, 1, 2, 3]],
+                validate_fn=trial[1],
+            )
 
             # Case 3: Similar to Case 2, but the second request is generated
             # after the first response from first request is received
-            self._decoupled_infer(request_count=2,
-                                  request_delay=2500,
-                                  repeat_count=4,
-                                  delay_time=2000,
-                                  delay_factor=0.1,
-                                  order_sequence=[[0, 5, 6, 7], [1, 2, 3, 4]],
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=2500,
+                repeat_count=4,
+                delay_time=2000,
+                delay_factor=0.1,
+                order_sequence=[[0, 5, 6, 7], [1, 2, 3, 4]],
+                validate_fn=trial[1],
+            )
 
             # Case 4: All the responses of second requests are dleivered after
             # all the responses from first requests are received
-            self._decoupled_infer(request_count=2,
-                                  request_delay=100,
-                                  repeat_count=4,
-                                  delay_time=500,
-                                  delay_factor=10,
-                                  order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]],
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=100,
+                repeat_count=4,
+                delay_time=500,
+                delay_factor=10,
+                order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]],
+                validate_fn=trial[1],
+            )
 
             # Case 5: Similar to Case 4, but the second request is generated
             # after the first response from the first request is received
-            self._decoupled_infer(request_count=2,
-                                  request_delay=750,
-                                  repeat_count=4,
-                                  delay_time=500,
-                                  delay_factor=10,
-                                  order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]],
-                                  validate_fn=trial[1])
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=750,
+                repeat_count=4,
+                delay_time=500,
+                delay_factor=10,
+                order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]],
+                validate_fn=trial[1],
+            )
 
     def _no_streaming_helper(self, protocol):
         data_offset = 100
@@ -478,9 +496,9 @@ def _no_streaming_helper(self, protocol):
         delay_time = 1000
         wait_time = 2000
 
-        input_data = np.arange(start=data_offset,
-                               stop=data_offset + repeat_count,
-                               dtype=np.int32)
+        input_data = np.arange(
+            start=data_offset, stop=data_offset + repeat_count, dtype=np.int32
+        )
         delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time
         wait_data = np.array([wait_time], dtype=np.uint32)
 
@@ -490,12 +508,11 @@ def _no_streaming_helper(self, protocol):
             this_outputs = self.outputs_
         else:
             this_inputs = []
-            this_inputs.append(
-                httpclient.InferInput('IN', [repeat_count], "INT32"))
-            this_inputs.append(httpclient.InferInput('DELAY', [1], "UINT32"))
-            this_inputs.append(httpclient.InferInput('WAIT', [1], "UINT32"))
+            this_inputs.append(httpclient.InferInput("IN", [repeat_count], "INT32"))
+            this_inputs.append(httpclient.InferInput("DELAY", [1], "UINT32"))
+            this_inputs.append(httpclient.InferInput("WAIT", [1], "UINT32"))
             this_outputs = []
-            this_outputs.append(httpclient.InferRequestedOutput('OUT'))
+            this_outputs.append(httpclient.InferRequestedOutput("OUT"))
 
         # Initialize data for IN
         this_inputs[0].set_shape([repeat_count])
@@ -510,19 +527,22 @@ def _no_streaming_helper(self, protocol):
 
         if protocol == "grpc":
             triton_client = grpcclient.InferenceServerClient(
-                url="localhost:8001", verbose=True)
+                url="localhost:8001", verbose=True
+            )
         else:
             triton_client = httpclient.InferenceServerClient(
-                url="localhost:8000", verbose=True)
+                url="localhost:8000", verbose=True
+            )
 
         with self.assertRaises(InferenceServerException) as cm:
-            triton_client.infer(model_name=self.model_name_,
-                                inputs=this_inputs,
-                                outputs=this_outputs)
+            triton_client.infer(
+                model_name=self.model_name_, inputs=this_inputs, outputs=this_outputs
+            )
 
         self.assertIn(
             "doesn't support models with decoupled transaction policy",
-            str(cm.exception))
+            str(cm.exception),
+        )
 
     def test_no_streaming(self):
         # Test cases with no streaming inference. Server should give
@@ -541,9 +561,9 @@ def test_wrong_shape(self):
         delay_time = 1000
         wait_time = 2000
 
-        input_data = np.arange(start=data_offset,
-                               stop=data_offset + repeat_count,
-                               dtype=np.int32)
+        input_data = np.arange(
+            start=data_offset, stop=data_offset + repeat_count, dtype=np.int32
+        )
         delay_data = (np.ones([repeat_count + 1], dtype=np.uint32)) * delay_time
         wait_data = np.array([wait_time], dtype=np.uint32)
 
@@ -562,12 +582,14 @@ def test_wrong_shape(self):
         result_dict = {}
 
         with self.assertRaises(InferenceServerException) as cm:
-            self._stream_infer(1, 0, repeat_count, delay_data, 1, user_data,
-                               result_dict)
+            self._stream_infer(
+                1, 0, repeat_count, delay_data, 1, user_data, result_dict
+            )
 
-        self.assertIn("expected IN and DELAY shape to match, got [1] and [2]",
-                      str(cm.exception))
+        self.assertIn(
+            "expected IN and DELAY shape to match, got [1] and [2]", str(cm.exception)
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh
old mode 100644
new mode 100755
index 8fb5841997..90bb913b6c
--- a/qa/L0_decoupled/test.sh
+++ b/qa/L0_decoupled/test.sh
@@ -74,7 +74,7 @@ for trial in $TRIALS; do
       cat $SERVER_LOG
       exit 1
   fi
-  
+
   for i in \
               test_one_to_none \
               test_one_to_one \
@@ -82,7 +82,7 @@ for trial in $TRIALS; do
               test_no_streaming \
               test_response_order \
 	      test_wrong_shape; do
-  
+
       echo "Test: $i" >>$CLIENT_LOG
       set +e
       python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1
@@ -100,11 +100,11 @@ for trial in $TRIALS; do
       fi
       set -e
   done
-  
+
   # Will delay the writing of each response by the specified many milliseconds.
   # This will ensure that there are multiple responses available to be written.
   export TRITONSERVER_DELAY_GRPC_RESPONSE=2000
-  
+
   echo "Test: test_one_to_multi_many" >>$CLIENT_LOG
   set +e
   python $DECOUPLED_TEST DecoupledTest.test_one_to_multi_many >>$CLIENT_LOG 2>&1
@@ -120,18 +120,18 @@ for trial in $TRIALS; do
           RET=1
       fi
   fi
-  
+
   set -e
-  
+
   unset TRITONSERVER_DELAY_GRPC_RESPONSE
-  
+
   kill $SERVER_PID
   wait $SERVER_PID
 done
 
 if [ $RET -eq 0 ]; then
   echo -e "\n***\n*** Test Passed\n***"
-else 
+else
   echo -e "\n***\n*** Test Failed\n***"
 fi
 
diff --git a/qa/L0_device_memory_tracker/test.py b/qa/L0_device_memory_tracker/test.py
old mode 100644
new mode 100755
index 0265f043d5..1d443d1032
--- a/qa/L0_device_memory_tracker/test.py
+++ b/qa/L0_device_memory_tracker/test.py
@@ -25,18 +25,16 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import unittest
 import time
+import unittest
 from functools import partial
 
-import tritonclient.http as httpclient
-import tritonclient.grpc as grpcclient
-
 import nvidia_smi
+import tritonclient.grpc as grpcclient
+import tritonclient.http as httpclient
 
 
 class UnifiedClientProxy:
-
     def __init__(self, client):
         self.client_ = client
 
@@ -45,21 +43,19 @@ def __getattr__(self, attr):
         if type(self.client_) == grpcclient.InferenceServerClient:
             if attr == "get_model_config":
                 return lambda *args, **kwargs: forward_attr(
-                    *args, **kwargs, as_json=True)["config"]
+                    *args, **kwargs, as_json=True
+                )["config"]
             elif attr == "get_inference_statistics":
                 return partial(forward_attr, as_json=True)
         return forward_attr
 
 
 class MemoryUsageTest(unittest.TestCase):
-
     def setUp(self):
         nvidia_smi.nvmlInit()
         self.gpu_handle_ = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
-        self.http_client_ = httpclient.InferenceServerClient(
-            url="localhost:8000")
-        self.grpc_client_ = grpcclient.InferenceServerClient(
-            url="localhost:8001")
+        self.http_client_ = httpclient.InferenceServerClient(url="localhost:8000")
+        self.grpc_client_ = grpcclient.InferenceServerClient(url="localhost:8001")
 
     def tearDown(self):
         nvidia_smi.nvmlShutdown()
@@ -69,8 +65,7 @@ def report_used_gpu_memory(self):
         return info.used
 
     def is_testing_backend(self, model_name, backend_name):
-        return self.client_.get_model_config(
-            model_name)["backend"] == backend_name
+        return self.client_.get_model_config(model_name)["backend"] == backend_name
 
     def verify_recorded_usage(self, model_stat):
         recorded_gpu_usage = 0
@@ -87,10 +82,13 @@ def verify_recorded_usage(self, model_stat):
         # check with tolerance as gpu usage obtained is overall usage
         self.assertTrue(
             usage_delta * 0.9 <= recorded_gpu_usage <= usage_delta * 1.1,
-            msg=
-            "For model {}, expect recorded usage to be in range [{}, {}], got {}"
-            .format(model_stat["name"], usage_delta * 0.9, usage_delta * 1.1,
-                    recorded_gpu_usage))
+            msg="For model {}, expect recorded usage to be in range [{}, {}], got {}".format(
+                model_stat["name"],
+                usage_delta * 0.9,
+                usage_delta * 1.1,
+                recorded_gpu_usage,
+            ),
+        )
 
     def test_onnx_http(self):
         self.client_ = UnifiedClientProxy(self.http_client_)
diff --git a/qa/L0_device_memory_tracker/test.sh b/qa/L0_device_memory_tracker/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_dlpack_multi_gpu/test.sh b/qa/L0_dlpack_multi_gpu/test.sh
old mode 100644
new mode 100755
index af528a6667..2485bfdb88
--- a/qa/L0_dlpack_multi_gpu/test.sh
+++ b/qa/L0_dlpack_multi_gpu/test.sh
@@ -64,7 +64,7 @@ fi
 
 set +e
 export MODEL_NAME="dlpack_test"
-python3 $CLIENT_PY > $CLIENT_LOG 2>&1 
+python3 $CLIENT_PY > $CLIENT_LOG 2>&1
 
 if [ $? -ne 0 ]; then
     echo -e "\n***\n*** python_unittest.py FAILED. \n***"
diff --git a/qa/L0_doc_links/test.sh b/qa/L0_doc_links/test.sh
old mode 100644
new mode 100755
index 730adee917..be7d291b01
--- a/qa/L0_doc_links/test.sh
+++ b/qa/L0_doc_links/test.sh
@@ -1,4 +1,5 @@
-# Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/bin/bash
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/qa/L0_dyna_implicit_state/test.sh b/qa/L0_dyna_implicit_state/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_dyna_sequence_batcher/dyna_sequence_batcher_test.py b/qa/L0_dyna_sequence_batcher/dyna_sequence_batcher_test.py
old mode 100644
new mode 100755
index 6fff86948c..f2c709469b
--- a/qa/L0_dyna_sequence_batcher/dyna_sequence_batcher_test.py
+++ b/qa/L0_dyna_sequence_batcher/dyna_sequence_batcher_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,57 +30,55 @@
 
 sys.path.append("../common")
 
-from builtins import str
 import os
-import time
 import threading
+import time
 import unittest
+from builtins import str
+
 import numpy as np
-import test_util as tu
 import sequence_util as su
+import test_util as tu
 
-_test_system_shared_memory = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
-_test_cuda_shared_memory = bool(
-    int(os.environ.get('TEST_CUDA_SHARED_MEMORY', 0)))
+_test_system_shared_memory = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
+_test_cuda_shared_memory = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0)))
 
-NO_BATCHING = (int(os.environ.get('NO_BATCHING', 0)) == 1)
+NO_BATCHING = int(os.environ.get("NO_BATCHING", 0)) == 1
 BACKENDS = os.environ.get(
-    'BACKENDS', "graphdef savedmodel libtorch onnx plan custom custom_string")
-IMPLICIT_STATE = (int(os.environ['IMPLICIT_STATE']) == 1)
+    "BACKENDS", "graphdef savedmodel libtorch onnx plan custom custom_string"
+)
+IMPLICIT_STATE = int(os.environ["IMPLICIT_STATE"]) == 1
 
-_trials = BACKENDS.split(' ')
+_trials = BACKENDS.split(" ")
 for backend in BACKENDS.split(" "):
     if NO_BATCHING:
-        if (backend != 'custom') and (backend != 'custom_string'):
+        if (backend != "custom") and (backend != "custom_string"):
             _trials += (backend + "_nobatch",)
 
 _ragged_batch_supported_trials = []
-if 'custom' in BACKENDS.split(' '):
-    _ragged_batch_supported_trials.append('custom')
+if "custom" in BACKENDS.split(" "):
+    _ragged_batch_supported_trials.append("custom")
 
 _protocols = ("http", "grpc")
 _max_sequence_idle_ms = 5000
 
 
 class DynaSequenceBatcherTest(su.SequenceBatcherTestUtil):
-
     def get_datatype(self, trial):
         return np.int32
 
-    def get_expected_result(self,
-                            expected_result,
-                            corrid,
-                            value,
-                            trial,
-                            flag_str=None):
+    def get_expected_result(self, expected_result, corrid, value, trial, flag_str=None):
         # Adjust the expected_result for models that
-        # couldn't implement the full accumulator. See
+        # could not implement the full accumulator. See
         # qa/common/gen_qa_dyna_sequence_models.py for more
         # information.
-        if ((("nobatch" not in trial) and ("custom" not in trial)) or \
-            ("graphdef" in trial) or ("plan" in trial) or ("onnx" in trial) or \
-            ("libtorch" in trial)):
+        if (
+            (("nobatch" not in trial) and ("custom" not in trial))
+            or ("graphdef" in trial)
+            or ("plan" in trial)
+            or ("onnx" in trial)
+            or ("libtorch" in trial)
+        ):
             expected_result = value
             if flag_str is not None:
                 if "start" in flag_str:
@@ -90,12 +90,9 @@ def get_expected_result(self,
                         expected_result += corrid
         return expected_result
 
-    def get_expected_result_implicit(self,
-                                     expected_result,
-                                     corrid,
-                                     value,
-                                     trial,
-                                     flag_str=None):
+    def get_expected_result_implicit(
+        self, expected_result, corrid, value, trial, flag_str=None
+    ):
         return expected_result
 
     def test_simple_sequence(self):
@@ -111,18 +108,22 @@ def test_simple_sequence(self):
 
                     self.check_setup(model_name)
                     self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
-                    self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                     os.environ)
+                    self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
 
                     if "string" in trial:
-                        corrid = '52'
+                        corrid = "52"
                     else:
                         corrid = 52
 
-                    expected_result = self.get_expected_result(
-                        45 + int(corrid), corrid, 9, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        45, corrid, 9, trial, "end")
+                    expected_result = (
+                        self.get_expected_result(
+                            45 + int(corrid), corrid, 9, trial, "end"
+                        )
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            45, corrid, 9, trial, "end"
+                        )
+                    )
 
                     self.check_sequence(
                         trial,
@@ -131,19 +132,26 @@ def test_simple_sequence(self):
                         corrid,
                         (4000, None),
                         # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay))
-                        (("start", 1, None, None), (None, 2, None, None),
-                         (None, 3, None, None), (None, 4, None, None),
-                         (None, 5, None, None), (None, 6, None, None),
-                         (None, 7, None, None), (None, 8, None, None),
-                         ("end", 9, None, None)),
+                        (
+                            ("start", 1, None, None),
+                            (None, 2, None, None),
+                            (None, 3, None, None),
+                            (None, 4, None, None),
+                            (None, 5, None, None),
+                            (None, 6, None, None),
+                            (None, 7, None, None),
+                            (None, 8, None, None),
+                            ("end", 9, None, None),
+                        ),
                         expected_result,
                         protocol,
-                        sequence_name="{}_{}".format(self._testMethodName,
-                                                     protocol))
+                        sequence_name="{}_{}".format(self._testMethodName, protocol),
+                    )
 
                     self.check_deferred_exception()
-                    self.check_status(model_name, {1: 9 * (idx + 1)},
-                                      9 * (idx + 1), 9 * (idx + 1))
+                    self.check_status(
+                        model_name, {1: 9 * (idx + 1)}, 9 * (idx + 1), 9 * (idx + 1)
+                    )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -160,18 +168,22 @@ def test_length1_sequence(self):
 
                     self.check_setup(model_name)
                     self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
-                    self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                     os.environ)
+                    self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
 
                     if "string" in trial:
-                        corrid = '99'
+                        corrid = "99"
                     else:
                         corrid = 99
 
-                    expected_result = self.get_expected_result(
-                        42 + int(corrid), corrid, 42, trial, "start,end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        42, corrid, 42, trial, "start,end")
+                    expected_result = (
+                        self.get_expected_result(
+                            42 + int(corrid), corrid, 42, trial, "start,end"
+                        )
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            42, corrid, 42, trial, "start,end"
+                        )
+                    )
 
                     self.check_sequence(
                         trial,
@@ -180,50 +192,60 @@ def test_length1_sequence(self):
                         corrid,
                         (4000, None),
                         # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay))
-                        (
-                            ("start,end", 42, None, None),),
+                        (("start,end", 42, None, None),),
                         expected_result,
                         protocol,
-                        sequence_name="{}_{}".format(self._testMethodName,
-                                                     protocol))
+                        sequence_name="{}_{}".format(self._testMethodName, protocol),
+                    )
 
                     self.check_deferred_exception()
-                    self.check_status(model_name, {1: (idx + 1)}, (idx + 1),
-                                      (idx + 1))
+                    self.check_status(model_name, {1: (idx + 1)}, (idx + 1), (idx + 1))
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
-    def _multi_sequence_impl(self, trials, expected_batch_exec,
-                             expected_exec_cnt, sleep_secs, tensor_shapes):
+    def _multi_sequence_impl(
+        self, trials, expected_batch_exec, expected_exec_cnt, sleep_secs, tensor_shapes
+    ):
         for trial in trials:
             self.clear_deferred_exceptions()
             dtype = self.get_datatype(trial)
             precreated_shm0_handles = self.precreate_register_regions(
-                (1, 3), dtype, 0, tensor_shape=(tensor_shapes[0],))
+                (1, 3), dtype, 0, tensor_shape=(tensor_shapes[0],)
+            )
             precreated_shm1_handles = self.precreate_register_regions(
-                (11, 12, 13), dtype, 1, tensor_shape=(tensor_shapes[1],))
+                (11, 12, 13), dtype, 1, tensor_shape=(tensor_shapes[1],)
+            )
             precreated_shm2_handles = self.precreate_register_regions(
-                (111, 112, 113), dtype, 2, tensor_shape=(tensor_shapes[2],))
+                (111, 112, 113), dtype, 2, tensor_shape=(tensor_shapes[2],)
+            )
             precreated_shm3_handles = self.precreate_register_regions(
-                (1111, 1112, 1113), dtype, 3, tensor_shape=(tensor_shapes[3],))
+                (1111, 1112, 1113), dtype, 3, tensor_shape=(tensor_shapes[3],)
+            )
             try:
                 model_name = tu.get_dyna_sequence_model_name(trial, dtype)
 
                 self.check_setup(model_name)
                 self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
-                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                 os.environ)
+                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
 
                 if "string" in trial:
-                    corrids = ['1001', '1002', '1003', '1004']
+                    corrids = ["1001", "1002", "1003", "1004"]
                 else:
                     corrids = [1001, 1002, 1003, 1004]
 
-                expected_result = self.get_expected_result(
-                    4 * tensor_shapes[0] +
-                    int(corrids[0]), corrids[0], 3, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    4, corrids[0], 3, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        4 * tensor_shapes[0] + int(corrids[0]),
+                        corrids[0],
+                        3,
+                        trial,
+                        "end",
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        4, corrids[0], 3, trial, "end"
+                    )
+                )
 
                 threads = []
                 threads.append(
@@ -238,19 +260,30 @@ def _multi_sequence_impl(self, trials, expected_batch_exec,
                             # (flag_str, value, pre_delay_ms)
                             (("start", 1, None), ("end", 3, None)),
                             expected_result,
-                            precreated_shm0_handles),
+                            precreated_shm0_handles,
+                        ),
                         kwargs={
-                            'sequence_name':
-                                "{}_{}".format(self._testMethodName,
-                                               corrids[0]),
-                            'tensor_shape': (tensor_shapes[0],)
-                        }))
+                            "sequence_name": "{}_{}".format(
+                                self._testMethodName, corrids[0]
+                            ),
+                            "tensor_shape": (tensor_shapes[0],),
+                        },
+                    )
+                )
 
-                expected_result = self.get_expected_result(
-                    36 * tensor_shapes[1] +
-                    int(corrids[1]), corrids[1], 13, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    36, corrids[1], 13, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        36 * tensor_shapes[1] + int(corrids[1]),
+                        corrids[1],
+                        13,
+                        trial,
+                        "end",
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        36, corrids[1], 13, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -261,22 +294,32 @@ def _multi_sequence_impl(self, trials, expected_batch_exec,
                             corrids[1],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 11, None), (None, 12, None), ("end", 13,
-                                                                     None)),
+                            (("start", 11, None), (None, 12, None), ("end", 13, None)),
                             expected_result,
-                            precreated_shm1_handles),
+                            precreated_shm1_handles,
+                        ),
                         kwargs={
-                            'sequence_name':
-                                "{}_{}".format(self._testMethodName,
-                                               corrids[1]),
-                            'tensor_shape': (tensor_shapes[1],)
-                        }))
+                            "sequence_name": "{}_{}".format(
+                                self._testMethodName, corrids[1]
+                            ),
+                            "tensor_shape": (tensor_shapes[1],),
+                        },
+                    )
+                )
 
-                expected_result = self.get_expected_result(
-                    336 * tensor_shapes[2] +
-                    int(corrids[2]), corrids[2], 113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    336, corrids[2], 113, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        336 * tensor_shapes[2] + int(corrids[2]),
+                        corrids[2],
+                        113,
+                        trial,
+                        "end",
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        336, corrids[2], 113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -287,21 +330,35 @@ def _multi_sequence_impl(self, trials, expected_batch_exec,
                             corrids[2],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 111, None), (None, 112, None),
-                             ("end", 113, None)),
+                            (
+                                ("start", 111, None),
+                                (None, 112, None),
+                                ("end", 113, None),
+                            ),
                             expected_result,
-                            precreated_shm2_handles),
+                            precreated_shm2_handles,
+                        ),
                         kwargs={
-                            'sequence_name':
-                                "{}_{}".format(self._testMethodName,
-                                               corrids[2]),
-                            'tensor_shape': (tensor_shapes[2],)
-                        }))
-                expected_result = self.get_expected_result(
-                    3336 * tensor_shapes[3] +
-                    int(corrids[3]), corrids[3], 1113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    3336, corrids[3], 1113, trial, "end")
+                            "sequence_name": "{}_{}".format(
+                                self._testMethodName, corrids[2]
+                            ),
+                            "tensor_shape": (tensor_shapes[2],),
+                        },
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        3336 * tensor_shapes[3] + int(corrids[3]),
+                        corrids[3],
+                        1113,
+                        trial,
+                        "end",
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        3336, corrids[3], 1113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -312,16 +369,22 @@ def _multi_sequence_impl(self, trials, expected_batch_exec,
                             corrids[3],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1111, None), (None, 1112, None),
-                             ("end", 1113, None)),
+                            (
+                                ("start", 1111, None),
+                                (None, 1112, None),
+                                ("end", 1113, None),
+                            ),
                             expected_result,
-                            precreated_shm3_handles),
+                            precreated_shm3_handles,
+                        ),
                         kwargs={
-                            'sequence_name':
-                                "{}_{}".format(self._testMethodName,
-                                               corrids[3]),
-                            'tensor_shape': (tensor_shapes[3],)
-                        }))
+                            "sequence_name": "{}_{}".format(
+                                self._testMethodName, corrids[3]
+                            ),
+                            "tensor_shape": (tensor_shapes[3],),
+                        },
+                    )
+                )
 
                 for t in threads:
                     t.start()
@@ -330,8 +393,9 @@ def _multi_sequence_impl(self, trials, expected_batch_exec,
                 for t in threads:
                     t.join()
                 self.check_deferred_exception()
-                self.check_status(model_name, expected_batch_exec,
-                                  expected_exec_cnt, 11)
+                self.check_status(
+                    model_name, expected_batch_exec, expected_exec_cnt, 11
+                )
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
             finally:
@@ -355,18 +419,18 @@ def test_multi_sequence_different_shape(self):
         # Send four sequences in parallel where the requests in each
         # sequence have different shape. Sequences should not be
         # batched due to input tensor size differences.
-        self._multi_sequence_impl(_ragged_batch_supported_trials, {1: 11}, 11,
-                                  0, (4, 3, 1, 2))
+        self._multi_sequence_impl(
+            _ragged_batch_supported_trials, {1: 11}, 11, 0, (4, 3, 1, 2)
+        )
 
     def test_multi_sequence_different_shape_allow_ragged(self):
         # Send four sequences in parallel where the requests in each
         # sequence have different shape. Input is marked as allowing
         # ragged and so sequences should be batched even with input
         # tensor size differences.
-        self._multi_sequence_impl(_ragged_batch_supported_trials, {
-            4: 2,
-            3: 1
-        }, 3, 1, (4, 3, 1, 2))
+        self._multi_sequence_impl(
+            _ragged_batch_supported_trials, {4: 2, 3: 1}, 3, 1, (4, 3, 1, 2)
+        )
 
     def test_backlog(self):
         # Send 5 equal-length sequences in parallel and make sure they
@@ -376,33 +440,42 @@ def test_backlog(self):
         for trial in _trials:
             self.clear_deferred_exceptions()
             dtype = self.get_datatype(trial)
-            precreated_shm0_handles = self.precreate_register_regions((1, 2, 3),
-                                                                      dtype, 0)
+            precreated_shm0_handles = self.precreate_register_regions(
+                (1, 2, 3), dtype, 0
+            )
             precreated_shm1_handles = self.precreate_register_regions(
-                (11, 12, 13), dtype, 1)
+                (11, 12, 13), dtype, 1
+            )
             precreated_shm2_handles = self.precreate_register_regions(
-                (111, 112, 113), dtype, 2)
+                (111, 112, 113), dtype, 2
+            )
             precreated_shm3_handles = self.precreate_register_regions(
-                (1111, 1112, 1113), dtype, 3)
+                (1111, 1112, 1113), dtype, 3
+            )
             precreated_shm4_handles = self.precreate_register_regions(
-                (11111, 11112, 11113), dtype, 4)
+                (11111, 11112, 11113), dtype, 4
+            )
             try:
                 model_name = tu.get_dyna_sequence_model_name(trial, dtype)
 
                 self.check_setup(model_name)
                 self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
-                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                 os.environ)
+                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
 
                 if "string" in trial:
-                    corrids = ['1001', '1002', '1003', '1004', '1005']
+                    corrids = ["1001", "1002", "1003", "1004", "1005"]
                 else:
                     corrids = [1001, 1002, 1003, 1004, 1005]
 
-                expected_result = self.get_expected_result(
-                    6 + int(corrids[0]), corrids[0], 3, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    6, corrids[0], 3, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        6 + int(corrids[0]), corrids[0], 3, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        6, corrids[0], 3, trial, "end"
+                    )
+                )
 
                 threads = []
                 threads.append(
@@ -415,18 +488,23 @@ def test_backlog(self):
                             corrids[0],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1, None), (None, 2, None), ("end", 3,
-                                                                   None)),
+                            (("start", 1, None), (None, 2, None), ("end", 3, None)),
                             expected_result,
-                            precreated_shm0_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm0_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
-                expected_result = self.get_expected_result(
-                    36 + int(corrids[1]), corrids[1], 13, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    36, corrids[1], 13, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        36 + int(corrids[1]), corrids[1], 13, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        36, corrids[1], 13, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -437,18 +515,23 @@ def test_backlog(self):
                             corrids[1],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 11, None), (None, 12, None), ("end", 13,
-                                                                     None)),
+                            (("start", 11, None), (None, 12, None), ("end", 13, None)),
                             expected_result,
-                            precreated_shm1_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm1_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
-                expected_result = self.get_expected_result(
-                    336 + int(corrids[2]), corrids[2], 113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    336, corrids[2], 113, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        336 + int(corrids[2]), corrids[2], 113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        336, corrids[2], 113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -459,18 +542,27 @@ def test_backlog(self):
                             corrids[2],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 111, None), (None, 112, None),
-                             ("end", 113, None)),
+                            (
+                                ("start", 111, None),
+                                (None, 112, None),
+                                ("end", 113, None),
+                            ),
                             expected_result,
-                            precreated_shm2_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm2_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
-                expected_result = self.get_expected_result(
-                    3336 + int(corrids[3]), corrids[3], 1113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    3336, corrids[3], 1113, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        3336 + int(corrids[3]), corrids[3], 1113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        3336, corrids[3], 1113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -481,18 +573,27 @@ def test_backlog(self):
                             corrids[3],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1111, None), (None, 1112, None),
-                             ("end", 1113, None)),
+                            (
+                                ("start", 1111, None),
+                                (None, 1112, None),
+                                ("end", 1113, None),
+                            ),
                             expected_result,
-                            precreated_shm3_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm3_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
-                expected_result = self.get_expected_result(
-                    33336 + int(corrids[4]), corrids[4], 11113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    33336, corrids[4], 11113, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        33336 + int(corrids[4]), corrids[4], 11113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        33336, corrids[4], 11113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -503,13 +604,17 @@ def test_backlog(self):
                             corrids[4],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 11111, None), (None, 11112, None),
-                             ("end", 11113, None)),
+                            (
+                                ("start", 11111, None),
+                                (None, 11112, None),
+                                ("end", 11113, None),
+                            ),
                             expected_result,
-                            precreated_shm4_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm4_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
                 for t in threads:
                     t.start()
@@ -534,35 +639,45 @@ def test_backlog_fill(self):
         for trial in _trials:
             self.clear_deferred_exceptions()
             dtype = self.get_datatype(trial)
-            precreated_shm0_handles = self.precreate_register_regions((1, 2, 3),
-                                                                      dtype, 0)
-            precreated_shm1_handles = self.precreate_register_regions((11, 13),
-                                                                      dtype, 1)
+            precreated_shm0_handles = self.precreate_register_regions(
+                (1, 2, 3), dtype, 0
+            )
+            precreated_shm1_handles = self.precreate_register_regions(
+                (11, 13), dtype, 1
+            )
             precreated_shm2_handles = self.precreate_register_regions(
-                (111, 113), dtype, 2)
+                (111, 113), dtype, 2
+            )
             precreated_shm3_handles = self.precreate_register_regions(
-                (1111, 1112, 1113), dtype, 3)
-            precreated_shm4_handles = self.precreate_register_regions((11111,),
-                                                                      dtype, 4)
-            precreated_shm5_handles = self.precreate_register_regions((22222,),
-                                                                      dtype, 5)
+                (1111, 1112, 1113), dtype, 3
+            )
+            precreated_shm4_handles = self.precreate_register_regions(
+                (11111,), dtype, 4
+            )
+            precreated_shm5_handles = self.precreate_register_regions(
+                (22222,), dtype, 5
+            )
             try:
                 model_name = tu.get_dyna_sequence_model_name(trial, dtype)
 
                 self.check_setup(model_name)
                 self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
-                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                 os.environ)
+                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                 if "string" in trial:
-                    corrids = ['1001', '1002', '1003', '1004', '1005', '1006']
+                    corrids = ["1001", "1002", "1003", "1004", "1005", "1006"]
                 else:
                     corrids = [1001, 1002, 1003, 1004, 1005, 1006]
                 threads = []
 
-                expected_result = self.get_expected_result(
-                    6 + int(corrids[0]), corrids[0], 3, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    6, corrids[0], 3, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        6 + int(corrids[0]), corrids[0], 3, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        6, corrids[0], 3, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -573,17 +688,22 @@ def test_backlog_fill(self):
                             corrids[0],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1, None), (None, 2, None), ("end", 3,
-                                                                   None)),
+                            (("start", 1, None), (None, 2, None), ("end", 3, None)),
                             expected_result,
-                            precreated_shm0_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    24 + int(corrids[1]), corrids[1], 13, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    24, corrids[1], 13, trial, "end")
+                            precreated_shm0_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        24 + int(corrids[1]), corrids[1], 13, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        24, corrids[1], 13, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -596,14 +716,20 @@ def test_backlog_fill(self):
                             # (flag_str, value, pre_delay_ms)
                             (("start", 11, None), ("end", 13, None)),
                             expected_result,
-                            precreated_shm1_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    224 + int(corrids[2]), corrids[2], 113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    224, corrids[2], 113, trial, "end")
+                            precreated_shm1_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        224 + int(corrids[2]), corrids[2], 113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        224, corrids[2], 113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -616,14 +742,20 @@ def test_backlog_fill(self):
                             # (flag_str, value, pre_delay_ms)
                             (("start", 111, None), ("end", 113, None)),
                             expected_result,
-                            precreated_shm2_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    3336 + int(corrids[3]), corrids[3], 1113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    3336, corrids[3], 1113, trial, "end")
+                            precreated_shm2_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        3336 + int(corrids[3]), corrids[3], 1113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        3336, corrids[3], 1113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -634,18 +766,26 @@ def test_backlog_fill(self):
                             corrids[3],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1111, None), (None, 1112, 3000),
-                             ("end", 1113, None)),
+                            (
+                                ("start", 1111, None),
+                                (None, 1112, 3000),
+                                ("end", 1113, None),
+                            ),
                             expected_result,
-                            precreated_shm3_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    11111 +
-                    int(corrids[4]), corrids[4], 11111, trial, "start,end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    11111, corrids[4], 11111, trial, "start,end")
+                            precreated_shm3_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        11111 + int(corrids[4]), corrids[4], 11111, trial, "start,end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        11111, corrids[4], 11111, trial, "start,end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -656,18 +796,22 @@ def test_backlog_fill(self):
                             corrids[4],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (
-                                ("start,end", 11111, None),),
+                            (("start,end", 11111, None),),
                             expected_result,
-                            precreated_shm4_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    22222 +
-                    int(corrids[5]), corrids[5], 22222, trial, "start,end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    22222, corrids[5], 22222, trial, "start,end")
+                            precreated_shm4_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        22222 + int(corrids[5]), corrids[5], 22222, trial, "start,end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        22222, corrids[5], 22222, trial, "start,end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -678,13 +822,13 @@ def test_backlog_fill(self):
                             corrids[5],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (
-                                ("start,end", 22222, None),),
+                            (("start,end", 22222, None),),
                             expected_result,
-                            precreated_shm5_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm5_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
                 threads[0].start()
                 threads[1].start()
@@ -716,35 +860,45 @@ def test_backlog_fill_no_end(self):
         for trial in _trials:
             self.clear_deferred_exceptions()
             dtype = self.get_datatype(trial)
-            precreated_shm0_handles = self.precreate_register_regions((1, 2, 3),
-                                                                      dtype, 0)
-            precreated_shm1_handles = self.precreate_register_regions((11, 13),
-                                                                      dtype, 1)
+            precreated_shm0_handles = self.precreate_register_regions(
+                (1, 2, 3), dtype, 0
+            )
+            precreated_shm1_handles = self.precreate_register_regions(
+                (11, 13), dtype, 1
+            )
             precreated_shm2_handles = self.precreate_register_regions(
-                (111, 113), dtype, 2)
+                (111, 113), dtype, 2
+            )
             precreated_shm3_handles = self.precreate_register_regions(
-                (1111, 1112, 1113), dtype, 3)
-            precreated_shm4_handles = self.precreate_register_regions((11111,),
-                                                                      dtype, 4)
+                (1111, 1112, 1113), dtype, 3
+            )
+            precreated_shm4_handles = self.precreate_register_regions(
+                (11111,), dtype, 4
+            )
             precreated_shm5_handles = self.precreate_register_regions(
-                (22222, 22223, 22224), dtype, 5)
+                (22222, 22223, 22224), dtype, 5
+            )
             try:
                 model_name = tu.get_dyna_sequence_model_name(trial, dtype)
 
                 self.check_setup(model_name)
                 self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
-                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                 os.environ)
+                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
 
                 if "string" in trial:
-                    corrids = ['1001', '1002', '1003', '1004', '1005', '1006']
+                    corrids = ["1001", "1002", "1003", "1004", "1005", "1006"]
                 else:
                     corrids = [1001, 1002, 1003, 1004, 1005, 1006]
                 threads = []
-                expected_result = self.get_expected_result(
-                    6 + int(corrids[0]), corrids[0], 3, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    6, corrids[0], 3, trial, "end")
+                expected_result = (
+                    self.get_expected_result(
+                        6 + int(corrids[0]), corrids[0], 3, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        6, corrids[0], 3, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -755,17 +909,22 @@ def test_backlog_fill_no_end(self):
                             corrids[0],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1, None), (None, 2, None), ("end", 3,
-                                                                   None)),
+                            (("start", 1, None), (None, 2, None), ("end", 3, None)),
                             expected_result,
-                            precreated_shm0_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    24 + int(corrids[1]), corrids[1], 13, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    24, corrids[1], 13, trial, "end")
+                            precreated_shm0_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        24 + int(corrids[1]), corrids[1], 13, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        24, corrids[1], 13, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -778,14 +937,20 @@ def test_backlog_fill_no_end(self):
                             # (flag_str, value, pre_delay_ms)
                             (("start", 11, None), ("end", 13, None)),
                             expected_result,
-                            precreated_shm1_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    224 + int(corrids[2]), corrids[2], 113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    224, corrids[2], 113, trial, "end")
+                            precreated_shm1_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        224 + int(corrids[2]), corrids[2], 113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        224, corrids[2], 113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -798,14 +963,20 @@ def test_backlog_fill_no_end(self):
                             # (flag_str, value, pre_delay_ms)
                             (("start", 111, None), ("end", 113, None)),
                             expected_result,
-                            precreated_shm2_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    3336 + int(corrids[3]), corrids[3], 1113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    3336, corrids[3], 1113, trial, "end")
+                            precreated_shm2_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        3336 + int(corrids[3]), corrids[3], 1113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        3336, corrids[3], 1113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -816,18 +987,26 @@ def test_backlog_fill_no_end(self):
                             corrids[3],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1111, None), (None, 1112, 3000),
-                             ("end", 1113, None)),
+                            (
+                                ("start", 1111, None),
+                                (None, 1112, 3000),
+                                ("end", 1113, None),
+                            ),
                             expected_result,
-                            precreated_shm3_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    11111 +
-                    int(corrids[4]), corrids[4], 11111, trial, "start,end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    11111, corrids[4], 11111, trial, "start,end")
+                            precreated_shm3_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        11111 + int(corrids[4]), corrids[4], 11111, trial, "start,end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        11111, corrids[4], 11111, trial, "start,end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -838,17 +1017,22 @@ def test_backlog_fill_no_end(self):
                             corrids[4],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (
-                                ("start,end", 11111, None),),
+                            (("start,end", 11111, None),),
                             expected_result,
-                            precreated_shm4_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    66669 + int(corrids[5]), corrids[5], 22224, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    66669, corrids[5], 22224, trial, "end")
+                            precreated_shm4_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        66669 + int(corrids[5]), corrids[5], 22224, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        66669, corrids[5], 22224, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -865,10 +1049,11 @@ def test_backlog_fill_no_end(self):
                                 ("end", 22224, 2000),
                             ),
                             expected_result,
-                            precreated_shm5_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm5_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
                 threads[0].start()
                 threads[1].start()
@@ -906,33 +1091,40 @@ def test_backlog_sequence_timeout(self):
         for trial in _trials:
             self.clear_deferred_exceptions()
             dtype = self.get_datatype(trial)
-            precreated_shm0_handles = self.precreate_register_regions((1, 3),
-                                                                      dtype, 0)
+            precreated_shm0_handles = self.precreate_register_regions((1, 3), dtype, 0)
             precreated_shm1_handles = self.precreate_register_regions(
-                (11, 12, 12, 13), dtype, 1)
+                (11, 12, 12, 13), dtype, 1
+            )
             precreated_shm2_handles = self.precreate_register_regions(
-                (111, 112, 112, 113), dtype, 2)
+                (111, 112, 112, 113), dtype, 2
+            )
             precreated_shm3_handles = self.precreate_register_regions(
-                (1111, 1112, 1112, 1113), dtype, 3)
+                (1111, 1112, 1112, 1113), dtype, 3
+            )
             precreated_shm4_handles = self.precreate_register_regions(
-                (11111, 11113), dtype, 4)
+                (11111, 11113), dtype, 4
+            )
             try:
                 model_name = tu.get_dyna_sequence_model_name(trial, dtype)
 
                 self.check_setup(model_name)
                 self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
-                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                 os.environ)
+                self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
 
                 if "string" in trial:
-                    corrids = ['1001', '1002', '1003', '1004', '1005']
+                    corrids = ["1001", "1002", "1003", "1004", "1005"]
                 else:
                     corrids = [1001, 1002, 1003, 1004, 1005]
                 threads = []
-                expected_result = self.get_expected_result(
-                    4 + int(corrids[0]), corrids[0], 3, trial, None
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    4, corrids[0], 3, trial, None)
+                expected_result = (
+                    self.get_expected_result(
+                        4 + int(corrids[0]), corrids[0], 3, trial, None
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        4, corrids[0], 3, trial, None
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -943,17 +1135,25 @@ def test_backlog_sequence_timeout(self):
                             corrids[0],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1, None),
-                             (None, 3, _max_sequence_idle_ms + 1000)),
+                            (
+                                ("start", 1, None),
+                                (None, 3, _max_sequence_idle_ms + 1000),
+                            ),
                             expected_result,
-                            precreated_shm0_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    48 + int(corrids[1]), corrids[1], 13, trial, None
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    48, corrids[1], 13, trial, None)
+                            precreated_shm0_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        48 + int(corrids[1]), corrids[1], 13, trial, None
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        48, corrids[1], 13, trial, None
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -964,19 +1164,27 @@ def test_backlog_sequence_timeout(self):
                             corrids[1],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 11, None), (None, 12,
-                                                   _max_sequence_idle_ms / 2),
-                             (None, 12, _max_sequence_idle_ms / 2),
-                             ("end", 13, _max_sequence_idle_ms / 2)),
+                            (
+                                ("start", 11, None),
+                                (None, 12, _max_sequence_idle_ms / 2),
+                                (None, 12, _max_sequence_idle_ms / 2),
+                                ("end", 13, _max_sequence_idle_ms / 2),
+                            ),
                             expected_result,
-                            precreated_shm1_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    448 + int(corrids[2]), corrids[2], 113, trial, None
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    448, corrids[2], 113, trial, None)
+                            precreated_shm1_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        448 + int(corrids[2]), corrids[2], 113, trial, None
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        448, corrids[2], 113, trial, None
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -987,19 +1195,27 @@ def test_backlog_sequence_timeout(self):
                             corrids[2],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 111, None), (None, 112,
-                                                    _max_sequence_idle_ms / 2),
-                             (None, 112, _max_sequence_idle_ms / 2),
-                             ("end", 113, _max_sequence_idle_ms / 2)),
+                            (
+                                ("start", 111, None),
+                                (None, 112, _max_sequence_idle_ms / 2),
+                                (None, 112, _max_sequence_idle_ms / 2),
+                                ("end", 113, _max_sequence_idle_ms / 2),
+                            ),
                             expected_result,
-                            precreated_shm2_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    4448 + int(corrids[3]), corrids[3], 1113, trial, None
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    4448, corrids[3], 1113, trial, None)
+                            precreated_shm2_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        4448 + int(corrids[3]), corrids[3], 1113, trial, None
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        4448, corrids[3], 1113, trial, None
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -1010,19 +1226,27 @@ def test_backlog_sequence_timeout(self):
                             corrids[3],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1111, None), (None, 1112,
-                                                     _max_sequence_idle_ms / 2),
-                             (None, 1112, _max_sequence_idle_ms / 2),
-                             ("end", 1113, _max_sequence_idle_ms / 2)),
+                            (
+                                ("start", 1111, None),
+                                (None, 1112, _max_sequence_idle_ms / 2),
+                                (None, 1112, _max_sequence_idle_ms / 2),
+                                ("end", 1113, _max_sequence_idle_ms / 2),
+                            ),
                             expected_result,
-                            precreated_shm3_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
-                expected_result = self.get_expected_result(
-                    22224 + int(corrids[4]), corrids[4], 11113, trial, "end"
-                ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                    22224, corrids[4], 11113, trial, "end")
+                            precreated_shm3_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
+                expected_result = (
+                    self.get_expected_result(
+                        22224 + int(corrids[4]), corrids[4], 11113, trial, "end"
+                    )
+                    if not IMPLICIT_STATE
+                    else self.get_expected_result_implicit(
+                        22224, corrids[4], 11113, trial, "end"
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -1035,10 +1259,11 @@ def test_backlog_sequence_timeout(self):
                             # (flag_str, value, pre_delay_ms)
                             (("start", 11111, None), ("end", 11113, None)),
                             expected_result,
-                            precreated_shm4_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            precreated_shm4_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
                 threads[0].start()
                 threads[1].start()
@@ -1052,10 +1277,15 @@ def test_backlog_sequence_timeout(self):
                 self.check_deferred_exception()
                 self.assertTrue(False, "expected error")
             except Exception as ex:
-                self.assertTrue(ex.message().startswith(
-                    str("inference request for sequence 1001 to " +
-                        "model '{}' must specify the START flag on the first " +
-                        "request of the sequence").format(model_name)))
+                self.assertTrue(
+                    ex.message().startswith(
+                        str(
+                            "inference request for sequence 1001 to "
+                            + "model '{}' must specify the START flag on the first "
+                            + "request of the sequence"
+                        ).format(model_name)
+                    )
+                )
             finally:
                 if _test_system_shared_memory or _test_cuda_shared_memory:
                     self.cleanup_shm_regions(precreated_shm0_handles)
@@ -1065,5 +1295,5 @@ def test_backlog_sequence_timeout(self):
                     self.cleanup_shm_regions(precreated_shm4_handles)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_dyna_sequence_batcher/test.sh b/qa/L0_dyna_sequence_batcher/test.sh
index 42f732338c..acac8399af 100755
--- a/qa/L0_dyna_sequence_batcher/test.sh
+++ b/qa/L0_dyna_sequence_batcher/test.sh
@@ -65,7 +65,7 @@ fi
 
 RET=0
 
-rm -fr *.log 
+rm -fr *.log
 
 # models
 rm -fr models && mkdir models
diff --git a/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py b/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py
old mode 100644
new mode 100755
index 89cbf359a7..e03876f981
--- a/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py
+++ b/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,22 +27,25 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import json
-import triton_python_backend_utils as pb_utils
+
 import numpy as np
+import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         responses = []
 
         for request in requests:
-            json_string = pb_utils.get_input_tensor_by_name(
-                request, "EXPECTED_HEADERS").as_numpy()[0].decode("utf-8")
+            json_string = (
+                pb_utils.get_input_tensor_by_name(request, "EXPECTED_HEADERS")
+                .as_numpy()[0]
+                .decode("utf-8")
+            )
             expected_headers = json.loads(json_string)
 
             success = True
-            if request.parameters() != '':
+            if request.parameters() != "":
                 parameters = json.loads(request.parameters())
                 for key, value in expected_headers.items():
                     if key in parameters:
@@ -49,10 +54,12 @@ def execute(self, requests):
                     else:
                         success = False
 
-            test_success = pb_utils.Tensor("TEST_SUCCESS",
-                                           np.array([success], dtype=bool))
+            test_success = pb_utils.Tensor(
+                "TEST_SUCCESS", np.array([success], dtype=bool)
+            )
             inference_response = pb_utils.InferenceResponse(
-                output_tensors=[test_success])
+                output_tensors=[test_success]
+            )
             responses.append(inference_response)
 
         return responses
diff --git a/qa/L0_grpc/grpc_basic_auth_test.py b/qa/L0_grpc/grpc_basic_auth_test.py
old mode 100644
new mode 100755
index a6408c442d..07d29ef5b7
--- a/qa/L0_grpc/grpc_basic_auth_test.py
+++ b/qa/L0_grpc/grpc_basic_auth_test.py
@@ -24,26 +24,23 @@
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import unittest
 import sys
+import unittest
 
 sys.path.append("../common")
 
 import test_util as tu
 import tritonclient.grpc as tritongrpcclient
 import tritonclient.grpc.aio as asynctritongrpcclient
-
-from tritonclient.grpc.auth import BasicAuth
 from tritonclient.grpc.aio.auth import BasicAuth as AsyncBasicAuth
+from tritonclient.grpc.auth import BasicAuth
 
 
 class GRPCBasicAuthTest(tu.TestResultCollector):
-
     def setUp(self):
         # Use the nginx port
-        self._client = tritongrpcclient.InferenceServerClient(
-            url='localhost:8004')
-        self._client.register_plugin(BasicAuth('username', 'password'))
+        self._client = tritongrpcclient.InferenceServerClient(url="localhost:8004")
+        self._client.register_plugin(BasicAuth("username", "password"))
 
     def test_client_call(self):
         self.assertTrue(self._client.is_server_live())
@@ -53,12 +50,10 @@ def tearDown(self):
 
 
 class GRPCBasicAuthAsyncTest(unittest.IsolatedAsyncioTestCase):
-
     async def asyncSetUp(self):
         # Use the nginx port
-        self._client = asynctritongrpcclient.InferenceServerClient(
-            url='localhost:8004')
-        self._client.register_plugin(AsyncBasicAuth('username', 'password'))
+        self._client = asynctritongrpcclient.InferenceServerClient(url="localhost:8004")
+        self._client.register_plugin(AsyncBasicAuth("username", "password"))
 
     async def test_client_call(self):
         self.assertTrue(await self._client.is_server_live())
@@ -67,5 +62,5 @@ async def asyncTearDown(self):
         await self._client.close()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_grpc/grpc_client_plugin_test.py b/qa/L0_grpc/grpc_client_plugin_test.py
old mode 100644
new mode 100755
index 45b6251e3e..1cc8c474ef
--- a/qa/L0_grpc/grpc_client_plugin_test.py
+++ b/qa/L0_grpc/grpc_client_plugin_test.py
@@ -24,23 +24,23 @@
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import sys
 import json
+import sys
 
 sys.path.append("../common")
 
 import unittest
+
 import numpy as np
 import test_util as tu
 import tritonclient.grpc as tritongrpcclient
+import tritonclient.grpc.aio as asynctritongrpcclient
 from tritonclient.grpc import InferenceServerClientPlugin
 from tritonclient.utils import np_to_triton_dtype
-import tritonclient.grpc.aio as asynctritongrpcclient
 
 
 # A simple plugin that adds headers to the inference request.
 class TestPlugin(InferenceServerClientPlugin):
-
     def __init__(self, headers):
         self._headers = headers
 
@@ -52,33 +52,35 @@ def prepare_infer_inputs(headers):
     expected_headers = np.array([json.dumps(headers)], dtype=object)
     inputs = []
     inputs.append(
-        tritongrpcclient.InferInput('EXPECTED_HEADERS', expected_headers.shape,
-                                    np_to_triton_dtype(expected_headers.dtype)))
+        tritongrpcclient.InferInput(
+            "EXPECTED_HEADERS",
+            expected_headers.shape,
+            np_to_triton_dtype(expected_headers.dtype),
+        )
+    )
     inputs[0].set_data_from_numpy(expected_headers)
 
     return inputs
 
 
 class GRPCClientPluginAsyncTest(unittest.IsolatedAsyncioTestCase):
-
     async def asyncSetUp(self):
-        self._headers = {'my-key': 'my-value'}
+        self._headers = {"my-key": "my-value"}
         self._plugin = TestPlugin(self._headers)
-        self._client = asynctritongrpcclient.InferenceServerClient(
-            url='localhost:8001')
+        self._client = asynctritongrpcclient.InferenceServerClient(url="localhost:8001")
 
     async def test_simple_infer(self):
         model = "client_plugin_test"
         inputs = prepare_infer_inputs(self._headers)
         self._client.register_plugin(self._plugin)
         response = await self._client.infer(model_name=model, inputs=inputs)
-        test_success = response.as_numpy('TEST_SUCCESS')
+        test_success = response.as_numpy("TEST_SUCCESS")
         self.assertEqual(test_success, True)
 
         self._client.unregister_plugin()
         inputs = prepare_infer_inputs({})
         response = await self._client.infer(model_name=model, inputs=inputs)
-        test_success = response.as_numpy('TEST_SUCCESS')
+        test_success = response.as_numpy("TEST_SUCCESS")
         self.assertEqual(test_success, True)
 
     async def asyncTearDown(self):
@@ -86,12 +88,10 @@ async def asyncTearDown(self):
 
 
 class GRPCClientPluginTest(tu.TestResultCollector):
-
     def setUp(self):
-        self._headers = {'my-key': 'my-value'}
+        self._headers = {"my-key": "my-value"}
         self._plugin = TestPlugin(self._headers)
-        self._client = tritongrpcclient.InferenceServerClient(
-            url='localhost:8001')
+        self._client = tritongrpcclient.InferenceServerClient(url="localhost:8001")
 
     def test_simple_infer(self):
         # Set the binary data to False so that 'Inference-Header-Length' is not
@@ -101,7 +101,7 @@ def test_simple_infer(self):
         self._client.register_plugin(self._plugin)
         self.assertEqual(self._plugin, self._client.plugin())
         response = self._client.infer(model_name=model, inputs=inputs)
-        test_success = response.as_numpy('TEST_SUCCESS')
+        test_success = response.as_numpy("TEST_SUCCESS")
         self.assertEqual(test_success, True)
 
         # Unregister the plugin
@@ -109,12 +109,12 @@ def test_simple_infer(self):
         self._client.unregister_plugin()
         self.assertEqual(None, self._client.plugin())
         response = self._client.infer(model_name=model, inputs=inputs)
-        test_success = response.as_numpy('TEST_SUCCESS')
+        test_success = response.as_numpy("TEST_SUCCESS")
         self.assertEqual(test_success, True)
 
     def tearDown(self):
         self._client.close()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_grpc/python_grpc_aio_test.py b/qa/L0_grpc/python_grpc_aio_test.py
old mode 100644
new mode 100755
index 88c08b8ab6..f5b3a8f958
--- a/qa/L0_grpc/python_grpc_aio_test.py
+++ b/qa/L0_grpc/python_grpc_aio_test.py
@@ -32,13 +32,10 @@
 
 
 class TestGrpcAioClient(unittest.IsolatedAsyncioTestCase):
-    """Test if aio rpc can reach the server
-
-    """
+    """Test if aio rpc can reach the server"""
 
     def setUp(self):
-        self._triton_client = grpcclient.InferenceServerClient(
-            url="localhost:8001")
+        self._triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
 
     async def asyncTearDown(self):
         await self._triton_client.close()
@@ -73,15 +70,15 @@ async def test_get_model_repository_index(self):
 
     async def test_load_model(self):
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "\[StatusCode\.UNAVAILABLE\] explicit model load / unload is not allowed if polling is enabled"
+            InferenceServerException,
+            "\[StatusCode\.UNAVAILABLE\] explicit model load / unload is not allowed if polling is enabled",
         ):
             await self._triton_client.load_model("simple")
 
     async def test_unload_model(self):
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "\[StatusCode\.UNAVAILABLE\] explicit model load / unload is not allowed if polling is enabled"
+            InferenceServerException,
+            "\[StatusCode\.UNAVAILABLE\] explicit model load / unload is not allowed if polling is enabled",
         ):
             await self._triton_client.load_model("simple")
 
@@ -99,8 +96,8 @@ async def test_get_system_shared_memory_status(self):
 
     async def test_register_system_shared_memory(self):
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "\[StatusCode\.INTERNAL\] Unable to open shared memory region: ''"
+            InferenceServerException,
+            "\[StatusCode\.INTERNAL\] Unable to open shared memory region: ''",
         ):
             await self._triton_client.register_system_shared_memory("", "", 0)
 
@@ -112,8 +109,8 @@ async def test_get_cuda_shared_memory_status(self):
 
     async def test_register_cuda_shared_memory(self):
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "\[StatusCode\.INVALID_ARGUMENT\] failed to register CUDA shared memory region '': failed to open CUDA IPC handle: invalid argument"
+            InferenceServerException,
+            "\[StatusCode\.INVALID_ARGUMENT\] failed to register CUDA shared memory region '': failed to open CUDA IPC handle: invalid argument",
         ):
             await self._triton_client.register_cuda_shared_memory("", b"", 0, 0)
 
diff --git a/qa/L0_grpc/python_unit_test.py b/qa/L0_grpc/python_unit_test.py
old mode 100644
new mode 100755
index db2a63f0a5..0fb6d97554
--- a/qa/L0_grpc/python_unit_test.py
+++ b/qa/L0_grpc/python_unit_test.py
@@ -25,20 +25,19 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import unittest
-import numpy as np
+import queue
 import time
-
-import tritonclient.grpc as grpcclient
-from tritonclient.utils import InferenceServerException
+import unittest
 
 # For stream infer test
 from functools import partial
-import queue
 
+import numpy as np
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import InferenceServerException
 
-class UserData:
 
+class UserData:
     def __init__(self):
         self._completed_requests = queue.Queue()
 
@@ -51,7 +50,6 @@ def callback(user_data, result, error):
 
 
 class RestrictedProtocolTest(unittest.TestCase):
-
     def setUp(self):
         self.client_ = grpcclient.InferenceServerClient(url="localhost:8001")
         self.model_name_ = "simple"
@@ -61,55 +59,61 @@ def setUp(self):
     def test_sanity(self):
         self.client_.get_inference_statistics("simple")
         self.client_.get_inference_statistics(
-            "simple", headers={self.prefix_ + "infer-key": "infer-value"})
+            "simple", headers={self.prefix_ + "infer-key": "infer-value"}
+        )
 
     # health, infer, model repository protocols are restricted.
     # health and infer expects "triton-grpc-restricted-infer-key : infer-value" header,
     # model repository expected "triton-grpc-restricted-admin-key : admin-value".
     def test_model_repository(self):
-        with self.assertRaisesRegex(InferenceServerException,
-                                    "This protocol is restricted"):
+        with self.assertRaisesRegex(
+            InferenceServerException, "This protocol is restricted"
+        ):
             self.client_.unload_model(
-                self.model_name_,
-                headers={self.prefix_ + "infer-key": "infer-value"})
+                self.model_name_, headers={self.prefix_ + "infer-key": "infer-value"}
+            )
         # Request go through and get actual transaction error
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "explicit model load / unload is not allowed"):
+            InferenceServerException, "explicit model load / unload is not allowed"
+        ):
             self.client_.unload_model(
-                self.model_name_,
-                headers={self.prefix_ + "admin-key": "admin-value"})
+                self.model_name_, headers={self.prefix_ + "admin-key": "admin-value"}
+            )
 
     def test_health(self):
-        with self.assertRaisesRegex(InferenceServerException,
-                                    "This protocol is restricted"):
+        with self.assertRaisesRegex(
+            InferenceServerException, "This protocol is restricted"
+        ):
             self.client_.is_server_live()
         self.client_.is_server_live({self.prefix_ + "infer-key": "infer-value"})
 
     def test_infer(self):
         # setup
         inputs = [
-            grpcclient.InferInput('INPUT0', [1, 16], "INT32"),
-            grpcclient.InferInput('INPUT1', [1, 16], "INT32")
+            grpcclient.InferInput("INPUT0", [1, 16], "INT32"),
+            grpcclient.InferInput("INPUT1", [1, 16], "INT32"),
         ]
         inputs[0].set_data_from_numpy(np.ones(shape=(1, 16), dtype=np.int32))
         inputs[1].set_data_from_numpy(np.ones(shape=(1, 16), dtype=np.int32))
 
         # This test only care if the request goes through
-        with self.assertRaisesRegex(InferenceServerException,
-                                    "This protocol is restricted"):
-            results = self.client_.infer(model_name=self.model_name_,
-                                         inputs=inputs,
-                                         headers={'test': '1'})
-        self.client_.infer(model_name=self.model_name_,
-                           inputs=inputs,
-                           headers={self.prefix_ + "infer-key": "infer-value"})
+        with self.assertRaisesRegex(
+            InferenceServerException, "This protocol is restricted"
+        ):
+            results = self.client_.infer(
+                model_name=self.model_name_, inputs=inputs, headers={"test": "1"}
+            )
+        self.client_.infer(
+            model_name=self.model_name_,
+            inputs=inputs,
+            headers={self.prefix_ + "infer-key": "infer-value"},
+        )
 
     def test_stream_infer(self):
         # setup
         inputs = [
-            grpcclient.InferInput('INPUT0', [1, 16], "INT32"),
-            grpcclient.InferInput('INPUT1', [1, 16], "INT32")
+            grpcclient.InferInput("INPUT0", [1, 16], "INT32"),
+            grpcclient.InferInput("INPUT1", [1, 16], "INT32"),
         ]
         inputs[0].set_data_from_numpy(np.ones(shape=(1, 16), dtype=np.int32))
         inputs[1].set_data_from_numpy(np.ones(shape=(1, 16), dtype=np.int32))
@@ -120,18 +124,18 @@ def test_stream_infer(self):
         # the stream.
         # So on client side, it will always perceive that the stream is
         # successfully created and can only check its health at a later time.
-        self.client_.start_stream(partial(callback, user_data),
-                                  headers={'test': '1'})
+        self.client_.start_stream(partial(callback, user_data), headers={"test": "1"})
         # wait for sufficient round-trip time
         time.sleep(1)
-        with self.assertRaisesRegex(InferenceServerException,
-                                    "The stream is no longer in valid state"):
-            self.client_.async_stream_infer(model_name=self.model_name_,
-                                            inputs=inputs)
+        with self.assertRaisesRegex(
+            InferenceServerException, "The stream is no longer in valid state"
+        ):
+            self.client_.async_stream_infer(model_name=self.model_name_, inputs=inputs)
         # callback should record error detail
         self.assertFalse(user_data._completed_requests.empty())
-        with self.assertRaisesRegex(InferenceServerException,
-                                    "This protocol is restricted"):
+        with self.assertRaisesRegex(
+            InferenceServerException, "This protocol is restricted"
+        ):
             raise user_data._completed_requests.get()
 
         self.assertTrue(user_data._completed_requests.empty())
@@ -140,14 +144,15 @@ def test_stream_infer(self):
         self.client_.stop_stream()
         self.client_.start_stream(
             partial(callback, user_data),
-            headers={self.prefix_ + "infer-key": "infer-value"})
-        self.client_.async_stream_infer(model_name=self.model_name_,
-                                        inputs=inputs)
+            headers={self.prefix_ + "infer-key": "infer-value"},
+        )
+        self.client_.async_stream_infer(model_name=self.model_name_, inputs=inputs)
         # wait for response
         time.sleep(1)
         self.assertFalse(user_data._completed_requests.empty())
-        self.assertNotEqual(type(user_data._completed_requests.get()),
-                            InferenceServerException)
+        self.assertNotEqual(
+            type(user_data._completed_requests.get()), InferenceServerException
+        )
 
 
 if __name__ == "__main__":
diff --git a/qa/L0_grpc/test.sh b/qa/L0_grpc/test.sh
old mode 100644
new mode 100755
index 923479836d..90d34a8738
--- a/qa/L0_grpc/test.sh
+++ b/qa/L0_grpc/test.sh
@@ -490,7 +490,7 @@ wait $SERVER_PID
 # Run cpp client unit test
 rm -rf unit_test_models && mkdir unit_test_models
 cp -r $DATADIR/qa_model_repository/onnx_int32_int32_int32 unit_test_models/.
-cp -r ${MODELDIR}/simple unit_test_models/. 
+cp -r ${MODELDIR}/simple unit_test_models/.
 
 SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=unit_test_models
             --trace-file=global_unittest.log --trace-level=TIMESTAMPS --trace-rate=1"
diff --git a/qa/L0_http/http_basic_auth_test.py b/qa/L0_http/http_basic_auth_test.py
old mode 100644
new mode 100755
index 21aa96dc5e..5aa1f71d81
--- a/qa/L0_http/http_basic_auth_test.py
+++ b/qa/L0_http/http_basic_auth_test.py
@@ -24,26 +24,23 @@
 # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import unittest
 import sys
+import unittest
 
 sys.path.append("../common")
 
 import test_util as tu
 import tritonclient.http as tritonhttpclient
 import tritonclient.http.aio as asynctritonhttpclient
-
-from tritonclient.http.auth import BasicAuth
 from tritonclient.http.aio.auth import BasicAuth as AsyncBasicAuth
+from tritonclient.http.auth import BasicAuth
 
 
 class HTTPBasicAuthTest(tu.TestResultCollector):
-
     def setUp(self):
         # Use the nginx port
-        self._client = tritonhttpclient.InferenceServerClient(
-            url='localhost:8004')
-        self._client.register_plugin(BasicAuth('username', 'password'))
+        self._client = tritonhttpclient.InferenceServerClient(url="localhost:8004")
+        self._client.register_plugin(BasicAuth("username", "password"))
 
     def test_client_call(self):
         self.assertTrue(self._client.is_server_live())
@@ -53,12 +50,10 @@ def tearDown(self):
 
 
 class HTTPBasicAuthAsyncTest(unittest.IsolatedAsyncioTestCase):
-
     async def asyncSetUp(self):
         # Use the nginx port
-        self._client = asynctritonhttpclient.InferenceServerClient(
-            url='localhost:8004')
-        self._client.register_plugin(AsyncBasicAuth('username', 'password'))
+        self._client = asynctritonhttpclient.InferenceServerClient(url="localhost:8004")
+        self._client.register_plugin(AsyncBasicAuth("username", "password"))
 
     async def test_client_call(self):
         self.assertTrue(await self._client.is_server_live())
@@ -67,5 +62,5 @@ async def asyncTearDown(self):
         await self._client.close()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_http/http_client_plugin_test.py b/qa/L0_http/http_client_plugin_test.py
old mode 100644
new mode 100755
index e110b9fdea..963ea2a81b
--- a/qa/L0_http/http_client_plugin_test.py
+++ b/qa/L0_http/http_client_plugin_test.py
@@ -30,18 +30,18 @@
 sys.path.append("../common")
 
 import unittest
-from unittest.mock import AsyncMock, patch, MagicMock
+from unittest.mock import AsyncMock, MagicMock, patch
+
 import numpy as np
 import test_util as tu
 import tritonclient.http as tritonhttpclient
+import tritonclient.http.aio as asynctritonhttpclient
 from tritonclient.http import InferenceServerClientPlugin
 from tritonclient.utils import np_to_triton_dtype
-import tritonclient.http.aio as asynctritonhttpclient
 
 
 # A simple plugin that adds headers to the inference request.
 class TestPlugin(InferenceServerClientPlugin):
-
     def __init__(self, headers):
         self._headers = headers
 
@@ -50,12 +50,10 @@ def __call__(self, request):
 
 
 class HTTPClientPluginAsyncTest(unittest.IsolatedAsyncioTestCase):
-
     async def asyncSetUp(self):
-        self._headers = {'MY-KEY': 'MY-VALUE'}
+        self._headers = {"MY-KEY": "MY-VALUE"}
         self._plugin = TestPlugin(self._headers)
-        self._client = asynctritonhttpclient.InferenceServerClient(
-            url='localhost:8001')
+        self._client = asynctritonhttpclient.InferenceServerClient(url="localhost:8001")
 
     async def test_server_is_live(self):
         # We are testing is_server_live as an example API that uses GET method
@@ -65,15 +63,15 @@ async def test_server_is_live(self):
         self._client.register_plugin(self._plugin)
         self.assertEqual(self._plugin, self._client.plugin())
         await self._client.is_server_live()
-        self._client._stub.get.assert_awaited_with(url=unittest.mock.ANY,
-                                                   headers=self._headers)
+        self._client._stub.get.assert_awaited_with(
+            url=unittest.mock.ANY, headers=self._headers
+        )
 
         # Make sure unregistering the plugin would no longer add the headers
         self._client.unregister_plugin()
         self.assertEqual(None, self._client.plugin())
         await self._client.is_server_live()
-        self._client._stub.get.assert_awaited_with(url=unittest.mock.ANY,
-                                                   headers={})
+        self._client._stub.get.assert_awaited_with(url=unittest.mock.ANY, headers={})
 
     async def test_simple_infer(self):
         # Only the read function must return async
@@ -87,21 +85,22 @@ async def test_simple_infer(self):
         # Setup inputs
         inputs = []
         inputs.append(
-            tritonhttpclient.InferInput('INPUT0', np_input.shape,
-                                        np_to_triton_dtype(np_input.dtype)))
+            tritonhttpclient.InferInput(
+                "INPUT0", np_input.shape, np_to_triton_dtype(np_input.dtype)
+            )
+        )
 
         # Set the binary data to False so that 'Inference-Header-Length' is not
         # added to the headers.
         inputs[0].set_data_from_numpy(np_input, binary_data=False)
 
         async def run_infer(headers):
-            with patch('tritonclient.http.aio._raise_if_error'):
-                with patch('tritonclient.http.aio.InferResult'):
+            with patch("tritonclient.http.aio._raise_if_error"):
+                with patch("tritonclient.http.aio.InferResult"):
                     await self._client.infer(model_name=model, inputs=inputs)
                     self._client._stub.post.assert_awaited_with(
-                        url=unittest.mock.ANY,
-                        data=unittest.mock.ANY,
-                        headers=headers)
+                        url=unittest.mock.ANY, data=unittest.mock.ANY, headers=headers
+                    )
 
         self._client.register_plugin(self._plugin)
         await run_infer(self._headers)
@@ -114,12 +113,10 @@ async def asyncTearDown(self):
 
 
 class HTTPClientPluginTest(tu.TestResultCollector):
-
     def setUp(self):
-        self._headers = {'MY-KEY': 'MY-VALUE'}
+        self._headers = {"MY-KEY": "MY-VALUE"}
         self._plugin = TestPlugin(self._headers)
-        self._client = tritonhttpclient.InferenceServerClient(
-            url='localhost:8001')
+        self._client = tritonhttpclient.InferenceServerClient(url="localhost:8001")
 
         # Use magic mock for the client stub
         self._client._client_stub = MagicMock()
@@ -129,14 +126,14 @@ def test_server_is_live(self):
         # for communication with the server.
         self._client.register_plugin(self._plugin)
         self._client.is_server_live()
-        self._client._client_stub.get.assert_called_with(unittest.mock.ANY,
-                                                         headers=self._headers)
+        self._client._client_stub.get.assert_called_with(
+            unittest.mock.ANY, headers=self._headers
+        )
 
         # Make sure unregistering the plugin would no longer add the headers
         self._client.unregister_plugin()
         self._client.is_server_live()
-        self._client._client_stub.get.assert_called_with(unittest.mock.ANY,
-                                                         headers={})
+        self._client._client_stub.get.assert_called_with(unittest.mock.ANY, headers={})
 
     def test_simple_infer(self):
         np_input = np.arange(8, dtype=np.float32).reshape(1, -1)
@@ -145,21 +142,24 @@ def test_simple_infer(self):
         # Setup inputs
         inputs = []
         inputs.append(
-            tritonhttpclient.InferInput('INPUT0', np_input.shape,
-                                        np_to_triton_dtype(np_input.dtype)))
+            tritonhttpclient.InferInput(
+                "INPUT0", np_input.shape, np_to_triton_dtype(np_input.dtype)
+            )
+        )
 
         # Set the binary data to False so that 'Inference-Header-Length' is not
         # added to the headers.
         inputs[0].set_data_from_numpy(np_input, binary_data=False)
 
         def run_infer(headers):
-            with patch('tritonclient.http._client._raise_if_error'):
-                with patch('tritonclient.http._client.InferResult'):
+            with patch("tritonclient.http._client._raise_if_error"):
+                with patch("tritonclient.http._client.InferResult"):
                     self._client.infer(model_name=model, inputs=inputs)
                     self._client._client_stub.post.assert_called_with(
                         request_uri=unittest.mock.ANY,
                         body=unittest.mock.ANY,
-                        headers=headers)
+                        headers=headers,
+                    )
 
         self._client.register_plugin(self._plugin)
         run_infer(self._headers)
@@ -171,5 +171,5 @@ def tearDown(self):
         self._client.close()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_http/http_test.py b/qa/L0_http/http_test.py
old mode 100644
new mode 100755
index 2482227a8f..a6fa0bcccd
--- a/qa/L0_http/http_test.py
+++ b/qa/L0_http/http_test.py
@@ -29,40 +29,39 @@
 
 sys.path.append("../common")
 
-import requests
 import unittest
+
 import numpy as np
+import requests
 import test_util as tu
 import tritonclient.http as tritonhttpclient
-from tritonclient.utils import np_to_triton_dtype, InferenceServerException
+from tritonclient.utils import InferenceServerException, np_to_triton_dtype
 
 
 class HttpTest(tu.TestResultCollector):
-
     def _get_infer_url(self, model_name):
         return "http://localhost:8000/v2/models/{}/infer".format(model_name)
 
-    def _raw_binary_helper(self,
-                           model,
-                           input_bytes,
-                           expected_output_bytes,
-                           extra_headers={}):
+    def _raw_binary_helper(
+        self, model, input_bytes, expected_output_bytes, extra_headers={}
+    ):
         # Select model that satisfies constraints for raw binary request
-        headers = {'Inference-Header-Content-Length': '0'}
+        headers = {"Inference-Header-Content-Length": "0"}
         # Add extra headers (if any) before sending request
         headers.update(extra_headers)
-        r = requests.post(self._get_infer_url(model),
-                          data=input_bytes,
-                          headers=headers)
+        r = requests.post(self._get_infer_url(model), data=input_bytes, headers=headers)
         r.raise_for_status()
 
         # Get the inference header size so we can locate the output binary data
         header_size = int(r.headers["Inference-Header-Content-Length"])
         # Assert input == output since this tests an identity model
         self.assertEqual(
-            expected_output_bytes, r.content[header_size:],
-            "Expected response body contains correct output binary data: {}; got: {}"
-            .format(expected_output_bytes, r.content[header_size:]))
+            expected_output_bytes,
+            r.content[header_size:],
+            "Expected response body contains correct output binary data: {}; got: {}".format(
+                expected_output_bytes, r.content[header_size:]
+            ),
+        )
 
     def test_raw_binary(self):
         model = "onnx_zero_1_float32"
@@ -80,54 +79,61 @@ def test_byte(self):
         # i.e. BYTE type the element count must be 1
         model = "onnx_zero_1_object_1_element"
         input = "427"
-        headers = {'Inference-Header-Content-Length': '0'}
-        r = requests.post(self._get_infer_url(model),
-                          data=input,
-                          headers=headers)
+        headers = {"Inference-Header-Content-Length": "0"}
+        r = requests.post(self._get_infer_url(model), data=input, headers=headers)
         r.raise_for_status()
 
         # Get the inference header size so we can locate the output binary data
         header_size = int(r.headers["Inference-Header-Content-Length"])
         # Triton returns BYTES tensor with byte size prepended
-        output = r.content[header_size + 4:].decode()
+        output = r.content[header_size + 4 :].decode()
         self.assertEqual(
-            input, output,
-            "Expected response body contains correct output binary data: {}; got: {}"
-            .format(input, output))
+            input,
+            output,
+            "Expected response body contains correct output binary data: {}; got: {}".format(
+                input, output
+            ),
+        )
 
     def test_byte_too_many_elements(self):
         # Select model that doesn't satisfy constraints for raw binary request
         # i.e. BYTE type the element count must be 1
         model = "onnx_zero_1_object"
         input = "427"
-        headers = {'Inference-Header-Content-Length': '0'}
-        r = requests.post(self._get_infer_url(model),
-                          data=input,
-                          headers=headers)
+        headers = {"Inference-Header-Content-Length": "0"}
+        r = requests.post(self._get_infer_url(model), data=input, headers=headers)
         self.assertEqual(
-            400, r.status_code,
+            400,
+            r.status_code,
             "Expected error code {} returned for the request; got: {}".format(
-                400, r.status_code))
+                400, r.status_code
+            ),
+        )
         self.assertIn(
             "For BYTE datatype raw input, the model must have input shape [1]",
-            r.content.decode())
+            r.content.decode(),
+        )
 
     def test_multi_variable_dimensions(self):
         # Select model that doesn't satisfy constraints for raw binary request
         # i.e. this model has multiple variable-sized dimensions
         model = "onnx_zero_1_float16"
         input = np.ones([2, 2], dtype=np.float16)
-        headers = {'Inference-Header-Content-Length': '0'}
-        r = requests.post(self._get_infer_url(model),
-                          data=input.tobytes(),
-                          headers=headers)
+        headers = {"Inference-Header-Content-Length": "0"}
+        r = requests.post(
+            self._get_infer_url(model), data=input.tobytes(), headers=headers
+        )
         self.assertEqual(
-            400, r.status_code,
+            400,
+            r.status_code,
             "Expected error code {} returned for the request; got: {}".format(
-                400, r.status_code))
+                400, r.status_code
+            ),
+        )
         self.assertIn(
             "The shape of the raw input 'INPUT0' can not be deduced because there are more than one variable-sized dimension",
-            r.content.decode())
+            r.content.decode(),
+        )
 
     def test_multi_inputs(self):
         # Select model that doesn't satisfy constraints for raw binary request
@@ -136,21 +142,25 @@ def test_multi_inputs(self):
         # Use one numpy array, after tobytes() it can be seen as three inputs
         # each with 8 elements (this ambiguity is why this is not allowed)
         input = np.arange(24, dtype=np.float32)
-        headers = {'Inference-Header-Content-Length': '0'}
-        r = requests.post(self._get_infer_url(model),
-                          data=input.tobytes(),
-                          headers=headers)
+        headers = {"Inference-Header-Content-Length": "0"}
+        r = requests.post(
+            self._get_infer_url(model), data=input.tobytes(), headers=headers
+        )
         self.assertEqual(
-            400, r.status_code,
+            400,
+            r.status_code,
             "Expected error code {} returned for the request; got: {}".format(
-                400, r.status_code))
+                400, r.status_code
+            ),
+        )
         self.assertIn(
             "Raw request must only have 1 input (found 1) to be deduced but got 3 inputs in",
-            r.content.decode())
+            r.content.decode(),
+        )
 
     # This is to test that a properly chunk-encoded request by the caller works,
     # though Triton does not specifically do any special chunk handling outside
-    # of underlying HTTP libaries used
+    # of underlying HTTP libraries used
     # Future Enhancement: Test other encodings as they come up
     def test_content_encoding_chunked_manually(self):
         # Similar to test_raw_binary but test with extra headers
@@ -165,9 +175,8 @@ def test_content_encoding_chunked_manually(self):
         # Chunk bytes and line separator
         chunk_encoded_input += input_bytes + b"\r\n"
         # Final byte (0) and end message
-        chunk_encoded_input += b'0\r\n\r\n'
-        self._raw_binary_helper(model, chunk_encoded_input, input_bytes,
-                                extra_headers)
+        chunk_encoded_input += b"0\r\n\r\n"
+        self._raw_binary_helper(model, chunk_encoded_input, input_bytes, extra_headers)
 
     # Test that Python client rejects any "Transfer-Encoding" HTTP headers
     # as we don't specially handle encoding requests for the user through
@@ -183,20 +192,19 @@ def test_content_encoding_unsupported_client(self):
                 inputs = []
                 inputs.append(
                     tritonhttpclient.InferInput(
-                        'INPUT0', np_input.shape,
-                        np_to_triton_dtype(np_input.dtype)))
+                        "INPUT0", np_input.shape, np_to_triton_dtype(np_input.dtype)
+                    )
+                )
                 inputs[0].set_data_from_numpy(np_input)
 
-                with tritonhttpclient.InferenceServerClient(
-                        "localhost:8000") as client:
+                with tritonhttpclient.InferenceServerClient("localhost:8000") as client:
                     # Python client is expected to raise an exception to reject
                     # 'content-encoding' HTTP headers.
-                    with self.assertRaisesRegex(InferenceServerException,
-                                                "Unsupported HTTP header"):
-                        client.infer(model_name=model,
-                                     inputs=inputs,
-                                     headers=headers)
+                    with self.assertRaisesRegex(
+                        InferenceServerException, "Unsupported HTTP header"
+                    ):
+                        client.infer(model_name=model, inputs=inputs, headers=headers)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_http/python_http_aio_test.py b/qa/L0_http/python_http_aio_test.py
old mode 100644
new mode 100755
index d31b9c71f2..bd8d342bb1
--- a/qa/L0_http/python_http_aio_test.py
+++ b/qa/L0_http/python_http_aio_test.py
@@ -32,12 +32,10 @@
 
 
 class TestHttpAioClient(unittest.IsolatedAsyncioTestCase):
-    """Test if aio rpc can reach the server
-    """
+    """Test if aio rpc can reach the server"""
 
     async def asyncSetUp(self):
-        self._triton_client = httpclient.InferenceServerClient(
-            url="localhost:8000")
+        self._triton_client = httpclient.InferenceServerClient(url="localhost:8000")
 
     async def asyncTearDown(self):
         await self._triton_client.close()
@@ -72,15 +70,15 @@ async def test_get_model_repository_index(self):
 
     async def test_load_model(self):
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "explicit model load / unload is not allowed if polling is enabled"
+            InferenceServerException,
+            "explicit model load / unload is not allowed if polling is enabled",
         ):
             await self._triton_client.load_model("simple")
 
     async def test_unload_model(self):
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "explicit model load / unload is not allowed if polling is enabled"
+            InferenceServerException,
+            "explicit model load / unload is not allowed if polling is enabled",
         ):
             await self._triton_client.load_model("simple")
 
diff --git a/qa/L0_http/test.sh b/qa/L0_http/test.sh
old mode 100644
new mode 100755
index 56c1782879..c08a5fba74
--- a/qa/L0_http/test.sh
+++ b/qa/L0_http/test.sh
@@ -251,7 +251,7 @@ fi
 
 # Create a password file with username:password
 echo -n 'username:' > pswd
-echo "password" | openssl passwd -stdin -apr1 >> pswd  
+echo "password" | openssl passwd -stdin -apr1 >> pswd
 nginx -c `pwd`/$NGINX_CONF
 
 python3 $BASIC_AUTH_TEST
@@ -504,7 +504,7 @@ wait $SERVER_PID
 # Run cpp client unit test
 rm -rf unit_test_models && mkdir unit_test_models
 cp -r $DATADIR/qa_model_repository/onnx_int32_int32_int32 unit_test_models/.
-cp -r ${MODELDIR}/simple unit_test_models/. 
+cp -r ${MODELDIR}/simple unit_test_models/.
 
 SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=unit_test_models
             --trace-file=global_unittest.log --trace-level=TIMESTAMPS --trace-rate=1"
diff --git a/qa/L0_http_fuzz/fuzztest.py b/qa/L0_http_fuzz/fuzztest.py
old mode 100644
new mode 100755
index 4c2704ec40..8e84ffffc7
--- a/qa/L0_http_fuzz/fuzztest.py
+++ b/qa/L0_http_fuzz/fuzztest.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,28 +30,29 @@
 
 sys.path.append("../common")
 
+import glob
+import os
+import sqlite3
 import unittest
+
 import test_util as tu
-import sqlite3
 from boofuzz import *
-import glob
-import os
 
 
 class FuzzTest(tu.TestResultCollector):
-
     def _run_fuzz(self, url, logger):
         session = Session(
             target=Target(connection=TCPSocketConnection("127.0.0.1", 8000)),
             fuzz_loggers=logger,
-            keep_web_open=False)
+            keep_web_open=False,
+        )
 
         s_initialize(name="Request" + url)
         with s_block("Request-Line"):
-            s_group("Method", [
-                "GET", "HEAD", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS",
-                "TRACE"
-            ])
+            s_group(
+                "Method",
+                ["GET", "HEAD", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE"],
+            )
             s_delim(" ", name="space-1")
             s_string(url, name="Request-URI")
             s_delim(" ", name="space-2")
@@ -62,28 +65,36 @@ def _run_fuzz(self, url, logger):
 
     def test_failures_from_db(self):
         url_list = [
-            "/v2", "/v2/models/simple", "/v2/models/simple/infer",
-            "/v2/models/simple/versions/v1", "/v2/models/simple/config",
-            "/v2/models/simple/stats", "/v2/models/simple/ready",
-            "/v2/health/ready", "/v2/health/live", "/v2/repository/index",
+            "/v2",
+            "/v2/models/simple",
+            "/v2/models/simple/infer",
+            "/v2/models/simple/versions/v1",
+            "/v2/models/simple/config",
+            "/v2/models/simple/stats",
+            "/v2/models/simple/ready",
+            "/v2/health/ready",
+            "/v2/health/live",
+            "/v2/repository/index",
             "/v2/repository/models/simple/unload",
             "/v2/repository/models/simple/load",
-            "/v2/systemsharedmemory/status", "/v2/systemsharedmemory/register",
+            "/v2/systemsharedmemory/status",
+            "/v2/systemsharedmemory/register",
             "/v2/systemsharedmemory/unregister",
             "/v2/systemsharedmemory/region/xx/status",
-            "/v2/cudasharedmemory/status", "/v2/cudasharedmemory/register",
+            "/v2/cudasharedmemory/status",
+            "/v2/cudasharedmemory/register",
             "/v2/cudasharedmemory/unregister",
-            "/v2/cudasharedmemory/region/xx/status"
+            "/v2/cudasharedmemory/region/xx/status",
         ]
 
-        csv_log = open('fuzz_results.csv', 'w')
+        csv_log = open("fuzz_results.csv", "w")
         logger = [FuzzLoggerCsv(file_handle=csv_log)]
 
         for url in url_list:
             self._run_fuzz(url, logger)
 
             # Get latest db file
-            files = glob.glob('boofuzz-results/*')
+            files = glob.glob("boofuzz-results/*")
             dbfile = max(files, key=os.path.getctime)
 
             conn = sqlite3.connect(dbfile)
@@ -91,10 +102,8 @@ def test_failures_from_db(self):
 
             # Get number of failures, should be 0
             self.assertEqual(
-                len([
-                    x for x in c.execute(
-                        "SELECT * FROM steps WHERE type=\"fail\"")
-                ]), 0)
+                len([x for x in c.execute('SELECT * FROM steps WHERE type="fail"')]), 0
+            )
 
 
 if __name__ == "__main__":
diff --git a/qa/L0_http_fuzz/test.sh b/qa/L0_http_fuzz/test.sh
old mode 100644
new mode 100755
index 372fe5a242..f721135698
--- a/qa/L0_http_fuzz/test.sh
+++ b/qa/L0_http_fuzz/test.sh
@@ -53,15 +53,15 @@ FUZZ_LOG=`pwd`/fuzz.log
 DATADIR=`pwd`/models
 SERVER=/opt/tritonserver/bin/tritonserver
 SERVER_ARGS="--model-repository=$DATADIR"
-source ../common/util.sh 
+source ../common/util.sh
 
 # Remove this once foobuzz and tornado packages upgrade to work with python 3.10
-# This test tests the server's ability to handle poor input and not the compatibility 
+# This test tests the server's ability to handle poor input and not the compatibility
 # with python 3.10. Python 3.8 is ok to use here.
 function_install_python38() {
     source ../L0_backend_python/common.sh
     install_conda
-    create_conda_env "3.8" "python-3-8" 
+    create_conda_env "3.8" "python-3-8"
 
     # Install test script dependencies
     pip3 install --upgrade wheel setuptools boofuzz==0.3.0 numpy pillow attrdict future grpcio requests gsutil \
diff --git a/qa/L0_https/test.sh b/qa/L0_https/test.sh
old mode 100644
new mode 100755
index 7e3f4696d1..7fe03b843e
--- a/qa/L0_https/test.sh
+++ b/qa/L0_https/test.sh
@@ -57,23 +57,23 @@ rm -f *.key *.crt ${CLIENT_LOG}.* server.log
 
 # Generate valid CA
 openssl genrsa -passout pass:1234 -des3 -out ca.key 4096
-openssl req -passin pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA"
+openssl req -passing pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA"
 
 # Generate valid Server Key/Cert
 openssl genrsa -passout pass:1234 -des3 -out server.key 4096
-openssl req -passin pass:1234 -new -key server.key -out server.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost"
-openssl x509 -req -passin pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt
+openssl req -passing pass:1234 -new -key server.key -out server.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost"
+openssl x509 -req -passing pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt
 
 # Remove passphrase from the Server Key
-openssl rsa -passin pass:1234 -in server.key -out server.key
+openssl rsa -passing pass:1234 -in server.key -out server.key
 
 # Generate valid Client Key/Cert
 openssl genrsa -passout pass:1234 -des3 -out client.key 4096
-openssl req -passin pass:1234 -new -key client.key -out client.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost"
-openssl x509 -passin pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt
+openssl req -passing pass:1234 -new -key client.key -out client.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost"
+openssl x509 -passing pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt
 
 # Remove passphrase from Client Key
-openssl rsa -passin pass:1234 -in client.key -out client.key
+openssl rsa -passing pass:1234 -in client.key -out client.key
 
 # Create mutated client key (Make first char of each like capital)
 cp client.key client2.key && sed -i "s/\b\(.\)/\u\1/g" client2.key
@@ -135,7 +135,7 @@ if [ $? -ne 0 ]; then
 fi
 
 # Test failure cases for SSL
-# Try without SSL 
+# Try without SSL
 $SIMPLE_INFER_CLIENT_PY -v -u localhost >> ${CLIENT_LOG}.no_ssl_fail_infer 2>&1
 if [ $? -ne 0 ]; then
     cat ${CLIENT_LOG}.no_ssl_fail_infer
diff --git a/qa/L0_implicit_state/implicit_state.py b/qa/L0_implicit_state/implicit_state.py
old mode 100644
new mode 100755
index 147697cf16..db8053dcb1
--- a/qa/L0_implicit_state/implicit_state.py
+++ b/qa/L0_implicit_state/implicit_state.py
@@ -187,4 +187,4 @@ def test_request_output(self):
 
 
 if __name__ == '__main__':
-    unittest.main()
+    unittest.main()
\ No newline at end of file
diff --git a/qa/L0_implicit_state/test.sh b/qa/L0_implicit_state/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_infer/infer_test.py b/qa/L0_infer/infer_test.py
old mode 100644
new mode 100755
index 1e0e172a13..d97803b17d
--- a/qa/L0_infer/infer_test.py
+++ b/qa/L0_infer/infer_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,67 +30,66 @@
 
 sys.path.append("../common")
 
+import os
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
-import os
-
 from tritonclient.utils import *
 
-TEST_SYSTEM_SHARED_MEMORY = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
-TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get('TEST_CUDA_SHARED_MEMORY',
-                                                  0)))
-CPU_ONLY = (os.environ.get('TRITON_SERVER_CPU_ONLY') is not None)
-TEST_VALGRIND = bool(int(os.environ.get('TEST_VALGRIND', 0)))
+TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
+TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0)))
+CPU_ONLY = os.environ.get("TRITON_SERVER_CPU_ONLY") is not None
+TEST_VALGRIND = bool(int(os.environ.get("TEST_VALGRIND", 0)))
 
-USE_GRPC = (os.environ.get('USE_GRPC', 1) != "0")
-USE_HTTP = (os.environ.get('USE_HTTP', 1) != "0")
+USE_GRPC = os.environ.get("USE_GRPC", 1) != "0"
+USE_HTTP = os.environ.get("USE_HTTP", 1) != "0"
 assert USE_GRPC or USE_HTTP, "USE_GRPC or USE_HTTP must be non-zero"
 
 BACKENDS = os.environ.get(
-    'BACKENDS',
-    "graphdef savedmodel onnx libtorch plan python python_dlpack openvino")
-ENSEMBLES = bool(int(os.environ.get('ENSEMBLES', 1)))
-NOBATCH = bool(int(os.environ.get('NOBATCH', 1)))
-BATCH = bool(int(os.environ.get('BATCH', 1)))
+    "BACKENDS", "graphdef savedmodel onnx libtorch plan python python_dlpack openvino"
+)
+ENSEMBLES = bool(int(os.environ.get("ENSEMBLES", 1)))
+NOBATCH = bool(int(os.environ.get("NOBATCH", 1)))
+BATCH = bool(int(os.environ.get("BATCH", 1)))
 
 np_dtype_string = np.dtype(object)
 
 
 class InferTest(tu.TestResultCollector):
-
     def _full_exact(
-            self,
+        self,
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        output0_raw,
+        output1_raw,
+        swap,
+        # 60 sec is the default value
+        network_timeout=60.0,
+    ):
+        def _infer_exact_helper(
+            tester,
+            pf,
+            tensor_shape,
+            batch_size,
             input_dtype,
             output0_dtype,
             output1_dtype,
-            output0_raw,
-            output1_raw,
-            swap,
-            # 60 sec is the default value
-            network_timeout=60.0):
-
-        def _infer_exact_helper(tester,
-                                pf,
-                                tensor_shape,
-                                batch_size,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                output0_raw=True,
-                                output1_raw=True,
-                                model_version=None,
-                                swap=False,
-                                outputs=("OUTPUT0", "OUTPUT1"),
-                                use_http=USE_HTTP,
-                                use_grpc=USE_GRPC,
-                                use_http_json_tensors=True,
-                                skip_request_id_check=True,
-                                use_streaming=True,
-                                correlation_id=0,
-                                network_timeout=60.0):
+            output0_raw=True,
+            output1_raw=True,
+            model_version=None,
+            swap=False,
+            outputs=("OUTPUT0", "OUTPUT1"),
+            use_http=USE_HTTP,
+            use_grpc=USE_GRPC,
+            use_http_json_tensors=True,
+            skip_request_id_check=True,
+            use_streaming=True,
+            correlation_id=0,
+            network_timeout=60.0,
+        ):
             for bs in (1, batch_size):
                 # model that does not support batching
                 if NOBATCH:
@@ -114,13 +115,15 @@ def _infer_exact_helper(tester,
                             correlation_id=correlation_id,
                             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
                             use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
-                            network_timeout=network_timeout)
+                            network_timeout=network_timeout,
+                        )
 
                 if BATCH:
                     # model that supports batching.
                     iu.infer_exact(
                         tester,
-                        pf, (bs,) + tensor_shape,
+                        pf,
+                        (bs,) + tensor_shape,
                         bs,
                         input_dtype,
                         output0_dtype,
@@ -138,7 +141,8 @@ def _infer_exact_helper(tester,
                         correlation_id=correlation_id,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
                         use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
-                        network_timeout=network_timeout)
+                        network_timeout=network_timeout,
+                    )
 
         input_size = 16
 
@@ -146,89 +150,131 @@ def _infer_exact_helper(tester,
         ensemble_prefix = [""]
         if ENSEMBLES:
             for prefix in all_ensemble_prefix:
-                if tu.validate_for_ensemble_model(prefix, input_dtype,
-                                                  output0_dtype, output1_dtype,
-                                                  (input_size,), (input_size,),
-                                                  (input_size,)):
+                if tu.validate_for_ensemble_model(
+                    prefix,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    (input_size,),
+                    (input_size,),
+                    (input_size,),
+                ):
                     ensemble_prefix.append(prefix)
 
-        if tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
-                                    (input_size,), (input_size,),
-                                    (input_size,)):
+        if tu.validate_for_tf_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size,),
+            (input_size,),
+            (input_size,),
+        ):
             for prefix in ensemble_prefix:
                 for pf in ["graphdef", "savedmodel"]:
                     if pf in BACKENDS:
-                        _infer_exact_helper(self,
-                                            prefix + pf, (input_size,),
-                                            8,
-                                            input_dtype,
-                                            output0_dtype,
-                                            output1_dtype,
-                                            output0_raw=output0_raw,
-                                            output1_raw=output1_raw,
-                                            swap=swap,
-                                            network_timeout=network_timeout)
+                        _infer_exact_helper(
+                            self,
+                            prefix + pf,
+                            (input_size,),
+                            8,
+                            input_dtype,
+                            output0_dtype,
+                            output1_dtype,
+                            output0_raw=output0_raw,
+                            output1_raw=output1_raw,
+                            swap=swap,
+                            network_timeout=network_timeout,
+                        )
 
         if not CPU_ONLY and tu.validate_for_trt_model(
-                input_dtype, output0_dtype, output1_dtype, (input_size, 1, 1),
-            (input_size, 1, 1), (input_size, 1, 1)):
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size, 1, 1),
+            (input_size, 1, 1),
+            (input_size, 1, 1),
+        ):
             for prefix in ensemble_prefix:
-                if 'plan' in BACKENDS:
+                if "plan" in BACKENDS:
                     if input_dtype == np.int8:
-                        _infer_exact_helper(self,
-                                            prefix + 'plan', (input_size, 1, 1),
-                                            8,
-                                            input_dtype,
-                                            output0_dtype,
-                                            output1_dtype,
-                                            output0_raw=output0_raw,
-                                            output1_raw=output1_raw,
-                                            swap=swap)
+                        _infer_exact_helper(
+                            self,
+                            prefix + "plan",
+                            (input_size, 1, 1),
+                            8,
+                            input_dtype,
+                            output0_dtype,
+                            output1_dtype,
+                            output0_raw=output0_raw,
+                            output1_raw=output1_raw,
+                            swap=swap,
+                        )
                     else:
-                        _infer_exact_helper(self,
-                                            prefix + 'plan', (input_size,),
-                                            8,
-                                            input_dtype,
-                                            output0_dtype,
-                                            output1_dtype,
-                                            output0_raw=output0_raw,
-                                            output1_raw=output1_raw,
-                                            swap=swap)
-
-        if tu.validate_for_onnx_model(input_dtype, output0_dtype, output1_dtype,
-                                      (input_size,), (input_size,),
-                                      (input_size,)):
+                        _infer_exact_helper(
+                            self,
+                            prefix + "plan",
+                            (input_size,),
+                            8,
+                            input_dtype,
+                            output0_dtype,
+                            output1_dtype,
+                            output0_raw=output0_raw,
+                            output1_raw=output1_raw,
+                            swap=swap,
+                        )
+
+        if tu.validate_for_onnx_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size,),
+            (input_size,),
+            (input_size,),
+        ):
             for prefix in ensemble_prefix:
-                if 'onnx' in BACKENDS:
-                    _infer_exact_helper(self,
-                                        prefix + 'onnx', (input_size,),
-                                        8,
-                                        input_dtype,
-                                        output0_dtype,
-                                        output1_dtype,
-                                        output0_raw=output0_raw,
-                                        output1_raw=output1_raw,
-                                        swap=swap)
-
-        if tu.validate_for_libtorch_model(input_dtype, output0_dtype,
-                                          output1_dtype, (input_size,),
-                                          (input_size,), (input_size,)):
+                if "onnx" in BACKENDS:
+                    _infer_exact_helper(
+                        self,
+                        prefix + "onnx",
+                        (input_size,),
+                        8,
+                        input_dtype,
+                        output0_dtype,
+                        output1_dtype,
+                        output0_raw=output0_raw,
+                        output1_raw=output1_raw,
+                        swap=swap,
+                    )
+
+        if tu.validate_for_libtorch_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size,),
+            (input_size,),
+            (input_size,),
+        ):
             # Due to PyTorch bug
             # https://github.com/pytorch/pytorch/issues/66930 we can't
             # run this test with int8 input and int32 outputs.
-            if ((input_dtype == np.int8) and (output0_dtype == np.int32) and
-                (output1_dtype == np.int32)):
-                print('skipping pytorch test for int8_int32_int32')
+            if (
+                (input_dtype == np.int8)
+                and (output0_dtype == np.int32)
+                and (output1_dtype == np.int32)
+            ):
+                print("skipping pytorch test for int8_int32_int32")
             else:
                 for prefix in ensemble_prefix:
-                    if 'libtorch' in BACKENDS:
+                    if "libtorch" in BACKENDS:
                         # Skip batching for PyTorch String I/O
-                        if ((input_dtype == np_dtype_string) or
-                            (output0_dtype == np_dtype_string) or
-                            (output1_dtype == np_dtype_string)):
+                        if (
+                            (input_dtype == np_dtype_string)
+                            or (output0_dtype == np_dtype_string)
+                            or (output1_dtype == np_dtype_string)
+                        ):
                             iu.infer_exact(
                                 self,
-                                prefix + 'libtorch_nobatch',
+                                prefix + "libtorch_nobatch",
                                 (input_size,),
                                 1,  # batch_size
                                 input_dtype,
@@ -239,239 +285,259 @@ def _infer_exact_helper(tester,
                                 swap=swap,
                                 use_http=USE_HTTP,
                                 use_grpc=USE_GRPC,
-                                use_system_shared_memory=
-                                TEST_SYSTEM_SHARED_MEMORY,
-                                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                            )
                         else:
-                            _infer_exact_helper(self,
-                                                prefix + 'libtorch',
-                                                (input_size,),
-                                                8,
-                                                input_dtype,
-                                                output0_dtype,
-                                                output1_dtype,
-                                                output0_raw=output0_raw,
-                                                output1_raw=output1_raw,
-                                                swap=swap)
+                            _infer_exact_helper(
+                                self,
+                                prefix + "libtorch",
+                                (input_size,),
+                                8,
+                                input_dtype,
+                                output0_dtype,
+                                output1_dtype,
+                                output0_raw=output0_raw,
+                                output1_raw=output1_raw,
+                                swap=swap,
+                            )
 
         for prefix in ensemble_prefix:
             if prefix != "":
                 continue
-            if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
+            if (
+                input_dtype == np.uint8
+                or output0_dtype == np.uint8
+                or output1_dtype == np.uint8
+            ):
                 continue
 
-            if 'python_dlpack' in BACKENDS:
-                _infer_exact_helper(self,
-                                    prefix + 'python_dlpack', (input_size,),
-                                    8,
-                                    input_dtype,
-                                    output0_dtype,
-                                    output1_dtype,
-                                    output0_raw=output0_raw,
-                                    output1_raw=output1_raw,
-                                    swap=swap)
-            elif 'python' in BACKENDS:
-                _infer_exact_helper(self,
-                                    prefix + 'python', (input_size,),
-                                    8,
-                                    input_dtype,
-                                    output0_dtype,
-                                    output1_dtype,
-                                    output0_raw=output0_raw,
-                                    output1_raw=output1_raw,
-                                    swap=swap)
+            if "python_dlpack" in BACKENDS:
+                _infer_exact_helper(
+                    self,
+                    prefix + "python_dlpack",
+                    (input_size,),
+                    8,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    output0_raw=output0_raw,
+                    output1_raw=output1_raw,
+                    swap=swap,
+                )
+            elif "python" in BACKENDS:
+                _infer_exact_helper(
+                    self,
+                    prefix + "python",
+                    (input_size,),
+                    8,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    output0_raw=output0_raw,
+                    output1_raw=output1_raw,
+                    swap=swap,
+                )
 
     def test_raw_uuu(self):
-        self._full_exact(np.uint8,
-                         np.uint8,
-                         np.uint8,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.uint8, np.uint8, np.uint8, output0_raw=True, output1_raw=True, swap=True
+        )
 
     def test_raw_bbb(self):
-        self._full_exact(np.int8,
-                         np.int8,
-                         np.int8,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.int8, np.int8, np.int8, output0_raw=True, output1_raw=True, swap=True
+        )
 
     def test_raw_sss(self):
-        self._full_exact(np.int16,
-                         np.int16,
-                         np.int16,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.int16, np.int16, np.int16, output0_raw=True, output1_raw=True, swap=True
+        )
 
     def test_raw_iii(self):
-        self._full_exact(np.int32,
-                         np.int32,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.int32, np.int32, np.int32, output0_raw=True, output1_raw=True, swap=True
+        )
 
     def test_raw_lll(self):
-        self._full_exact(np.int64,
-                         np.int64,
-                         np.int64,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int64, np.int64, np.int64, output0_raw=True, output1_raw=True, swap=False
+        )
 
     def test_raw_hhh(self):
-        self._full_exact(np.float16,
-                         np.float16,
-                         np.float16,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.float16,
+            np.float16,
+            np.float16,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_fff(self):
-        self._full_exact(np.float32,
-                         np.float32,
-                         np.float32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.float32,
+            np.float32,
+            np.float32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=True,
+        )
 
     def test_raw_hff(self):
-        self._full_exact(np.float16,
-                         np.float32,
-                         np.float32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.float16,
+            np.float32,
+            np.float32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_bii(self):
-        self._full_exact(np.int8,
-                         np.int32,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int8, np.int32, np.int32, output0_raw=True, output1_raw=True, swap=False
+        )
 
     def test_raw_ibb(self):
-        self._full_exact(np.int32,
-                         np.int8,
-                         np.int8,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32, np.int8, np.int8, output0_raw=True, output1_raw=True, swap=False
+        )
 
     def test_raw_ibs(self):
-        self._full_exact(np.int32,
-                         np.int8,
-                         np.int16,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32, np.int8, np.int16, output0_raw=True, output1_raw=True, swap=False
+        )
 
     def test_raw_fuu(self):
-        self._full_exact(np.float32,
-                         np.uint8,
-                         np.uint8,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.float32,
+            np.uint8,
+            np.uint8,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_uff(self):
-        self._full_exact(np.uint8,
-                         np.float32,
-                         np.float32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.uint8,
+            np.float32,
+            np.float32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_fuh(self):
-        self._full_exact(np.float32,
-                         np.uint8,
-                         np.float16,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.float32,
+            np.uint8,
+            np.float16,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_iff(self):
-        self._full_exact(np.int32,
-                         np.float32,
-                         np.float32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32,
+            np.float32,
+            np.float32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_fii(self):
-        self._full_exact(np.float32,
-                         np.int32,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.float32,
+            np.int32,
+            np.int32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_ihs(self):
-        self._full_exact(np.int32,
-                         np.float16,
-                         np.int16,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32,
+            np.float16,
+            np.int16,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_ooo(self):
-        self._full_exact(np_dtype_string,
-                         np_dtype_string,
-                         np_dtype_string,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np_dtype_string,
+            np_dtype_string,
+            np_dtype_string,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_oii(self):
-        self._full_exact(np_dtype_string,
-                         np.int32,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np_dtype_string,
+            np.int32,
+            np.int32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_oio(self):
-        self._full_exact(np_dtype_string,
-                         np.int32,
-                         np_dtype_string,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np_dtype_string,
+            np.int32,
+            np_dtype_string,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_ooi(self):
-        self._full_exact(np_dtype_string,
-                         np_dtype_string,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np_dtype_string,
+            np_dtype_string,
+            np.int32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_ioo(self):
-        self._full_exact(np.int32,
-                         np_dtype_string,
-                         np_dtype_string,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32,
+            np_dtype_string,
+            np_dtype_string,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_iio(self):
-        self._full_exact(np.int32,
-                         np.int32,
-                         np_dtype_string,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32,
+            np.int32,
+            np_dtype_string,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_ioi(self):
-        self._full_exact(np.int32,
-                         np_dtype_string,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32,
+            np_dtype_string,
+            np.int32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     # shared memory does not support class output
     if not (TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY):
@@ -486,95 +552,118 @@ def test_class_bbb(self):
                 swap=True,
                 # Increase network_timeout for TensorFlow models for
                 # valgrind test.
-                network_timeout=100.0 if TEST_VALGRIND else 60.0)
+                network_timeout=100.0 if TEST_VALGRIND else 60.0,
+            )
 
         def test_class_sss(self):
-            self._full_exact(np.int16,
-                             np.int16,
-                             np.int16,
-                             output0_raw=False,
-                             output1_raw=False,
-                             swap=True)
+            self._full_exact(
+                np.int16,
+                np.int16,
+                np.int16,
+                output0_raw=False,
+                output1_raw=False,
+                swap=True,
+            )
 
         def test_class_iii(self):
-            self._full_exact(np.int32,
-                             np.int32,
-                             np.int32,
-                             output0_raw=False,
-                             output1_raw=False,
-                             swap=True)
+            self._full_exact(
+                np.int32,
+                np.int32,
+                np.int32,
+                output0_raw=False,
+                output1_raw=False,
+                swap=True,
+            )
 
         def test_class_lll(self):
-            self._full_exact(np.int64,
-                             np.int64,
-                             np.int64,
-                             output0_raw=False,
-                             output1_raw=False,
-                             swap=False)
+            self._full_exact(
+                np.int64,
+                np.int64,
+                np.int64,
+                output0_raw=False,
+                output1_raw=False,
+                swap=False,
+            )
 
         def test_class_fff(self):
-            self._full_exact(np.float32,
-                             np.float32,
-                             np.float32,
-                             output0_raw=False,
-                             output1_raw=False,
-                             swap=True)
+            self._full_exact(
+                np.float32,
+                np.float32,
+                np.float32,
+                output0_raw=False,
+                output1_raw=False,
+                swap=True,
+            )
 
         def test_class_iff(self):
-            self._full_exact(np.int32,
-                             np.float32,
-                             np.float32,
-                             output0_raw=False,
-                             output1_raw=False,
-                             swap=False)
+            self._full_exact(
+                np.int32,
+                np.float32,
+                np.float32,
+                output0_raw=False,
+                output1_raw=False,
+                swap=False,
+            )
 
         def test_mix_bbb(self):
-            self._full_exact(np.int8,
-                             np.int8,
-                             np.int8,
-                             output0_raw=True,
-                             output1_raw=False,
-                             swap=True)
+            self._full_exact(
+                np.int8,
+                np.int8,
+                np.int8,
+                output0_raw=True,
+                output1_raw=False,
+                swap=True,
+            )
 
         def test_mix_sss(self):
-            self._full_exact(np.int16,
-                             np.int16,
-                             np.int16,
-                             output0_raw=False,
-                             output1_raw=True,
-                             swap=True)
+            self._full_exact(
+                np.int16,
+                np.int16,
+                np.int16,
+                output0_raw=False,
+                output1_raw=True,
+                swap=True,
+            )
 
         def test_mix_iii(self):
-            self._full_exact(np.int32,
-                             np.int32,
-                             np.int32,
-                             output0_raw=True,
-                             output1_raw=False,
-                             swap=True)
+            self._full_exact(
+                np.int32,
+                np.int32,
+                np.int32,
+                output0_raw=True,
+                output1_raw=False,
+                swap=True,
+            )
 
         def test_mix_lll(self):
-            self._full_exact(np.int64,
-                             np.int64,
-                             np.int64,
-                             output0_raw=False,
-                             output1_raw=True,
-                             swap=False)
+            self._full_exact(
+                np.int64,
+                np.int64,
+                np.int64,
+                output0_raw=False,
+                output1_raw=True,
+                swap=False,
+            )
 
         def test_mix_fff(self):
-            self._full_exact(np.float32,
-                             np.float32,
-                             np.float32,
-                             output0_raw=True,
-                             output1_raw=False,
-                             swap=True)
+            self._full_exact(
+                np.float32,
+                np.float32,
+                np.float32,
+                output0_raw=True,
+                output1_raw=False,
+                swap=True,
+            )
 
         def test_mix_iff(self):
-            self._full_exact(np.int32,
-                             np.float32,
-                             np.float32,
-                             output0_raw=False,
-                             output1_raw=True,
-                             swap=False)
+            self._full_exact(
+                np.int32,
+                np.float32,
+                np.float32,
+                output0_raw=False,
+                output1_raw=True,
+                swap=False,
+            )
 
     def test_raw_version_latest_1(self):
         input_size = 16
@@ -582,7 +671,7 @@ def test_raw_version_latest_1(self):
 
         # There are 3 versions of graphdef_int8_int8_int8 but
         # only version 3 should be available
-        for platform in ('graphdef', 'savedmodel'):
+        for platform in ("graphdef", "savedmodel"):
             if platform not in BACKENDS:
                 continue
             try:
@@ -599,10 +688,10 @@ def test_raw_version_latest_1(self):
                     use_http=USE_HTTP,
                     use_grpc=USE_GRPC,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             except InferenceServerException as ex:
-                self.assertTrue(
-                    ex.message().startswith("Request for unknown model"))
+                self.assertTrue(ex.message().startswith("Request for unknown model"))
 
             try:
                 iu.infer_exact(
@@ -618,24 +707,26 @@ def test_raw_version_latest_1(self):
                     use_http=USE_HTTP,
                     use_grpc=USE_GRPC,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             except InferenceServerException as ex:
-                self.assertTrue(
-                    ex.message().startswith("Request for unknown model"))
-
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.int8,
-                           np.int8,
-                           np.int8,
-                           model_version=3,
-                           swap=True,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                self.assertTrue(ex.message().startswith("Request for unknown model"))
+
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.int8,
+                np.int8,
+                np.int8,
+                model_version=3,
+                swap=True,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
 
     def test_raw_version_latest_2(self):
         input_size = 16
@@ -643,7 +734,7 @@ def test_raw_version_latest_2(self):
 
         # There are 3 versions of graphdef_int16_int16_int16 but only
         # versions 2 and 3 should be available
-        for platform in ('graphdef', 'savedmodel'):
+        for platform in ("graphdef", "savedmodel"):
             if platform not in BACKENDS:
                 continue
             try:
@@ -660,37 +751,41 @@ def test_raw_version_latest_2(self):
                     use_http=USE_HTTP,
                     use_grpc=USE_GRPC,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             except InferenceServerException as ex:
-                self.assertTrue(
-                    ex.message().startswith("Request for unknown model"))
-
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.int16,
-                           np.int16,
-                           np.int16,
-                           model_version=2,
-                           swap=True,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.int16,
-                           np.int16,
-                           np.int16,
-                           model_version=3,
-                           swap=True,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                self.assertTrue(ex.message().startswith("Request for unknown model"))
+
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.int16,
+                np.int16,
+                np.int16,
+                model_version=2,
+                swap=True,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.int16,
+                np.int16,
+                np.int16,
+                model_version=3,
+                swap=True,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
 
     def test_raw_version_all(self):
         input_size = 16
@@ -698,48 +793,54 @@ def test_raw_version_all(self):
 
         # There are 3 versions of *_int32_int32_int32 and all should
         # be available.
-        for platform in ('graphdef', 'savedmodel'):
+        for platform in ("graphdef", "savedmodel"):
             if platform not in BACKENDS:
                 continue
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.int32,
-                           np.int32,
-                           np.int32,
-                           model_version=1,
-                           swap=False,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.int32,
-                           np.int32,
-                           np.int32,
-                           model_version=2,
-                           swap=True,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.int32,
-                           np.int32,
-                           np.int32,
-                           model_version=3,
-                           swap=True,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.int32,
+                np.int32,
+                np.int32,
+                model_version=1,
+                swap=False,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.int32,
+                np.int32,
+                np.int32,
+                model_version=2,
+                swap=True,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.int32,
+                np.int32,
+                np.int32,
+                model_version=3,
+                swap=True,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
 
     def test_raw_version_specific_1(self):
         input_size = 16
@@ -747,22 +848,24 @@ def test_raw_version_specific_1(self):
 
         # There are 3 versions of *_float16_float16_float16 but only
         # version 1 should be available.
-        for platform in ('graphdef', 'savedmodel'):
+        for platform in ("graphdef", "savedmodel"):
             if platform not in BACKENDS:
                 continue
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.float16,
-                           np.float16,
-                           np.float16,
-                           model_version=1,
-                           swap=False,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.float16,
+                np.float16,
+                np.float16,
+                model_version=1,
+                swap=False,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
 
             try:
                 iu.infer_exact(
@@ -778,10 +881,10 @@ def test_raw_version_specific_1(self):
                     use_http=USE_HTTP,
                     use_grpc=USE_GRPC,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             except InferenceServerException as ex:
-                self.assertTrue(
-                    ex.message().startswith("Request for unknown model"))
+                self.assertTrue(ex.message().startswith("Request for unknown model"))
 
             try:
                 iu.infer_exact(
@@ -797,35 +900,37 @@ def test_raw_version_specific_1(self):
                     use_http=USE_HTTP,
                     use_grpc=USE_GRPC,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             except InferenceServerException as ex:
-                self.assertTrue(
-                    ex.message().startswith("Request for unknown model"))
+                self.assertTrue(ex.message().startswith("Request for unknown model"))
 
     def test_raw_version_specific_1_3(self):
         input_size = 16
 
         # There are 3 versions of *_float32_float32_float32 but only
         # versions 1 and 3 should be available.
-        for platform in ('graphdef', 'savedmodel', 'plan'):
-            if platform == 'plan' and CPU_ONLY:
+        for platform in ("graphdef", "savedmodel", "plan"):
+            if platform == "plan" and CPU_ONLY:
                 continue
             if platform not in BACKENDS:
                 continue
             tensor_shape = (1, input_size)
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           model_version=1,
-                           swap=False,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                model_version=1,
+                swap=False,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
 
             try:
                 iu.infer_exact(
@@ -841,27 +946,29 @@ def test_raw_version_specific_1_3(self):
                     use_http=USE_HTTP,
                     use_grpc=USE_GRPC,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             except InferenceServerException as ex:
-                self.assertTrue(
-                    ex.message().startswith("Request for unknown model"))
-
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           model_version=3,
-                           swap=True,
-                           use_http=USE_HTTP,
-                           use_grpc=USE_GRPC,
-                           use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                           use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                self.assertTrue(ex.message().startswith("Request for unknown model"))
+
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                model_version=3,
+                swap=True,
+                use_http=USE_HTTP,
+                use_grpc=USE_GRPC,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
 
     if ENSEMBLES:
-        if all(x in BACKENDS for x in ['graphdef', 'savedmodel']):
+        if all(x in BACKENDS for x in ["graphdef", "savedmodel"]):
 
             def test_ensemble_mix_platform(self):
                 # Skip on CPU only machine as TensorRT model is used in this ensemble
@@ -870,7 +977,8 @@ def test_ensemble_mix_platform(self):
                 for bs in (1, 8):
                     iu.infer_exact(
                         self,
-                        "mix_platform", (bs, 16),
+                        "mix_platform",
+                        (bs, 16),
                         bs,
                         np.float32,
                         np.float32,
@@ -878,7 +986,8 @@ def test_ensemble_mix_platform(self):
                         use_http=USE_HTTP,
                         use_grpc=USE_GRPC,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
         if "graphdef" in BACKENDS:
 
@@ -886,7 +995,8 @@ def test_ensemble_mix_type(self):
                 for bs in (1, 8):
                     iu.infer_exact(
                         self,
-                        "mix_type", (bs, 16),
+                        "mix_type",
+                        (bs, 16),
                         bs,
                         np.int32,
                         np.float32,
@@ -894,15 +1004,17 @@ def test_ensemble_mix_type(self):
                         use_http=USE_HTTP,
                         use_grpc=USE_GRPC,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
-        if all(x in BACKENDS for x in ['graphdef', 'savedmodel']):
+        if all(x in BACKENDS for x in ["graphdef", "savedmodel"]):
 
             def test_ensemble_mix_ensemble(self):
                 for bs in (1, 8):
                     iu.infer_exact(
                         self,
-                        "mix_ensemble", (bs, 16),
+                        "mix_ensemble",
+                        (bs, 16),
                         bs,
                         np.int32,
                         np.float32,
@@ -910,11 +1022,15 @@ def test_ensemble_mix_ensemble(self):
                         use_http=USE_HTTP,
                         use_grpc=USE_GRPC,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
-        if all(x in BACKENDS for x in [
-                'graphdef',
-        ]):
+        if all(
+            x in BACKENDS
+            for x in [
+                "graphdef",
+            ]
+        ):
 
             def test_ensemble_mix_batch_nobatch(self):
                 base_names = ["batch_to_nobatch", "nobatch_to_batch"]
@@ -922,7 +1038,8 @@ def test_ensemble_mix_batch_nobatch(self):
                     for bs in (1, 8):
                         iu.infer_exact(
                             self,
-                            name, (bs, 16),
+                            name,
+                            (bs, 16),
                             bs,
                             np.float32,
                             np.float32,
@@ -930,10 +1047,12 @@ def test_ensemble_mix_batch_nobatch(self):
                             use_http=USE_HTTP,
                             use_grpc=USE_GRPC,
                             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                        )
                     iu.infer_exact(
                         self,
-                        name + "_nobatch", (8, 16),
+                        name + "_nobatch",
+                        (8, 16),
                         1,
                         np.float32,
                         np.float32,
@@ -941,13 +1060,15 @@ def test_ensemble_mix_batch_nobatch(self):
                         use_http=USE_HTTP,
                         use_grpc=USE_GRPC,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
                 # batch -> nobatch -> batch
                 for bs in (1, 8):
                     iu.infer_exact(
                         self,
-                        "mix_nobatch_batch", (bs, 16),
+                        "mix_nobatch_batch",
+                        (bs, 16),
                         bs,
                         np.float32,
                         np.float32,
@@ -955,17 +1076,19 @@ def test_ensemble_mix_batch_nobatch(self):
                         use_http=USE_HTTP,
                         use_grpc=USE_GRPC,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
         if not (TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY):
 
             def test_ensemble_label_lookup(self):
-                if all(x in BACKENDS for x in ['graphdef', 'savedmodel']):
+                if all(x in BACKENDS for x in ["graphdef", "savedmodel"]):
                     # Ensemble needs to look up label from the actual model
                     for bs in (1, 8):
                         iu.infer_exact(
                             self,
-                            "mix_platform", (bs, 16),
+                            "mix_platform",
+                            (bs, 16),
                             bs,
                             np.float32,
                             np.float32,
@@ -975,14 +1098,16 @@ def test_ensemble_label_lookup(self):
                             use_http=USE_HTTP,
                             use_grpc=USE_GRPC,
                             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                        )
 
-                if all(x in BACKENDS for x in ['graphdef', 'savedmodel']):
+                if all(x in BACKENDS for x in ["graphdef", "savedmodel"]):
                     # Label from the actual model will be passed along the nested ensemble
                     for bs in (1, 8):
                         iu.infer_exact(
                             self,
-                            "mix_ensemble", (bs, 16),
+                            "mix_ensemble",
+                            (bs, 16),
                             bs,
                             np.int32,
                             np.float32,
@@ -992,14 +1117,16 @@ def test_ensemble_label_lookup(self):
                             use_http=USE_HTTP,
                             use_grpc=USE_GRPC,
                             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                        )
 
                 if "graphdef" in BACKENDS:
                     # If label file is provided, it will use the provided label file directly
                     try:
                         iu.infer_exact(
                             self,
-                            "wrong_label", (1, 16),
+                            "wrong_label",
+                            (1, 16),
                             1,
                             np.int32,
                             np.float32,
@@ -1009,7 +1136,8 @@ def test_ensemble_label_lookup(self):
                             use_http=USE_HTTP,
                             use_grpc=USE_GRPC,
                             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                        )
                     except AssertionError:
                         # Sanity check that infer_exact failed since this ensemble is provided
                         # with unexpected labels
@@ -1019,7 +1147,8 @@ def test_ensemble_label_lookup(self):
                     for bs in (1, 8):
                         iu.infer_exact(
                             self,
-                            "label_override", (bs, 16),
+                            "label_override",
+                            (bs, 16),
                             bs,
                             np.int32,
                             np.float32,
@@ -1029,8 +1158,9 @@ def test_ensemble_label_lookup(self):
                             use_http=USE_HTTP,
                             use_grpc=USE_GRPC,
                             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                            use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_infer/install_and_test.sh b/qa/L0_infer/install_and_test.sh
index f488f510f4..28e5dad52e 100755
--- a/qa/L0_infer/install_and_test.sh
+++ b/qa/L0_infer/install_and_test.sh
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# Note: This script is to be used with customized triton containers that need 
+# Note: This script is to be used with customized triton containers that need
 # dependencies to run L0_infer tests
 apt-get update && \
     apt-get install -y --no-install-recommends \
diff --git a/qa/L0_infer_reshape/infer_reshape_test.py b/qa/L0_infer_reshape/infer_reshape_test.py
old mode 100644
new mode 100755
index 0c3117131e..e77dcbecaf
--- a/qa/L0_infer_reshape/infer_reshape_test.py
+++ b/qa/L0_infer_reshape/infer_reshape_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,119 +30,139 @@
 
 sys.path.append("../common")
 
+import os
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
-import os
 
 np_dtype_string = np.dtype(object)
 
-TEST_SYSTEM_SHARED_MEMORY = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
-TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get('TEST_CUDA_SHARED_MEMORY',
-                                                  0)))
+TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
+TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0)))
 
 
 class InferReshapeTest(tu.TestResultCollector):
-
-    def _full_reshape(self,
-                      dtype,
-                      input_shapes,
-                      output_shapes=None,
-                      no_batch=True):
+    def _full_reshape(self, dtype, input_shapes, output_shapes=None, no_batch=True):
         # 'shapes' is list of shapes, one for each input.
         if output_shapes is None:
             output_shapes = input_shapes
 
         # For validation assume any shape can be used...
-        if tu.validate_for_tf_model(dtype, dtype, dtype, input_shapes[0],
-                                    input_shapes[0], input_shapes[0]):
+        if tu.validate_for_tf_model(
+            dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0]
+        ):
             # model that supports batching
             for bs in (1, 8):
-                full_shapes = [[
-                    bs,
-                ] + input_shape for input_shape in input_shapes]
-                full_output_shapes = [[
-                    bs,
-                ] + output_shape for output_shape in output_shapes]
+                full_shapes = [
+                    [
+                        bs,
+                    ]
+                    + input_shape
+                    for input_shape in input_shapes
+                ]
+                full_output_shapes = [
+                    [
+                        bs,
+                    ]
+                    + output_shape
+                    for output_shape in output_shapes
+                ]
                 iu.infer_zero(
                     self,
-                    'graphdef',
+                    "graphdef",
                     bs,
                     dtype,
                     full_shapes,
                     full_output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
                 iu.infer_zero(
                     self,
-                    'savedmodel',
+                    "savedmodel",
                     bs,
                     dtype,
                     full_shapes,
                     full_output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             # model that does not support batching
             if no_batch:
                 iu.infer_zero(
                     self,
-                    'graphdef_nobatch',
+                    "graphdef_nobatch",
                     1,
                     dtype,
                     input_shapes,
                     output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
                 iu.infer_zero(
                     self,
-                    'savedmodel_nobatch',
+                    "savedmodel_nobatch",
                     1,
                     dtype,
                     input_shapes,
                     output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
 
-        if tu.validate_for_onnx_model(dtype, dtype, dtype, input_shapes[0],
-                                      input_shapes[0], input_shapes[0]):
+        if tu.validate_for_onnx_model(
+            dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0]
+        ):
             # model that supports batching
             for bs in (1, 8):
-                full_shapes = [[
-                    bs,
-                ] + input_shape for input_shape in input_shapes]
-                full_output_shapes = [[
-                    bs,
-                ] + output_shape for output_shape in output_shapes]
+                full_shapes = [
+                    [
+                        bs,
+                    ]
+                    + input_shape
+                    for input_shape in input_shapes
+                ]
+                full_output_shapes = [
+                    [
+                        bs,
+                    ]
+                    + output_shape
+                    for output_shape in output_shapes
+                ]
                 iu.infer_zero(
                     self,
-                    'onnx',
+                    "onnx",
                     bs,
                     dtype,
                     full_shapes,
                     full_output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             # model that does not support batching
             if no_batch:
                 iu.infer_zero(
                     self,
-                    'onnx_nobatch',
+                    "onnx_nobatch",
                     1,
                     dtype,
                     input_shapes,
                     output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
 
-        if tu.validate_for_libtorch_model(dtype,
-                                          dtype,
-                                          dtype,
-                                          input_shapes[0],
-                                          input_shapes[0],
-                                          input_shapes[0],
-                                          reshape=True):
+        if tu.validate_for_libtorch_model(
+            dtype,
+            dtype,
+            dtype,
+            input_shapes[0],
+            input_shapes[0],
+            input_shapes[0],
+            reshape=True,
+        ):
             # skip variable size reshape on libtorch for now,
             # see "gen_qa_reshape_model.py" for detail
             if dtype != np.int32:
@@ -149,48 +171,72 @@ def _full_reshape(self,
                 if no_batch and (dtype != np_dtype_string):
                     iu.infer_zero(
                         self,
-                        'libtorch_nobatch',
+                        "libtorch_nobatch",
                         1,
                         dtype,
                         input_shapes,
                         output_shapes,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
                 # model that supports batching
                 for bs in (1, 8):
-                    full_shapes = [[
-                        bs,
-                    ] + input_shape for input_shape in input_shapes]
-                    full_output_shapes = [[
-                        bs,
-                    ] + output_shape for output_shape in output_shapes]
+                    full_shapes = [
+                        [
+                            bs,
+                        ]
+                        + input_shape
+                        for input_shape in input_shapes
+                    ]
+                    full_output_shapes = [
+                        [
+                            bs,
+                        ]
+                        + output_shape
+                        for output_shape in output_shapes
+                    ]
                     iu.infer_zero(
                         self,
-                        'libtorch',
+                        "libtorch",
                         bs,
                         dtype,
                         full_shapes,
                         full_output_shapes,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
         for name in ["simple_reshape", "sequence_reshape", "fan_reshape"]:
             # [TODO] Skip variable size reshape on ensemble for now.
             # Need rework on how ensemble for reshape are generated
             if dtype == np.int32:
                 break
-            if tu.validate_for_ensemble_model(name, dtype, dtype, dtype,
-                                              input_shapes[0], input_shapes[0],
-                                              input_shapes[0]):
+            if tu.validate_for_ensemble_model(
+                name,
+                dtype,
+                dtype,
+                dtype,
+                input_shapes[0],
+                input_shapes[0],
+                input_shapes[0],
+            ):
                 # model that supports batching
                 for bs in (1, 8):
-                    full_shapes = [[
-                        bs,
-                    ] + input_shape for input_shape in input_shapes]
-                    full_output_shapes = [[
-                        bs,
-                    ] + output_shape for output_shape in output_shapes]
+                    full_shapes = [
+                        [
+                            bs,
+                        ]
+                        + input_shape
+                        for input_shape in input_shapes
+                    ]
+                    full_output_shapes = [
+                        [
+                            bs,
+                        ]
+                        + output_shape
+                        for output_shape in output_shapes
+                    ]
                     iu.infer_zero(
                         self,
                         name,
@@ -199,58 +245,67 @@ def _full_reshape(self,
                         full_shapes,
                         full_output_shapes,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
                 # model that does not support batching
                 if no_batch:
                     iu.infer_zero(
                         self,
-                        name + '_nobatch',
+                        name + "_nobatch",
                         1,
                         dtype,
                         input_shapes,
                         output_shapes,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
-    def _trt_reshape(self,
-                     dtype,
-                     input_shapes,
-                     output_shapes=None,
-                     no_batch=True):
+    def _trt_reshape(self, dtype, input_shapes, output_shapes=None, no_batch=True):
         # 'shapes' is list of shapes, one for each input.
         if output_shapes is None:
             output_shapes = input_shapes
 
-        if tu.validate_for_trt_model(dtype, dtype, dtype, input_shapes[0],
-                                     input_shapes[0], input_shapes[0]):
+        if tu.validate_for_trt_model(
+            dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0]
+        ):
             # model that supports batching
             for bs in (1, 8):
-                full_shapes = [[
-                    bs,
-                ] + input_shape for input_shape in input_shapes]
-                full_output_shapes = [[
-                    bs,
-                ] + output_shape for output_shape in output_shapes]
+                full_shapes = [
+                    [
+                        bs,
+                    ]
+                    + input_shape
+                    for input_shape in input_shapes
+                ]
+                full_output_shapes = [
+                    [
+                        bs,
+                    ]
+                    + output_shape
+                    for output_shape in output_shapes
+                ]
                 iu.infer_zero(
                     self,
-                    'plan',
+                    "plan",
                     bs,
                     dtype,
                     full_shapes,
                     full_output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             # model that does not support batching
             if no_batch:
                 iu.infer_zero(
                     self,
-                    'plan_nobatch',
+                    "plan_nobatch",
                     1,
                     dtype,
                     input_shapes,
                     output_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
 
     def test_ff1(self):
         self._full_reshape(np.float32, input_shapes=([1],), no_batch=False)
@@ -263,21 +318,24 @@ def test_ff3(self):
         self._full_reshape(np.float32, input_shapes=([4, 4], [2], [2, 2, 3]))
 
     def test_ff4(self):
-        self._full_reshape(np.float32,
-                           input_shapes=([4, 4], [2], [2, 2, 3], [1]),
-                           output_shapes=([16], [1, 2], [3, 2, 2], [1]))
-        self._trt_reshape(np.float32,
-                          input_shapes=([4, 4], [2], [2, 2, 3], [1]),
-                          output_shapes=([2, 2, 4], [1, 2, 1], [3, 2,
-                                                                2], [1, 1, 1]))
+        self._full_reshape(
+            np.float32,
+            input_shapes=([4, 4], [2], [2, 2, 3], [1]),
+            output_shapes=([16], [1, 2], [3, 2, 2], [1]),
+        )
+        self._trt_reshape(
+            np.float32,
+            input_shapes=([4, 4], [2], [2, 2, 3], [1]),
+            output_shapes=([2, 2, 4], [1, 2, 1], [3, 2, 2], [1, 1, 1]),
+        )
 
     def test_ii1(self):
         self._full_reshape(np.int32, input_shapes=([2, 4, 5, 6],))
 
     def test_ii2(self):
-        self._full_reshape(np.int32,
-                           input_shapes=([4, 1], [2]),
-                           output_shapes=([1, 4], [1, 2]))
+        self._full_reshape(
+            np.int32, input_shapes=([4, 1], [2]), output_shapes=([1, 4], [1, 2])
+        )
 
     def test_ii3(self):
         self._full_reshape(np.int32, input_shapes=([1, 4, 1], [8], [2, 2, 3]))
@@ -286,5 +344,5 @@ def test_oo1(self):
         self._full_reshape(np.object_, input_shapes=([1],), no_batch=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_infer_variable/infer_variable_test.py b/qa/L0_infer_variable/infer_variable_test.py
old mode 100644
new mode 100755
index 3769e30d4e..e5e6470a3c
--- a/qa/L0_infer_variable/infer_variable_test.py
+++ b/qa/L0_infer_variable/infer_variable_test.py
@@ -1,4 +1,6 @@
-# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -30,48 +32,49 @@
 
 import os
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
 
 np_dtype_string = np.dtype(object)
 
-TEST_SYSTEM_SHARED_MEMORY = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
-TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get('TEST_CUDA_SHARED_MEMORY',
-                                                  0)))
+TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
+TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0)))
 
 
 class InferVariableTest(tu.TestResultCollector):
-
-    def _full_exact(self,
-                    input_dtype,
-                    output0_dtype,
-                    output1_dtype,
-                    input_shape,
-                    output0_shape,
-                    output1_shape,
-                    output0_raw=True,
-                    output1_raw=True,
-                    swap=False):
-
-        def _infer_exact_helper(tester,
-                                pf,
-                                tensor_shape,
-                                batch_size,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                output0_raw=True,
-                                output1_raw=True,
-                                model_version=None,
-                                swap=False,
-                                outputs=("OUTPUT0", "OUTPUT1"),
-                                use_http=True,
-                                use_grpc=True,
-                                skip_request_id_check=False,
-                                use_streaming=True,
-                                correlation_id=0):
+    def _full_exact(
+        self,
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+        output0_raw=True,
+        output1_raw=True,
+        swap=False,
+    ):
+        def _infer_exact_helper(
+            tester,
+            pf,
+            tensor_shape,
+            batch_size,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_raw=True,
+            output1_raw=True,
+            model_version=None,
+            swap=False,
+            outputs=("OUTPUT0", "OUTPUT1"),
+            use_http=True,
+            use_grpc=True,
+            skip_request_id_check=False,
+            use_streaming=True,
+            correlation_id=0,
+        ):
             for bs in (1, batch_size):
                 # model that does not support batching
                 if bs == 1:
@@ -94,15 +97,23 @@ def _infer_exact_helper(tester,
                         use_streaming=use_streaming,
                         correlation_id=correlation_id,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
                 # model that supports batching. Skip for libtorch string I/O
-                elif pf == 'libtorch' and tu.validate_for_libtorch_model(
-                        input_dtype, output0_dtype, output1_dtype, tensor_shape,
-                        tensor_shape, tensor_shape, bs):
+                elif pf == "libtorch" and tu.validate_for_libtorch_model(
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    tensor_shape,
+                    tensor_shape,
+                    tensor_shape,
+                    bs,
+                ):
                     iu.infer_exact(
                         tester,
-                        pf, (bs,) + tensor_shape,
+                        pf,
+                        (bs,) + tensor_shape,
                         bs,
                         input_dtype,
                         output0_dtype,
@@ -118,91 +129,128 @@ def _infer_exact_helper(tester,
                         use_streaming=use_streaming,
                         correlation_id=correlation_id,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
 
         all_ensemble_prefix = ["simple_", "sequence_", "fan_"]
         ensemble_prefix = [""]
         for prefix in all_ensemble_prefix:
-            if tu.validate_for_ensemble_model(prefix, input_dtype,
-                                              output0_dtype, output1_dtype,
-                                              input_shape, input_shape,
-                                              input_shape):
+            if tu.validate_for_ensemble_model(
+                prefix,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                input_shape,
+                input_shape,
+                input_shape,
+            ):
                 ensemble_prefix.append(prefix)
 
-        if tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
-                                    input_shape, output0_shape, output1_shape):
+        if tu.validate_for_tf_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            input_shape,
+            output0_shape,
+            output1_shape,
+        ):
             for prefix in ensemble_prefix:
                 for pf in ["graphdef", "savedmodel"]:
-                    _infer_exact_helper(self,
-                                        prefix + pf,
-                                        input_shape,
-                                        8,
-                                        input_dtype,
-                                        output0_dtype,
-                                        output1_dtype,
-                                        output0_raw=output0_raw,
-                                        output1_raw=output1_raw,
-                                        swap=swap)
-
-        if tu.validate_for_trt_model(input_dtype, output0_dtype, output1_dtype,
-                                     input_shape, output0_shape, output1_shape):
+                    _infer_exact_helper(
+                        self,
+                        prefix + pf,
+                        input_shape,
+                        8,
+                        input_dtype,
+                        output0_dtype,
+                        output1_dtype,
+                        output0_raw=output0_raw,
+                        output1_raw=output1_raw,
+                        swap=swap,
+                    )
+
+        if tu.validate_for_trt_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            input_shape,
+            output0_shape,
+            output1_shape,
+        ):
             for prefix in ensemble_prefix:
                 if input_dtype == np.int8:
-                    _infer_exact_helper(self,
-                                        prefix + 'plan',
-                                        input_shape + (1, 1),
-                                        8,
-                                        input_dtype,
-                                        output0_dtype,
-                                        output1_dtype,
-                                        output0_raw=output0_raw,
-                                        output1_raw=output1_raw,
-                                        swap=swap)
+                    _infer_exact_helper(
+                        self,
+                        prefix + "plan",
+                        input_shape + (1, 1),
+                        8,
+                        input_dtype,
+                        output0_dtype,
+                        output1_dtype,
+                        output0_raw=output0_raw,
+                        output1_raw=output1_raw,
+                        swap=swap,
+                    )
                 else:
-                    _infer_exact_helper(self,
-                                        prefix + 'plan',
-                                        input_shape,
-                                        8,
-                                        input_dtype,
-                                        output0_dtype,
-                                        output1_dtype,
-                                        output0_raw=output0_raw,
-                                        output1_raw=output1_raw,
-                                        swap=swap)
-
-        if tu.validate_for_onnx_model(input_dtype, output0_dtype, output1_dtype,
-                                      input_shape, output0_shape,
-                                      output1_shape):
+                    _infer_exact_helper(
+                        self,
+                        prefix + "plan",
+                        input_shape,
+                        8,
+                        input_dtype,
+                        output0_dtype,
+                        output1_dtype,
+                        output0_raw=output0_raw,
+                        output1_raw=output1_raw,
+                        swap=swap,
+                    )
+
+        if tu.validate_for_onnx_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            input_shape,
+            output0_shape,
+            output1_shape,
+        ):
             # No basic ensemble models are created against custom models [TODO]
-            _infer_exact_helper(self,
-                                'onnx',
-                                input_shape,
-                                8,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                output0_raw=output0_raw,
-                                output1_raw=output1_raw,
-                                swap=swap)
-
-        if tu.validate_for_libtorch_model(input_dtype, output0_dtype,
-                                          output1_dtype, input_shape,
-                                          output0_shape, output1_shape):
+            _infer_exact_helper(
+                self,
+                "onnx",
+                input_shape,
+                8,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                output0_raw=output0_raw,
+                output1_raw=output1_raw,
+                swap=swap,
+            )
+
+        if tu.validate_for_libtorch_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            input_shape,
+            output0_shape,
+            output1_shape,
+        ):
             # No basic ensemble models are created against custom models [TODO]
-            _infer_exact_helper(self,
-                                'libtorch',
-                                input_shape,
-                                8,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                output0_raw=output0_raw,
-                                output1_raw=output1_raw,
-                                swap=swap)
+            _infer_exact_helper(
+                self,
+                "libtorch",
+                input_shape,
+                8,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                output0_raw=output0_raw,
+                output1_raw=output1_raw,
+                swap=swap,
+            )
 
     def test_raw_fff(self):
-        self._full_exact(np.float32, np.float32, np.float32, (16,), (16,),
-                         (16,))
+        self._full_exact(np.float32, np.float32, np.float32, (16,), (16,), (16,))
 
     def test_raw_fii(self):
         self._full_exact(np.float32, np.int32, np.int32, (2, 8), (2, 8), (2, 8))
@@ -211,8 +259,9 @@ def test_raw_fll(self):
         self._full_exact(np.float32, np.int64, np.int64, (8, 4), (8, 4), (8, 4))
 
     def test_raw_fil(self):
-        self._full_exact(np.float32, np.int32, np.int64, (2, 8, 2), (2, 8, 2),
-                         (2, 8, 2))
+        self._full_exact(
+            np.float32, np.int32, np.int64, (2, 8, 2), (2, 8, 2), (2, 8, 2)
+        )
 
     def test_raw_ffi(self):
         self._full_exact(np.float32, np.float32, np.int32, (16,), (16,), (16,))
@@ -221,95 +270,148 @@ def test_raw_iii(self):
         self._full_exact(np.int32, np.int32, np.int32, (2, 8), (2, 8), (2, 8))
 
     def test_faw_iif(self):
-        self._full_exact(np.int32, np.int32, np.float32, (2, 8, 2), (2, 8, 2),
-                         (2, 8, 2))
+        self._full_exact(
+            np.int32, np.int32, np.float32, (2, 8, 2), (2, 8, 2), (2, 8, 2)
+        )
 
     def test_raw_ooo(self):
-        self._full_exact(np_dtype_string, np_dtype_string, np_dtype_string,
-                         (16,), (16,), (16,))
+        self._full_exact(
+            np_dtype_string, np_dtype_string, np_dtype_string, (16,), (16,), (16,)
+        )
 
     def test_raw_oii(self):
-        self._full_exact(np_dtype_string, np.int32, np.int32, (2, 8), (2, 8),
-                         (2, 8))
+        self._full_exact(np_dtype_string, np.int32, np.int32, (2, 8), (2, 8), (2, 8))
 
     def test_raw_ooi(self):
-        self._full_exact(np_dtype_string, np_dtype_string, np.int32, (8, 4),
-                         (8, 4), (8, 4))
+        self._full_exact(
+            np_dtype_string, np_dtype_string, np.int32, (8, 4), (8, 4), (8, 4)
+        )
 
     def test_raw_oio(self):
-        self._full_exact(np_dtype_string, np.int32, np_dtype_string, (2, 8, 2),
-                         (2, 8, 2), (2, 8, 2))
+        self._full_exact(
+            np_dtype_string, np.int32, np_dtype_string, (2, 8, 2), (2, 8, 2), (2, 8, 2)
+        )
 
     def test_class_fff(self):
-        self._full_exact(np.float32,
-                         np.float32,
-                         np.float32, (16,), (16,), (16,),
-                         output0_raw=False,
-                         output1_raw=False)
+        self._full_exact(
+            np.float32,
+            np.float32,
+            np.float32,
+            (16,),
+            (16,),
+            (16,),
+            output0_raw=False,
+            output1_raw=False,
+        )
 
     def test_class_fii(self):
-        self._full_exact(np.float32,
-                         np.int32,
-                         np.int32, (2, 8), (2, 8), (2, 8),
-                         output0_raw=False,
-                         output1_raw=False)
+        self._full_exact(
+            np.float32,
+            np.int32,
+            np.int32,
+            (2, 8),
+            (2, 8),
+            (2, 8),
+            output0_raw=False,
+            output1_raw=False,
+        )
 
     def test_class_fll(self):
-        self._full_exact(np.float32,
-                         np.int64,
-                         np.int64, (8, 4), (8, 4), (8, 4),
-                         output0_raw=False,
-                         output1_raw=False)
+        self._full_exact(
+            np.float32,
+            np.int64,
+            np.int64,
+            (8, 4),
+            (8, 4),
+            (8, 4),
+            output0_raw=False,
+            output1_raw=False,
+        )
 
     def test_class_fil(self):
-        self._full_exact(np.float32,
-                         np.int32,
-                         np.int64, (2, 8, 2), (2, 8, 2), (2, 8, 2),
-                         output0_raw=False,
-                         output1_raw=False)
+        self._full_exact(
+            np.float32,
+            np.int32,
+            np.int64,
+            (2, 8, 2),
+            (2, 8, 2),
+            (2, 8, 2),
+            output0_raw=False,
+            output1_raw=False,
+        )
 
     def test_class_ffi(self):
-        self._full_exact(np.float32,
-                         np.float32,
-                         np.int32, (16,), (16,), (16,),
-                         output0_raw=False,
-                         output1_raw=False)
+        self._full_exact(
+            np.float32,
+            np.float32,
+            np.int32,
+            (16,),
+            (16,),
+            (16,),
+            output0_raw=False,
+            output1_raw=False,
+        )
 
     def test_class_iii(self):
-        self._full_exact(np.int32,
-                         np.int32,
-                         np.int32, (2, 8), (2, 8), (2, 8),
-                         output0_raw=False,
-                         output1_raw=False)
+        self._full_exact(
+            np.int32,
+            np.int32,
+            np.int32,
+            (2, 8),
+            (2, 8),
+            (2, 8),
+            output0_raw=False,
+            output1_raw=False,
+        )
 
     def test_class_iif(self):
-        self._full_exact(np.int32,
-                         np.int32,
-                         np.float32, (2, 8, 2), (2, 8, 2), (2, 8, 2),
-                         output0_raw=False,
-                         output1_raw=False)
+        self._full_exact(
+            np.int32,
+            np.int32,
+            np.float32,
+            (2, 8, 2),
+            (2, 8, 2),
+            (2, 8, 2),
+            output0_raw=False,
+            output1_raw=False,
+        )
 
     def test_mix_ffi(self):
-        self._full_exact(np.float32,
-                         np.float32,
-                         np.int32, (16,), (16,), (16,),
-                         output0_raw=True,
-                         output1_raw=False)
+        self._full_exact(
+            np.float32,
+            np.float32,
+            np.int32,
+            (16,),
+            (16,),
+            (16,),
+            output0_raw=True,
+            output1_raw=False,
+        )
 
     def test_mix_iii(self):
-        self._full_exact(np.int32,
-                         np.int32,
-                         np.int32, (2, 8), (2, 8), (2, 8),
-                         output0_raw=False,
-                         output1_raw=True)
+        self._full_exact(
+            np.int32,
+            np.int32,
+            np.int32,
+            (2, 8),
+            (2, 8),
+            (2, 8),
+            output0_raw=False,
+            output1_raw=True,
+        )
 
     def test_mix_iif(self):
-        self._full_exact(np.int32,
-                         np.int32,
-                         np.float32, (2, 8, 2), (2, 8, 2), (2, 8, 2),
-                         output0_raw=True,
-                         output1_raw=False)
-
-
-if __name__ == '__main__':
+        self._full_exact(
+            np.int32,
+            np.int32,
+            np.float32,
+            (2, 8, 2),
+            (2, 8, 2),
+            (2, 8, 2),
+            output0_raw=True,
+            output1_raw=False,
+        )
+
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_infer_zero/infer_zero_test.py b/qa/L0_infer_zero/infer_zero_test.py
old mode 100644
new mode 100755
index de00635450..9e9b0f4625
--- a/qa/L0_infer_zero/infer_zero_test.py
+++ b/qa/L0_infer_zero/infer_zero_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,103 +30,125 @@
 
 sys.path.append("../common")
 
+import os
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
-import os
 
 np_dtype_string = np.dtype(object)
 
-TEST_SYSTEM_SHARED_MEMORY = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
-TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get('TEST_CUDA_SHARED_MEMORY',
-                                                  0)))
+TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
+TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0)))
 
 
 class InferZeroTest(tu.TestResultCollector):
-
     def _full_zero(self, dtype, shapes):
         # 'shapes' is list of shapes, one for each input.
 
         # For validation assume any shape can be used...
-        if tu.validate_for_tf_model(dtype, dtype, dtype, shapes[0], shapes[0],
-                                    shapes[0]):
+        if tu.validate_for_tf_model(
+            dtype, dtype, dtype, shapes[0], shapes[0], shapes[0]
+        ):
             # model that supports batching
             for bs in (1, 8):
-                batch_shapes = [[
-                    bs,
-                ] + shape for shape in shapes]
+                batch_shapes = [
+                    [
+                        bs,
+                    ]
+                    + shape
+                    for shape in shapes
+                ]
                 iu.infer_zero(
                     self,
-                    'graphdef',
+                    "graphdef",
                     bs,
                     dtype,
                     batch_shapes,
                     batch_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
                 iu.infer_zero(
                     self,
-                    'savedmodel',
+                    "savedmodel",
                     bs,
                     dtype,
                     batch_shapes,
                     batch_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             # model that does not support batching
-            iu.infer_zero(self,
-                          'graphdef_nobatch',
-                          1,
-                          dtype,
-                          shapes,
-                          shapes,
-                          use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                          use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
-            iu.infer_zero(self,
-                          'savedmodel_nobatch',
-                          1,
-                          dtype,
-                          shapes,
-                          shapes,
-                          use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                          use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
-
-        if tu.validate_for_onnx_model(dtype, dtype, dtype, shapes[0], shapes[0],
-                                      shapes[0]):
+            iu.infer_zero(
+                self,
+                "graphdef_nobatch",
+                1,
+                dtype,
+                shapes,
+                shapes,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
+            iu.infer_zero(
+                self,
+                "savedmodel_nobatch",
+                1,
+                dtype,
+                shapes,
+                shapes,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
+
+        if tu.validate_for_onnx_model(
+            dtype, dtype, dtype, shapes[0], shapes[0], shapes[0]
+        ):
             # model that supports batching
             for bs in (1, 8):
-                batch_shapes = [[
-                    bs,
-                ] + shape for shape in shapes]
+                batch_shapes = [
+                    [
+                        bs,
+                    ]
+                    + shape
+                    for shape in shapes
+                ]
                 iu.infer_zero(
                     self,
-                    'onnx',
+                    "onnx",
                     bs,
                     dtype,
                     batch_shapes,
                     batch_shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
             # model that does not support batching
-            iu.infer_zero(self,
-                          'onnx_nobatch',
-                          1,
-                          dtype,
-                          shapes,
-                          shapes,
-                          use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                          use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+            iu.infer_zero(
+                self,
+                "onnx_nobatch",
+                1,
+                dtype,
+                shapes,
+                shapes,
+                use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+                use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+            )
 
         for name in ["simple_zero", "sequence_zero", "fan_zero"]:
-            if tu.validate_for_ensemble_model(name, dtype, dtype, dtype,
-                                              shapes[0], shapes[0], shapes[0]):
+            if tu.validate_for_ensemble_model(
+                name, dtype, dtype, dtype, shapes[0], shapes[0], shapes[0]
+            ):
                 # model that supports batching
                 for bs in (1, 8):
-                    batch_shapes = [[
-                        bs,
-                    ] + shape for shape in shapes]
+                    batch_shapes = [
+                        [
+                            bs,
+                        ]
+                        + shape
+                        for shape in shapes
+                    ]
                     iu.infer_zero(
                         self,
                         name,
@@ -133,81 +157,135 @@ def _full_zero(self, dtype, shapes):
                         batch_shapes,
                         batch_shapes,
                         use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                        use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                    )
                 # model that does not support batching
                 iu.infer_zero(
                     self,
-                    name + '_nobatch',
+                    name + "_nobatch",
                     1,
                     dtype,
                     shapes,
                     shapes,
                     use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY)
+                    use_cuda_shared_memory=TEST_CUDA_SHARED_MEMORY,
+                )
 
     def test_ff1_sanity(self):
-        self._full_zero(np.float32, ([
-            1,
-        ],))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    1,
+                ],
+            ),
+        )
 
     def test_ff1(self):
-        self._full_zero(np.float32, ([
-            0,
-        ],))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    0,
+                ],
+            ),
+        )
 
     def test_ff3_sanity(self):
-        self._full_zero(np.float32, ([
-            1,
-        ], [
-            2,
-        ], [
-            1,
-        ]))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    1,
+                ],
+                [
+                    2,
+                ],
+                [
+                    1,
+                ],
+            ),
+        )
 
     def test_ff3_0(self):
-        self._full_zero(np.float32, ([
-            0,
-        ], [
-            0,
-        ], [
-            0,
-        ]))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    0,
+                ],
+                [
+                    0,
+                ],
+                [
+                    0,
+                ],
+            ),
+        )
 
     def test_ff3_1(self):
-        self._full_zero(np.float32, ([
-            0,
-        ], [
-            0,
-        ], [
-            1,
-        ]))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    0,
+                ],
+                [
+                    0,
+                ],
+                [
+                    1,
+                ],
+            ),
+        )
 
     def test_ff3_2(self):
-        self._full_zero(np.float32, ([
-            0,
-        ], [
-            1,
-        ], [
-            0,
-        ]))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    0,
+                ],
+                [
+                    1,
+                ],
+                [
+                    0,
+                ],
+            ),
+        )
 
     def test_ff3_3(self):
-        self._full_zero(np.float32, ([
-            1,
-        ], [
-            0,
-        ], [
-            0,
-        ]))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    1,
+                ],
+                [
+                    0,
+                ],
+                [
+                    0,
+                ],
+            ),
+        )
 
     def test_ff3_4(self):
-        self._full_zero(np.float32, ([
-            1,
-        ], [
-            0,
-        ], [
-            1,
-        ]))
+        self._full_zero(
+            np.float32,
+            (
+                [
+                    1,
+                ],
+                [
+                    0,
+                ],
+                [
+                    1,
+                ],
+            ),
+        )
 
     def test_hh1_sanity(self):
         self._full_zero(np.float16, ([2, 2],))
@@ -240,14 +318,24 @@ def test_hh3_4(self):
         self._full_zero(np.float16, ([1, 1], [0, 6], [2, 2]))
 
     def test_oo1_sanity(self):
-        self._full_zero(np_dtype_string, ([
-            2,
-        ],))
+        self._full_zero(
+            np_dtype_string,
+            (
+                [
+                    2,
+                ],
+            ),
+        )
 
     def test_oo1(self):
-        self._full_zero(np_dtype_string, ([
-            0,
-        ],))
+        self._full_zero(
+            np_dtype_string,
+            (
+                [
+                    0,
+                ],
+            ),
+        )
 
     def test_oo3_sanity(self):
         self._full_zero(np_dtype_string, ([2, 2], [2, 2], [1, 1]))
@@ -268,15 +356,25 @@ def test_oo3_4(self):
         self._full_zero(np_dtype_string, ([1, 1], [0, 6], [2, 2]))
 
     def test_bb1_sanity(self):
-        self._full_zero(bool, ([
-            10,
-        ],))
+        self._full_zero(
+            bool,
+            (
+                [
+                    10,
+                ],
+            ),
+        )
 
     def test_bb1_0(self):
-        self._full_zero(bool, ([
-            0,
-        ],))
+        self._full_zero(
+            bool,
+            (
+                [
+                    0,
+                ],
+            ),
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_inferentia_perf_analyzer/test.sh b/qa/L0_inferentia_perf_analyzer/test.sh
old mode 100644
new mode 100755
index 21e361ee6c..1881e07f87
--- a/qa/L0_inferentia_perf_analyzer/test.sh
+++ b/qa/L0_inferentia_perf_analyzer/test.sh
@@ -25,21 +25,21 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# First need to set up enviroment
+# First need to set up environment
 if [ ${USE_TENSORFLOW} == "1" ] && [ ${USE_PYTORCH} == "1" ] ; then
     echo " Unsupported test configuration. Only one of USE_TENSORFLOW and USE_PYTORCH can be set to 1."
     exit 0
 elif [ ${USE_TENSORFLOW} == "1" ] ; then
-    echo "Setting up enviroment with tensorflow 1"
+    echo "Setting up environment with tensorflow 1"
     source ${TRITON_PATH}/python_backend/inferentia/scripts/setup.sh -t --tensorflow-version 1
 elif [ ${USE_PYTORCH} == "1" ] ; then
-    echo "Setting up enviroment with pytorch"
+    echo "Setting up environment with pytorch"
     source ${TRITON_PATH}/python_backend/inferentia/scripts/setup.sh -p
-else 
+else
     echo " Unsupported test configuration. USE_TENSORFLOW flag is: ${USE_TENSORFLOW} and USE_PYTORCH flag is: ${USE_PYTORCH}. Only one of them can be set to 1."
     exit 0
 fi
-echo "done setting up enviroment"
+echo "done setting up environment"
 
 REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
 if [ "$#" -ge 1 ]; then
@@ -80,32 +80,32 @@ function create_inferentia_models () {
     for DISABLE_DEFAULT_BATCHING_FLAG in ${DISABLE_DEFAULT_BATCHING_FLAGS}; do
         for BATCHED_FLAG in ${BATCHED_FLAGS}; do
             for TEST_TYPE in ${TEST_TYPES}; do
-                CURR_GEN_SCRIPT="${GEN_SCRIPT} --model_type ${MODEL_TYPE}  
-                --triton_model_dir ${TRITON_PATH}/models_${TEST_TYPE}${BATCHED_FLAG}${TEST_FRAMEWORK}${DISABLE_DEFAULT_BATCHING_FLAG}/add-sub-1x4 
+                CURR_GEN_SCRIPT="${GEN_SCRIPT} --model_type ${MODEL_TYPE}
+                --triton_model_dir ${TRITON_PATH}/models_${TEST_TYPE}${BATCHED_FLAG}${TEST_FRAMEWORK}${DISABLE_DEFAULT_BATCHING_FLAG}/add-sub-1x4
                 --compiled_model ${COMPILED_MODEL}"
                 if [ ${DISABLE_DEFAULT_BATCHING_FLAG} == "_no_batch" ]; then
-                    CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT} 
+                    CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT}
                     --disable_batch_requests_to_neuron"
                 fi
                 if [ ${BATCHED_FLAG} == "_batched_" ]; then
                     CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT}
-                    --triton_input INPUT__0,INT64,4 INPUT__1,INT64,4 
-                    --triton_output OUTPUT__0,INT64,4 OUTPUT__1,INT64,4          
-                    --enable_dynamic_batching 
-                    --max_batch_size 1000 
-                    --preferred_batch_size 8 
+                    --triton_input INPUT__0,INT64,4 INPUT__1,INT64,4
+                    --triton_output OUTPUT__0,INT64,4 OUTPUT__1,INT64,4
+                    --enable_dynamic_batching
+                    --max_batch_size 1000
+                    --preferred_batch_size 8
                     --max_queue_delay_microseconds 100"
                 else
                     CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT}
-                    --triton_input INPUT__0,INT64,-1x4 INPUT__1,INT64,-1x4 
+                    --triton_input INPUT__0,INT64,-1x4 INPUT__1,INT64,-1x4
                     --triton_output OUTPUT__0,INT64,-1x4 OUTPUT__1,INT64,-1x4"
                 fi
                 if [ ${TEST_TYPE} == "single" ]; then
-                    CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT}   
+                    CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT}
                     --neuron_core_range 0:0"
                 elif [ ${TEST_TYPE} == "multiple" ]; then
-                    CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT} 
-                    --triton_model_instance_count 3 
+                    CURR_GEN_SCRIPT="${CURR_GEN_SCRIPT}
+                    --triton_model_instance_count 3
                     --neuron_core_range 0:7"
                 fi
                 echo ${CURR_GEN_SCRIPT}
diff --git a/qa/L0_io/test.sh b/qa/L0_io/test.sh
index fc49a4d537..1f7d77ffcc 100755
--- a/qa/L0_io/test.sh
+++ b/qa/L0_io/test.sh
@@ -156,7 +156,7 @@ cp $MODELSDIR/libtorch_multi_device/config.pbtxt $MODELSDIR/libtorch_multi_gpu/.
     sed -i "s/name: \"libtorch_multi_device\"/name: \"libtorch_multi_gpu\"/" config.pbtxt)
 
 set +e
-python3 gen_libtorch_model.py >> $CLIENT_LOG 2>&1 
+python3 gen_libtorch_model.py >> $CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
     echo -e "\n***\n*** Error when generating libtorch models. \n***"
     cat $CLIENT_LOG
diff --git a/qa/L0_java_memory_growth/MemoryGrowthTest.java b/qa/L0_java_memory_growth/MemoryGrowthTest.java
index d5a8092872..3060b6542c 100644
--- a/qa/L0_java_memory_growth/MemoryGrowthTest.java
+++ b/qa/L0_java_memory_growth/MemoryGrowthTest.java
@@ -24,880 +24,833 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import static org.bytedeco.tritonserver.global.tritonserver.*;
+
+import com.google.gson.*;
 import java.io.*;
 import java.util.*;
 import java.util.concurrent.*;
-import com.google.gson.*;
 import org.bytedeco.javacpp.*;
 import org.bytedeco.tritonserver.tritonserver.*;
-import static org.bytedeco.tritonserver.global.tritonserver.*;
 
 public class MemoryGrowthTest {
-    static final double TRITON_MIN_COMPUTE_CAPABILITY = 6.0;
-    private static boolean done = false;
-    static float max_growth_allowed = .10f;
-    static int max_mem_allowed = 30;
-
-    static void FAIL(String MSG) {
-        System.err.println("failure: " + MSG);
-        System.exit(1);
-    }
-
-    static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG) {
-        if (err__ != null) {
-            System.err.println("error: " + MSG + ":"
-                             + TRITONSERVER_ErrorCodeString(err__) + " - "
-                             + TRITONSERVER_ErrorMessage(err__));
-            TRITONSERVER_ErrorDelete(err__);
-            System.exit(1);
-        }
-    }
-
-    static boolean enforce_memory_type = false;
-    static int requested_memory_type;
-    // Parameters for percentile range to include (exclude outliers)
-    static final int max_percentile = 90;
-    static final int min_percentile = 10;
-
-    static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server {
-        public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p) { super(p); deallocator(new DeleteDeallocator(this)); }
-        protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
-            DeleteDeallocator(Pointer p) { super(p); }
-            @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
-        }
-    }
-
-    static void
-    Usage(String msg)
-    {
-      if (msg != null) {
-        System.err.println(msg);
-      }
-
-      System.err.println("Usage: java " + MemoryGrowthTest.class.getSimpleName() + " [options]");
-      System.err.println("\t-i Set number of iterations");
-      System.err.println("\t-m <\"system\"|\"pinned\"|gpu>"
-                       + " Enforce the memory type for input and output tensors."
-                       + " If not specified, inputs will be in system memory and outputs"
-                       + " will be based on the model's preferred type.");
-      System.err.println("\t-v Enable verbose logging");
-      System.err.println("\t-r [model repository absolute path]");
-      System.err.println("\t--max-growth Specify maximum allowed memory growth (%)");
-      System.err.println("\t--max-memory Specify maximum allowed memory (MB)");
-
+  static final double TRITON_MIN_COMPUTE_CAPABILITY = 6.0;
+  private static boolean done = false;
+  static float max_growth_allowed = .10f;
+  static int max_mem_allowed = 30;
+
+  static void FAIL(String MSG)
+  {
+    System.err.println("failure: " + MSG);
+    System.exit(1);
+  }
+
+  static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG)
+  {
+    if (err__ != null) {
+      System.err.println(
+          "error: " + MSG + ":" + TRITONSERVER_ErrorCodeString(err__) + " - "
+          + TRITONSERVER_ErrorMessage(err__));
+      TRITONSERVER_ErrorDelete(err__);
       System.exit(1);
     }
+  }
 
-    static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
-        @Override public TRITONSERVER_Error call (
-            TRITONSERVER_ResponseAllocator allocator, String tensor_name,
-            long byte_size, int preferred_memory_type,
-            long preferred_memory_type_id, Pointer userp, PointerPointer buffer,
-            PointerPointer buffer_userp, IntPointer actual_memory_type,
-            LongPointer actual_memory_type_id)
-        {
-          // Initially attempt to make the actual memory type and id that we
-          // allocate be the same as preferred memory type
-          actual_memory_type.put(0, preferred_memory_type);
-          actual_memory_type_id.put(0, preferred_memory_type_id);
-
-          // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
-          // need to do any other book-keeping.
-          if (byte_size == 0) {
-            buffer.put(0, null);
-            buffer_userp.put(0, null);
-          } else {
-            Pointer allocated_ptr = new Pointer();
-            if (enforce_memory_type) {
-              actual_memory_type.put(0, requested_memory_type);
-            }
-
-            actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU);
-            allocated_ptr = Pointer.malloc(byte_size);
-
-            // Pass the tensor name with buffer_userp so we can show it when
-            // releasing the buffer.
-            if (!allocated_ptr.isNull()) {
-              buffer.put(0, allocated_ptr);
-              buffer_userp.put(0, Loader.newGlobalRef(tensor_name));
-            }
-          }
+  static boolean enforce_memory_type = false;
+  static int requested_memory_type;
+  // Parameters for percentile range to include (exclude outliers)
+  static final int max_percentile = 90;
+  static final int min_percentile = 10;
 
-          return null;  // Success
-        }
+  static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server {
+    public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p)
+    {
+      super(p);
+      deallocator(new DeleteDeallocator(this));
     }
-
-    static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
-        @Override public TRITONSERVER_Error call (
-            TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
-            long byte_size, int memory_type, long memory_type_id)
-        {
-          String name = null;
-          if (buffer_userp != null) {
-            name = (String)Loader.accessGlobalRef(buffer_userp);
-          } else {
-            name = "<unknown>";
-          }
-          Pointer.free(buffer);
-          Loader.deleteGlobalRef(buffer_userp);
-
-          return null;  // Success
-        }
+    protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
+      DeleteDeallocator(Pointer p) { super(p); }
+      @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
     }
+  }
 
-    static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
-        @Override public void call (
-            TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
-        {
-          // We reuse the request so we don't delete it here.
-        }
+  static void Usage(String msg)
+  {
+    if (msg != null) {
+      System.err.println(msg);
     }
 
-    static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
-        @Override public void call (
-            TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
-        {
-          if (response != null) {
-            // Send 'response' to the future.
-            futures.get(userp).complete(response);
-          }
+    System.err.println("Usage: java " + MemoryGrowthTest.class.getSimpleName() + " [options]");
+    System.err.println("\t-i Set number of iterations");
+    System.err.println(
+        "\t-m <\"system\"|\"pinned\"|gpu>"
+        + " Enforce the memory type for input and output tensors."
+        + " If not specified, inputs will be in system memory and outputs"
+        + " will be based on the model's preferred type.");
+    System.err.println("\t-v Enable verbose logging");
+    System.err.println("\t-r [model repository absolute path]");
+    System.err.println("\t--max-growth Specify maximum allowed memory growth (%)");
+    System.err.println("\t--max-memory Specify maximum allowed memory (MB)");
+
+    System.exit(1);
+  }
+
+  static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
+    @Override
+    public TRITONSERVER_Error call(
+        TRITONSERVER_ResponseAllocator allocator, String tensor_name, long byte_size,
+        int preferred_memory_type, long preferred_memory_type_id, Pointer userp,
+        PointerPointer buffer, PointerPointer buffer_userp, IntPointer actual_memory_type,
+        LongPointer actual_memory_type_id)
+    {
+      // Initially attempt to make the actual memory type and id that we
+      // allocate be the same as preferred memory type
+      actual_memory_type.put(0, preferred_memory_type);
+      actual_memory_type_id.put(0, preferred_memory_type_id);
+
+      // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
+      // need to do any other book-keeping.
+      if (byte_size == 0) {
+        buffer.put(0, null);
+        buffer_userp.put(0, null);
+      } else {
+        Pointer allocated_ptr = new Pointer();
+        if (enforce_memory_type) {
+          actual_memory_type.put(0, requested_memory_type);
         }
-    }
 
-    static ConcurrentHashMap<Pointer, CompletableFuture<TRITONSERVER_InferenceResponse>> futures = new ConcurrentHashMap<>();
-    static ResponseAlloc responseAlloc = new ResponseAlloc();
-    static ResponseRelease responseRelease = new ResponseRelease();
-    static InferRequestComplete inferRequestComplete = new InferRequestComplete();
-    static InferResponseComplete inferResponseComplete = new InferResponseComplete();
+        actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU);
+        allocated_ptr = Pointer.malloc(byte_size);
 
-    static TRITONSERVER_Error
-    ParseModelMetadata(
-        JsonObject model_metadata, boolean[] is_int,
-        boolean[] is_torch_model)
-    {
-      String seen_data_type = null;
-      for (JsonElement input_element : model_metadata.get("inputs").getAsJsonArray()) {
-        JsonObject input = input_element.getAsJsonObject();
-        if (!input.get("datatype").getAsString().equals("INT32") &&
-            !input.get("datatype").getAsString().equals("FP32")) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_UNSUPPORTED,
-              "simple lib example only supports model with data type INT32 or " +
-              "FP32");
-        }
-        if (seen_data_type == null) {
-          seen_data_type = input.get("datatype").getAsString();
-        } else if (!seen_data_type.equals(input.get("datatype").getAsString())) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              "the inputs and outputs of 'simple' model must have the data type");
-        }
-      }
-      for (JsonElement output_element : model_metadata.get("outputs").getAsJsonArray()) {
-        JsonObject output = output_element.getAsJsonObject();
-        if (!output.get("datatype").getAsString().equals("INT32") &&
-            !output.get("datatype").getAsString().equals("FP32")) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_UNSUPPORTED,
-              "simple lib example only supports model with data type INT32 or " +
-              "FP32");
-        } else if (!seen_data_type.equals(output.get("datatype").getAsString())) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              "the inputs and outputs of 'simple' model must have the data type");
+        // Pass the tensor name with buffer_userp so we can show it when
+        // releasing the buffer.
+        if (!allocated_ptr.isNull()) {
+          buffer.put(0, allocated_ptr);
+          buffer_userp.put(0, Loader.newGlobalRef(tensor_name));
         }
       }
 
-      is_int[0] = seen_data_type.equals("INT32");
-      is_torch_model[0] =
-          model_metadata.get("platform").getAsString().equals("pytorch_libtorch");
-      return null;
+      return null; // Success
     }
+  }
 
-    static void
-    GenerateInputData(
-        IntPointer[] input0_data, IntPointer[] input1_data)
+  static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
+    @Override
+    public TRITONSERVER_Error call(
+        TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
+        long byte_size, int memory_type, long memory_type_id)
     {
-      input0_data[0] = new IntPointer(16);
-      input1_data[0] = new IntPointer(16);
-      for (int i = 0; i < 16; ++i) {
-        input0_data[0].put(i, i);
-        input1_data[0].put(i, 1);
+      String name = null;
+      if (buffer_userp != null) {
+        name = (String) Loader.accessGlobalRef(buffer_userp);
+      } else {
+        name = "<unknown>";
       }
+      Pointer.free(buffer);
+      Loader.deleteGlobalRef(buffer_userp);
+
+      return null; // Success
     }
+  }
 
-    static void
-    GenerateInputData(
-        FloatPointer[] input0_data, FloatPointer[] input1_data)
+  static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
+    @Override public void call(TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
     {
-      input0_data[0] = new FloatPointer(16);
-      input1_data[0] = new FloatPointer(16);
-      for (int i = 0; i < 16; ++i) {
-        input0_data[0].put(i, i);
-        input1_data[0].put(i, 1);
-      }
+      // We reuse the request so we don't delete it here.
     }
+  }
 
-    static void
-    CompareResult(
-        String output0_name, String output1_name,
-        IntPointer input0, IntPointer input1, IntPointer output0,
-        IntPointer output1)
+  static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
+    @Override public void call(TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
     {
-      for (int i = 0; i < 16; ++i) {
-        if ((input0.get(i) + input1.get(i)) != output0.get(i)) {
-          FAIL("incorrect sum in " + output0_name);
-        }
-        if ((input0.get(i) - input1.get(i)) != output1.get(i)) {
-          FAIL("incorrect difference in " + output1_name);
-        }
+      if (response != null) {
+        // Send 'response' to the future.
+        futures.get(userp).complete(response);
+      }
+    }
+  }
+
+  static ConcurrentHashMap<Pointer, CompletableFuture<TRITONSERVER_InferenceResponse>> futures =
+      new ConcurrentHashMap<>();
+  static ResponseAlloc responseAlloc = new ResponseAlloc();
+  static ResponseRelease responseRelease = new ResponseRelease();
+  static InferRequestComplete inferRequestComplete = new InferRequestComplete();
+  static InferResponseComplete inferResponseComplete = new InferResponseComplete();
+
+  static TRITONSERVER_Error ParseModelMetadata(
+      JsonObject model_metadata, boolean[] is_int, boolean[] is_torch_model)
+  {
+    String seen_data_type = null;
+    for (JsonElement input_element : model_metadata.get("inputs").getAsJsonArray()) {
+      JsonObject input = input_element.getAsJsonObject();
+      if (!input.get("datatype").getAsString().equals("INT32")
+          && !input.get("datatype").getAsString().equals("FP32")) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_UNSUPPORTED,
+            "simple lib example only supports model with data type INT32 or "
+                + "FP32");
+      }
+      if (seen_data_type == null) {
+        seen_data_type = input.get("datatype").getAsString();
+      } else if (!seen_data_type.equals(input.get("datatype").getAsString())) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            "the inputs and outputs of 'simple' model must have the data type");
+      }
+    }
+    for (JsonElement output_element : model_metadata.get("outputs").getAsJsonArray()) {
+      JsonObject output = output_element.getAsJsonObject();
+      if (!output.get("datatype").getAsString().equals("INT32")
+          && !output.get("datatype").getAsString().equals("FP32")) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_UNSUPPORTED,
+            "simple lib example only supports model with data type INT32 or "
+                + "FP32");
+      } else if (!seen_data_type.equals(output.get("datatype").getAsString())) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            "the inputs and outputs of 'simple' model must have the data type");
       }
     }
 
-    static void
-    CompareResult(
-        String output0_name, String output1_name,
-        FloatPointer input0, FloatPointer input1, FloatPointer output0,
-        FloatPointer output1)
-    {
-      for (int i = 0; i < 16; ++i) {
-        if ((input0.get(i) + input1.get(i)) != output0.get(i)) {
-          FAIL("incorrect sum in " + output0_name);
-        }
-        if ((input0.get(i) - input1.get(i)) != output1.get(i)) {
-          FAIL("incorrect difference in " + output1_name);
-        }
+    is_int[0] = seen_data_type.equals("INT32");
+    is_torch_model[0] = model_metadata.get("platform").getAsString().equals("pytorch_libtorch");
+    return null;
+  }
+
+  static void GenerateInputData(IntPointer[] input0_data, IntPointer[] input1_data)
+  {
+    input0_data[0] = new IntPointer(16);
+    input1_data[0] = new IntPointer(16);
+    for (int i = 0; i < 16; ++i) {
+      input0_data[0].put(i, i);
+      input1_data[0].put(i, 1);
+    }
+  }
+
+  static void GenerateInputData(FloatPointer[] input0_data, FloatPointer[] input1_data)
+  {
+    input0_data[0] = new FloatPointer(16);
+    input1_data[0] = new FloatPointer(16);
+    for (int i = 0; i < 16; ++i) {
+      input0_data[0].put(i, i);
+      input1_data[0].put(i, 1);
+    }
+  }
+
+  static void CompareResult(
+      String output0_name, String output1_name, IntPointer input0, IntPointer input1,
+      IntPointer output0, IntPointer output1)
+  {
+    for (int i = 0; i < 16; ++i) {
+      if ((input0.get(i) + input1.get(i)) != output0.get(i)) {
+        FAIL("incorrect sum in " + output0_name);
+      }
+      if ((input0.get(i) - input1.get(i)) != output1.get(i)) {
+        FAIL("incorrect difference in " + output1_name);
+      }
+    }
+  }
+
+  static void CompareResult(
+      String output0_name, String output1_name, FloatPointer input0, FloatPointer input1,
+      FloatPointer output0, FloatPointer output1)
+  {
+    for (int i = 0; i < 16; ++i) {
+      if ((input0.get(i) + input1.get(i)) != output0.get(i)) {
+        FAIL("incorrect sum in " + output0_name);
+      }
+      if ((input0.get(i) - input1.get(i)) != output1.get(i)) {
+        FAIL("incorrect difference in " + output1_name);
       }
     }
+  }
+
+  static void Check(
+      TRITONSERVER_InferenceResponse response, Pointer input0_data, Pointer input1_data,
+      String output0, String output1, long expected_byte_size, int expected_datatype,
+      boolean is_int)
+  {
+    HashMap<String, Pointer> output_data = new HashMap<>();
+
+    int[] output_count = {0};
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceResponseOutputCount(response, output_count),
+        "getting number of response outputs");
+    if (output_count[0] != 2) {
+      FAIL("expecting 2 response outputs, got " + output_count[0]);
+    }
 
-    static void
-    Check(
-        TRITONSERVER_InferenceResponse response,
-        Pointer input0_data, Pointer input1_data,
-        String output0, String output1,
-        long expected_byte_size,
-        int expected_datatype, boolean is_int)
-    {
-      HashMap<String, Pointer> output_data = new HashMap<>();
+    for (int idx = 0; idx < output_count[0]; ++idx) {
+      BytePointer cname = new BytePointer((Pointer) null);
+      IntPointer datatype = new IntPointer(1);
+      LongPointer shape = new LongPointer((Pointer) null);
+      LongPointer dim_count = new LongPointer(1);
+      Pointer base = new Pointer();
+      SizeTPointer byte_size = new SizeTPointer(1);
+      IntPointer memory_type = new IntPointer(1);
+      LongPointer memory_type_id = new LongPointer(1);
+      Pointer userp = new Pointer();
 
-      int[] output_count = {0};
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceResponseOutputCount(response, output_count),
-          "getting number of response outputs");
-      if (output_count[0] != 2) {
-        FAIL("expecting 2 response outputs, got " + output_count[0]);
-      }
-
-      for (int idx = 0; idx < output_count[0]; ++idx) {
-        BytePointer cname = new BytePointer((Pointer)null);
-        IntPointer datatype = new IntPointer(1);
-        LongPointer shape = new LongPointer((Pointer)null);
-        LongPointer dim_count = new LongPointer(1);
-        Pointer base = new Pointer();
-        SizeTPointer byte_size = new SizeTPointer(1);
-        IntPointer memory_type = new IntPointer(1);
-        LongPointer memory_type_id = new LongPointer(1);
-        Pointer userp = new Pointer();
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseOutput(
-                response, idx, cname, datatype, shape, dim_count, base,
-                byte_size, memory_type, memory_type_id, userp),
-            "getting output info");
-
-        if (cname.isNull()) {
-          FAIL("unable to get output name");
-        }
-
-        String name = cname.getString();
-        if ((!name.equals(output0)) && (!name.equals(output1))) {
-          FAIL("unexpected output '" + name + "'");
-        }
-
-        if ((dim_count.get() != 2) || (shape.get(0) != 1) || (shape.get(1) != 16)) {
-          FAIL("unexpected shape for '" + name + "'");
-        }
+          TRITONSERVER_InferenceResponseOutput(
+              response, idx, cname, datatype, shape, dim_count, base, byte_size, memory_type,
+              memory_type_id, userp),
+          "getting output info");
 
-        if (datatype.get() != expected_datatype) {
-          FAIL(
-              "unexpected datatype '" +
-              TRITONSERVER_DataTypeString(datatype.get()) + "' for '" +
-              name + "'");
-        }
-
-        if (byte_size.get() != expected_byte_size) {
-          FAIL(
-              "unexpected byte-size, expected " +
-              expected_byte_size + ", got " +
-              byte_size.get() + " for " + name);
-        }
-
-        if (enforce_memory_type && (memory_type.get() != requested_memory_type)) {
-          FAIL(
-              "unexpected memory type, expected to be allocated in " +
-              TRITONSERVER_MemoryTypeString(requested_memory_type) +
-              ", got " + TRITONSERVER_MemoryTypeString(memory_type.get()) +
-              ", id " + memory_type_id.get() + " for " + name);
-        }
+      if (cname.isNull()) {
+        FAIL("unable to get output name");
+      }
 
-        // We make a copy of the data here... which we could avoid for
-        // performance reasons but ok for this simple example.
-        BytePointer odata = new BytePointer(byte_size.get());
-        output_data.put(name, odata);
-        odata.put(base.limit(byte_size.get()));
+      String name = cname.getString();
+      if ((!name.equals(output0)) && (!name.equals(output1))) {
+        FAIL("unexpected output '" + name + "'");
       }
 
-      if (is_int) {
-        CompareResult(
-            output0, output1, new IntPointer(input0_data), new IntPointer(input1_data),
-            new IntPointer(output_data.get(output0)), new IntPointer(output_data.get(output1)));
-      } else {
-        CompareResult(
-            output0, output1, new FloatPointer(input0_data), new FloatPointer(input1_data),
-            new FloatPointer(output_data.get(output0)), new FloatPointer(output_data.get(output1)));
-      }
-    }
-
-    /**
-    Returns whether the memory growth is within the acceptable range
-    @param  max_float_allowed     Maximum allowed memory growth (%)
-    @param  max_mem_allowed       Maximum allowed memory (MB)
-     */
-    static boolean
-    ValidateMemoryGrowth(float max_growth_allowed, int max_mem_allowed){
-      // Allocate list starting capacity to hold up to 24 hours worth of snapshots.
-      List<Double> memory_snapshots = new ArrayList<Double>(20000);
-      while(!done){
-        try {
-          Thread.sleep(5000);
-        } catch (InterruptedException e){
-          System.out.println("Memory growth validation interrupted.");
-        }
-        System.gc();
-        double snapshot = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory();
-        memory_snapshots.add(snapshot);
-        System.out.println("Memory allocated (MB):" + snapshot/1E6);
+      if ((dim_count.get() != 2) || (shape.get(0) != 1) || (shape.get(1) != 16)) {
+        FAIL("unexpected shape for '" + name + "'");
       }
-      if(memory_snapshots.size() < 5){
-        System.out.println("Error: Not enough snapshots, found " + memory_snapshots.size()
-        + " snapshots");
-        return false;
+
+      if (datatype.get() != expected_datatype) {
+        FAIL(
+            "unexpected datatype '" + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name
+            + "'");
       }
 
-      // Measure memory growth without outliers by taking difference
-      // between 90th percentile and 10th percentile memory usage.
-      final double bytes_in_mb = 1E6;
-      Collections.sort(memory_snapshots);
-      int index_max = ((int) Math.ceil(max_percentile / 100.0 * memory_snapshots.size())) - 1;
-      int index_min = ((int) Math.ceil(min_percentile / 100.0 * memory_snapshots.size())) - 1;
-      double memory_allocation_delta = memory_snapshots.get(index_max) - memory_snapshots.get(index_min);
-      double memory_allocation_delta_mb = memory_allocation_delta / bytes_in_mb;
-      double memory_allocation_delta_percent = memory_allocation_delta / memory_snapshots.get(index_max);
+      if (byte_size.get() != expected_byte_size) {
+        FAIL(
+            "unexpected byte-size, expected " + expected_byte_size + ", got " + byte_size.get()
+            + " for " + name);
+      }
 
-      System.out.println("Change in memory allocation (MB): " +
-          memory_allocation_delta_mb + ", " +
-          (memory_allocation_delta_percent * 100) + "%");
+      if (enforce_memory_type && (memory_type.get() != requested_memory_type)) {
+        FAIL(
+            "unexpected memory type, expected to be allocated in "
+            + TRITONSERVER_MemoryTypeString(requested_memory_type) + ", got "
+            + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id " + memory_type_id.get()
+            + " for " + name);
+      }
 
-      boolean passed = true;
+      // We make a copy of the data here... which we could avoid for
+      // performance reasons but ok for this simple example.
+      BytePointer odata = new BytePointer(byte_size.get());
+      output_data.put(name, odata);
+      odata.put(base.limit(byte_size.get()));
+    }
 
-      if(memory_allocation_delta_percent >= max_growth_allowed){
-        passed = false;
-        System.out.println("Exceeded allowed memory growth (" +
-          (max_growth_allowed * 100) + "%)");
+    if (is_int) {
+      CompareResult(
+          output0, output1, new IntPointer(input0_data), new IntPointer(input1_data),
+          new IntPointer(output_data.get(output0)), new IntPointer(output_data.get(output1)));
+    } else {
+      CompareResult(
+          output0, output1, new FloatPointer(input0_data), new FloatPointer(input1_data),
+          new FloatPointer(output_data.get(output0)), new FloatPointer(output_data.get(output1)));
+    }
+  }
+
+  /**
+  Returns whether the memory growth is within the acceptable range
+  @param  max_float_allowed     Maximum allowed memory growth (%)
+  @param  max_mem_allowed       Maximum allowed memory (MB)
+   */
+  static boolean ValidateMemoryGrowth(float max_growth_allowed, int max_mem_allowed)
+  {
+    // Allocate list starting capacity to hold up to 24 hours worth of snapshots.
+    List<Double> memory_snapshots = new ArrayList<Double>(20000);
+    while (!done) {
+      try {
+        Thread.sleep(5000);
       }
-
-      if((memory_snapshots.get(index_max) / bytes_in_mb) >= max_mem_allowed){
-        passed = false;
-        System.out.println("Exceeded allowed memory (" + max_mem_allowed + 
-          "MB), got " + (memory_snapshots.get(index_max) / bytes_in_mb) + "MB");
+      catch (InterruptedException e) {
+        System.out.println("Memory growth validation interrupted.");
       }
-      return passed;
+      System.gc();
+      double snapshot = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory();
+      memory_snapshots.add(snapshot);
+      System.out.println("Memory allocated (MB):" + snapshot / 1E6);
+    }
+    if (memory_snapshots.size() < 5) {
+      System.out.println(
+          "Error: Not enough snapshots, found " + memory_snapshots.size() + " snapshots");
+      return false;
     }
 
-    static void
-    RunInference(TRITONSERVER_ServerDeleter server, String model_name, boolean[] is_int, boolean[] is_torch_model, boolean check_accuracy)
-    throws Exception
-    {
-      // Create the allocator that will be used to allocate buffers for
-      // the result tensors.
-      TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_ResponseAllocatorNew(
-              allocator, responseAlloc, responseRelease, null /* start_fn */),
-          "creating response allocator");
-
-      // Inference
-      TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestNew(
-              irequest, server, model_name, -1 /* model_version */),
-          "creating inference request");
+    // Measure memory growth without outliers by taking difference
+    // between 90th percentile and 10th percentile memory usage.
+    final double bytes_in_mb = 1E6;
+    Collections.sort(memory_snapshots);
+    int index_max = ((int) Math.ceil(max_percentile / 100.0 * memory_snapshots.size())) - 1;
+    int index_min = ((int) Math.ceil(min_percentile / 100.0 * memory_snapshots.size())) - 1;
+    double memory_allocation_delta =
+        memory_snapshots.get(index_max) - memory_snapshots.get(index_min);
+    double memory_allocation_delta_mb = memory_allocation_delta / bytes_in_mb;
+    double memory_allocation_delta_percent =
+        memory_allocation_delta / memory_snapshots.get(index_max);
+
+    System.out.println(
+        "Change in memory allocation (MB): " + memory_allocation_delta_mb + ", "
+        + (memory_allocation_delta_percent * 100) + "%");
+
+    boolean passed = true;
+
+    if (memory_allocation_delta_percent >= max_growth_allowed) {
+      passed = false;
+      System.out.println("Exceeded allowed memory growth (" + (max_growth_allowed * 100) + "%)");
+    }
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"),
-          "setting ID for the request");
+    if ((memory_snapshots.get(index_max) / bytes_in_mb) >= max_mem_allowed) {
+      passed = false;
+      System.out.println(
+          "Exceeded allowed memory (" + max_mem_allowed + "MB), got "
+          + (memory_snapshots.get(index_max) / bytes_in_mb) + "MB");
+    }
+    return passed;
+  }
+
+  static void RunInference(
+      TRITONSERVER_ServerDeleter server, String model_name, boolean[] is_int,
+      boolean[] is_torch_model, boolean check_accuracy) throws Exception
+  {
+    // Create the allocator that will be used to allocate buffers for
+    // the result tensors.
+    TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_ResponseAllocatorNew(
+            allocator, responseAlloc, responseRelease, null /* start_fn */),
+        "creating response allocator");
+
+    // Inference
+    TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestNew(irequest, server, model_name, -1 /* model_version */),
+        "creating inference request");
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"),
+        "setting ID for the request");
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetReleaseCallback(
+            irequest, inferRequestComplete, null /* request_release_userp */),
+        "setting request release callback");
+
+    // Inputs
+    String input0 = is_torch_model[0] ? "INPUT__0" : "INPUT0";
+    String input1 = is_torch_model[0] ? "INPUT__1" : "INPUT1";
+
+    long[] input0_shape = {1, 16};
+    long[] input1_shape = {1, 16};
+
+    int datatype = (is_int[0]) ? TRITONSERVER_TYPE_INT32 : TRITONSERVER_TYPE_FP32;
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddInput(
+            irequest, input0, datatype, input0_shape, input0_shape.length),
+        "setting input 0 meta-data for the request");
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddInput(
+            irequest, input1, datatype, input1_shape, input1_shape.length),
+        "setting input 1 meta-data for the request");
+
+    String output0 = is_torch_model[0] ? "OUTPUT__0" : "OUTPUT0";
+    String output1 = is_torch_model[0] ? "OUTPUT__1" : "OUTPUT1";
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output0),
+        "requesting output 0 for the request");
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output1),
+        "requesting output 1 for the request");
+
+    // Create the data for the two input tensors. Initialize the first
+    // to unique values and the second to all ones.
+    BytePointer input0_data;
+    BytePointer input1_data;
+    if (is_int[0]) {
+      IntPointer[] p0 = {null}, p1 = {null};
+      GenerateInputData(p0, p1);
+      input0_data = p0[0].getPointer(BytePointer.class);
+      input1_data = p1[0].getPointer(BytePointer.class);
+    } else {
+      FloatPointer[] p0 = {null}, p1 = {null};
+      GenerateInputData(p0, p1);
+      input0_data = p0[0].getPointer(BytePointer.class);
+      input1_data = p1[0].getPointer(BytePointer.class);
+    }
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestSetReleaseCallback(
-              irequest, inferRequestComplete, null /* request_release_userp */),
-          "setting request release callback");
+    long input0_size = input0_data.limit();
+    long input1_size = input1_data.limit();
 
-      // Inputs
-      String input0 = is_torch_model[0] ? "INPUT__0" : "INPUT0";
-      String input1 = is_torch_model[0] ? "INPUT__1" : "INPUT1";
+    Pointer input0_base = input0_data;
+    Pointer input1_base = input1_data;
 
-      long[] input0_shape = {1, 16};
-      long[] input1_shape = {1, 16};
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAppendInputData(
+            irequest, input0, input0_base, input0_size, requested_memory_type,
+            0 /* memory_type_id */),
+        "assigning INPUT0 data");
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAppendInputData(
+            irequest, input1, input1_base, input1_size, requested_memory_type,
+            0 /* memory_type_id */),
+        "assigning INPUT1 data");
 
-      int datatype =
-          (is_int[0]) ? TRITONSERVER_TYPE_INT32 : TRITONSERVER_TYPE_FP32;
+    // Perform inference...
+    {
+      CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
+      futures.put(irequest, completed);
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddInput(
-              irequest, input0, datatype, input0_shape, input0_shape.length),
-          "setting input 0 meta-data for the request");
+          TRITONSERVER_InferenceRequestSetResponseCallback(
+              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
+              irequest),
+          "setting response callback");
+
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddInput(
-              irequest, input1, datatype, input1_shape, input1_shape.length),
-          "setting input 1 meta-data for the request");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
 
-      String output0 = is_torch_model[0] ? "OUTPUT__0" : "OUTPUT0";
-      String output1 = is_torch_model[0] ? "OUTPUT__1" : "OUTPUT1";
+      // Wait for the inference to complete.
+      TRITONSERVER_InferenceResponse completed_response = completed.get();
+      futures.remove(irequest);
 
+      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
+      if (check_accuracy) {
+        Check(
+            completed_response, input0_data, input1_data, output0, output1, input0_size, datatype,
+            is_int[0]);
+      }
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output0),
-          "requesting output 0 for the request");
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output1),
-          "requesting output 1 for the request");
+          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
+    }
 
-      // Create the data for the two input tensors. Initialize the first
-      // to unique values and the second to all ones.
-      BytePointer input0_data;
-      BytePointer input1_data;
+    // Modify some input data in place and then reuse the request
+    // object. For simplicity we only do this when the input tensors are
+    // in non-pinned system memory.
+    if (!enforce_memory_type || (requested_memory_type == TRITONSERVER_MEMORY_CPU)) {
       if (is_int[0]) {
-        IntPointer[] p0 = {null}, p1 = {null};
-        GenerateInputData(p0, p1);
-        input0_data = p0[0].getPointer(BytePointer.class);
-        input1_data = p1[0].getPointer(BytePointer.class);
+        new IntPointer(input0_data).put(0, 27);
       } else {
-        FloatPointer[] p0 = {null}, p1 = {null};
-        GenerateInputData(p0, p1);
-        input0_data = p0[0].getPointer(BytePointer.class);
-        input1_data = p1[0].getPointer(BytePointer.class);
+        new FloatPointer(input0_data).put(0, 27.0f);
       }
 
-      long input0_size = input0_data.limit();
-      long input1_size = input1_data.limit();
+      CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
+      futures.put(irequest, completed);
 
-      Pointer input0_base = input0_data;
-      Pointer input1_base = input1_data;
+      // Using a new promise so have to re-register the callback to set
+      // the promise as the userp.
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceRequestSetResponseCallback(
+              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
+              irequest),
+          "setting response callback");
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAppendInputData(
-              irequest, input0, input0_base, input0_size, requested_memory_type,
-              0 /* memory_type_id */),
-          "assigning INPUT0 data");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
+
+      // Wait for the inference to complete.
+      TRITONSERVER_InferenceResponse completed_response = completed.get();
+      futures.remove(irequest);
+      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
+      if (check_accuracy) {
+        Check(
+            completed_response, input0_data, input1_data, output0, output1, input0_size, datatype,
+            is_int[0]);
+      }
+
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
+    }
+
+    // Remove input data and then add back different data.
+    {
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceRequestRemoveAllInputData(irequest, input0),
+          "removing INPUT0 data");
       FAIL_IF_ERR(
           TRITONSERVER_InferenceRequestAppendInputData(
-              irequest, input1, input1_base, input1_size, requested_memory_type,
+              irequest, input0, input1_base, input1_size, requested_memory_type,
               0 /* memory_type_id */),
-          "assigning INPUT1 data");
-
-      // Perform inference...
-      {
-        CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
-        futures.put(irequest, completed);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestSetResponseCallback(
-                irequest, allocator, null /* response_allocator_userp */,
-                inferResponseComplete, irequest),
-            "setting response callback");
-
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerInferAsync(
-                server, irequest, null /* trace */),
-            "running inference");
-
-        // Wait for the inference to complete.
-        TRITONSERVER_InferenceResponse completed_response = completed.get();
-        futures.remove(irequest);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseError(completed_response),
-            "response status");
-        if (check_accuracy) {
-          Check(
-              completed_response, input0_data, input1_data, output0, output1,
-              input0_size, datatype, is_int[0]);
-        }
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseDelete(completed_response),
-            "deleting inference response");
-      }
-
-      // Modify some input data in place and then reuse the request
-      // object. For simplicity we only do this when the input tensors are
-      // in non-pinned system memory.
-      if (!enforce_memory_type ||
-          (requested_memory_type == TRITONSERVER_MEMORY_CPU)) {
-        if (is_int[0]) {
-          new IntPointer(input0_data).put(0, 27);
-        } else {
-          new FloatPointer(input0_data).put(0, 27.0f);
-        }
-
-        CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
-        futures.put(irequest, completed);
-
-        // Using a new promise so have to re-register the callback to set
-        // the promise as the userp.
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestSetResponseCallback(
-                irequest, allocator, null /* response_allocator_userp */,
-                inferResponseComplete, irequest),
-            "setting response callback");
-
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerInferAsync(
-                server, irequest, null /* trace */),
-            "running inference");
-
-        // Wait for the inference to complete.
-        TRITONSERVER_InferenceResponse completed_response = completed.get();
-        futures.remove(irequest);
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseError(completed_response),
-            "response status");
-        if (check_accuracy) {
-          Check(
-              completed_response, input0_data, input1_data, output0, output1,
-              input0_size, datatype, is_int[0]);
-        }
+          "assigning INPUT1 data to INPUT0");
 
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseDelete(completed_response),
-            "deleting inference response");
-      }
-
-      // Remove input data and then add back different data.
-      {
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestRemoveAllInputData(irequest, input0),
-            "removing INPUT0 data");
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestAppendInputData(
-                irequest, input0, input1_base, input1_size, requested_memory_type,
-                0 /* memory_type_id */),
-            "assigning INPUT1 data to INPUT0");
-
-        CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
-        futures.put(irequest, completed);
-
-        // Using a new promise so have to re-register the callback to set
-        // the promise as the userp.
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestSetResponseCallback(
-                irequest, allocator, null /* response_allocator_userp */,
-                inferResponseComplete, irequest),
-            "setting response callback");
-
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerInferAsync(
-                server, irequest, null /* trace */),
-            "running inference");
-
-        // Wait for the inference to complete.
-        TRITONSERVER_InferenceResponse completed_response = completed.get();
-        futures.remove(irequest);
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseError(completed_response),
-            "response status");
-
-        if (check_accuracy) {
-          // Both inputs are using input1_data...
-          Check(
-              completed_response, input1_data, input1_data, output0, output1,
-              input0_size, datatype, is_int[0]);
-        }
+      CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
+      futures.put(irequest, completed);
 
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseDelete(completed_response),
-            "deleting inference response");
-      }
+      // Using a new promise so have to re-register the callback to set
+      // the promise as the userp.
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceRequestSetResponseCallback(
+              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
+              irequest),
+          "setting response callback");
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestDelete(irequest),
-          "deleting inference request");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
+
+      // Wait for the inference to complete.
+      TRITONSERVER_InferenceResponse completed_response = completed.get();
+      futures.remove(irequest);
+      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
+
+      if (check_accuracy) {
+        // Both inputs are using input1_data...
+        Check(
+            completed_response, input1_data, input1_data, output0, output1, input0_size, datatype,
+            is_int[0]);
+      }
 
       FAIL_IF_ERR(
-          TRITONSERVER_ResponseAllocatorDelete(allocator),
-          "deleting response allocator");
+          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
     }
 
-    public static void
-    main(String[] args) throws Exception
-    {
-      int num_iterations = 1000000;
-      String model_repository_path = null;
-      int verbose_level = 0;
-      boolean check_accuracy = false;
-
-      // Parse commandline...
-      for (int i = 0; i < args.length; i++) {
-        switch (args[i]) {
-          case "-i":
-            i++;
-            try {
-              num_iterations = Integer.parseInt(args[i]);
-            } catch (NumberFormatException e){
-              Usage(
-                  "-i must be used to specify number of iterations");
-            }
-            break;
-          case "-m":
-            enforce_memory_type = true;
-            i++;
-            if (args[i].equals("system")) {
-              requested_memory_type = TRITONSERVER_MEMORY_CPU;
-            } else if (args[i].equals("pinned")) {
-              requested_memory_type = TRITONSERVER_MEMORY_CPU_PINNED;
-            } else if (args[i].equals("gpu")) {
-              requested_memory_type = TRITONSERVER_MEMORY_GPU;
-            } else {
-              Usage(
-                  "-m must be used to specify one of the following types:" +
-                  " <\"system\"|\"pinned\"|gpu>");
-            }
-            break;
-          case "-r":
-            model_repository_path = args[++i];
-            break;
-          case "-v":
-            verbose_level = 1;
-            break;
-          case "-c":
-            check_accuracy = true;
-            break;
-          case "-?":
-            Usage(null);
-            break;
-          case "--max-growth":
-            i++;
-            try {
-              max_growth_allowed = Integer.parseInt(args[i]) / 100.0f;
-            } catch (NumberFormatException e){
-              Usage(
-                  "--max-growth must be an integer value specifying allowed memory growth (%)");
-            }
-            break;
-          case "--max-memory":
-            i++;
-            try {
-              max_mem_allowed = Integer.parseInt(args[i]);
-            } catch (NumberFormatException e){
-              Usage(
-                  "--max-memory must be an integer value specifying maximum allowed memory (MB)");
-            }
-            break;
-        }
-      }
+    FAIL_IF_ERR(TRITONSERVER_InferenceRequestDelete(irequest), "deleting inference request");
+
+    FAIL_IF_ERR(TRITONSERVER_ResponseAllocatorDelete(allocator), "deleting response allocator");
+  }
+
+  public static void main(String[] args) throws Exception
+  {
+    int num_iterations = 1000000;
+    String model_repository_path = null;
+    int verbose_level = 0;
+    boolean check_accuracy = false;
 
-      if (model_repository_path == null) {
-        Usage("-r must be used to specify model repository path");
+    // Parse commandline...
+    for (int i = 0; i < args.length; i++) {
+      switch (args[i]) {
+        case "-i":
+          i++;
+          try {
+            num_iterations = Integer.parseInt(args[i]);
+          }
+          catch (NumberFormatException e) {
+            Usage("-i must be used to specify number of iterations");
+          }
+          break;
+        case "-m":
+          enforce_memory_type = true;
+          i++;
+          if (args[i].equals("system")) {
+            requested_memory_type = TRITONSERVER_MEMORY_CPU;
+          } else if (args[i].equals("pinned")) {
+            requested_memory_type = TRITONSERVER_MEMORY_CPU_PINNED;
+          } else if (args[i].equals("gpu")) {
+            requested_memory_type = TRITONSERVER_MEMORY_GPU;
+          } else {
+            Usage(
+                "-m must be used to specify one of the following types:"
+                + " <\"system\"|\"pinned\"|gpu>");
+          }
+          break;
+        case "-r":
+          model_repository_path = args[++i];
+          break;
+        case "-v":
+          verbose_level = 1;
+          break;
+        case "-c":
+          check_accuracy = true;
+          break;
+        case "-?":
+          Usage(null);
+          break;
+        case "--max-growth":
+          i++;
+          try {
+            max_growth_allowed = Integer.parseInt(args[i]) / 100.0f;
+          }
+          catch (NumberFormatException e) {
+            Usage("--max-growth must be an integer value specifying allowed memory growth (%)");
+          }
+          break;
+        case "--max-memory":
+          i++;
+          try {
+            max_mem_allowed = Integer.parseInt(args[i]);
+          }
+          catch (NumberFormatException e) {
+            Usage("--max-memory must be an integer value specifying maximum allowed memory (MB)");
+          }
+          break;
       }
-      if (enforce_memory_type && requested_memory_type != TRITONSERVER_MEMORY_CPU) {
-        Usage("-m can only be set to \"system\" without enabling GPU");
+    }
+
+    if (model_repository_path == null) {
+      Usage("-r must be used to specify model repository path");
+    }
+    if (enforce_memory_type && requested_memory_type != TRITONSERVER_MEMORY_CPU) {
+      Usage("-m can only be set to \"system\" without enabling GPU");
+    }
+
+    // Check API version.
+    int[] api_version_major = {0}, api_version_minor = {0};
+    FAIL_IF_ERR(
+        TRITONSERVER_ApiVersion(api_version_major, api_version_minor),
+        "getting Triton API version");
+    if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0])
+        || (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) {
+      FAIL("triton server API version mismatch");
+    }
+
+    // Create the server...
+    TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
+    FAIL_IF_ERR(TRITONSERVER_ServerOptionsNew(server_options), "creating server options");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetModelRepositoryPath(server_options, model_repository_path),
+        "setting model repository path");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
+        "setting verbose logging level");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetBackendDirectory(server_options, "/opt/tritonserver/backends"),
+        "setting backend directory");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
+            server_options, "/opt/tritonserver/repoagents"),
+        "setting repository agent directory");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true),
+        "setting strict model configuration");
+    double min_compute_capability = TRITON_MIN_COMPUTE_CAPABILITY;
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability(
+            server_options, min_compute_capability),
+        "setting minimum supported CUDA compute capability");
+
+    TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
+    FAIL_IF_ERR(TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+    FAIL_IF_ERR(TRITONSERVER_ServerOptionsDelete(server_options), "deleting server options");
+
+    TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
+
+    // Wait until the server is both live and ready.
+    int health_iters = 0;
+    while (true) {
+      boolean[] live = {false}, ready = {false};
+      FAIL_IF_ERR(TRITONSERVER_ServerIsLive(server, live), "unable to get server liveness");
+      FAIL_IF_ERR(TRITONSERVER_ServerIsReady(server, ready), "unable to get server readiness");
+      System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
+      if (live[0] && ready[0]) {
+        break;
       }
 
-      // Check API version.
-      int[] api_version_major = {0}, api_version_minor = {0};
-      FAIL_IF_ERR(
-          TRITONSERVER_ApiVersion(api_version_major, api_version_minor),
-          "getting Triton API version");
-      if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0]) ||
-          (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) {
-        FAIL("triton server API version mismatch");
+      if (++health_iters >= 10) {
+        FAIL("failed to find healthy inference server");
       }
 
-      // Create the server...
-      TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsNew(server_options),
-          "creating server options");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetModelRepositoryPath(
-              server_options, model_repository_path),
-          "setting model repository path");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
-          "setting verbose logging level");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetBackendDirectory(
-              server_options, "/opt/tritonserver/backends"),
-          "setting backend directory");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
-              server_options, "/opt/tritonserver/repoagents"),
-          "setting repository agent directory");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true),
-          "setting strict model configuration");
-      double min_compute_capability = TRITON_MIN_COMPUTE_CAPABILITY;
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability(
-              server_options, min_compute_capability),
-          "setting minimum supported CUDA compute capability");
+      Thread.sleep(500);
+    }
 
-      TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
+    // Print status of the server.
+    {
+      TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
       FAIL_IF_ERR(
-          TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+          TRITONSERVER_ServerMetadata(server, server_metadata_message),
+          "unable to get server metadata message");
+      BytePointer buffer = new BytePointer((Pointer) null);
+      SizeTPointer byte_size = new SizeTPointer(1);
       FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsDelete(server_options),
-          "deleting server options");
-
-      TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
-
-      // Wait until the server is both live and ready.
-      int health_iters = 0;
-      while (true) {
-        boolean[] live = {false}, ready = {false};
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerIsLive(server, live),
-            "unable to get server liveness");
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerIsReady(server, ready),
-            "unable to get server readiness");
-        System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
-        if (live[0] && ready[0]) {
-          break;
-        }
+          TRITONSERVER_MessageSerializeToJson(server_metadata_message, buffer, byte_size),
+          "unable to serialize server metadata message");
+
+      System.out.println("Server Status:");
+      System.out.println(buffer.limit(byte_size.get()).getString());
+
+      FAIL_IF_ERR(TRITONSERVER_MessageDelete(server_metadata_message), "deleting status metadata");
+    }
+
+    String model_name = "simple";
 
+    // Wait for the model to become available.
+    boolean[] is_torch_model = {false};
+    boolean[] is_int = {true};
+    boolean[] is_ready = {false};
+    health_iters = 0;
+    while (!is_ready[0]) {
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerModelIsReady(server, model_name, 1, is_ready),
+          "unable to get model readiness");
+      if (!is_ready[0]) {
         if (++health_iters >= 10) {
-          FAIL("failed to find healthy inference server");
+          FAIL("model failed to be ready in 10 iterations");
         }
-
         Thread.sleep(500);
+        continue;
       }
 
-      // Print status of the server.
-      {
-        TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerMetadata(server, server_metadata_message),
-            "unable to get server metadata message");
-        BytePointer buffer = new BytePointer((Pointer)null);
-        SizeTPointer byte_size = new SizeTPointer(1);
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageSerializeToJson(
-                server_metadata_message, buffer, byte_size),
-            "unable to serialize server metadata message");
-
-        System.out.println("Server Status:");
-        System.out.println(buffer.limit(byte_size.get()).getString());
-
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageDelete(server_metadata_message),
-            "deleting status metadata");
-      }
-
-      String model_name = "simple";
-
-      // Wait for the model to become available.
-      boolean[] is_torch_model = {false};
-      boolean[] is_int = {true};
-      boolean[] is_ready = {false};
-      health_iters = 0;
-      while (!is_ready[0]) {
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerModelIsReady(
-                server, model_name, 1, is_ready),
-            "unable to get model readiness");
-        if (!is_ready[0]) {
-          if (++health_iters >= 10) {
-            FAIL("model failed to be ready in 10 iterations");
-          }
-          Thread.sleep(500);
-          continue;
-        }
+      TRITONSERVER_Message model_metadata_message = new TRITONSERVER_Message(null);
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerModelMetadata(server, model_name, 1, model_metadata_message),
+          "unable to get model metadata message");
+      BytePointer buffer = new BytePointer((Pointer) null);
+      SizeTPointer byte_size = new SizeTPointer(1);
+      FAIL_IF_ERR(
+          TRITONSERVER_MessageSerializeToJson(model_metadata_message, buffer, byte_size),
+          "unable to serialize model status protobuf");
 
-        TRITONSERVER_Message model_metadata_message = new TRITONSERVER_Message(null);
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerModelMetadata(
-                server, model_name, 1, model_metadata_message),
-            "unable to get model metadata message");
-        BytePointer buffer = new BytePointer((Pointer)null);
-        SizeTPointer byte_size = new SizeTPointer(1);
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageSerializeToJson(
-                model_metadata_message, buffer, byte_size),
-            "unable to serialize model status protobuf");
-
-        JsonParser parser = new JsonParser();
-        JsonObject model_metadata = null;
-        try {
-          model_metadata = parser.parse(buffer.limit(byte_size.get()).getString()).getAsJsonObject();
-        } catch (Exception e) {
-          FAIL("error: failed to parse model metadata from JSON: " + e);
-        }
+      JsonParser parser = new JsonParser();
+      JsonObject model_metadata = null;
+      try {
+        model_metadata = parser.parse(buffer.limit(byte_size.get()).getString()).getAsJsonObject();
+      }
+      catch (Exception e) {
+        FAIL("error: failed to parse model metadata from JSON: " + e);
+      }
 
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageDelete(model_metadata_message),
-            "deleting status protobuf");
+      FAIL_IF_ERR(TRITONSERVER_MessageDelete(model_metadata_message), "deleting status protobuf");
 
-        if (!model_metadata.get("name").getAsString().equals(model_name)) {
-          FAIL("unable to find metadata for model");
-        }
+      if (!model_metadata.get("name").getAsString().equals(model_name)) {
+        FAIL("unable to find metadata for model");
+      }
 
-        boolean found_version = false;
-        if (model_metadata.has("versions")) {
-          for (JsonElement version : model_metadata.get("versions").getAsJsonArray()) {
-            if (version.getAsString().equals("1")) {
-              found_version = true;
-              break;
-            }
+      boolean found_version = false;
+      if (model_metadata.has("versions")) {
+        for (JsonElement version : model_metadata.get("versions").getAsJsonArray()) {
+          if (version.getAsString().equals("1")) {
+            found_version = true;
+            break;
           }
         }
-        if (!found_version) {
-          FAIL("unable to find version 1 status for model");
-        }
-
-        FAIL_IF_ERR(
-            ParseModelMetadata(model_metadata, is_int, is_torch_model),
-            "parsing model metadata");
+      }
+      if (!found_version) {
+        FAIL("unable to find version 1 status for model");
       }
 
-      Runnable runnable =
-        () -> {
-          boolean passed = ValidateMemoryGrowth(max_growth_allowed, max_mem_allowed);
-          
-          // Sleep to give the garbage collector time to free the server.
-          // This avoids race conditions between Triton bindings' printing and
-          // Java's native printing below.
-          try {
-            Thread.sleep(5000);
-          } catch (InterruptedException e){
-            System.out.println("Sleep interrupted: " + e.toString());
-          }
+      FAIL_IF_ERR(
+          ParseModelMetadata(model_metadata, is_int, is_torch_model), "parsing model metadata");
+    }
 
-          if(passed){
-            System.out.println("Memory growth test passed");
-          } else {
-            System.out.println("Memory growth test FAILED");
-          }
-        };
-      Thread memory_thread = new Thread(runnable);
-      memory_thread.start();
+    Runnable runnable = () ->
+    {
+      boolean passed = ValidateMemoryGrowth(max_growth_allowed, max_mem_allowed);
 
-      for(int i = 0; i < num_iterations; i++){
-        try (PointerScope scope = new PointerScope()) {
-          RunInference(server, model_name, is_int, is_torch_model, check_accuracy);
-        }
+      // Sleep to give the garbage collector time to free the server.
+      // This avoids race conditions between Triton bindings' printing and
+      // Java's native printing below.
+      try {
+        Thread.sleep(5000);
+      }
+      catch (InterruptedException e) {
+        System.out.println("Sleep interrupted: " + e.toString());
       }
-      done = true;
-      memory_thread.join();
 
-      System.exit(0);
+      if (passed) {
+        System.out.println("Memory growth test passed");
+      } else {
+        System.out.println("Memory growth test FAILED");
+      }
+    };
+    Thread memory_thread = new Thread(runnable);
+    memory_thread.start();
+
+    for (int i = 0; i < num_iterations; i++) {
+      try (PointerScope scope = new PointerScope()) {
+        RunInference(server, model_name, is_int, is_torch_model, check_accuracy);
+      }
     }
+    done = true;
+    memory_thread.join();
+
+    System.exit(0);
+  }
 }
diff --git a/qa/L0_java_memory_growth/test.sh b/qa/L0_java_memory_growth/test.sh
index 610315d34e..1011ec0633 100755
--- a/qa/L0_java_memory_growth/test.sh
+++ b/qa/L0_java_memory_growth/test.sh
@@ -76,7 +76,7 @@ fi
 LOG_IDX=$((LOG_IDX+1))
 CLIENT_LOG="./client_$LOG_IDX.log"
 
-# Longer-running memory growth test 
+# Longer-running memory growth test
 ITERS=1000000
 MAX_MEM_GROWTH_MB=10
 if [ "$TRITON_PERF_LONG" == 1 ]; then
diff --git a/qa/L0_java_resnet/ResnetTest.java b/qa/L0_java_resnet/ResnetTest.java
index 9bf46b22f7..e9f353cf62 100644
--- a/qa/L0_java_resnet/ResnetTest.java
+++ b/qa/L0_java_resnet/ResnetTest.java
@@ -24,593 +24,563 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import static org.bytedeco.tritonserver.global.tritonserver.*;
+
+import com.google.gson.*;
 import java.io.*;
 import java.util.*;
 import java.util.concurrent.*;
-import com.google.gson.*;
 import org.bytedeco.javacpp.*;
 import org.bytedeco.tritonserver.tritonserver.*;
-import static org.bytedeco.tritonserver.global.tritonserver.*;
 
 public class ResnetTest {
-    // Maximum allowed difference from expected model outputs
-    private static final float ALLOWED_DELTA = .001f;
-    private static final String[] MODELS = {
-      "resnet50_fp32_libtorch",
-      "resnet50_fp32_onnx",
+  // Maximum allowed difference from expected model outputs
+  private static final float ALLOWED_DELTA = .001f;
+  private static final String[] MODELS = {
+      "resnet50_fp32_libtorch", "resnet50_fp32_onnx",
       // TODO: fix build to support GPU only resnet50v1.5_fp16_savedmodel
       //"resnet50v1.5_fp16_savedmodel",
-      };
-    private static final double TRITON_MIN_COMPUTE_CAPABILITY = 6.0;
-    private enum Backend {
-      NONE,
-      ONNX,
-      TF,
-      TORCH,
+  };
+  private static final double TRITON_MIN_COMPUTE_CAPABILITY = 6.0;
+  private enum Backend {
+    NONE,
+    ONNX,
+    TF,
+    TORCH,
+  }
+
+  static void FAIL(String MSG)
+  {
+    System.err.println("failure: " + MSG);
+    System.exit(1);
+  }
+
+  static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG)
+  {
+    if (err__ != null) {
+      System.err.println(
+          "error: " + MSG + ":" + TRITONSERVER_ErrorCodeString(err__) + " - "
+          + TRITONSERVER_ErrorMessage(err__));
+      TRITONSERVER_ErrorDelete(err__);
+      System.exit(1);
     }
+  }
+
+  static boolean enforce_memory_type = false;
+  static int requested_memory_type;
 
-    static void FAIL(String MSG) {
-        System.err.println("failure: " + MSG);
-        System.exit(1);
+  static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server {
+    public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p)
+    {
+      super(p);
+      deallocator(new DeleteDeallocator(this));
+    }
+    protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
+      DeleteDeallocator(Pointer p) { super(p); }
+      @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
     }
+  }
 
-    static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG) {
-        if (err__ != null) {
-            System.err.println("error: " + MSG + ":"
-                             + TRITONSERVER_ErrorCodeString(err__) + " - "
-                             + TRITONSERVER_ErrorMessage(err__));
-            TRITONSERVER_ErrorDelete(err__);
-            System.exit(1);
-        }
+  static void Usage(String msg)
+  {
+    if (msg != null) {
+      System.err.println(msg);
     }
 
-    static boolean enforce_memory_type = false;
-    static int requested_memory_type;
+    System.err.println("Usage: java " + ResnetTest.class.getSimpleName() + " [options]");
+    System.err.println(
+        "\t-m <\"system\"|\"pinned\"|gpu>"
+        + " Enforce the memory type for input and output tensors."
+        + " If not specified, inputs will be in system memory and outputs"
+        + " will be based on the model's preferred type.");
+    System.err.println("\t-v Enable verbose logging");
+    System.err.println("\t-r [model repository absolute path]");
+
+    System.exit(1);
+  }
+
+  static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
+    @Override
+    public TRITONSERVER_Error call(
+        TRITONSERVER_ResponseAllocator allocator, String tensor_name, long byte_size,
+        int preferred_memory_type, long preferred_memory_type_id, Pointer userp,
+        PointerPointer buffer, PointerPointer buffer_userp, IntPointer actual_memory_type,
+        LongPointer actual_memory_type_id)
+    {
+      // Initially attempt to make the actual memory type and id that we
+      // allocate be the same as preferred memory type
+      actual_memory_type.put(0, preferred_memory_type);
+      actual_memory_type_id.put(0, preferred_memory_type_id);
+
+      // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
+      // need to do any other book-keeping.
+      if (byte_size == 0) {
+        buffer.put(0, null);
+        buffer_userp.put(0, null);
+        System.out.println("allocated " + byte_size + " bytes for result tensor " + tensor_name);
+      } else {
+        Pointer allocated_ptr = new Pointer();
+        if (enforce_memory_type) {
+          actual_memory_type.put(0, requested_memory_type);
+        }
 
-    static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server {
-        public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p) { super(p); deallocator(new DeleteDeallocator(this)); }
-        protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
-            DeleteDeallocator(Pointer p) { super(p); }
-            @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
+        actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU);
+        allocated_ptr = Pointer.malloc(byte_size);
+
+        // Pass the tensor name with buffer_userp so we can show it when
+        // releasing the buffer.
+        if (!allocated_ptr.isNull()) {
+          buffer.put(0, allocated_ptr);
+          buffer_userp.put(0, Loader.newGlobalRef(tensor_name));
+          System.out.println(
+              "allocated " + byte_size + " bytes in "
+              + TRITONSERVER_MemoryTypeString(actual_memory_type.get()) + " for result tensor "
+              + tensor_name);
         }
+      }
+
+      return null; // Success
     }
+  }
 
-    static void
-    Usage(String msg)
+  static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
+    @Override
+    public TRITONSERVER_Error call(
+        TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
+        long byte_size, int memory_type, long memory_type_id)
     {
-      if (msg != null) {
-        System.err.println(msg);
+      String name = null;
+      if (buffer_userp != null) {
+        name = (String) Loader.accessGlobalRef(buffer_userp);
+      } else {
+        name = "<unknown>";
       }
 
-      System.err.println("Usage: java " + ResnetTest.class.getSimpleName() + " [options]");
-      System.err.println("\t-m <\"system\"|\"pinned\"|gpu>"
-                       + " Enforce the memory type for input and output tensors."
-                       + " If not specified, inputs will be in system memory and outputs"
-                       + " will be based on the model's preferred type.");
-      System.err.println("\t-v Enable verbose logging");
-      System.err.println("\t-r [model repository absolute path]");
+      Pointer.free(buffer);
+      Loader.deleteGlobalRef(buffer_userp);
 
-      System.exit(1);
+      return null; // Success
     }
+  }
 
-    static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
-        @Override public TRITONSERVER_Error call (
-            TRITONSERVER_ResponseAllocator allocator, String tensor_name,
-            long byte_size, int preferred_memory_type,
-            long preferred_memory_type_id, Pointer userp, PointerPointer buffer,
-            PointerPointer buffer_userp, IntPointer actual_memory_type,
-            LongPointer actual_memory_type_id)
-        {
-          // Initially attempt to make the actual memory type and id that we
-          // allocate be the same as preferred memory type
-          actual_memory_type.put(0, preferred_memory_type);
-          actual_memory_type_id.put(0, preferred_memory_type_id);
-
-          // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
-          // need to do any other book-keeping.
-          if (byte_size == 0) {
-            buffer.put(0, null);
-            buffer_userp.put(0, null);
-            System.out.println("allocated " + byte_size + " bytes for result tensor " + tensor_name);
-          } else {
-            Pointer allocated_ptr = new Pointer();
-            if (enforce_memory_type) {
-              actual_memory_type.put(0, requested_memory_type);
-            }
-
-            actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU);
-            allocated_ptr = Pointer.malloc(byte_size);
-
-            // Pass the tensor name with buffer_userp so we can show it when
-            // releasing the buffer.
-            if (!allocated_ptr.isNull()) {
-              buffer.put(0, allocated_ptr);
-              buffer_userp.put(0, Loader.newGlobalRef(tensor_name));
-              System.out.println("allocated " + byte_size + " bytes in "
-                               + TRITONSERVER_MemoryTypeString(actual_memory_type.get())
-                               + " for result tensor " + tensor_name);
-            }
-          }
-
-          return null;  // Success
-        }
+  static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
+    @Override public void call(TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
+    {
+      // We reuse the request so we don't delete it here.
     }
+  }
 
-    static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
-        @Override public TRITONSERVER_Error call (
-            TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
-            long byte_size, int memory_type, long memory_type_id)
-        {
-          String name = null;
-          if (buffer_userp != null) {
-            name = (String)Loader.accessGlobalRef(buffer_userp);
-          } else {
-            name = "<unknown>";
-          }
-          
-          Pointer.free(buffer);
-          Loader.deleteGlobalRef(buffer_userp);
-
-          return null;  // Success
-        }
+  static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
+    @Override public void call(TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
+    {
+      if (response != null) {
+        // Send 'response' to the future.
+        futures.get(userp).complete(response);
+      }
     }
-
-    static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
-        @Override public void call (
-            TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
-        {
-          // We reuse the request so we don't delete it here.
-        }
+  }
+
+  static ConcurrentHashMap<Pointer, CompletableFuture<TRITONSERVER_InferenceResponse>> futures =
+      new ConcurrentHashMap<>();
+  static ResponseAlloc responseAlloc = new ResponseAlloc();
+  static ResponseRelease responseRelease = new ResponseRelease();
+  static InferRequestComplete inferRequestComplete = new InferRequestComplete();
+  static InferResponseComplete inferResponseComplete = new InferResponseComplete();
+
+  static void GenerateInputData(FloatPointer[] input_data)
+  {
+    // Input size is 3 * 224 * 224
+    input_data[0] = new FloatPointer(150528);
+    for (int i = 0; i < 150528; ++i) {
+      input_data[0].put(i, 1);
     }
-
-    static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
-        @Override public void call (
-            TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
-        {
-          if (response != null) {
-            // Send 'response' to the future.
-            futures.get(userp).complete(response);
-          }
-        }
-    }
-
-    static ConcurrentHashMap<Pointer, CompletableFuture<TRITONSERVER_InferenceResponse>> futures = new ConcurrentHashMap<>();
-    static ResponseAlloc responseAlloc = new ResponseAlloc();
-    static ResponseRelease responseRelease = new ResponseRelease();
-    static InferRequestComplete inferRequestComplete = new InferRequestComplete();
-    static InferResponseComplete inferResponseComplete = new InferResponseComplete();
-
-    static void
-    GenerateInputData(
-        FloatPointer[] input_data)
-    {
-      // Input size is 3 * 224 * 224
-      input_data[0] = new FloatPointer(150528);
-      for (int i = 0; i < 150528; ++i) {
-        input_data[0].put(i, 1);
+  }
+
+  static boolean AreValidResults(
+      String model_name, FloatPointer output, FloatPointer expected_output)
+  {
+    int output_length = model_name.contains("tensorflow") ? 1001 : 1000;
+    for (int i = 0; i < output_length; ++i) {
+      float difference = output.get(i) - expected_output.get(i);
+      if (difference > ALLOWED_DELTA) {
+        System.out.println(
+            model_name + "inference failure: unexpected output "
+            + "in " + model_name + ", index " + i);
+
+        System.out.println("Value: " + output.get(i) + ", expected " + expected_output.get(i));
+
+        return false; // Failure
       }
     }
+    return true; // Success
+  }
+
+  static void Check(
+      String model_name, Backend backend, TRITONSERVER_InferenceResponse response,
+      Pointer input_data, String output, int expected_datatype) throws Exception
+  {
+    HashMap<String, Pointer> output_data = new HashMap<>();
+
+    int[] output_count = {0};
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceResponseOutputCount(response, output_count),
+        "getting number of response outputs");
+    if (output_count[0] != 1) {
+      FAIL("expecting 1 response output, got " + output_count[0]);
+    }
 
-    static boolean
-    AreValidResults(
-        String model_name, FloatPointer output, FloatPointer expected_output)
-    {
-      int output_length = model_name.contains("tensorflow") ? 1001 : 1000;
-      for (int i = 0; i < output_length; ++i) {
-        float difference = output.get(i) - expected_output.get(i);
-        if (difference > ALLOWED_DELTA) {
-          System.out.println(model_name + "inference failure: unexpected output " +
-          "in " + model_name + ", index " + i);
+    for (int idx = 0; idx < output_count[0]; ++idx) {
+      BytePointer cname = new BytePointer((Pointer) null);
+      IntPointer datatype = new IntPointer(1);
+      LongPointer shape = new LongPointer((Pointer) null);
+      LongPointer dim_count = new LongPointer(1);
+      Pointer base = new Pointer();
+      SizeTPointer byte_size = new SizeTPointer(1);
+      IntPointer memory_type = new IntPointer(1);
+      LongPointer memory_type_id = new LongPointer(1);
+      Pointer userp = new Pointer();
 
-          System.out.println("Value: " + output.get(i) + ", expected " +
-          expected_output.get(i));
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceResponseOutput(
+              response, idx, cname, datatype, shape, dim_count, base, byte_size, memory_type,
+              memory_type_id, userp),
+          "getting output info");
 
-          return false; // Failure
-        }
+      if (cname.isNull()) {
+        FAIL("unable to get output name");
       }
-      return true; // Success
-    }
-
-    static void
-    Check(
-        String model_name, Backend backend,
-        TRITONSERVER_InferenceResponse response,
-        Pointer input_data, String output,
-        int expected_datatype) throws Exception
-    {
-      HashMap<String, Pointer> output_data = new HashMap<>();
 
-      int[] output_count = {0};
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceResponseOutputCount(response, output_count),
-          "getting number of response outputs");
-      if (output_count[0] != 1) {
-        FAIL("expecting 1 response output, got " + output_count[0]);
+      String name = cname.getString();
+      if (!name.equals(output)) {
+        FAIL("unexpected output '" + name + "'");
       }
 
-      for (int idx = 0; idx < output_count[0]; ++idx) {
-        BytePointer cname = new BytePointer((Pointer)null);
-        IntPointer datatype = new IntPointer(1);
-        LongPointer shape = new LongPointer((Pointer)null);
-        LongPointer dim_count = new LongPointer(1);
-        Pointer base = new Pointer();
-        SizeTPointer byte_size = new SizeTPointer(1);
-        IntPointer memory_type = new IntPointer(1);
-        LongPointer memory_type_id = new LongPointer(1);
-        Pointer userp = new Pointer();
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseOutput(
-                response, idx, cname, datatype, shape, dim_count, base,
-                byte_size, memory_type, memory_type_id, userp),
-            "getting output info");
-
-        if (cname.isNull()) {
-          FAIL("unable to get output name");
-        }
+      int output_length = backend == backend.TF ? 1001 : 1000;
 
-        String name = cname.getString();
-        if (!name.equals(output)) {
-          FAIL("unexpected output '" + name + "'");
-        }
+      if ((dim_count.get() != 2) || (shape.get(0) != 1) || shape.get(1) != output_length) {
+        FAIL("unexpected shape for '" + name + "'");
+      }
 
-        int output_length = backend == backend.TF ? 1001: 1000;
+      if (datatype.get() != expected_datatype) {
+        FAIL(
+            "unexpected datatype '" + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name
+            + "'");
+      }
 
-        if ((dim_count.get() != 2) || (shape.get(0) != 1)
-        || shape.get(1) != output_length) {
-          FAIL("unexpected shape for '" + name + "'");
-        }
+      if (enforce_memory_type && (memory_type.get() != requested_memory_type)) {
+        FAIL(
+            "unexpected memory type, expected to be allocated in "
+            + TRITONSERVER_MemoryTypeString(requested_memory_type) + ", got "
+            + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id " + memory_type_id.get()
+            + " for " + name);
+      }
 
-        if (datatype.get() != expected_datatype) {
-          FAIL(
-              "unexpected datatype '" +
-              TRITONSERVER_DataTypeString(datatype.get()) + "' for '" +
-              name + "'");
-        }
+      // We make a copy of the data here... which we could avoid for
+      // performance reasons but ok for this simple example.
+      BytePointer odata = new BytePointer(byte_size.get());
+      output_data.put(name, odata);
+      odata.put(base.limit(byte_size.get()));
+    }
 
-        if (enforce_memory_type && (memory_type.get() != requested_memory_type)) {
-          FAIL(
-              "unexpected memory type, expected to be allocated in " +
-              TRITONSERVER_MemoryTypeString(requested_memory_type) +
-              ", got " + TRITONSERVER_MemoryTypeString(memory_type.get()) +
-              ", id " + memory_type_id.get() + " for " + name);
-        }
+    // Expected output for model
+    String file_name = "expected_output_data/expected_output_";
+    switch (backend) {
+      case ONNX:
+        file_name += "onnx";
+        break;
+      case TF:
+        file_name += "tensorflow";
+        break;
+      case TORCH:
+        file_name += "pytorch";
+        break;
+      default:
+        FAIL("Unsupported model type");
+        break;
+    }
+    file_name += ".txt";
 
-        // We make a copy of the data here... which we could avoid for
-        // performance reasons but ok for this simple example.
-        BytePointer odata = new BytePointer(byte_size.get());
-        output_data.put(name, odata);
-        odata.put(base.limit(byte_size.get()));
-      }
+    int output_length = backend == backend.TF ? 1001 : 1000;
+    FloatPointer expected_output = new FloatPointer(output_length);
 
-      // Expected output for model
-      String file_name = "expected_output_data/expected_output_";
-      switch (backend) {
-        case ONNX:
-          file_name += "onnx";
-          break;
-        case TF:
-          file_name += "tensorflow";
-          break;
-        case TORCH:
-          file_name += "pytorch";
-          break;
-        default:
-          FAIL("Unsupported model type");
-          break;
-      }
-      file_name += ".txt";
-      
-      int output_length = backend == backend.TF ? 1001: 1000;
-      FloatPointer expected_output = new FloatPointer(output_length);
-
-      try (Scanner scanner = new Scanner(new File(file_name))) {
-        for (int i = 0; i < output_length; ++i) {
-          expected_output.put(i, scanner.nextFloat());
-        } 
+    try (Scanner scanner = new Scanner(new File(file_name))) {
+      for (int i = 0; i < output_length; ++i) {
+        expected_output.put(i, scanner.nextFloat());
       }
+    }
 
-      boolean correct_results = AreValidResults(
-          model_name, new FloatPointer(output_data.get(output)),
-          expected_output);
+    boolean correct_results =
+        AreValidResults(model_name, new FloatPointer(output_data.get(output)), expected_output);
 
-      if(correct_results){
-        System.out.println(backend.name() + " test PASSED");
-      } else {
-        System.out.println(backend.name() + " test FAILED");
-      }
+    if (correct_results) {
+      System.out.println(backend.name() + " test PASSED");
+    } else {
+      System.out.println(backend.name() + " test FAILED");
+    }
+  }
+
+  static void PerformInference(TRITONSERVER_ServerDeleter server, String model_name)
+      throws Exception
+  {
+    // Get type of model
+    Backend backend = Backend.NONE;
+    if (model_name.contains("onnx")) {
+      backend = Backend.ONNX;
+    } else if (model_name.contains("savedmodel")) {
+      backend = Backend.TF;
+    } else if (model_name.contains("torch")) {
+      backend = Backend.TORCH;
+    } else {
+      FAIL(
+          "Supported model types (Onnx, TensorFlow, Torch) "
+          + "cannot be inferred from model name " + model_name);
     }
 
-    static void
-    PerformInference(
-      TRITONSERVER_ServerDeleter server, String model_name) throws Exception
-    {
-      // Get type of model
-      Backend backend = Backend.NONE;
-      if(model_name.contains("onnx")) {
-        backend = Backend.ONNX;
-      } else if (model_name.contains("savedmodel")) {
-        backend = Backend.TF;
-      } else if (model_name.contains("torch")) {
-        backend = Backend.TORCH;
-      } else {
-        FAIL("Supported model types (Onnx, TensorFlow, Torch) " +
-        "cannot be inferred from model name " + model_name);
-      }
-
-      // Wait for the model to become available.
-      boolean[] is_ready = {false};
-      int health_iters = 0;
-      while (!is_ready[0]) {
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerModelIsReady(
-                server, model_name, 1, is_ready),
-            "unable to get model readiness");
-        if (!is_ready[0]) {
-          if (++health_iters >= 10) {
-            FAIL(model_name + " model failed to be ready in 10 iterations");
-          }
-          Thread.sleep(500);
-          continue;
+    // Wait for the model to become available.
+    boolean[] is_ready = {false};
+    int health_iters = 0;
+    while (!is_ready[0]) {
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerModelIsReady(server, model_name, 1, is_ready),
+          "unable to get model readiness");
+      if (!is_ready[0]) {
+        if (++health_iters >= 10) {
+          FAIL(model_name + " model failed to be ready in 10 iterations");
         }
+        Thread.sleep(500);
+        continue;
       }
+    }
 
-      // Create the allocator that will be used to allocate buffers for
-      // the result tensors.
-      TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_ResponseAllocatorNew(
-              allocator, responseAlloc, responseRelease, null /* start_fn */),
-          "creating response allocator");
+    // Create the allocator that will be used to allocate buffers for
+    // the result tensors.
+    TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_ResponseAllocatorNew(
+            allocator, responseAlloc, responseRelease, null /* start_fn */),
+        "creating response allocator");
+
+    // Inference
+    TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestNew(irequest, server, model_name, -1 /* model_version */),
+        "creating inference request");
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"),
+        "setting ID for the request");
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetReleaseCallback(
+            irequest, inferRequestComplete, null /* request_release_userp */),
+        "setting request release callback");
+
+
+    // Model inputs
+    String input = "";
+    String output = "";
+    long[] input_shape = {1, 224, 224, 3};
+
+    switch (backend) {
+      case ONNX:
+        input = "import/input:0";
+        output = "import/resnet_v1_50/predictions/Softmax:0";
+        break;
+      case TF:
+        input = "input";
+        output = "probabilities";
+        break;
+      case TORCH:
+        input = "INPUT__0";
+        input_shape[1] = 3;
+        input_shape[3] = 224;
+        output = "OUTPUT__0";
+        break;
+      default:
+        FAIL("Unsupported model type");
+        break;
+    }
 
-      // Inference
-      TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestNew(
-              irequest, server, model_name, -1 /* model_version */),
-          "creating inference request");
+    int datatype = TRITONSERVER_TYPE_FP32;
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"),
-          "setting ID for the request");
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddInput(
+            irequest, input, datatype, input_shape, input_shape.length),
+        "setting input 0 meta-data for the request");
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestSetReleaseCallback(
-              irequest, inferRequestComplete, null /* request_release_userp */),
-          "setting request release callback");
-
-      
-      // Model inputs
-      String input = "";
-      String output = "";
-      long[] input_shape = {1, 224, 224, 3};
-
-      switch (backend) {
-        case ONNX:
-          input = "import/input:0";
-          output = "import/resnet_v1_50/predictions/Softmax:0";
-          break;
-        case TF:
-          input = "input";
-          output = "probabilities";
-          break;
-        case TORCH:
-          input = "INPUT__0";
-          input_shape[1] = 3;
-          input_shape[3] = 224;
-          output = "OUTPUT__0";
-          break;
-        default:
-          FAIL("Unsupported model type");
-          break;
-      }
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output),
+        "requesting output 0 for the request");
 
-      int datatype = TRITONSERVER_TYPE_FP32;
+    // Create the data for the two input tensors. Initialize the first
+    // to unique values and the second to all ones.
+    BytePointer input_data;
+    FloatPointer[] p0 = {null};
+    GenerateInputData(p0);
+    input_data = p0[0].getPointer(BytePointer.class);
+    long input_size = input_data.limit();
+    Pointer input_base = input_data;
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddInput(
-              irequest, input, datatype, input_shape, input_shape.length),
-          "setting input 0 meta-data for the request");
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAppendInputData(
+            irequest, input, input_base, input_size, requested_memory_type, 0 /* memory_type_id */),
+        "assigning INPUT data");
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output),
-          "requesting output 0 for the request");
-
-      // Create the data for the two input tensors. Initialize the first
-      // to unique values and the second to all ones.
-      BytePointer input_data;
-      FloatPointer[] p0 = {null};
-      GenerateInputData(p0);
-      input_data = p0[0].getPointer(BytePointer.class);
-      long input_size = input_data.limit();
-      Pointer input_base = input_data;
+    // Perform inference...
+    {
+      CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
+      futures.put(irequest, completed);
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAppendInputData(
-              irequest, input, input_base, input_size, requested_memory_type,
-              0 /* memory_type_id */),
-          "assigning INPUT data");
-
-      // Perform inference...
-      {
-        CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
-        futures.put(irequest, completed);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestSetResponseCallback(
-                irequest, allocator, null /* response_allocator_userp */,
-                inferResponseComplete, irequest),
-            "setting response callback");
-
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerInferAsync(
-                server, irequest, null /* trace */),
-            "running inference");
-
-        // Wait for the inference to complete.
-        TRITONSERVER_InferenceResponse completed_response = completed.get();
-        futures.remove(irequest);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseError(completed_response),
-            "response status");
-
-        Check(
-            model_name, backend, completed_response, input_data, output, datatype);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseDelete(completed_response),
-            "deleting inference response");
-      }
+          TRITONSERVER_InferenceRequestSetResponseCallback(
+              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
+              irequest),
+          "setting response callback");
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestDelete(irequest),
-          "deleting inference request");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
+
+      // Wait for the inference to complete.
+      TRITONSERVER_InferenceResponse completed_response = completed.get();
+      futures.remove(irequest);
+
+      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
+
+      Check(model_name, backend, completed_response, input_data, output, datatype);
 
       FAIL_IF_ERR(
-          TRITONSERVER_ResponseAllocatorDelete(allocator),
-          "deleting response allocator");
+          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
     }
-    
-    public static void
-    main(String[] args) throws Exception
-    {
-      String model_repository_path = null;
-      int verbose_level = 0;
-
-      // Parse commandline...
-      for (int i = 0; i < args.length; i++) {
-        switch (args[i]) {
-          case "-m": {
-            enforce_memory_type = true;
-            i++;
-            if (args[i].equals("system")) {
-              requested_memory_type = TRITONSERVER_MEMORY_CPU;
-            } else if (args[i].equals("pinned")) {
-              requested_memory_type = TRITONSERVER_MEMORY_CPU_PINNED;
-            } else if (args[i].equals("gpu")) {
-              requested_memory_type = TRITONSERVER_MEMORY_GPU;
-            } else {
-              Usage(
-                  "-m must be used to specify one of the following types:" +
-                  " <\"system\"|\"pinned\"|gpu>");
-            }
-            break;
+
+    FAIL_IF_ERR(TRITONSERVER_InferenceRequestDelete(irequest), "deleting inference request");
+
+    FAIL_IF_ERR(TRITONSERVER_ResponseAllocatorDelete(allocator), "deleting response allocator");
+  }
+
+  public static void main(String[] args) throws Exception
+  {
+    String model_repository_path = null;
+    int verbose_level = 0;
+
+    // Parse commandline...
+    for (int i = 0; i < args.length; i++) {
+      switch (args[i]) {
+        case "-m": {
+          enforce_memory_type = true;
+          i++;
+          if (args[i].equals("system")) {
+            requested_memory_type = TRITONSERVER_MEMORY_CPU;
+          } else if (args[i].equals("pinned")) {
+            requested_memory_type = TRITONSERVER_MEMORY_CPU_PINNED;
+          } else if (args[i].equals("gpu")) {
+            requested_memory_type = TRITONSERVER_MEMORY_GPU;
+          } else {
+            Usage(
+                "-m must be used to specify one of the following types:"
+                + " <\"system\"|\"pinned\"|gpu>");
           }
-          case "-r":
-            model_repository_path = args[++i];
-            break;
-          case "-v":
-            verbose_level = 1;
-            break;
-          case "-?":
-            Usage(null);
-            break;
+          break;
         }
+        case "-r":
+          model_repository_path = args[++i];
+          break;
+        case "-v":
+          verbose_level = 1;
+          break;
+        case "-?":
+          Usage(null);
+          break;
       }
+    }
 
-      if (model_repository_path == null) {
-        Usage("-r must be used to specify model repository path");
-      }
-      if (enforce_memory_type && requested_memory_type != TRITONSERVER_MEMORY_CPU) {
-        Usage("-m can only be set to \"system\" without enabling GPU");
+    if (model_repository_path == null) {
+      Usage("-r must be used to specify model repository path");
+    }
+    if (enforce_memory_type && requested_memory_type != TRITONSERVER_MEMORY_CPU) {
+      Usage("-m can only be set to \"system\" without enabling GPU");
+    }
+
+    // Check API version.
+    int[] api_version_major = {0}, api_version_minor = {0};
+    FAIL_IF_ERR(
+        TRITONSERVER_ApiVersion(api_version_major, api_version_minor),
+        "getting Triton API version");
+    if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0])
+        || (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) {
+      FAIL("triton server API version mismatch");
+    }
+
+    // Create the server...
+    TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
+    FAIL_IF_ERR(TRITONSERVER_ServerOptionsNew(server_options), "creating server options");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetModelRepositoryPath(server_options, model_repository_path),
+        "setting model repository path");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
+        "setting verbose logging level");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetBackendDirectory(server_options, "/opt/tritonserver/backends"),
+        "setting backend directory");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
+            server_options, "/opt/tritonserver/repoagents"),
+        "setting repository agent directory");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true),
+        "setting strict model configuration");
+    double min_compute_capability = TRITON_MIN_COMPUTE_CAPABILITY;
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability(
+            server_options, min_compute_capability),
+        "setting minimum supported CUDA compute capability");
+
+    TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
+    FAIL_IF_ERR(TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+    FAIL_IF_ERR(TRITONSERVER_ServerOptionsDelete(server_options), "deleting server options");
+
+    TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
+
+    // Wait until the server is both live and ready.
+    int health_iters = 0;
+    while (true) {
+      boolean[] live = {false}, ready = {false};
+      FAIL_IF_ERR(TRITONSERVER_ServerIsLive(server, live), "unable to get server liveness");
+      FAIL_IF_ERR(TRITONSERVER_ServerIsReady(server, ready), "unable to get server readiness");
+      System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
+      if (live[0] && ready[0]) {
+        break;
       }
 
-      // Check API version.
-      int[] api_version_major = {0}, api_version_minor = {0};
-      FAIL_IF_ERR(
-          TRITONSERVER_ApiVersion(api_version_major, api_version_minor),
-          "getting Triton API version");
-      if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0]) ||
-          (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) {
-        FAIL("triton server API version mismatch");
+      if (++health_iters >= 10) {
+        FAIL("failed to find healthy inference server");
       }
 
-      // Create the server...
-      TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsNew(server_options),
-          "creating server options");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetModelRepositoryPath(
-              server_options, model_repository_path),
-          "setting model repository path");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
-          "setting verbose logging level");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetBackendDirectory(
-              server_options, "/opt/tritonserver/backends"),
-          "setting backend directory");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
-              server_options, "/opt/tritonserver/repoagents"),
-          "setting repository agent directory");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true),
-          "setting strict model configuration");
-      double min_compute_capability = TRITON_MIN_COMPUTE_CAPABILITY;
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability(
-              server_options, min_compute_capability),
-          "setting minimum supported CUDA compute capability");
+      Thread.sleep(500);
+    }
 
-      TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
+    // Print status of the server.
+    {
+      TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
       FAIL_IF_ERR(
-          TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+          TRITONSERVER_ServerMetadata(server, server_metadata_message),
+          "unable to get server metadata message");
+      BytePointer buffer = new BytePointer((Pointer) null);
+      SizeTPointer byte_size = new SizeTPointer(1);
       FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsDelete(server_options),
-          "deleting server options");
-
-      TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
-
-      // Wait until the server is both live and ready.
-      int health_iters = 0;
-      while (true) {
-        boolean[] live = {false}, ready = {false};
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerIsLive(server, live),
-            "unable to get server liveness");
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerIsReady(server, ready),
-            "unable to get server readiness");
-        System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
-        if (live[0] && ready[0]) {
-          break;
-        }
-
-        if (++health_iters >= 10) {
-          FAIL("failed to find healthy inference server");
-        }
-
-        Thread.sleep(500);
-      }
+          TRITONSERVER_MessageSerializeToJson(server_metadata_message, buffer, byte_size),
+          "unable to serialize server metadata message");
 
-      // Print status of the server.
-      {
-        TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerMetadata(server, server_metadata_message),
-            "unable to get server metadata message");
-        BytePointer buffer = new BytePointer((Pointer)null);
-        SizeTPointer byte_size = new SizeTPointer(1);
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageSerializeToJson(
-                server_metadata_message, buffer, byte_size),
-            "unable to serialize server metadata message");
-
-        System.out.println("Server Status:");
-        System.out.println(buffer.limit(byte_size.get()).getString());
-
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageDelete(server_metadata_message),
-            "deleting status metadata");
-      }
+      System.out.println("Server Status:");
+      System.out.println(buffer.limit(byte_size.get()).getString());
 
-      for(String model : MODELS) {
-        PerformInference(server, model);
-      }
+      FAIL_IF_ERR(TRITONSERVER_MessageDelete(server_metadata_message), "deleting status metadata");
+    }
 
-      System.exit(0);
+    for (String model : MODELS) {
+      PerformInference(server, model);
     }
+
+    System.exit(0);
+  }
 }
diff --git a/qa/L0_java_sequence_batcher/SequenceTest.java b/qa/L0_java_sequence_batcher/SequenceTest.java
index 3fdc5d63c1..e74214f695 100644
--- a/qa/L0_java_sequence_batcher/SequenceTest.java
+++ b/qa/L0_java_sequence_batcher/SequenceTest.java
@@ -24,615 +24,576 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import static org.bytedeco.tritonserver.global.tritonserver.*;
+
+import com.google.gson.*;
 import java.io.*;
 import java.util.*;
 import java.util.concurrent.*;
-import com.google.gson.*;
 import org.bytedeco.javacpp.*;
 import org.bytedeco.tritonserver.tritonserver.*;
-import static org.bytedeco.tritonserver.global.tritonserver.*;
 
 public class SequenceTest {
-
-    // Boilerplate code for setting up Triton
-    static void FAIL(String MSG) {
-        System.err.println("Failure: " + MSG);
-        System.exit(1);
-    }
-
-    static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG) {
-        if (err__ != null) {
-            System.err.println("error: " + MSG + ":"
-                             + TRITONSERVER_ErrorCodeString(err__) + " - "
-                             + TRITONSERVER_ErrorMessage(err__));
-            TRITONSERVER_ErrorDelete(err__);
-            System.exit(1);
-        }
+  // Boilerplate code for setting up Triton
+  static void FAIL(String MSG)
+  {
+    System.err.println("Failure: " + MSG);
+    System.exit(1);
+  }
+
+  static void FAIL_IF_ERR(TRITONSERVER_Error err__, String MSG)
+  {
+    if (err__ != null) {
+      System.err.println(
+          "error: " + MSG + ":" + TRITONSERVER_ErrorCodeString(err__) + " - "
+          + TRITONSERVER_ErrorMessage(err__));
+      TRITONSERVER_ErrorDelete(err__);
+      System.exit(1);
     }
+  }
 
-    static int requested_memory_type = TRITONSERVER_MEMORY_CPU;
-
-    static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server {
-        public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p) { super(p); deallocator(new DeleteDeallocator(this)); }
-        protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
-            DeleteDeallocator(Pointer p) { super(p); }
-            @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
-        }
-    }
+  static int requested_memory_type = TRITONSERVER_MEMORY_CPU;
 
-    static void
-    Usage(String msg)
+  static class TRITONSERVER_ServerDeleter extends TRITONSERVER_Server {
+    public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p)
     {
-      if (msg != null) {
-        System.err.println(msg);
-      }
-
-      System.err.println("Usage: java " + SequenceTest.class.getSimpleName() + " [options]");
-      System.err.println("\t-m [model name]");
-      System.err.println("\t-v Enable verbose logging");
-      System.err.println("\t-r [model repository absolute path]");
-
-      System.exit(1);
+      super(p);
+      deallocator(new DeleteDeallocator(this));
     }
-
-    static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
-        @Override public TRITONSERVER_Error call (
-            TRITONSERVER_ResponseAllocator allocator, String tensor_name,
-            long byte_size, int preferred_memory_type,
-            long preferred_memory_type_id, Pointer userp, PointerPointer buffer,
-            PointerPointer buffer_userp, IntPointer actual_memory_type,
-            LongPointer actual_memory_type_id)
-        {
-          // Initially attempt to make the actual memory type and id that we
-          // allocate be the same as preferred memory type
-          actual_memory_type.put(0, preferred_memory_type);
-          actual_memory_type_id.put(0, preferred_memory_type_id);
-
-          // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
-          // need to do any other book-keeping.
-          if (byte_size == 0) {
-            buffer.put(0, null);
-            buffer_userp.put(0, null);
-            System.out.println("allocated " + byte_size + " bytes for result tensor " + tensor_name);
-          } else {
-            Pointer allocated_ptr = new Pointer();
-            actual_memory_type.put(0, requested_memory_type);
-
-            actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU);
-            allocated_ptr = Pointer.malloc(byte_size);
-
-            // Pass the tensor name with buffer_userp so we can show it when
-            // releasing the buffer.
-            if (!allocated_ptr.isNull()) {
-              buffer.put(0, allocated_ptr);
-              buffer_userp.put(0, new BytePointer(tensor_name));
-              System.out.println("allocated " + byte_size + " bytes in "
-                               + TRITONSERVER_MemoryTypeString(actual_memory_type.get())
-                               + " for result tensor " + tensor_name);
-            }
-          }
-
-          return null;  // Success
-        }
+    protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
+      DeleteDeallocator(Pointer p) { super(p); }
+      @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
     }
+  }
 
-    static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
-        @Override public TRITONSERVER_Error call (
-            TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
-            long byte_size, int memory_type, long memory_type_id)
-        {
-          BytePointer name = null;
-          if (buffer_userp != null) {
-            name = new BytePointer(buffer_userp);
-          } else {
-            name = new BytePointer("<unknown>");
-          }
-
-          System.out.println("Releasing buffer " + buffer + " of size " + byte_size
-                           + " in " + TRITONSERVER_MemoryTypeString(memory_type)
-                           + " for result '" + name.getString() + "'");
-          Pointer.free(buffer);
-          name.deallocate();
-
-          return null;  // Success
-        }
+  static void Usage(String msg)
+  {
+    if (msg != null) {
+      System.err.println(msg);
     }
 
-    static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
-        @Override public void call (
-            TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
-        {
-          // We reuse the request so we don't delete it here.
+    System.err.println("Usage: java " + SequenceTest.class.getSimpleName() + " [options]");
+    System.err.println("\t-m [model name]");
+    System.err.println("\t-v Enable verbose logging");
+    System.err.println("\t-r [model repository absolute path]");
+
+    System.exit(1);
+  }
+
+  static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
+    @Override
+    public TRITONSERVER_Error call(
+        TRITONSERVER_ResponseAllocator allocator, String tensor_name, long byte_size,
+        int preferred_memory_type, long preferred_memory_type_id, Pointer userp,
+        PointerPointer buffer, PointerPointer buffer_userp, IntPointer actual_memory_type,
+        LongPointer actual_memory_type_id)
+    {
+      // Initially attempt to make the actual memory type and id that we
+      // allocate be the same as preferred memory type
+      actual_memory_type.put(0, preferred_memory_type);
+      actual_memory_type_id.put(0, preferred_memory_type_id);
+
+      // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
+      // need to do any other book-keeping.
+      if (byte_size == 0) {
+        buffer.put(0, null);
+        buffer_userp.put(0, null);
+        System.out.println("allocated " + byte_size + " bytes for result tensor " + tensor_name);
+      } else {
+        Pointer allocated_ptr = new Pointer();
+        actual_memory_type.put(0, requested_memory_type);
+
+        actual_memory_type.put(0, TRITONSERVER_MEMORY_CPU);
+        allocated_ptr = Pointer.malloc(byte_size);
+
+        // Pass the tensor name with buffer_userp so we can show it when
+        // releasing the buffer.
+        if (!allocated_ptr.isNull()) {
+          buffer.put(0, allocated_ptr);
+          buffer_userp.put(0, new BytePointer(tensor_name));
+          System.out.println(
+              "allocated " + byte_size + " bytes in "
+              + TRITONSERVER_MemoryTypeString(actual_memory_type.get()) + " for result tensor "
+              + tensor_name);
         }
-    }
+      }
 
-    static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
-        @Override public void call (
-            TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
-        {
-          if (response != null) {
-            // Send 'response' to the future.
-            futures.get(userp).complete(response);
-          }
-        }
+      return null; // Success
     }
+  }
 
-    static ConcurrentHashMap<Pointer, CompletableFuture<TRITONSERVER_InferenceResponse>> futures = new ConcurrentHashMap<>();
-    static ResponseAlloc responseAlloc = new ResponseAlloc();
-    static ResponseRelease responseRelease = new ResponseRelease();
-    static InferRequestComplete inferRequestComplete = new InferRequestComplete();
-    static InferResponseComplete inferResponseComplete = new InferResponseComplete();
-
-    static TRITONSERVER_Error
-    ParseModelMetadata(
-        JsonObject model_metadata,
-        boolean[] is_torch_model)
+  static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
+    @Override
+    public TRITONSERVER_Error call(
+        TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
+        long byte_size, int memory_type, long memory_type_id)
     {
-      String seen_data_type = null;
-      for (JsonElement input_element : model_metadata.get("inputs").getAsJsonArray()) {
-        JsonObject input = input_element.getAsJsonObject();
-        if (!input.get("datatype").getAsString().equals("INT32")) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_UNSUPPORTED,
-              "sequence qa example only supports model with data type INT32");
-        }
-        if (seen_data_type == null) {
-          seen_data_type = input.get("datatype").getAsString();
-        } else if (!seen_data_type.equals(input.get("datatype").getAsString())) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              "the inputs and outputs of sequence model must have the data type");
-        }
-      }
-      for (JsonElement output_element : model_metadata.get("outputs").getAsJsonArray()) {
-        JsonObject output = output_element.getAsJsonObject();
-        if (!output.get("datatype").getAsString().equals("INT32")) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_UNSUPPORTED,
-              "sequence qa example only supports model with data type INT32");
-        } else if (!seen_data_type.equals(output.get("datatype").getAsString())) {
-          return TRITONSERVER_ErrorNew(
-              TRITONSERVER_ERROR_INVALID_ARG,
-              "the inputs and outputs of sequence' model must have the data type");
-        }
+      BytePointer name = null;
+      if (buffer_userp != null) {
+        name = new BytePointer(buffer_userp);
+      } else {
+        name = new BytePointer("<unknown>");
       }
 
-      is_torch_model[0] =
-          model_metadata.get("platform").getAsString().equals("pytorch_libtorch");
-      return null;
+      System.out.println(
+          "Releasing buffer " + buffer + " of size " + byte_size + " in "
+          + TRITONSERVER_MemoryTypeString(memory_type) + " for result '" + name.getString() + "'");
+      Pointer.free(buffer);
+      name.deallocate();
+
+      return null; // Success
     }
+  }
 
-    // Custom function to set metadata required for sequence batcher
-    static void
-    SetSequenceMetadata(TRITONSERVER_InferenceRequest irequest, long correlation_id, boolean sequence_start, boolean sequence_end)
+  static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
+    @Override public void call(TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
     {
+      // We reuse the request so we don't delete it here.
+    }
+  }
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestSetCorrelationId(
-              irequest, correlation_id), "Unable to set correlation ID");
-      int flags = 0;
-      if(sequence_start) {
-        flags += TRITONSERVER_REQUEST_FLAG_SEQUENCE_START;
+  static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
+    @Override public void call(TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
+    {
+      if (response != null) {
+        // Send 'response' to the future.
+        futures.get(userp).complete(response);
       }
-      if(sequence_end) {
-        flags += TRITONSERVER_REQUEST_FLAG_SEQUENCE_END;
+    }
+  }
+
+  static ConcurrentHashMap<Pointer, CompletableFuture<TRITONSERVER_InferenceResponse>> futures =
+      new ConcurrentHashMap<>();
+  static ResponseAlloc responseAlloc = new ResponseAlloc();
+  static ResponseRelease responseRelease = new ResponseRelease();
+  static InferRequestComplete inferRequestComplete = new InferRequestComplete();
+  static InferResponseComplete inferResponseComplete = new InferResponseComplete();
+
+  static TRITONSERVER_Error ParseModelMetadata(JsonObject model_metadata, boolean[] is_torch_model)
+  {
+    String seen_data_type = null;
+    for (JsonElement input_element : model_metadata.get("inputs").getAsJsonArray()) {
+      JsonObject input = input_element.getAsJsonObject();
+      if (!input.get("datatype").getAsString().equals("INT32")) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_UNSUPPORTED,
+            "sequence qa example only supports model with data type INT32");
+      }
+      if (seen_data_type == null) {
+        seen_data_type = input.get("datatype").getAsString();
+      } else if (!seen_data_type.equals(input.get("datatype").getAsString())) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            "the inputs and outputs of sequence model must have the data type");
+      }
+    }
+    for (JsonElement output_element : model_metadata.get("outputs").getAsJsonArray()) {
+      JsonObject output = output_element.getAsJsonObject();
+      if (!output.get("datatype").getAsString().equals("INT32")) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_UNSUPPORTED,
+            "sequence qa example only supports model with data type INT32");
+      } else if (!seen_data_type.equals(output.get("datatype").getAsString())) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            "the inputs and outputs of sequence' model must have the data type");
       }
-      FAIL_IF_ERR(
-        TRITONSERVER_InferenceRequestSetFlags(
-            irequest, flags), "Unable to set flags");
-
     }
 
-    // Custom function for adjusting sequence batcher
-    // expected results for backends that do not implement
-    // full accumulator
-    static int
-    GetExpectedResult(String model_name, int expected_result, int value, String flag){
-      if((!model_name.contains("nobatch") && !model_name.contains("custom")) ||
-          model_name.contains("graphdef") || model_name.contains("plan") ||
-          model_name.contains("onnx") || model_name.contains("libtorch")){
-            expected_result = value;
-            if(flag != null && flag.contains("start")){
-              expected_result++;
-            }
-        }
-        return expected_result;
+    is_torch_model[0] = model_metadata.get("platform").getAsString().equals("pytorch_libtorch");
+    return null;
+  }
+
+  // Custom function to set metadata required for sequence batcher
+  static void SetSequenceMetadata(
+      TRITONSERVER_InferenceRequest irequest, long correlation_id, boolean sequence_start,
+      boolean sequence_end)
+  {
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetCorrelationId(irequest, correlation_id),
+        "Unable to set correlation ID");
+    int flags = 0;
+    if (sequence_start) {
+      flags += TRITONSERVER_REQUEST_FLAG_SEQUENCE_START;
+    }
+    if (sequence_end) {
+      flags += TRITONSERVER_REQUEST_FLAG_SEQUENCE_END;
+    }
+    FAIL_IF_ERR(TRITONSERVER_InferenceRequestSetFlags(irequest, flags), "Unable to set flags");
+  }
+
+  // Custom function for adjusting sequence batcher
+  // expected results for backends that do not implement
+  // full accumulator
+  static int GetExpectedResult(String model_name, int expected_result, int value, String flag)
+  {
+    if ((!model_name.contains("nobatch") && !model_name.contains("custom"))
+        || model_name.contains("graphdef") || model_name.contains("plan")
+        || model_name.contains("onnx") || model_name.contains("libtorch")) {
+      expected_result = value;
+      if (flag != null && flag.contains("start")) {
+        expected_result++;
+      }
+    }
+    return expected_result;
+  }
+
+  // Standard function for checking response parameters,
+  // plus customized check that final sequence result
+  // "out" matches expected result
+  static void Check(
+      String model_name, TRITONSERVER_InferenceResponse response, int input_value, String output0,
+      long expected_byte_size, int expected_datatype, boolean sequence_end, int expected_result)
+  {
+    HashMap<String, Pointer> output_data = new HashMap<>();
+
+    int[] output_count = {0};
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceResponseOutputCount(response, output_count),
+        "getting number of response outputs");
+    if (output_count[0] != 1) {
+      FAIL("expecting 1 response outputs, got " + output_count[0]);
     }
 
-    // Standard function for checking response parameters,
-    // plus customized check that final sequence result
-    // "out" matches expected result
-    static void
-    Check(
-        String model_name,
-        TRITONSERVER_InferenceResponse response,
-        int input_value, String output0,
-        long expected_byte_size, int expected_datatype,
-        boolean sequence_end, int expected_result)
-    {
-      HashMap<String, Pointer> output_data = new HashMap<>();
+    for (int idx = 0; idx < output_count[0]; ++idx) {
+      BytePointer cname = new BytePointer((Pointer) null);
+      IntPointer datatype = new IntPointer(1);
+      LongPointer shape = new LongPointer((Pointer) null);
+      LongPointer dim_count = new LongPointer(1);
+      Pointer base = new Pointer();
+      SizeTPointer byte_size = new SizeTPointer(1);
+      IntPointer memory_type = new IntPointer(1);
+      LongPointer memory_type_id = new LongPointer(1);
+      Pointer userp = new Pointer();
 
-      int[] output_count = {0};
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceResponseOutputCount(response, output_count),
-          "getting number of response outputs");
-      if (output_count[0] != 1) {
-        FAIL("expecting 1 response outputs, got " + output_count[0]);
+          TRITONSERVER_InferenceResponseOutput(
+              response, idx, cname, datatype, shape, dim_count, base, byte_size, memory_type,
+              memory_type_id, userp),
+          "getting output info");
+
+      if (cname.isNull()) {
+        FAIL("unable to get output name");
       }
 
-      for (int idx = 0; idx < output_count[0]; ++idx) {
-        BytePointer cname = new BytePointer((Pointer)null);
-        IntPointer datatype = new IntPointer(1);
-        LongPointer shape = new LongPointer((Pointer)null);
-        LongPointer dim_count = new LongPointer(1);
-        Pointer base = new Pointer();
-        SizeTPointer byte_size = new SizeTPointer(1);
-        IntPointer memory_type = new IntPointer(1);
-        LongPointer memory_type_id = new LongPointer(1);
-        Pointer userp = new Pointer();
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseOutput(
-                response, idx, cname, datatype, shape, dim_count, base,
-                byte_size, memory_type, memory_type_id, userp),
-            "getting output info");
-
-        if (cname.isNull()) {
-          FAIL("unable to get output name");
-        }
+      String name = cname.getString();
+      if (!name.equals(output0)) {
+        FAIL("unexpected output '" + name + "'");
+      }
 
-        String name = cname.getString();
-        if (!name.equals(output0)) {
-          FAIL("unexpected output '" + name + "'");
-        }
+      if ((dim_count.get() != 1) || (shape.get(0) != 1)) {
+        FAIL("unexpected shape for '" + name + "'");
+      }
 
-        if ((dim_count.get() != 1) || (shape.get(0) != 1)) {
-          FAIL("unexpected shape for '" + name + "'");
-        }
+      if (datatype.get() != expected_datatype) {
+        FAIL(
+            "unexpected datatype '" + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name
+            + "'");
+      }
 
-        if (datatype.get() != expected_datatype) {
-          FAIL(
-              "unexpected datatype '" +
-              TRITONSERVER_DataTypeString(datatype.get()) + "' for '" +
-              name + "'");
-        }
+      if (byte_size.get() != expected_byte_size) {
+        FAIL(
+            "unexpected byte-size, expected " + expected_byte_size + ", got " + byte_size.get()
+            + " for " + name);
+      }
 
-        if (byte_size.get() != expected_byte_size) {
-          FAIL(
-              "unexpected byte-size, expected " +
-              expected_byte_size + ", got " +
-              byte_size.get() + " for " + name);
-        }
+      if (memory_type.get() != requested_memory_type) {
+        FAIL(
+            "unexpected memory type, expected to be allocated in "
+            + TRITONSERVER_MemoryTypeString(requested_memory_type) + ", got "
+            + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id " + memory_type_id.get()
+            + " for " + name);
+      }
 
-        if (memory_type.get() != requested_memory_type) {
-          FAIL(
-              "unexpected memory type, expected to be allocated in " +
-              TRITONSERVER_MemoryTypeString(requested_memory_type) +
-              ", got " + TRITONSERVER_MemoryTypeString(memory_type.get()) +
-              ", id " + memory_type_id.get() + " for " + name);
-        }
+      // We make a copy of the data here... which we could avoid for
+      // performance reasons but ok for this sequence example.
+      BytePointer odata = new BytePointer(byte_size.get());
+      output_data.put(name, odata);
+      System.out.println(name + " is stored in system memory");
+      odata.put(base.limit(byte_size.get()));
+    }
 
-        // We make a copy of the data here... which we could avoid for
-        // performance reasons but ok for this sequence example.
-        BytePointer odata = new BytePointer(byte_size.get());
-        output_data.put(name, odata);
-        System.out.println(name + " is stored in system memory");
-        odata.put(base.limit(byte_size.get()));
+    int out = new IntPointer(output_data.get(output0)).get(0);
+    System.out.println("Value: " + out);
+    if (sequence_end) {
+      expected_result = GetExpectedResult(model_name, expected_result, input_value, "end");
+      if (out != expected_result) {
+        FAIL("Expected result: " + expected_result + ", got " + out);
+      } else {
+        System.out.println(model_name + " test PASSED");
       }
-
-      int out = new IntPointer(output_data.get(output0)).get(0);
-      System.out.println("Value: " + out);
-      if(sequence_end){
-        expected_result = GetExpectedResult(model_name, expected_result,
-            input_value, "end");
-        if(out != expected_result){
-          FAIL("Expected result: " + expected_result + ", got " + out);
-        } else {
-          System.out.println(model_name + " test PASSED");
-        }
+    }
+  }
+
+  // Boilerplate main function to run inference
+  // for provided model, custom setting of
+  // sequence metadata
+  public static void main(String[] args) throws Exception
+  {
+    String model_repository_path = null;
+    String model_name = null;
+    int verbose_level = 0;
+
+    // Parse commandline...
+    for (int i = 0; i < args.length; i++) {
+      switch (args[i]) {
+        case "-m":
+          model_name = args[++i];
+          break;
+        case "-r":
+          model_repository_path = args[++i];
+          break;
+        case "-v":
+          verbose_level = 1;
+          break;
+        case "-?":
+          Usage(null);
+          break;
       }
     }
 
-    // Boilerplate main function to run inference
-    // for provided model, custom setting of
-    // sequence metadata
-    public static void
-    main(String[] args) throws Exception
-    {
-      String model_repository_path = null;
-      String model_name = null;
-      int verbose_level = 0;
-
-      // Parse commandline...
-      for (int i = 0; i < args.length; i++) {
-        switch (args[i]) {
-          case "-m":
-            model_name = args[++i];
-            break;
-          case "-r":
-            model_repository_path = args[++i];
-            break;
-          case "-v":
-            verbose_level = 1;
-            break;
-          case "-?":
-            Usage(null);
-            break;
-        }
-      }
+    if (model_name == null) {
+      Usage("-m must be used to specify model name");
+    }
+    if (model_repository_path == null) {
+      Usage("-r must be used to specify model repository path");
+    }
 
-      if(model_name == null) {
-        Usage("-m must be used to specify model name");
-      }
-      if (model_repository_path == null) {
-        Usage("-r must be used to specify model repository path");
+    // Check API version.
+    int[] api_version_major = {0}, api_version_minor = {0};
+    FAIL_IF_ERR(
+        TRITONSERVER_ApiVersion(api_version_major, api_version_minor),
+        "getting Triton API version");
+    if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0])
+        || (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) {
+      FAIL("triton server API version mismatch");
+    }
+
+    // Create the server...
+    TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
+    FAIL_IF_ERR(TRITONSERVER_ServerOptionsNew(server_options), "creating server options");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetModelRepositoryPath(server_options, model_repository_path),
+        "setting model repository path");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
+        "setting verbose logging level");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetBackendDirectory(server_options, "/opt/tritonserver/backends"),
+        "setting backend directory");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
+            server_options, "/opt/tritonserver/repoagents"),
+        "setting repository agent directory");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true),
+        "setting strict model configuration");
+
+    TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
+    FAIL_IF_ERR(TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+    FAIL_IF_ERR(TRITONSERVER_ServerOptionsDelete(server_options), "deleting server options");
+
+    TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
+
+    // Wait until the server is both live and ready.
+    int health_iters = 0;
+    while (true) {
+      boolean[] live = {false}, ready = {false};
+      FAIL_IF_ERR(TRITONSERVER_ServerIsLive(server, live), "unable to get server liveness");
+      FAIL_IF_ERR(TRITONSERVER_ServerIsReady(server, ready), "unable to get server readiness");
+      System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
+      if (live[0] && ready[0]) {
+        break;
       }
 
-      // Check API version.
-      int[] api_version_major = {0}, api_version_minor = {0};
-      FAIL_IF_ERR(
-          TRITONSERVER_ApiVersion(api_version_major, api_version_minor),
-          "getting Triton API version");
-      if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major[0]) ||
-          (TRITONSERVER_API_VERSION_MINOR > api_version_minor[0])) {
-        FAIL("triton server API version mismatch");
+      if (++health_iters >= 10) {
+        FAIL("failed to find healthy inference server");
       }
 
-      // Create the server...
-      TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsNew(server_options),
-          "creating server options");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetModelRepositoryPath(
-              server_options, model_repository_path),
-          "setting model repository path");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
-          "setting verbose logging level");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetBackendDirectory(
-              server_options, "/opt/tritonserver/backends"),
-          "setting backend directory");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
-              server_options, "/opt/tritonserver/repoagents"),
-          "setting repository agent directory");
-      FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsSetStrictModelConfig(server_options, true),
-          "setting strict model configuration");
+      Thread.sleep(500);
+    }
 
-      TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
+    // Print status of the server.
+    {
+      TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
       FAIL_IF_ERR(
-          TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+          TRITONSERVER_ServerMetadata(server, server_metadata_message),
+          "unable to get server metadata message");
+      BytePointer buffer = new BytePointer((Pointer) null);
+      SizeTPointer byte_size = new SizeTPointer(1);
       FAIL_IF_ERR(
-          TRITONSERVER_ServerOptionsDelete(server_options),
-          "deleting server options");
-
-      TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
-
-      // Wait until the server is both live and ready.
-      int health_iters = 0;
-      while (true) {
-        boolean[] live = {false}, ready = {false};
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerIsLive(server, live),
-            "unable to get server liveness");
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerIsReady(server, ready),
-            "unable to get server readiness");
-        System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
-        if (live[0] && ready[0]) {
-          break;
-        }
+          TRITONSERVER_MessageSerializeToJson(server_metadata_message, buffer, byte_size),
+          "unable to serialize server metadata message");
+
+      System.out.println("Server Status:");
+      System.out.println(buffer.limit(byte_size.get()).getString());
 
+      FAIL_IF_ERR(TRITONSERVER_MessageDelete(server_metadata_message), "deleting status metadata");
+    }
+
+    // Wait for the model to become available.
+    boolean[] is_torch_model = {false};
+    boolean[] is_ready = {false};
+    health_iters = 0;
+    while (!is_ready[0]) {
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerModelIsReady(server, model_name, 1, is_ready),
+          "unable to get model readiness");
+      if (!is_ready[0]) {
         if (++health_iters >= 10) {
-          FAIL("failed to find healthy inference server");
+          FAIL("model failed to be ready in 10 iterations");
         }
-
         Thread.sleep(500);
+        continue;
       }
 
-      // Print status of the server.
-      {
-        TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerMetadata(server, server_metadata_message),
-            "unable to get server metadata message");
-        BytePointer buffer = new BytePointer((Pointer)null);
-        SizeTPointer byte_size = new SizeTPointer(1);
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageSerializeToJson(
-                server_metadata_message, buffer, byte_size),
-            "unable to serialize server metadata message");
-
-        System.out.println("Server Status:");
-        System.out.println(buffer.limit(byte_size.get()).getString());
-
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageDelete(server_metadata_message),
-            "deleting status metadata");
-      }
-
-      // Wait for the model to become available.
-      boolean[] is_torch_model = {false};
-      boolean[] is_ready = {false};
-      health_iters = 0;
-      while (!is_ready[0]) {
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerModelIsReady(
-                server, model_name, 1, is_ready),
-            "unable to get model readiness");
-        if (!is_ready[0]) {
-          if (++health_iters >= 10) {
-            FAIL("model failed to be ready in 10 iterations");
-          }
-          Thread.sleep(500);
-          continue;
-        }
+      TRITONSERVER_Message model_metadata_message = new TRITONSERVER_Message(null);
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerModelMetadata(server, model_name, 1, model_metadata_message),
+          "unable to get model metadata message");
+      BytePointer buffer = new BytePointer((Pointer) null);
+      SizeTPointer byte_size = new SizeTPointer(1);
+      FAIL_IF_ERR(
+          TRITONSERVER_MessageSerializeToJson(model_metadata_message, buffer, byte_size),
+          "unable to serialize model status protobuf");
 
-        TRITONSERVER_Message model_metadata_message = new TRITONSERVER_Message(null);
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerModelMetadata(
-                server, model_name, 1, model_metadata_message),
-            "unable to get model metadata message");
-        BytePointer buffer = new BytePointer((Pointer)null);
-        SizeTPointer byte_size = new SizeTPointer(1);
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageSerializeToJson(
-                model_metadata_message, buffer, byte_size),
-            "unable to serialize model status protobuf");
-
-        JsonParser parser = new JsonParser();
-        JsonObject model_metadata = null;
-        try {
-          model_metadata = parser.parse(buffer.limit(byte_size.get()).getString()).getAsJsonObject();
-        } catch (Exception e) {
-          FAIL("error: failed to parse model metadata from JSON: " + e);
-        }
+      JsonParser parser = new JsonParser();
+      JsonObject model_metadata = null;
+      try {
+        model_metadata = parser.parse(buffer.limit(byte_size.get()).getString()).getAsJsonObject();
+      }
+      catch (Exception e) {
+        FAIL("error: failed to parse model metadata from JSON: " + e);
+      }
 
-        FAIL_IF_ERR(
-            TRITONSERVER_MessageDelete(model_metadata_message),
-            "deleting status protobuf");
+      FAIL_IF_ERR(TRITONSERVER_MessageDelete(model_metadata_message), "deleting status protobuf");
 
-        if (!model_metadata.get("name").getAsString().equals(model_name)) {
-          FAIL("unable to find metadata for model");
-        }
+      if (!model_metadata.get("name").getAsString().equals(model_name)) {
+        FAIL("unable to find metadata for model");
+      }
 
-        boolean found_version = false;
-        if (model_metadata.has("versions")) {
-          for (JsonElement version : model_metadata.get("versions").getAsJsonArray()) {
-            if (version.getAsString().equals("1")) {
-              found_version = true;
-              break;
-            }
+      boolean found_version = false;
+      if (model_metadata.has("versions")) {
+        for (JsonElement version : model_metadata.get("versions").getAsJsonArray()) {
+          if (version.getAsString().equals("1")) {
+            found_version = true;
+            break;
           }
         }
-        if (!found_version) {
-          FAIL("unable to find version 1 status for model");
-        }
-
-        FAIL_IF_ERR(
-            ParseModelMetadata(model_metadata, is_torch_model),
-            "parsing model metadata");
+      }
+      if (!found_version) {
+        FAIL("unable to find version 1 status for model");
       }
 
-      // Create the allocator that will be used to allocate buffers for
-      // the result tensors.
-      TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_ResponseAllocatorNew(
-              allocator, responseAlloc, responseRelease, null /* start_fn */),
-          "creating response allocator");
+      FAIL_IF_ERR(ParseModelMetadata(model_metadata, is_torch_model), "parsing model metadata");
+    }
 
-      // Inference
-      TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestNew(
-              irequest, server, model_name, -1 /* model_version */),
-          "creating inference request");
+    // Create the allocator that will be used to allocate buffers for
+    // the result tensors.
+    TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_ResponseAllocatorNew(
+            allocator, responseAlloc, responseRelease, null /* start_fn */),
+        "creating response allocator");
+
+    // Inference
+    TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestNew(irequest, server, model_name, -1 /* model_version */),
+        "creating inference request");
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"),
+        "setting ID for the request");
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetReleaseCallback(
+            irequest, inferRequestComplete, null /* request_release_userp */),
+        "setting request release callback");
+
+    // Inputs
+    String input0 = is_torch_model[0] ? "INPUT__0" : "INPUT";
+
+    long[] input0_shape = {1};
+
+    int datatype = TRITONSERVER_TYPE_INT32;
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddInput(
+            irequest, input0, datatype, input0_shape, input0_shape.length),
+        "setting input 0 meta-data for the request");
+
+    String output0 = is_torch_model[0] ? "OUTPUT__0" : "OUTPUT";
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output0),
+        "requesting output 0 for the request");
+
+    // Non-zero ID for the sequence requests
+    long correlation_id = 5;
+    // Number of requests in the sequence
+    int num_requests = 9;
+    // Expected_result is  1+2+3+...+num_requests
+    int expected_result = num_requests * (1 + num_requests) / 2;
+    boolean sequence_start = true;
+    boolean sequence_end = false;
+
+    // Create the initial data for the input tensor.
+    IntPointer[] p0 = {new IntPointer(1)};
+    BytePointer input0_data = p0[0].getPointer(BytePointer.class);
+    long input0_size = input0_data.limit();
+
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestAppendInputData(
+            irequest, input0, input0_data, input0_size, requested_memory_type,
+            0 /* memory_type_id */),
+        "assigning INPUT0 data");
+
+    for (int i = 0; i < num_requests; i++) {
+      // Update input value
+      int input = i + 1;
+      p0[0].put(0, input);
+
+      // Set sequence metadata
+      if (i == 1) {
+        sequence_start = false;
+      }
+      if (i == num_requests - 1) {
+        sequence_end = true;
+      }
+      SetSequenceMetadata(irequest, correlation_id, sequence_start, sequence_end);
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestSetId(irequest, "my_request_id"),
-          "setting ID for the request");
+      // Perform inference...
+      CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
+      futures.put(irequest, completed);
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestSetReleaseCallback(
-              irequest, inferRequestComplete, null /* request_release_userp */),
-          "setting request release callback");
-
-      // Inputs
-      String input0 = is_torch_model[0] ? "INPUT__0" : "INPUT";
-
-      long[] input0_shape = {1};
-
-      int datatype = TRITONSERVER_TYPE_INT32;
+          TRITONSERVER_InferenceRequestSetResponseCallback(
+              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
+              irequest),
+          "setting response callback");
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddInput(
-              irequest, input0, datatype, input0_shape, input0_shape.length),
-          "setting input 0 meta-data for the request");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
 
-      String output0 = is_torch_model[0] ? "OUTPUT__0" : "OUTPUT";
+      // Wait for the inference to complete.
+      TRITONSERVER_InferenceResponse completed_response = completed.get();
+      futures.remove(irequest);
 
-      FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output0),
-          "requesting output 0 for the request");
-
-      // Non-zero ID for the sequence requests
-      long correlation_id = 5;
-      // Number of requests in the sequence
-      int num_requests = 9;
-      // Expected_result is  1+2+3+...+num_requests
-      int expected_result = num_requests * (1 + num_requests) / 2;
-      boolean sequence_start = true;
-      boolean sequence_end = false;
-
-      // Create the initial data for the input tensor.
-      IntPointer[] p0 = {new IntPointer(1)};
-      BytePointer input0_data = p0[0].getPointer(BytePointer.class);
-      long input0_size = input0_data.limit();
+      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
 
-      FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestAppendInputData(
-                irequest, input0, input0_data, input0_size, requested_memory_type,
-                0 /* memory_type_id */),
-            "assigning INPUT0 data");
-
-      for(int i = 0; i < num_requests; i++) {
-        // Update input value
-        int input = i + 1;
-        p0[0].put(0, input);
-
-        // Set sequence metadata
-        if(i == 1) {
-          sequence_start = false;
-        }
-        if(i == num_requests - 1) {
-          sequence_end = true;
-        }
-        SetSequenceMetadata(irequest, correlation_id, sequence_start, sequence_end);
-        
-        // Perform inference...
-        CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
-        futures.put(irequest, completed);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceRequestSetResponseCallback(
-                irequest, allocator, null /* response_allocator_userp */,
-                inferResponseComplete, irequest),
-            "setting response callback");
-
-        FAIL_IF_ERR(
-            TRITONSERVER_ServerInferAsync(
-                server, irequest, null /* trace */),
-            "running inference");
-
-        // Wait for the inference to complete.
-        TRITONSERVER_InferenceResponse completed_response = completed.get();
-        futures.remove(irequest);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseError(completed_response),
-            "response status");
-
-        Check(
-            model_name, completed_response, input, output0, input0_size,
-            datatype, sequence_end, expected_result);
-
-        FAIL_IF_ERR(
-            TRITONSERVER_InferenceResponseDelete(completed_response),
-            "deleting inference response");
-      }
+      Check(
+          model_name, completed_response, input, output0, input0_size, datatype, sequence_end,
+          expected_result);
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceRequestDelete(irequest),
-          "deleting inference request");
+          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
+    }
 
-      FAIL_IF_ERR(
-          TRITONSERVER_ResponseAllocatorDelete(allocator),
-          "deleting response allocator");
+    FAIL_IF_ERR(TRITONSERVER_InferenceRequestDelete(irequest), "deleting inference request");
 
-      System.exit(0);
-    }
+    FAIL_IF_ERR(TRITONSERVER_ResponseAllocatorDelete(allocator), "deleting response allocator");
+
+    System.exit(0);
+  }
 }
diff --git a/qa/L0_json/test.sh b/qa/L0_json/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_large_payload/large_payload_test.py b/qa/L0_large_payload/large_payload_test.py
old mode 100644
new mode 100755
index 051fa4790b..fff57290ef
--- a/qa/L0_large_payload/large_payload_test.py
+++ b/qa/L0_large_payload/large_payload_test.py
@@ -1,4 +1,6 @@
-# Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -30,15 +32,15 @@
 
 import math
 import unittest
+
 import numpy as np
 import test_util as tu
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
-from tritonclientutils import np_to_triton_dtype, InferenceServerException
+from tritonclientutils import InferenceServerException, np_to_triton_dtype
 
 
 class LargePayLoadTest(tu.TestResultCollector):
-
     def setUp(self):
         self._data_type = np.float32
 
@@ -46,36 +48,40 @@ def setUp(self):
         # hard limit on 2GBs for the size of input tensors. All backends except
         # plan backend should be able to handle payloads larger than 2GBs using
         # HTTP.
-        very_large_tensor_shape = (math.trunc(
-            3 * (1024 * 1024 * 1024) / np.dtype(self._data_type).itemsize),)
+        very_large_tensor_shape = (
+            math.trunc(3 * (1024 * 1024 * 1024) / np.dtype(self._data_type).itemsize),
+        )
         self._very_large_in0 = np.random.random(very_large_tensor_shape).astype(
-            self._data_type)
+            self._data_type
+        )
 
         # 1.9 GBs allows us to test gRPC with moderate sizes too.
-        large_tensor_shape = (math.trunc(1.9 * (1024 * 1024 * 1024) //
-                                         np.dtype(self._data_type).itemsize),)
-        self._large_in0 = np.random.random(large_tensor_shape).astype(
-            self._data_type)
+        large_tensor_shape = (
+            math.trunc(
+                1.9 * (1024 * 1024 * 1024) // np.dtype(self._data_type).itemsize
+            ),
+        )
+        self._large_in0 = np.random.random(large_tensor_shape).astype(self._data_type)
 
         small_tensor_shape = (1,)
-        self._small_in0 = np.random.random(small_tensor_shape).astype(
-            self._data_type)
-
-        self._clients = ((httpclient,
-                          httpclient.InferenceServerClient('localhost:8000')),
-                         (grpcclient,
-                          grpcclient.InferenceServerClient('localhost:8001')))
-
-    def _test_helper(self,
-                     client,
-                     model_name,
-                     input_name='INPUT0',
-                     output_name='OUTPUT0'):
-        # plan does not supoort large batch sizes.
-        if not model_name.startswith('plan'):
+        self._small_in0 = np.random.random(small_tensor_shape).astype(self._data_type)
+
+        self._clients = (
+            (httpclient, httpclient.InferenceServerClient("localhost:8000")),
+            (grpcclient, grpcclient.InferenceServerClient("localhost:8001")),
+        )
+
+    def _test_helper(
+        self, client, model_name, input_name="INPUT0", output_name="OUTPUT0"
+    ):
+        # plan does not support large batch sizes.
+        if not model_name.startswith("plan"):
             inputs = [
-                client[0].InferInput(input_name, self._large_in0.shape,
-                                     np_to_triton_dtype(self._data_type))
+                client[0].InferInput(
+                    input_name,
+                    self._large_in0.shape,
+                    np_to_triton_dtype(self._data_type),
+                )
             ]
             inputs[0].set_data_from_numpy(self._large_in0)
             results = client[1].infer(model_name, inputs)
@@ -84,13 +90,17 @@ def _test_helper(self,
             # the framework and protocol do support large payload
             self.assertTrue(
                 np.array_equal(self._large_in0, results.as_numpy(output_name)),
-                "output is different from input")
+                "output is different from input",
+            )
 
         if client[0] == httpclient:
             # FIXME HTTPServer cannot support large payloads. See DLIS-1776.
             inputs = [
-                client[0].InferInput(input_name, self._very_large_in0.shape,
-                                     np_to_triton_dtype(self._data_type))
+                client[0].InferInput(
+                    input_name,
+                    self._very_large_in0.shape,
+                    np_to_triton_dtype(self._data_type),
+                )
             ]
             inputs[0].set_data_from_numpy(self._very_large_in0)
             with self.assertRaises(InferenceServerException):
@@ -113,56 +123,54 @@ def _test_helper(self,
 
         # Send a small payload to verify if the server is still functional
         inputs = [
-            client[0].InferInput(input_name, self._small_in0.shape,
-                                 np_to_triton_dtype(self._data_type))
+            client[0].InferInput(
+                input_name, self._small_in0.shape, np_to_triton_dtype(self._data_type)
+            )
         ]
         inputs[0].set_data_from_numpy(self._small_in0)
         results = client[1].infer(model_name, inputs)
         self.assertTrue(
             np.array_equal(self._small_in0, results.as_numpy(output_name)),
-            "output is different from input")
+            "output is different from input",
+        )
 
     def test_graphdef(self):
         # graphdef_nobatch_zero_1_float32 is identity model with input shape [-1]
         for client in self._clients:
-            model_name = tu.get_zero_model_name("graphdef_nobatch", 1,
-                                                self._data_type)
+            model_name = tu.get_zero_model_name("graphdef_nobatch", 1, self._data_type)
             self._test_helper(client, model_name)
 
     def test_savedmodel(self):
         # savedmodel_nobatch_zero_1_float32 is identity model with input shape [-1]
         for client in self._clients:
-            model_name = tu.get_zero_model_name("savedmodel_nobatch", 1,
-                                                self._data_type)
+            model_name = tu.get_zero_model_name(
+                "savedmodel_nobatch", 1, self._data_type
+            )
             self._test_helper(client, model_name)
 
     def test_onnx(self):
         # onnx_nobatch_zero_1_float32 is identity model with input shape [-1]
         for client in self._clients:
-            model_name = tu.get_zero_model_name("onnx_nobatch", 1,
-                                                self._data_type)
+            model_name = tu.get_zero_model_name("onnx_nobatch", 1, self._data_type)
             self._test_helper(client, model_name)
 
     def test_python(self):
         # python_nobatch_zero_1_float32 is identity model with input shape [-1]
         for client in self._clients:
-            model_name = tu.get_zero_model_name("python_nobatch", 1,
-                                                self._data_type)
+            model_name = tu.get_zero_model_name("python_nobatch", 1, self._data_type)
             self._test_helper(client, model_name)
 
     def test_plan(self):
         # plan_nobatch_zero_1_float32 is identity model with input shape [-1]
         for client in self._clients:
-            model_name = tu.get_zero_model_name("plan_nobatch", 1,
-                                                self._data_type)
+            model_name = tu.get_zero_model_name("plan_nobatch", 1, self._data_type)
             self._test_helper(client, model_name)
 
     def test_libtorch(self):
         # libtorch_nobatch_zero_1_float32 is identity model with input shape [-1]
         for client in self._clients:
-            model_name = tu.get_zero_model_name("libtorch_nobatch", 1,
-                                                self._data_type)
-            self._test_helper(client, model_name, 'INPUT__0', 'OUTPUT__0')
+            model_name = tu.get_zero_model_name("libtorch_nobatch", 1, self._data_type)
+            self._test_helper(client, model_name, "INPUT__0", "OUTPUT__0")
 
     def test_custom(self):
         # custom_zero_1_float32 is identity model with input shape [-1]
@@ -171,5 +179,5 @@ def test_custom(self):
             self._test_helper(client, model_name)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_large_payload/test.sh b/qa/L0_large_payload/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_libtorch_inference_mode/test.sh b/qa/L0_libtorch_inference_mode/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_libtorch_instance_group_kind_model/client.py b/qa/L0_libtorch_instance_group_kind_model/client.py
old mode 100644
new mode 100755
index c3c8289f8a..92bead3464
--- a/qa/L0_libtorch_instance_group_kind_model/client.py
+++ b/qa/L0_libtorch_instance_group_kind_model/client.py
@@ -31,32 +31,32 @@
 sys.path.append("../common")
 
 import unittest
+
 import numpy as np
 import test_util as tu
-
 import tritonclient.http as httpclient
 
 # By default, find tritonserver on "localhost", but can be overridden
 # with TRITONSERVER_IPADDR envvar
-_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost')
+_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
 
 
 class InferTest(tu.TestResultCollector):
-
     def test_infer(self):
         try:
             triton_client = httpclient.InferenceServerClient(
-                url=f"{_tritonserver_ipaddr}:8000")
+                url=f"{_tritonserver_ipaddr}:8000"
+            )
         except Exception as e:
             print("channel creation failed: " + str(e))
             sys.exit(1)
 
-        model_name = os.environ['MODEL_NAME']
+        model_name = os.environ["MODEL_NAME"]
 
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "FP32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "FP32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "FP32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "FP32"))
 
         # Create the data for the two input tensors.
         input0_data = np.arange(start=0, stop=16, dtype=np.float32)
@@ -68,15 +68,13 @@ def test_infer(self):
         inputs[0].set_data_from_numpy(input0_data, binary_data=True)
         inputs[1].set_data_from_numpy(input1_data, binary_data=True)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT__0', binary_data=True))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT__1', binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT__0", binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT__1", binary_data=True))
 
         results = triton_client.infer(model_name, inputs, outputs=outputs)
 
-        output0_data = results.as_numpy('OUTPUT__0')
-        output1_data = results.as_numpy('OUTPUT__1')
+        output0_data = results.as_numpy("OUTPUT__0")
+        output1_data = results.as_numpy("OUTPUT__1")
 
         expected_output_0 = input0_data + input1_data
         expected_output_1 = input0_data - input1_data
@@ -88,5 +86,5 @@ def test_infer(self):
         self.assertTrue(np.all(expected_output_1 == output1_data))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_libtorch_instance_group_kind_model/gen_models.py b/qa/L0_libtorch_instance_group_kind_model/gen_models.py
index af8023e352..e61980f491 100755
--- a/qa/L0_libtorch_instance_group_kind_model/gen_models.py
+++ b/qa/L0_libtorch_instance_group_kind_model/gen_models.py
@@ -30,7 +30,6 @@
 
 
 class SumModule(nn.Module):
-
     def __init__(self, device):
         super(SumModule, self).__init__()
         self.device = device
@@ -38,13 +37,15 @@ def __init__(self, device):
     def forward(self, INPUT0, INPUT1):
         INPUT0 = INPUT0.to(self.device)
         INPUT1 = INPUT1.to(self.device)
-        print('SumModule - INPUT0 device: {}, INPUT1 device: {}\n'.format(
-            INPUT0.device, INPUT1.device))
+        print(
+            "SumModule - INPUT0 device: {}, INPUT1 device: {}\n".format(
+                INPUT0.device, INPUT1.device
+            )
+        )
         return INPUT0 + INPUT1
 
 
 class DiffModule(nn.Module):
-
     def __init__(self, device):
         super(DiffModule, self).__init__()
         self.device = device
@@ -52,13 +53,15 @@ def __init__(self, device):
     def forward(self, INPUT0, INPUT1):
         INPUT0 = INPUT0.to(self.device)
         INPUT1 = INPUT1.to(self.device)
-        print('DiffModule - INPUT0 device: {}, INPUT1 device: {}\n'.format(
-            INPUT0.device, INPUT1.device))
+        print(
+            "DiffModule - INPUT0 device: {}, INPUT1 device: {}\n".format(
+                INPUT0.device, INPUT1.device
+            )
+        )
         return INPUT0 - INPUT1
 
 
 class TestModel(nn.Module):
-
     def __init__(self, device0, device1):
         super(TestModel, self).__init__()
         self.device0 = device0
@@ -72,6 +75,7 @@ def forward(self, INPUT0, INPUT1):
         op1 = self.layer2(INPUT0, INPUT1)
         return op0, op1
 
+
 if torch.cuda.device_count() < 4:
     print("Need at least 4 GPUs to run this test")
     exit(1)
diff --git a/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt
old mode 100755
new mode 100644
diff --git a/qa/L0_libtorch_instance_group_kind_model/test.sh b/qa/L0_libtorch_instance_group_kind_model/test.sh
index 7dcb96d5d1..04d76bd036 100755
--- a/qa/L0_libtorch_instance_group_kind_model/test.sh
+++ b/qa/L0_libtorch_instance_group_kind_model/test.sh
@@ -63,9 +63,9 @@ cp models/libtorch_multi_device/config.pbtxt models/libtorch_multi_gpu/.
 (cd models/libtorch_multi_gpu && \
     sed -i "s/name: \"libtorch_multi_device\"/name: \"libtorch_multi_gpu\"/" config.pbtxt)
 
-# Generate the models which are partioned across multiple devices
+# Generate the models which are partitioned across multiple devices
 set +e
-python3 gen_models.py >> $CLIENT_LOG 2>&1 
+python3 gen_models.py >> $CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
     echo -e "\n***\n*** Error when generating models. \n***"
     cat $CLIENT_LOG
@@ -83,7 +83,7 @@ fi
 set +e
 
 export MODEL_NAME='libtorch_multi_device'
-python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 
+python3 $CLIENT_PY >> $CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
     echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***"
     cat $CLIENT_LOG
@@ -109,7 +109,7 @@ for MESSAGE in "${MESSAGES[@]}"; do
 done
 
 export MODEL_NAME='libtorch_multi_gpu'
-python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 
+python3 $CLIENT_PY >> $CLIENT_LOG 2>&1
 if [ $? -ne 0 ]; then
     echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***"
     cat $CLIENT_LOG
diff --git a/qa/L0_libtorch_io_names/io_names_client.py b/qa/L0_libtorch_io_names/io_names_client.py
old mode 100644
new mode 100755
index 15971356d9..b74e520de2
--- a/qa/L0_libtorch_io_names/io_names_client.py
+++ b/qa/L0_libtorch_io_names/io_names_client.py
@@ -29,19 +29,19 @@
 
 sys.path.append("../common")
 
-from builtins import range
 import unittest
-import test_util as tu
-import numpy as np
+from builtins import range
 
+import numpy as np
+import test_util as tu
 import tritonclient.http as httpclient
 
 
 class IONamingConvention(tu.TestResultCollector):
-
     def _infer_helper(self, model_name, io_names, reversed_order=False):
-        triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                         verbose=False)
+        triton_client = httpclient.InferenceServerClient(
+            "localhost:8000", verbose=False
+        )
 
         # Create the data for the two inputs. Initialize the first to unique
         # integers and the second to all ones.
@@ -53,30 +53,34 @@ def _infer_helper(self, model_name, io_names, reversed_order=False):
         output_req = []
         inputs.append(
             httpclient.InferInput(
-                io_names[0] if not reversed_order else io_names[1], [1, 16],
-                "FP32"))
+                io_names[0] if not reversed_order else io_names[1], [1, 16], "FP32"
+            )
+        )
         inputs[-1].set_data_from_numpy(input0_data)
         inputs.append(
             httpclient.InferInput(
-                io_names[1] if not reversed_order else io_names[0], [1, 16],
-                "FP32"))
+                io_names[1] if not reversed_order else io_names[0], [1, 16], "FP32"
+            )
+        )
         inputs[-1].set_data_from_numpy(input1_data)
         output_req.append(
-            httpclient.InferRequestedOutput(io_names[2], binary_data=True))
+            httpclient.InferRequestedOutput(io_names[2], binary_data=True)
+        )
         output_req.append(
-            httpclient.InferRequestedOutput(io_names[3], binary_data=True))
+            httpclient.InferRequestedOutput(io_names[3], binary_data=True)
+        )
 
         results = triton_client.infer(model_name, inputs, outputs=output_req)
 
         output0_data = results.as_numpy(
-            io_names[2] if not reversed_order else io_names[3])
+            io_names[2] if not reversed_order else io_names[3]
+        )
         output1_data = results.as_numpy(
-            io_names[3] if not reversed_order else io_names[2])
+            io_names[3] if not reversed_order else io_names[2]
+        )
         for i in range(16):
-            self.assertEqual(input0_data[0][i] - input1_data[0][i],
-                             output0_data[0][i])
-            self.assertEqual(input0_data[0][i] + input1_data[0][i],
-                             output1_data[0][i])
+            self.assertEqual(input0_data[0][i] - input1_data[0][i], output0_data[0][i])
+            self.assertEqual(input0_data[0][i] + input1_data[0][i], output1_data[0][i])
 
     def test_io_index(self):
         io_names = ["INPUT__0", "INPUT__1", "OUTPUT__0", "OUTPUT__1"]
@@ -108,10 +112,8 @@ def test_mix_arguments_index(self):
 
     def test_unordered_index(self):
         io_names = ["INPUT1", "INPUT0", "OUT__1", "OUT__0"]
-        self._infer_helper("libtorch_unordered_index",
-                           io_names,
-                           reversed_order=True)
+        self._infer_helper("libtorch_unordered_index", io_names, reversed_order=True)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_libtorch_io_names/test.sh b/qa/L0_libtorch_io_names/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_libtorch_nvfuser/test.sh b/qa/L0_libtorch_nvfuser/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_libtorch_optimized_execution/test.sh b/qa/L0_libtorch_optimized_execution/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_libtorch_shared_weights/libtorch_shared_weights_test.py b/qa/L0_libtorch_shared_weights/libtorch_shared_weights_test.py
old mode 100644
new mode 100755
index eeb5651afa..7c2fdb5a71
--- a/qa/L0_libtorch_shared_weights/libtorch_shared_weights_test.py
+++ b/qa/L0_libtorch_shared_weights/libtorch_shared_weights_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,31 +30,29 @@
 
 sys.path.append("../common")
 
-import numpy as np
 import unittest
 from builtins import range
-import tritonhttpclient as httpclient
+
+import numpy as np
 import test_util as tu
+import tritonhttpclient as httpclient
 
 FLAGS = None
 
 
 class SharedWeightsTest(tu.TestResultCollector):
-
     def _full_exact(self, model_name, request_concurrency, shape):
-
         # Run async requests to make sure backend handles concurrent requests
         # correctly.
         client = httpclient.InferenceServerClient(
-            "localhost:8000", concurrency=request_concurrency)
+            "localhost:8000", concurrency=request_concurrency
+        )
         input_datas = []
         requests = []
         for i in range(request_concurrency):
             input_data = (16384 * np.random.randn(*shape)).astype(np.float32)
             input_datas.append(input_data)
-            inputs = [
-                httpclient.InferInput("INPUT__0", input_data.shape, "FP32")
-            ]
+            inputs = [httpclient.InferInput("INPUT__0", input_data.shape, "FP32")]
             inputs[0].set_data_from_numpy(input_data)
             requests.append(client.async_infer(model_name, inputs))
 
@@ -62,8 +62,7 @@ def _full_exact(self, model_name, request_concurrency, shape):
             results = requests[i].get_result()
 
             output_data = results.as_numpy("OUTPUT__0")
-            self.assertIsNotNone(output_data,
-                                 "error: expected 'OUTPUT__0' to be found")
+            self.assertIsNotNone(output_data, "error: expected 'OUTPUT__0' to be found")
             np.testing.assert_allclose(output_data, input_datas[i])
 
     def test_pytorch_identity_model(self):
@@ -71,5 +70,5 @@ def test_pytorch_identity_model(self):
         self._full_exact(model_name, 128, [8])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_libtorch_shared_weights/test.sh b/qa/L0_libtorch_shared_weights/test.sh
old mode 100644
new mode 100755
index e6f23b7a45..6ca251ce32
--- a/qa/L0_libtorch_shared_weights/test.sh
+++ b/qa/L0_libtorch_shared_weights/test.sh
@@ -1,4 +1,5 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/bin/bash
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/qa/L0_lifecycle/lifecycle_test.py b/qa/L0_lifecycle/lifecycle_test.py
old mode 100644
new mode 100755
index 5ce079a77a..1caffb8f56
--- a/qa/L0_lifecycle/lifecycle_test.py
+++ b/qa/L0_lifecycle/lifecycle_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,93 +30,99 @@
 
 sys.path.append("../common")
 
-from builtins import range
-from functools import partial
+import concurrent.futures
 import os
 import shutil
 import signal
+import threading
 import time
 import unittest
-import numpy as np
+from builtins import range
+from functools import partial
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
-import threading
-import concurrent.futures
-
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
 from tritonclient.utils import InferenceServerException
 
 
 class LifeCycleTest(tu.TestResultCollector):
-
-    def _infer_success_models(self,
-                              model_base_names,
-                              versions,
-                              tensor_shape,
-                              swap=False):
+    def _infer_success_models(
+        self, model_base_names, versions, tensor_shape, swap=False
+    ):
         for base_name in model_base_names:
             try:
-                model_name = tu.get_model_name(base_name, np.float32,
-                                               np.float32, np.float32)
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                model_name = tu.get_model_name(
+                    base_name, np.float32, np.float32, np.float32
+                )
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     # FIXME is_server_ready should be true here DLIS-1296
                     # self.assertTrue(triton_client.is_server_ready())
                     for v in versions:
                         self.assertTrue(
-                            triton_client.is_model_ready(model_name, str(v)))
+                            triton_client.is_model_ready(model_name, str(v))
+                        )
 
                 for v in versions:
-                    iu.infer_exact(self,
-                                   base_name,
-                                   tensor_shape,
-                                   1,
-                                   np.float32,
-                                   np.float32,
-                                   np.float32,
-                                   model_version=v,
-                                   swap=(swap or (v != 1)))
+                    iu.infer_exact(
+                        self,
+                        base_name,
+                        tensor_shape,
+                        1,
+                        np.float32,
+                        np.float32,
+                        np.float32,
+                        model_version=v,
+                        swap=(swap or (v != 1)),
+                    )
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
-    def _infer_success_identity(self, model_base, versions, tensor_dtype,
-                                tensor_shape):
+    def _infer_success_identity(self, model_base, versions, tensor_dtype, tensor_shape):
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             self.assertTrue(triton_client.is_server_live())
             self.assertTrue(triton_client.is_server_ready())
             for v in versions:
                 self.assertTrue(
                     triton_client.is_model_ready(
-                        tu.get_zero_model_name(model_base, 1, tensor_dtype),
-                        str(v)))
+                        tu.get_zero_model_name(model_base, 1, tensor_dtype), str(v)
+                    )
+                )
 
             for v in versions:
-                iu.infer_zero(self,
-                              model_base,
-                              1,
-                              tensor_dtype,
-                              tensor_shape,
-                              tensor_shape,
-                              use_http=False,
-                              use_grpc=True,
-                              use_http_json_tensors=False,
-                              use_streaming=False)
+                iu.infer_zero(
+                    self,
+                    model_base,
+                    1,
+                    tensor_dtype,
+                    tensor_shape,
+                    tensor_shape,
+                    use_http=False,
+                    use_grpc=True,
+                    use_http_json_tensors=False,
+                    use_streaming=False,
+                )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def _get_client(self, use_grpc=False):
         if use_grpc:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
         else:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
         return triton_client
 
     def _async_load(self, model_name, use_grpc):
@@ -130,8 +138,9 @@ def test_parse_error_noexit(self):
         # SERVER_FAILED_TO_INITIALIZE status.
         # Server is not live and not ready regardless of --strict-readiness
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             self.assertFalse(triton_client.is_server_live())
             self.assertFalse(triton_client.is_server_ready())
             md = triton_client.get_server_metadata()
@@ -141,13 +150,14 @@ def test_parse_error_noexit(self):
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             self.assertFalse(triton_client.is_server_live())
             self.assertFalse(triton_client.is_server_ready())
             md = triton_client.get_server_metadata()
-            self.assertEqual(os.environ["TRITON_SERVER_VERSION"], md['version'])
-            self.assertEqual("triton", md['name'])
+            self.assertEqual(os.environ["TRITON_SERVER_VERSION"], md["version"])
+            self.assertEqual("triton", md["name"])
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -157,17 +167,20 @@ def test_parse_error_modelfail(self):
 
         # Server was started but with a model that fails to load
         try:
-            model_name = tu.get_model_name('graphdef', np.float32, np.float32,
-                                           np.float32)
+            model_name = tu.get_model_name(
+                "graphdef", np.float32, np.float32, np.float32
+            )
 
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             self.assertTrue(triton_client.is_server_live())
             self.assertFalse(triton_client.is_server_ready())
             self.assertFalse(triton_client.is_model_ready(model_name, "1"))
 
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             self.assertTrue(triton_client.is_server_live())
             self.assertFalse(triton_client.is_server_ready())
             self.assertFalse(triton_client.is_model_ready(model_name, "1"))
@@ -176,35 +189,38 @@ def test_parse_error_modelfail(self):
 
         # Inferencing with the missing model should fail.
         try:
-            iu.infer_exact(self, 'graphdef', tensor_shape, 1, np.float32,
-                           np.float32, np.float32)
-            self.assertTrue(
-                False, "expected error for unavailable model " + model_name)
+            iu.infer_exact(
+                self, "graphdef", tensor_shape, 1, np.float32, np.float32, np.float32
+            )
+            self.assertTrue(False, "expected error for unavailable model " + model_name)
         except Exception as ex:
             self.assertIn(
                 "Request for unknown model: 'graphdef_float32_float32_float32' has no available versions",
-                ex.message())
+                ex.message(),
+            )
 
         # And other models should be loaded successfully
         try:
-            for base_name in ['savedmodel', 'onnx']:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
-                    model_name = tu.get_model_name(base_name, np.float32,
-                                                   np.float32, np.float32)
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "1"))
-
-                iu.infer_exact(self,
-                               base_name,
-                               tensor_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               model_version=1)
+            for base_name in ["savedmodel", "onnx"]:
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
+                    model_name = tu.get_model_name(
+                        base_name, np.float32, np.float32, np.float32
+                    )
+                    self.assertTrue(triton_client.is_model_ready(model_name, "1"))
+
+                iu.infer_exact(
+                    self,
+                    base_name,
+                    tensor_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    model_version=1,
+                )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -214,17 +230,20 @@ def test_parse_error_modelfail_nostrict(self):
 
         # Server was started but with a model that fails to load
         try:
-            model_name = tu.get_model_name('graphdef', np.float32, np.float32,
-                                           np.float32)
+            model_name = tu.get_model_name(
+                "graphdef", np.float32, np.float32, np.float32
+            )
 
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             self.assertTrue(triton_client.is_server_live())
             self.assertTrue(triton_client.is_server_ready())
             self.assertFalse(triton_client.is_model_ready(model_name, "1"))
 
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             self.assertTrue(triton_client.is_server_live())
             self.assertTrue(triton_client.is_server_ready())
             self.assertFalse(triton_client.is_model_ready(model_name, "1"))
@@ -233,35 +252,38 @@ def test_parse_error_modelfail_nostrict(self):
 
         # Inferencing with the missing model should fail.
         try:
-            iu.infer_exact(self, 'graphdef', tensor_shape, 1, np.float32,
-                           np.float32, np.float32)
-            self.assertTrue(
-                False, "expected error for unavailable model " + model_name)
+            iu.infer_exact(
+                self, "graphdef", tensor_shape, 1, np.float32, np.float32, np.float32
+            )
+            self.assertTrue(False, "expected error for unavailable model " + model_name)
         except Exception as ex:
             self.assertIn(
                 "Request for unknown model: 'graphdef_float32_float32_float32' has no available versions",
-                ex.message())
+                ex.message(),
+            )
 
         # And other models should be loaded successfully
         try:
-            for base_name in ['savedmodel', 'onnx']:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
-                    model_name = tu.get_model_name(base_name, np.float32,
-                                                   np.float32, np.float32)
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "1"))
-
-                iu.infer_exact(self,
-                               base_name,
-                               tensor_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               model_version=1)
+            for base_name in ["savedmodel", "onnx"]:
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
+                    model_name = tu.get_model_name(
+                        base_name, np.float32, np.float32, np.float32
+                    )
+                    self.assertTrue(triton_client.is_model_ready(model_name, "1"))
+
+                iu.infer_exact(
+                    self,
+                    base_name,
+                    tensor_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    model_version=1,
+                )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -269,13 +291,14 @@ def test_parse_error_no_model_config(self):
         tensor_shape = (1, 16)
 
         # Server was started but with a model that fails to be polled
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
-                model_name = tu.get_model_name('graphdef', np.float32,
-                                               np.float32, np.float32)
+                model_name = tu.get_model_name(
+                    "graphdef", np.float32, np.float32, np.float32
+                )
 
                 # expecting ready because not strict readiness
                 self.assertTrue(triton_client.is_server_live())
@@ -283,29 +306,36 @@ def test_parse_error_no_model_config(self):
 
                 md = triton_client.get_model_metadata(model_name, "1")
                 self.assertTrue(
-                    False, "expected model '" + model_name +
-                    "' to be ignored due to polling failure")
+                    False,
+                    "expected model '"
+                    + model_name
+                    + "' to be ignored due to polling failure",
+                )
 
             except Exception as ex:
                 self.assertIn(
                     "Request for unknown model: 'graphdef_float32_float32_float32' is not found",
-                    ex.message())
+                    ex.message(),
+                )
 
         # And other models should be loaded successfully
         try:
-            for base_name in ['savedmodel', 'onnx']:
-                model_name = tu.get_model_name(base_name, np.float32,
-                                               np.float32, np.float32)
+            for base_name in ["savedmodel", "onnx"]:
+                model_name = tu.get_model_name(
+                    base_name, np.float32, np.float32, np.float32
+                )
                 self.assertTrue(triton_client.is_model_ready(model_name, "1"))
 
-                iu.infer_exact(self,
-                               base_name,
-                               tensor_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               model_version=1)
+                iu.infer_exact(
+                    self,
+                    base_name,
+                    tensor_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    model_version=1,
+                )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -313,10 +343,10 @@ def test_init_error_modelfail(self):
         # --strict-readiness=true so server is live but not ready
 
         # Server was started but with models that fail to load
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 self.assertTrue(triton_client.is_server_live())
                 self.assertFalse(triton_client.is_server_ready())
@@ -331,24 +361,27 @@ def test_init_error_modelfail(self):
 
             # And other models should be loaded successfully
             try:
-                for base_name in ['graphdef', 'savedmodel', 'onnx']:
-                    model_name = tu.get_model_name(base_name, np.float32,
-                                                   np.float32, np.float32)
+                for base_name in ["graphdef", "savedmodel", "onnx"]:
+                    model_name = tu.get_model_name(
+                        base_name, np.float32, np.float32, np.float32
+                    )
                     self.assertTrue(triton_client.is_model_ready(model_name))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         try:
             tensor_shape = (1, 16)
-            for base_name in ['graphdef', 'savedmodel', 'onnx']:
-                iu.infer_exact(self,
-                               base_name,
-                               tensor_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               model_version=1)
+            for base_name in ["graphdef", "savedmodel", "onnx"]:
+                iu.infer_exact(
+                    self,
+                    base_name,
+                    tensor_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    model_version=1,
+                )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -357,95 +390,105 @@ def test_parse_error_model_no_version(self):
         tensor_shape = (1, 16)
 
         # Server was started but with a model that fails to load
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 self.assertTrue(triton_client.is_server_live())
                 self.assertFalse(triton_client.is_server_ready())
 
-                model_name = tu.get_model_name('graphdef', np.float32,
-                                               np.float32, np.float32)
+                model_name = tu.get_model_name(
+                    "graphdef", np.float32, np.float32, np.float32
+                )
                 self.assertFalse(triton_client.is_model_ready(model_name))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
             # Sanity check that other models are loaded properly
             try:
-                for base_name in ['savedmodel', 'onnx']:
-                    model_name = tu.get_model_name(base_name, np.float32,
-                                                   np.float32, np.float32)
+                for base_name in ["savedmodel", "onnx"]:
+                    model_name = tu.get_model_name(
+                        base_name, np.float32, np.float32, np.float32
+                    )
                     self.assertTrue(triton_client.is_model_ready(model_name))
                 for version in ["1", "3"]:
-                    model_name = tu.get_model_name("plan", np.float32,
-                                                   np.float32, np.float32)
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, version))
+                    model_name = tu.get_model_name(
+                        "plan", np.float32, np.float32, np.float32
+                    )
+                    self.assertTrue(triton_client.is_model_ready(model_name, version))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         try:
-            for base_name in ['savedmodel', 'onnx']:
-                iu.infer_exact(self,
-                               base_name,
-                               tensor_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               swap=True)
+            for base_name in ["savedmodel", "onnx"]:
+                iu.infer_exact(
+                    self,
+                    base_name,
+                    tensor_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    swap=True,
+                )
             for version in [1, 3]:
-                iu.infer_exact(self,
-                               'plan',
-                               tensor_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               swap=(version == 3),
-                               model_version=version)
+                iu.infer_exact(
+                    self,
+                    "plan",
+                    tensor_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    swap=(version == 3),
+                    model_version=version,
+                )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         try:
-            iu.infer_exact(self, 'graphdef', tensor_shape, 1, np.float32,
-                           np.float32, np.float32)
-            self.assertTrue(
-                False, "expected error for unavailable model " + model_name)
+            iu.infer_exact(
+                self, "graphdef", tensor_shape, 1, np.float32, np.float32, np.float32
+            )
+            self.assertTrue(False, "expected error for unavailable model " + model_name)
         except Exception as ex:
             self.assertIn(
                 "Request for unknown model: 'graphdef_float32_float32_float32' has no available versions",
-                ex.message())
+                ex.message(),
+            )
 
     def test_parse_ignore_zero_prefixed_version(self):
         tensor_shape = (1, 16)
 
         # Server was started but only version 1 is loaded
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
 
-                model_name = tu.get_model_name('savedmodel', np.float32,
-                                               np.float32, np.float32)
+                model_name = tu.get_model_name(
+                    "savedmodel", np.float32, np.float32, np.float32
+                )
                 self.assertTrue(triton_client.is_model_ready(model_name, "1"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         try:
             # swap=False for version 1
-            iu.infer_exact(self,
-                           'savedmodel',
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           swap=False)
+            iu.infer_exact(
+                self,
+                "savedmodel",
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                swap=False,
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -453,53 +496,54 @@ def test_parse_ignore_non_intergral_version(self):
         tensor_shape = (1, 16)
 
         # Server was started but only version 1 is loaded
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
 
-                model_name = tu.get_model_name('savedmodel', np.float32,
-                                               np.float32, np.float32)
+                model_name = tu.get_model_name(
+                    "savedmodel", np.float32, np.float32, np.float32
+                )
                 self.assertTrue(triton_client.is_model_ready(model_name, "1"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         try:
             # swap=False for version 1
-            iu.infer_exact(self,
-                           'savedmodel',
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           swap=False)
+            iu.infer_exact(
+                self,
+                "savedmodel",
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                swap=False,
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_dynamic_model_load_unload(self):
         tensor_shape = (1, 16)
-        savedmodel_name = tu.get_model_name('savedmodel', np.float32,
-                                            np.float32, np.float32)
-        onnx_name = tu.get_model_name('onnx', np.float32, np.float32,
-                                      np.float32)
+        savedmodel_name = tu.get_model_name(
+            "savedmodel", np.float32, np.float32, np.float32
+        )
+        onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32)
 
         # Make sure savedmodel model is not in the status (because
         # initially it is not in the model repository)
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
             except Exception as ex:
@@ -510,16 +554,14 @@ def test_dynamic_model_load_unload(self):
         try:
             shutil.copytree(savedmodel_name, "models/" + savedmodel_name)
             time.sleep(5)  # wait for model to load
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertTrue(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertTrue(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertTrue(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
         except Exception as ex:
@@ -527,47 +569,58 @@ def test_dynamic_model_load_unload(self):
 
         # Run inference on the just loaded model
         try:
-            iu.infer_exact(self,
-                           'savedmodel',
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           swap=True)
+            iu.infer_exact(
+                self,
+                "savedmodel",
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                swap=True,
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Make sure savedmodel has execution stats
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             stats = triton_client.get_inference_statistics(savedmodel_name)
             self.assertEqual(len(stats["model_stats"]), 2)
             for idx in range(len(stats["model_stats"])):
-                self.assertEqual(stats["model_stats"][idx]["name"],
-                                 savedmodel_name)
+                self.assertEqual(stats["model_stats"][idx]["name"], savedmodel_name)
                 if stats["model_stats"][idx]["version"] == "1":
                     self.assertEqual(
-                        stats["model_stats"][idx]["inference_stats"]["success"]
-                        ["count"], 0)
+                        stats["model_stats"][idx]["inference_stats"]["success"][
+                            "count"
+                        ],
+                        0,
+                    )
                 else:
                     self.assertNotEqual(
-                        stats["model_stats"][idx]["inference_stats"]["success"]
-                        ["count"], 0)
-
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+                        stats["model_stats"][idx]["inference_stats"]["success"][
+                            "count"
+                        ],
+                        0,
+                    )
+
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             stats = triton_client.get_inference_statistics(savedmodel_name)
             self.assertEqual(len(stats.model_stats), 2)
             for idx in range(len(stats.model_stats)):
                 self.assertEqual(stats.model_stats[idx].name, savedmodel_name)
                 if stats.model_stats[idx].version == "1":
                     self.assertEqual(
-                        stats.model_stats[idx].inference_stats.success.count, 0)
+                        stats.model_stats[idx].inference_stats.success.count, 0
+                    )
                 else:
                     self.assertNotEqual(
-                        stats.model_stats[idx].inference_stats.success.count, 0)
+                        stats.model_stats[idx].inference_stats.success.count, 0
+                    )
 
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
@@ -577,16 +630,14 @@ def test_dynamic_model_load_unload(self):
         try:
             shutil.rmtree("models/" + savedmodel_name)
             time.sleep(5)  # wait for model to unload
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
         except Exception as ex:
@@ -594,62 +645,65 @@ def test_dynamic_model_load_unload(self):
 
         # Model is removed so inference should fail
         try:
-            iu.infer_exact(self,
-                           'savedmodel',
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           swap=True)
+            iu.infer_exact(
+                self,
+                "savedmodel",
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                swap=True,
+            )
             self.assertTrue(
-                False,
-                "expected error for unavailable model " + savedmodel_name)
+                False, "expected error for unavailable model " + savedmodel_name
+            )
         except Exception as ex:
             self.assertIn(
-                "Request for unknown model: '{}' has no available versions".
-                format(savedmodel_name), ex.message())
+                "Request for unknown model: '{}' has no available versions".format(
+                    savedmodel_name
+                ),
+                ex.message(),
+            )
 
         # Add back the same model. The status/stats should be reset.
         try:
             shutil.copytree(savedmodel_name, "models/" + savedmodel_name)
             time.sleep(5)  # wait for model to load
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertTrue(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertTrue(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertTrue(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
 
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             stats = triton_client.get_inference_statistics(savedmodel_name)
             self.assertEqual(len(stats["model_stats"]), 2)
             self.assertEqual(stats["model_stats"][0]["name"], savedmodel_name)
             self.assertEqual(stats["model_stats"][1]["name"], savedmodel_name)
             self.assertEqual(
-                stats["model_stats"][0]["inference_stats"]["success"]["count"],
-                0)
+                stats["model_stats"][0]["inference_stats"]["success"]["count"], 0
+            )
             self.assertEqual(
-                stats["model_stats"][1]["inference_stats"]["success"]["count"],
-                0)
+                stats["model_stats"][1]["inference_stats"]["success"]["count"], 0
+            )
 
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             stats = triton_client.get_inference_statistics(savedmodel_name)
             self.assertEqual(len(stats.model_stats), 2)
             self.assertEqual(stats.model_stats[0].name, savedmodel_name)
             self.assertEqual(stats.model_stats[1].name, savedmodel_name)
-            self.assertEqual(stats.model_stats[0].inference_stats.success.count,
-                             0)
-            self.assertEqual(stats.model_stats[1].inference_stats.success.count,
-                             0)
+            self.assertEqual(stats.model_stats[0].inference_stats.success.count, 0)
+            self.assertEqual(stats.model_stats[1].inference_stats.success.count, 0)
 
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
@@ -659,16 +713,14 @@ def test_dynamic_model_load_unload(self):
         try:
             shutil.rmtree("models/" + onnx_name)
             time.sleep(5)  # wait for model to unload
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertTrue(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertTrue(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertTrue(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertFalse(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertFalse(triton_client.is_model_ready(onnx_name, "3"))
         except Exception as ex:
@@ -676,41 +728,41 @@ def test_dynamic_model_load_unload(self):
 
         # Model is removed so inference should fail
         try:
-            iu.infer_exact(self,
-                           'onnx',
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           swap=True)
-            self.assertTrue(False,
-                            "expected error for unavailable model " + onnx_name)
+            iu.infer_exact(
+                self,
+                "onnx",
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                swap=True,
+            )
+            self.assertTrue(False, "expected error for unavailable model " + onnx_name)
         except Exception as ex:
             self.assertIn(
                 "Request for unknown model: 'onnx_float32_float32_float32' has no available versions",
-                ex.message())
+                ex.message(),
+            )
 
     def test_dynamic_model_load_unload_disabled(self):
         tensor_shape = (1, 16)
-        savedmodel_name = tu.get_model_name('savedmodel', np.float32,
-                                            np.float32, np.float32)
-        onnx_name = tu.get_model_name('onnx', np.float32, np.float32,
-                                      np.float32)
+        savedmodel_name = tu.get_model_name(
+            "savedmodel", np.float32, np.float32, np.float32
+        )
+        onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32)
 
         # Make sure savedmodel model is not in the status (because
         # initially it is not in the model repository)
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
             except Exception as ex:
@@ -721,16 +773,14 @@ def test_dynamic_model_load_unload_disabled(self):
         try:
             shutil.copytree(savedmodel_name, "models/" + savedmodel_name)
             time.sleep(5)  # wait for model to load
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
         except Exception as ex:
@@ -738,37 +788,38 @@ def test_dynamic_model_load_unload_disabled(self):
 
         # Run inference which should fail because the model isn't there
         try:
-            iu.infer_exact(self,
-                           'savedmodel',
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           swap=True)
+            iu.infer_exact(
+                self,
+                "savedmodel",
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                swap=True,
+            )
             self.assertTrue(
-                False,
-                "expected error for unavailable model " + savedmodel_name)
+                False, "expected error for unavailable model " + savedmodel_name
+            )
         except Exception as ex:
             self.assertIn(
                 "Request for unknown model: 'savedmodel_float32_float32_float32' is not found",
-                ex.message())
+                ex.message(),
+            )
 
         # Remove one of the original models from the model repository.
         # Unloading is disabled so it should remain available in the status.
         try:
             shutil.rmtree("models/" + onnx_name)
             time.sleep(5)  # wait for model to unload (but it shouldn't)
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
         except Exception as ex:
@@ -777,84 +828,93 @@ def test_dynamic_model_load_unload_disabled(self):
         # Run inference to make sure model still being served even
         # though deleted from model repository
         try:
-            iu.infer_exact(self,
-                           'onnx',
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           swap=True)
+            iu.infer_exact(
+                self,
+                "onnx",
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                swap=True,
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_dynamic_version_load_unload(self):
         tensor_shape = (1, 16)
-        graphdef_name = tu.get_model_name('graphdef', np.int32, np.int32,
-                                          np.int32)
+        graphdef_name = tu.get_model_name("graphdef", np.int32, np.int32, np.int32)
 
         # There are 3 versions. Make sure that all have status and are
         # ready.
         try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "1"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "2"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "3"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "2"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "3"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Run inference on version 1 to make sure it is available
         try:
-            iu.infer_exact(self,
-                           'graphdef',
-                           tensor_shape,
-                           1,
-                           np.int32,
-                           np.int32,
-                           np.int32,
-                           swap=False,
-                           model_version=1)
+            iu.infer_exact(
+                self,
+                "graphdef",
+                tensor_shape,
+                1,
+                np.int32,
+                np.int32,
+                np.int32,
+                swap=False,
+                model_version=1,
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Make sure only version 1 has execution stats in the status.
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             stats = triton_client.get_inference_statistics(graphdef_name)
             self.assertEqual(len(stats["model_stats"]), 3)
             for idx in range(len(stats["model_stats"])):
-                self.assertEqual(stats["model_stats"][idx]["name"],
-                                 graphdef_name)
+                self.assertEqual(stats["model_stats"][idx]["name"], graphdef_name)
                 if stats["model_stats"][idx]["version"] == "1":
                     self.assertNotEqual(
-                        stats["model_stats"][idx]["inference_stats"]["success"]
-                        ["count"], 0)
+                        stats["model_stats"][idx]["inference_stats"]["success"][
+                            "count"
+                        ],
+                        0,
+                    )
                 else:
                     self.assertEqual(
-                        stats["model_stats"][idx]["inference_stats"]["success"]
-                        ["count"], 0)
-
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+                        stats["model_stats"][idx]["inference_stats"]["success"][
+                            "count"
+                        ],
+                        0,
+                    )
+
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             stats = triton_client.get_inference_statistics(graphdef_name)
             self.assertEqual(len(stats.model_stats), 3)
             for idx in range(len(stats.model_stats)):
                 self.assertEqual(stats.model_stats[idx].name, graphdef_name)
                 if stats.model_stats[idx].version == "1":
                     self.assertNotEqual(
-                        stats.model_stats[idx].inference_stats.success.count, 0)
+                        stats.model_stats[idx].inference_stats.success.count, 0
+                    )
                 else:
                     self.assertEqual(
-                        stats.model_stats[idx].inference_stats.success.count, 0)
+                        stats.model_stats[idx].inference_stats.success.count, 0
+                    )
 
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
@@ -864,87 +924,81 @@ def test_dynamic_version_load_unload(self):
         try:
             shutil.rmtree("models/" + graphdef_name + "/1")
             time.sleep(5)  # wait for version to unload
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(graphdef_name, "1"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "2"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(graphdef_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "2"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "3"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Version is removed so inference should fail
         try:
-            iu.infer_exact(self,
-                           'graphdef',
-                           tensor_shape,
-                           1,
-                           np.int32,
-                           np.int32,
-                           np.int32,
-                           swap=False,
-                           model_version=1)
+            iu.infer_exact(
+                self,
+                "graphdef",
+                tensor_shape,
+                1,
+                np.int32,
+                np.int32,
+                np.int32,
+                swap=False,
+                model_version=1,
+            )
             self.assertTrue(
-                False, "expected error for unavailable model " + graphdef_name)
+                False, "expected error for unavailable model " + graphdef_name
+            )
         except Exception as ex:
             self.assertIn(
                 "Request for unknown model: 'graphdef_int32_int32_int32' version 1 is not at ready state",
-                ex.message())
+                ex.message(),
+            )
 
         # Add another version to the model repository.
         try:
-            shutil.copytree("models/" + graphdef_name + "/2",
-                            "models/" + graphdef_name + "/7")
+            shutil.copytree(
+                "models/" + graphdef_name + "/2", "models/" + graphdef_name + "/7"
+            )
             time.sleep(5)  # wait for version to load
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(graphdef_name, "1"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "2"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "3"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "7"))
+                self.assertFalse(triton_client.is_model_ready(graphdef_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "2"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "3"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "7"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_dynamic_version_load_unload_disabled(self):
         tensor_shape = (1, 16)
-        graphdef_name = tu.get_model_name('graphdef', np.int32, np.int32,
-                                          np.int32)
+        graphdef_name = tu.get_model_name("graphdef", np.int32, np.int32, np.int32)
 
         # Add a new version to the model repository and give it time to
         # load. But it shouldn't load because dynamic loading is
         # disabled.
         try:
-            shutil.copytree("models/" + graphdef_name + "/2",
-                            "models/" + graphdef_name + "/7")
+            shutil.copytree(
+                "models/" + graphdef_name + "/2", "models/" + graphdef_name + "/7"
+            )
             time.sleep(5)  # wait for model to load
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "1"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "2"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "3"))
-                self.assertFalse(
-                    triton_client.is_model_ready(graphdef_name, "7"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "2"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(graphdef_name, "7"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -954,59 +1008,54 @@ def test_dynamic_version_load_unload_disabled(self):
         try:
             shutil.rmtree("models/" + graphdef_name + "/1")
             time.sleep(5)  # wait for version to unload (but it shouldn't)
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "1"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "2"))
-                self.assertTrue(triton_client.is_model_ready(
-                    graphdef_name, "3"))
-                self.assertFalse(
-                    triton_client.is_model_ready(graphdef_name, "7"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "2"))
+                self.assertTrue(triton_client.is_model_ready(graphdef_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(graphdef_name, "7"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Run inference to make sure model still being served even
         # though version deleted from model repository
         try:
-            iu.infer_exact(self,
-                           'graphdef',
-                           tensor_shape,
-                           1,
-                           np.int32,
-                           np.int32,
-                           np.int32,
-                           swap=False,
-                           model_version=1)
+            iu.infer_exact(
+                self,
+                "graphdef",
+                tensor_shape,
+                1,
+                np.int32,
+                np.int32,
+                np.int32,
+                swap=False,
+                model_version=1,
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_dynamic_model_modify(self):
-        models_base = ('savedmodel', 'plan')
+        models_base = ("savedmodel", "plan")
         models_shape = ((1, 16), (1, 16))
         models = list()
         for m in models_base:
-            models.append(
-                tu.get_model_name(m, np.float32, np.float32, np.float32))
+            models.append(tu.get_model_name(m, np.float32, np.float32, np.float32))
 
         # Make sure savedmodel and plan are in the status
         for model_name in models:
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "1"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -1014,63 +1063,67 @@ def test_dynamic_model_modify(self):
         for version in (1, 3):
             for model_name, model_shape in zip(models_base, models_shape):
                 try:
-                    iu.infer_exact(self,
-                                   model_name,
-                                   model_shape,
-                                   1,
-                                   np.float32,
-                                   np.float32,
-                                   np.float32,
-                                   swap=(version == 3),
-                                   model_version=version)
+                    iu.infer_exact(
+                        self,
+                        model_name,
+                        model_shape,
+                        1,
+                        np.float32,
+                        np.float32,
+                        np.float32,
+                        swap=(version == 3),
+                        model_version=version,
+                    )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Change the model configuration to use wrong label file
         for base_name, model_name in zip(models_base, models):
-            shutil.copyfile("config.pbtxt.wrong." + base_name,
-                            "models/" + model_name + "/config.pbtxt")
+            shutil.copyfile(
+                "config.pbtxt.wrong." + base_name,
+                "models/" + model_name + "/config.pbtxt",
+            )
 
         time.sleep(5)  # wait for models to reload
         for model_name in models:
             for model_name, model_shape in zip(models_base, models_shape):
                 try:
-                    iu.infer_exact(self,
-                                   model_name,
-                                   model_shape,
-                                   1,
-                                   np.float32,
-                                   np.float32,
-                                   np.float32,
-                                   swap=(version == 3),
-                                   model_version=version,
-                                   output0_raw=False)
+                    iu.infer_exact(
+                        self,
+                        model_name,
+                        model_shape,
+                        1,
+                        np.float32,
+                        np.float32,
+                        np.float32,
+                        swap=(version == 3),
+                        model_version=version,
+                        output0_raw=False,
+                    )
                     self.assertTrue(
-                        False,
-                        "expected error for wrong label for " + model_name)
+                        False, "expected error for wrong label for " + model_name
+                    )
                 except AssertionError as ex:
-                    self.assertTrue("'label9" in str(ex) and "!=" in str(ex),
-                                    str(ex))
+                    self.assertTrue("'label9" in str(ex) and "!=" in str(ex), str(ex))
 
         # Change the model configuration to use correct label file and to have
         # the default version policy (so that only version 3) is available.
         for base_name, model_name in zip(models_base, models):
-            shutil.copyfile("config.pbtxt." + base_name,
-                            "models/" + model_name + "/config.pbtxt")
+            shutil.copyfile(
+                "config.pbtxt." + base_name, "models/" + model_name + "/config.pbtxt"
+            )
 
         time.sleep(5)  # wait for models to reload
         for model_name in models:
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -1078,56 +1131,58 @@ def test_dynamic_model_modify(self):
         # change in model policy makes that no longer available.
         for model_name, model_shape in zip(models_base, models_shape):
             try:
-                iu.infer_exact(self,
-                               model_name,
-                               model_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               swap=False,
-                               model_version=1)
+                iu.infer_exact(
+                    self,
+                    model_name,
+                    model_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    swap=False,
+                    model_version=1,
+                )
                 self.assertTrue(
-                    False, "expected error for unavailable model " + model_name)
+                    False, "expected error for unavailable model " + model_name
+                )
             except Exception as ex:
                 self.assertIn("Request for unknown model", ex.message())
 
         # Version 3 should continue to work...
         for model_name, model_shape in zip(models_base, models_shape):
             try:
-                iu.infer_exact(self,
-                               model_name,
-                               model_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               swap=True,
-                               model_version=3)
+                iu.infer_exact(
+                    self,
+                    model_name,
+                    model_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    swap=True,
+                    model_version=3,
+                )
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_dynamic_file_delete(self):
-        models_base = ('savedmodel', 'plan')
+        models_base = ("savedmodel", "plan")
         models_shape = ((1, 16), (1, 16))
         models = list()
         for m in models_base:
-            models.append(
-                tu.get_model_name(m, np.float32, np.float32, np.float32))
+            models.append(tu.get_model_name(m, np.float32, np.float32, np.float32))
 
         # Make sure savedmodel and plan are in the status
         for model_name in models:
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "1"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -1135,15 +1190,17 @@ def test_dynamic_file_delete(self):
         for version in (1, 3):
             for model_name, model_shape in zip(models_base, models_shape):
                 try:
-                    iu.infer_exact(self,
-                                   model_name,
-                                   model_shape,
-                                   1,
-                                   np.float32,
-                                   np.float32,
-                                   np.float32,
-                                   swap=(version == 3),
-                                   model_version=version)
+                    iu.infer_exact(
+                        self,
+                        model_name,
+                        model_shape,
+                        1,
+                        np.float32,
+                        np.float32,
+                        np.float32,
+                        swap=(version == 3),
+                        model_version=version,
+                    )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -1157,81 +1214,86 @@ def test_dynamic_file_delete(self):
         time.sleep(5)  # wait for models to reload
         for model_name in models:
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Only version 3 (latest) should work...
         for model_name, model_shape in zip(models_base, models_shape):
             try:
-                iu.infer_exact(self,
-                               model_name,
-                               model_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               swap=True,
-                               model_version=3)
+                iu.infer_exact(
+                    self,
+                    model_name,
+                    model_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    swap=True,
+                    model_version=3,
+                )
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
             try:
-                iu.infer_exact(self,
-                               model_name,
-                               model_shape,
-                               1,
-                               np.float32,
-                               np.float32,
-                               np.float32,
-                               swap=False,
-                               model_version=1)
+                iu.infer_exact(
+                    self,
+                    model_name,
+                    model_shape,
+                    1,
+                    np.float32,
+                    np.float32,
+                    np.float32,
+                    swap=False,
+                    model_version=1,
+                )
                 self.assertTrue(
-                    False,
-                    "expected error for unavailable model " + graphdef_name)
+                    False, "expected error for unavailable model " + graphdef_name
+                )
             except Exception as ex:
                 self.assertIn("Request for unknown model", ex.message())
 
     def test_multiple_model_repository_polling(self):
         model_shape = (1, 16)
-        savedmodel_name = tu.get_model_name('savedmodel', np.float32,
-                                            np.float32, np.float32)
+        savedmodel_name = tu.get_model_name(
+            "savedmodel", np.float32, np.float32, np.float32
+        )
 
         # Models should be loaded successfully and infer
         # successfully. Initially savedmodel only has version 1.
-        self._infer_success_models([
-            'savedmodel',
-        ], (1,), model_shape)
-        self._infer_success_models(['graphdef', 'onnx'], (1, 3), model_shape)
+        self._infer_success_models(
+            [
+                "savedmodel",
+            ],
+            (1,),
+            model_shape,
+        )
+        self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape)
 
         # Add the savedmodel to the second model repository, should cause
         # it to be unloaded due to duplication
         shutil.copytree(savedmodel_name, "models_0/" + savedmodel_name)
         time.sleep(5)  # wait for models to reload
         try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models(['graphdef', 'onnx'], (1, 3), model_shape)
+        self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape)
 
         # Remove the savedmodel from the first model repository, the
         # model from the second model repository should be loaded
@@ -1239,91 +1301,96 @@ def test_multiple_model_repository_polling(self):
         # have versions 1 and 3.
         shutil.rmtree("models/" + savedmodel_name)
         time.sleep(5)  # wait for model to unload
-        self._infer_success_models(['savedmodel', 'graphdef', 'onnx'], (1, 3),
-                                   model_shape)
+        self._infer_success_models(
+            ["savedmodel", "graphdef", "onnx"], (1, 3), model_shape
+        )
 
     def test_multiple_model_repository_control(self):
         # similar to test_multiple_model_repository_polling, but the
         # model load/unload is controlled by the API
         model_shape = (1, 16)
-        savedmodel_name = tu.get_model_name('savedmodel', np.float32,
-                                            np.float32, np.float32)
-        model_bases = ['savedmodel', 'graphdef', 'onnx']
+        savedmodel_name = tu.get_model_name(
+            "savedmodel", np.float32, np.float32, np.float32
+        )
+        model_bases = ["savedmodel", "graphdef", "onnx"]
 
         # Initially models are not loaded
         for base in model_bases:
             try:
-                model_name = tu.get_model_name(base, np.float32, np.float32,
-                                               np.float32)
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                model_name = tu.get_model_name(base, np.float32, np.float32, np.float32)
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Load all models, here we use GRPC
         for base in model_bases:
             try:
-                model_name = tu.get_model_name(base, np.float32, np.float32,
-                                               np.float32)
+                model_name = tu.get_model_name(base, np.float32, np.float32, np.float32)
                 triton_client = grpcclient.InferenceServerClient(
-                    "localhost:8001", verbose=True)
+                    "localhost:8001", verbose=True
+                )
                 triton_client.load_model(model_name)
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Models should be loaded successfully and infer
         # successfully. Initially savedmodel only has version 1.
-        self._infer_success_models([
-            'savedmodel',
-        ], (1,), model_shape)
-        self._infer_success_models(['graphdef', 'onnx'], (1, 3), model_shape)
+        self._infer_success_models(
+            [
+                "savedmodel",
+            ],
+            (1,),
+            model_shape,
+        )
+        self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape)
 
         # Add the savedmodel to the second model repository. Because
         # not polling this doesn't change any model state, all models
         # are still loaded and available.
         shutil.copytree(savedmodel_name, "models_0/" + savedmodel_name)
-        self._infer_success_models([
-            'savedmodel',
-        ], (1,), model_shape)
-        self._infer_success_models(['graphdef', 'onnx'], (1, 3), model_shape)
+        self._infer_success_models(
+            [
+                "savedmodel",
+            ],
+            (1,),
+            model_shape,
+        )
+        self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape)
 
         # Load savedmodel again which should fail because it is now duplicated
         # in 2 model repositories. Use HTTP here.
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(savedmodel_name)
         except Exception as ex:
-            self.assertIn("failed to load '{}'".format(savedmodel_name),
-                          ex.message())
+            self.assertIn("failed to load '{}'".format(savedmodel_name), ex.message())
 
         try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
                 # Unlike polling mode, the failed load on the duplicate model
                 # should NOT unload the existing versions in model control mode.
-                self.assertTrue(
-                    triton_client.is_model_ready(savedmodel_name, "1"))
+                self.assertTrue(triton_client.is_model_ready(savedmodel_name, "1"))
                 # Version 3 did not exist in the first model repository, so
                 # it should still not be loaded.
-                self.assertFalse(
-                    triton_client.is_model_ready(savedmodel_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(savedmodel_name, "3"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models(['graphdef', 'onnx'], (1, 3), model_shape)
+        self._infer_success_models(["graphdef", "onnx"], (1, 3), model_shape)
 
         # Remove the savedmodel from the first model repository and
         # explicitly load savedmodel. The savedmodel from the second
@@ -1331,23 +1398,23 @@ def test_multiple_model_repository_control(self):
         # model repository savedmodel should have versions 1 and 3.
         shutil.rmtree("models/" + savedmodel_name)
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             # Unload existing in-memory model from first model repository
             triton_client.unload_model(savedmodel_name)
             # Load model from second model repository since original was deleted
             triton_client.load_model(savedmodel_name)
         except Exception as ex:
-            self.assertIn("failed to load '{}'".format(savedmodel_name),
-                          ex.message())
+            self.assertIn("failed to load '{}'".format(savedmodel_name), ex.message())
 
-        self._infer_success_models(['savedmodel', 'graphdef', 'onnx'], (1, 3),
-                                   model_shape)
+        self._infer_success_models(
+            ["savedmodel", "graphdef", "onnx"], (1, 3), model_shape
+        )
 
     def test_model_control(self):
         model_shape = (1, 16)
-        onnx_name = tu.get_model_name('onnx', np.float32, np.float32,
-                                      np.float32)
+        onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32)
 
         ensemble_prefix = "simple_"
         ensemble_name = ensemble_prefix + onnx_name
@@ -1355,48 +1422,55 @@ def test_model_control(self):
         # Make sure no models are loaded
         for model_name in (onnx_name, ensemble_name):
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Load non-existent model
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 triton_client.load_model("unknown_model")
                 self.assertTrue(False, "expected unknown model failure")
             except Exception as ex:
                 self.assertIn(
                     "failed to load 'unknown_model', failed to poll from model repository",
-                    ex.message())
+                    ex.message(),
+                )
 
         # Load ensemble model, the dependent model should be polled and loaded
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(ensemble_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "onnx",
-        ], (1, 3), model_shape)
-        self._infer_success_models([
-            "simple_onnx",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (1, 3),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_onnx",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
 
         # Delete model configuration for onnx, which will cause
         # the autofiller to use the latest version policy so that only
@@ -1404,51 +1478,65 @@ def test_model_control(self):
         for model_name in (onnx_name,):
             os.remove("models/" + model_name + "/config.pbtxt")
 
-        self._infer_success_models([
-            "onnx",
-        ], (1, 3), model_shape)
-        self._infer_success_models([
-            "simple_onnx",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (1, 3),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_onnx",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
 
         # Reload models, only version 3 should be available for onnx
         for model_name in (onnx_name, ensemble_name):
             try:
                 triton_client = grpcclient.InferenceServerClient(
-                    "localhost:8001", verbose=True)
+                    "localhost:8001", verbose=True
+                )
                 triton_client.load_model(model_name)
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "onnx",
-        ], (3,), model_shape)
-        self._infer_success_models([
-            "simple_onnx",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (3,),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_onnx",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
 
         for model_name in (onnx_name,):
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Unload non-existing model, nothing should happen
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 triton_client.unload_model("unknown_model")
             except Exception as ex:
@@ -1457,24 +1545,23 @@ def test_model_control(self):
         # Unload the depending model, as side effect, the ensemble model will be
         # forced to be unloaded
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.unload_model(onnx_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         for model_name in (onnx_name, ensemble_name):
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -1482,41 +1569,43 @@ def test_model_control(self):
         # model. The ensemble model should not be reloaded because it
         # was explicitly unloaded.
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.unload_model(ensemble_name)
             triton_client.load_model(onnx_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "onnx",
-        ], (3,), model_shape)
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (3,),
+            model_shape,
+        )
 
         try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(ensemble_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(ensemble_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(ensemble_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(ensemble_name, "3"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_model_control_fail(self):
-        model_name = tu.get_model_name('onnx', np.float32, np.float32,
-                                       np.float32)
+        model_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32)
 
         # Make sure no models are loaded
         try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
                 self.assertFalse(triton_client.is_model_ready(model_name, "1"))
@@ -1526,28 +1615,27 @@ def test_model_control_fail(self):
 
         # Request to load the model and expect fail to load
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(model_name)
             self.assertTrue(False, "expecting load failure")
         except InferenceServerException as ex:
-            self.assertIn("load failed for model '{}'".format(model_name),
-                          ex.message())
+            self.assertIn("load failed for model '{}'".format(model_name), ex.message())
 
         # Another attempt should fail as well
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(model_name)
             self.assertTrue(False, "expecting load failure")
         except InferenceServerException as ex:
-            self.assertIn("load failed for model '{}'".format(model_name),
-                          ex.message())
+            self.assertIn("load failed for model '{}'".format(model_name), ex.message())
 
     def test_model_control_ensemble(self):
         model_shape = (1, 16)
-        onnx_name = tu.get_model_name('onnx', np.float32, np.float32,
-                                      np.float32)
+        onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32)
 
         ensemble_prefix = "simple_"
         ensemble_name = ensemble_prefix + onnx_name
@@ -1555,83 +1643,91 @@ def test_model_control_ensemble(self):
         # Make sure no models are loaded
         for model_name in (onnx_name, ensemble_name):
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Load ensemble model, the dependent model should be polled and loaded
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(ensemble_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "onnx",
-        ], (1, 3), model_shape)
-        self._infer_success_models([
-            "simple_onnx",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (1, 3),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_onnx",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
 
         # Unload the ensemble with unload_dependents flag. all models should be unloaded
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.unload_model(ensemble_name, unload_dependents=True)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         for model_name in (onnx_name, ensemble_name):
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Load ensemble model, and unload it without unload_dependents flag (default).
         # The dependent model should still be available
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(ensemble_name)
             triton_client.unload_model(ensemble_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "onnx",
-        ], (1, 3), model_shape)
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (1, 3),
+            model_shape,
+        )
 
         try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(ensemble_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(ensemble_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(ensemble_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(ensemble_name, "3"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "1"))
                 self.assertTrue(triton_client.is_model_ready(onnx_name, "3"))
         except Exception as ex:
@@ -1639,8 +1735,7 @@ def test_model_control_ensemble(self):
 
     def test_load_same_model_different_platform(self):
         model_shape = (1, 16)
-        model_name = tu.get_model_name('simple', np.float32, np.float32,
-                                       np.float32)
+        model_name = tu.get_model_name("simple", np.float32, np.float32, np.float32)
 
         # Check whether or not to use grpc protocol
         use_grpc = "TRITONSERVER_USE_GRPC" in os.environ
@@ -1654,19 +1749,22 @@ def test_load_same_model_different_platform(self):
             self.assertTrue(triton_client.is_model_ready(model_name, "1"))
             self.assertTrue(triton_client.is_model_ready(model_name, "3"))
             if use_grpc:
-                metadata = triton_client.get_model_metadata(model_name,
-                                                            as_json=True)
+                metadata = triton_client.get_model_metadata(model_name, as_json=True)
             else:
                 metadata = triton_client.get_model_metadata(model_name)
             self.assertEqual(metadata["platform"], "tensorrt_plan")
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
-        self._infer_success_models([
-            "simple",
-        ], (
-            1,
-            3,
-        ), model_shape)
+        self._infer_success_models(
+            [
+                "simple",
+            ],
+            (
+                1,
+                3,
+            ),
+            model_shape,
+        )
 
         # Copy the same model of different platform to model repository
         shutil.rmtree("models/" + model_name)
@@ -1688,19 +1786,22 @@ def test_load_same_model_different_platform(self):
             self.assertTrue(triton_client.is_model_ready(model_name, "1"))
             self.assertTrue(triton_client.is_model_ready(model_name, "3"))
             if use_grpc:
-                metadata = triton_client.get_model_metadata(model_name,
-                                                            as_json=True)
+                metadata = triton_client.get_model_metadata(model_name, as_json=True)
             else:
                 metadata = triton_client.get_model_metadata(model_name)
             self.assertEqual(metadata["platform"], "pytorch_libtorch")
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
-        self._infer_success_models([
-            "simple",
-        ], (
-            1,
-            3,
-        ), model_shape)
+        self._infer_success_models(
+            [
+                "simple",
+            ],
+            (
+                1,
+                3,
+            ),
+            model_shape,
+        )
 
     def test_model_availability_on_reload(self):
         model_name = "identity_zero_1_int32"
@@ -1725,9 +1826,8 @@ def test_model_availability_on_reload(self):
 
         # Reload models, v1 should still be available until v2 is loaded
         # The load is requested in other thread as it is blocking API,
-        # and the v1 availibility should be tested during the reload
-        thread = threading.Thread(target=self._async_load,
-                                  args=(model_name, use_grpc))
+        # and the v1 availability should be tested during the reload
+        thread = threading.Thread(target=self._async_load, args=(model_name, use_grpc))
         thread.start()
         # wait for time < model creation delay to ensure load request is sent
         time.sleep(3)
@@ -1738,9 +1838,12 @@ def test_model_availability_on_reload(self):
             triton_client = self._get_client(use_grpc)
             self.assertTrue(triton_client.is_server_live())
             load_end = time.time()
-            self.assertTrue((load_end - load_start) < 5,
-                            "server was waiting unexpectly, waited {}".format(
-                                (load_end - load_start)))
+            self.assertTrue(
+                (load_end - load_start) < 5,
+                "server was waiting unexpectedly, waited {}".format(
+                    (load_end - load_start)
+                ),
+            )
             self.assertTrue(triton_client.is_server_ready())
             self.assertTrue(triton_client.is_model_ready(model_name, "1"))
         except Exception as ex:
@@ -1778,14 +1881,12 @@ def test_model_availability_on_reload_2(self):
         self._infer_success_identity(model_base, (1,), np.int32, model_shape)
 
         # Overwrite config.pbtxt to load v2 only
-        shutil.copyfile("config.pbtxt.v2",
-                        "models/" + model_name + "/config.pbtxt")
+        shutil.copyfile("config.pbtxt.v2", "models/" + model_name + "/config.pbtxt")
 
         # Reload models, v1 should still be available until v2 is loaded
         # The load is requested in other thread as it is blocking API,
-        # and the v1 availibility should be tested during the reload
-        thread = threading.Thread(target=self._async_load,
-                                  args=(model_name, use_grpc))
+        # and the v1 availability should be tested during the reload
+        thread = threading.Thread(target=self._async_load, args=(model_name, use_grpc))
         thread.start()
         # wait for time < model creation delay to ensure load request is sent
         time.sleep(3)
@@ -1796,9 +1897,12 @@ def test_model_availability_on_reload_2(self):
             triton_client = self._get_client(use_grpc)
             self.assertTrue(triton_client.is_server_live())
             load_end = time.time()
-            self.assertTrue((load_end - load_start) < 5,
-                            "server was waiting unexpectly, waited {}".format(
-                                (load_end - load_start)))
+            self.assertTrue(
+                (load_end - load_start) < 5,
+                "server was waiting unexpectedly, waited {}".format(
+                    (load_end - load_start)
+                ),
+            )
             self.assertTrue(triton_client.is_server_ready())
             self.assertTrue(triton_client.is_model_ready(model_name, "1"))
         except Exception as ex:
@@ -1836,13 +1940,11 @@ def test_model_availability_on_reload_3(self):
         self._infer_success_identity(model_base, (1,), np.int32, model_shape)
 
         # Overwrite config.pbtxt to load v2 only
-        shutil.copyfile("config.pbtxt.new",
-                        "models/" + model_name + "/config.pbtxt")
+        shutil.copyfile("config.pbtxt.new", "models/" + model_name + "/config.pbtxt")
 
         # Reload models, v1 will be reloaded but it should  be available
         # during the whole reload
-        thread = threading.Thread(target=self._async_load,
-                                  args=(model_name, use_grpc))
+        thread = threading.Thread(target=self._async_load, args=(model_name, use_grpc))
         thread.start()
         # wait for time < model creation delay to ensure load request is sent
         time.sleep(3)
@@ -1853,9 +1955,12 @@ def test_model_availability_on_reload_3(self):
             triton_client = self._get_client(use_grpc)
             self.assertTrue(triton_client.is_server_live())
             load_end = time.time()
-            self.assertTrue((load_end - load_start) < 5,
-                            "server was waiting unexpectly, waited {}".format(
-                                (load_end - load_start)))
+            self.assertTrue(
+                (load_end - load_start) < 5,
+                "server was waiting unexpectedly, waited {}".format(
+                    (load_end - load_start)
+                ),
+            )
             self.assertTrue(triton_client.is_server_ready())
             self.assertTrue(triton_client.is_model_ready(model_name, "1"))
         except Exception as ex:
@@ -1880,8 +1985,9 @@ def test_model_reload_fail(self):
 
         # Make sure version 1 of the model is loaded
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             self.assertTrue(triton_client.is_server_live())
             self.assertTrue(triton_client.is_server_ready())
             self.assertTrue(triton_client.is_model_ready(model_name, "1"))
@@ -1890,24 +1996,26 @@ def test_model_reload_fail(self):
         self._infer_success_identity(model_base, (1,), np.int32, model_shape)
 
         # Overwrite config.pbtxt to load v2 only on GPU, which will fail
-        shutil.copyfile("config.pbtxt.v2.gpu",
-                        "models/" + model_name + "/config.pbtxt")
+        shutil.copyfile("config.pbtxt.v2.gpu", "models/" + model_name + "/config.pbtxt")
 
         # Reload models, v1 should still be available even if v2 fails to load
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(model_name)
             self.assertTrue(False, "expecting load failure")
         except Exception as ex:
             self.assertIn(
                 "version 2 is at UNAVAILABLE state: Internal: GPU instances not supported",
-                ex.message())
+                ex.message(),
+            )
 
         # Make sure version 1 of the model is available, and version 2 is not
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             self.assertTrue(triton_client.is_server_live())
             self.assertTrue(triton_client.is_server_ready())
             self.assertTrue(triton_client.is_model_ready(model_name, "1"))
@@ -1918,113 +2026,143 @@ def test_model_reload_fail(self):
 
     def test_multiple_model_repository_control_startup_models(self):
         model_shape = (1, 16)
-        onnx_name = tu.get_model_name('onnx', np.float32, np.float32,
-                                      np.float32)
-        plan_name = tu.get_model_name('plan', np.float32, np.float32,
-                                      np.float32)
+        onnx_name = tu.get_model_name("onnx", np.float32, np.float32, np.float32)
+        plan_name = tu.get_model_name("plan", np.float32, np.float32, np.float32)
 
         ensemble_prefix = "simple_"
         onnx_ensemble_name = ensemble_prefix + onnx_name
         plan_ensemble_name = ensemble_prefix + plan_name
 
         # Make sure unloaded models are not in the status
-        for base in ('savedmodel',):
-            model_name = tu.get_model_name(base, np.float32, np.float32,
-                                           np.float32)
+        for base in ("savedmodel",):
+            model_name = tu.get_model_name(base, np.float32, np.float32, np.float32)
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
         # And loaded models work properly
-        self._infer_success_models([
-            "onnx",
-        ], (1, 3), model_shape)
-        self._infer_success_models([
-            "simple_onnx",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
-        self._infer_success_models([
-            "plan",
-        ], (1, 3), model_shape)
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (1, 3),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_onnx",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
+        self._infer_success_models(
+            [
+                "plan",
+            ],
+            (1, 3),
+            model_shape,
+        )
 
         # Load non-existing model
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 triton_client.load_model("unknown_model")
                 self.assertTrue(False, "expected unknown model failure")
             except Exception as ex:
                 self.assertIn(
                     "failed to load 'unknown_model', failed to poll from model repository",
-                    ex.message())
+                    ex.message(),
+                )
 
         # Load plan ensemble model, the dependent model is already
         # loaded via command-line
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.load_model(plan_ensemble_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "plan",
-        ], (1, 3), model_shape)
-        self._infer_success_models([
-            "simple_plan",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
+        self._infer_success_models(
+            [
+                "plan",
+            ],
+            (1, 3),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_plan",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
 
         # Delete model configuration, which will cause the autofiller
         # to use the latest version policy so that only version 3 will
         # be available if the models are re-loaded
         os.remove("models/" + onnx_name + "/config.pbtxt")
 
-        self._infer_success_models([
-            "plan",
-        ], (1, 3), model_shape)
-        self._infer_success_models([
-            "simple_plan",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
+        self._infer_success_models(
+            [
+                "plan",
+            ],
+            (1, 3),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_plan",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
 
         # Reload onnx, only version 3 should be available
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             triton_client.load_model(onnx_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "onnx",
-        ], (3,), model_shape)
-        self._infer_success_models([
-            "simple_onnx",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
-
-        try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (3,),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_onnx",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
+
+        try:
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
                 self.assertFalse(triton_client.is_model_ready(onnx_name, "1"))
@@ -2032,10 +2170,10 @@ def test_multiple_model_repository_control_startup_models(self):
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Unload non-existing model, nothing should happen
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
             try:
                 triton_client.unload_model("unknown_model")
             except Exception as ex:
@@ -2044,24 +2182,23 @@ def test_multiple_model_repository_control_startup_models(self):
         # Unload the onnx, as side effect, the ensemble model
         # will be forced to be unloaded
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.unload_model(onnx_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         for model_name in [onnx_name, onnx_ensemble_name]:
             try:
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
             except Exception as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -2069,36 +2206,46 @@ def test_multiple_model_repository_control_startup_models(self):
         # depending model. The ensemble model should not be reloaded
         # because it was explicitly unloaded.
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             triton_client.unload_model(onnx_ensemble_name)
             triton_client.load_model(onnx_name)
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
-        self._infer_success_models([
-            "onnx",
-        ], (3,), model_shape)
-        self._infer_success_models([
-            "plan",
-        ], (1, 3), model_shape)
-        self._infer_success_models([
-            "simple_plan",
-        ], (1, 3),
-                                   model_shape,
-                                   swap=True)
-
-        try:
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+        self._infer_success_models(
+            [
+                "onnx",
+            ],
+            (3,),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "plan",
+            ],
+            (1, 3),
+            model_shape,
+        )
+        self._infer_success_models(
+            [
+                "simple_plan",
+            ],
+            (1, 3),
+            model_shape,
+            swap=True,
+        )
+
+        try:
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
-                self.assertFalse(
-                    triton_client.is_model_ready(onnx_ensemble_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(onnx_ensemble_name, "3"))
+                self.assertFalse(triton_client.is_model_ready(onnx_ensemble_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(onnx_ensemble_name, "3"))
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -2106,7 +2253,7 @@ def test_model_repository_index(self):
         # use model control EXPLICIT and --load-model to load a subset of models
         # in model repository
         tensor_shape = (1, 16)
-        model_bases = ['graphdef', 'savedmodel', "simple_savedmodel"]
+        model_bases = ["graphdef", "savedmodel", "simple_savedmodel"]
 
         # Sanity check on loaded models
         # 3 models should be loaded:
@@ -2115,12 +2262,13 @@ def test_model_repository_index(self):
         #     graphdef_float32_float32_float32
         for model_base in model_bases:
             try:
-                model_name = tu.get_model_name(model_base, np.float32,
-                                               np.float32, np.float32)
-                for triton_client in (httpclient.InferenceServerClient(
-                        "localhost:8000", verbose=True),
-                                      grpcclient.InferenceServerClient(
-                                          "localhost:8001", verbose=True)):
+                model_name = tu.get_model_name(
+                    model_base, np.float32, np.float32, np.float32
+                )
+                for triton_client in (
+                    httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                    grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                ):
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
                     self.assertTrue(triton_client.is_model_ready(model_name))
@@ -2132,8 +2280,9 @@ def test_model_repository_index(self):
         # which appears in two repositories.
         model_bases.append("simple_graphdef")
         try:
-            triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(
+                "localhost:8000", verbose=True
+            )
             index = triton_client.get_model_repository_index()
             indexed = list()
             self.assertEqual(len(index), 8)
@@ -2142,15 +2291,17 @@ def test_model_repository_index(self):
                 if i["name"] == "onnx_float32_float32_float32":
                     self.assertEqual(i["state"], "UNAVAILABLE")
                     self.assertEqual(
-                        i["reason"],
-                        "model appears in two or more repositories")
+                        i["reason"], "model appears in two or more repositories"
+                    )
             for model_base in model_bases:
-                model_name = tu.get_model_name(model_base, np.float32,
-                                               np.float32, np.float32)
+                model_name = tu.get_model_name(
+                    model_base, np.float32, np.float32, np.float32
+                )
                 self.assertTrue(model_name in indexed)
 
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             index = triton_client.get_model_repository_index()
             indexed = list()
             self.assertEqual(len(index.models), 8)
@@ -2159,10 +2310,12 @@ def test_model_repository_index(self):
                 if i.name == "onnx_float32_float32_float32":
                     self.assertEqual(i.state, "UNAVAILABLE")
                     self.assertEqual(
-                        i.reason, "model appears in two or more repositories")
+                        i.reason, "model appears in two or more repositories"
+                    )
             for model_base in model_bases:
-                model_name = tu.get_model_name(model_base, np.float32,
-                                               np.float32, np.float32)
+                model_name = tu.get_model_name(
+                    model_base, np.float32, np.float32, np.float32
+                )
                 self.assertTrue(model_name in indexed)
 
         except Exception as ex:
@@ -2171,21 +2324,19 @@ def test_model_repository_index(self):
     def test_config_override(self):
         model_shape = (1, 16)
 
-        for triton_client in (httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True),
-                              grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)):
-            for base in (('onnx', 'onnxruntime'),):
-                model_name = tu.get_model_name(base[0], np.float32, np.float32,
-                                               np.float32)
+        for triton_client in (
+            httpclient.InferenceServerClient("localhost:8000", verbose=True),
+            grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+        ):
+            for base in (("onnx", "onnxruntime"),):
+                model_name = tu.get_model_name(
+                    base[0], np.float32, np.float32, np.float32
+                )
                 try:
                     self.assertTrue(triton_client.is_server_live())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "2"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "2"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -2194,18 +2345,23 @@ def test_config_override(self):
                 try:
                     triton_client.load_model(model_name)
                     self.assertTrue(
-                        False, "expected fail to load '{}'".format(model_name))
+                        False, "expected fail to load '{}'".format(model_name)
+                    )
                 except Exception as ex:
                     self.assertIn(
-                        "load failed for model '{}'".format(model_name),
-                        ex.message())
+                        "load failed for model '{}'".format(model_name), ex.message()
+                    )
 
                 # Request to load the model with provided "correct" config
                 try:
-                    triton_client.load_model(model_name,
-                                             config="""
+                    triton_client.load_model(
+                        model_name,
+                        config="""
 {{"backend":"{backend}","version_policy":{{"specific" : {{ "versions": [2] }} }} }}
-""".format(backend=base[1]))
+""".format(
+                            backend=base[1]
+                        ),
+                    )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
                 self.assertFalse(triton_client.is_model_ready(model_name, "1"))
@@ -2213,67 +2369,61 @@ def test_config_override(self):
                 self.assertFalse(triton_client.is_model_ready(model_name, "3"))
 
                 # And loaded models work properly
-                self._infer_success_models([
-                    base[0],
-                ], (2,), model_shape)
+                self._infer_success_models(
+                    [
+                        base[0],
+                    ],
+                    (2,),
+                    model_shape,
+                )
 
                 # request without additional config will load with default
                 # config and expect to fail, and version 2 will not be unloaded.
                 try:
                     triton_client.load_model(model_name)
                     self.assertTrue(
-                        False, "expected fail to load '{}'".format(model_name))
+                        False, "expected fail to load '{}'".format(model_name)
+                    )
                 except Exception as ex:
                     self.assertIn(
-                        "load failed for model '{}'".format(model_name),
-                        ex.message())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "2"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                        "load failed for model '{}'".format(model_name), ex.message()
+                    )
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "2"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
 
                 # Unload model for the next client iteration
                 try:
                     triton_client.unload_model(model_name)
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "2"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "2"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "3"))
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_file_override(self):
-
         model_shape = (1, 16)
         override_base = "override_model"
 
-        for base in (('onnx', 'onnxruntime'),):
-            model_name = tu.get_model_name(base[0], np.float32, np.float32,
-                                           np.float32)
-            override_model_name = tu.get_model_name(override_base, np.float32,
-                                                    np.float32, np.float32)
+        for base in (("onnx", "onnxruntime"),):
+            model_name = tu.get_model_name(base[0], np.float32, np.float32, np.float32)
+            override_model_name = tu.get_model_name(
+                override_base, np.float32, np.float32, np.float32
+            )
 
             # Prepare override file
-            with open("models/{}/3/model.{}".format(model_name, base[0]),
-                      'rb') as f:
+            with open("models/{}/3/model.{}".format(model_name, base[0]), "rb") as f:
                 file_content = f.read()
 
-            for triton_client in (httpclient.InferenceServerClient(
-                    "localhost:8000", verbose=True),
-                                  grpcclient.InferenceServerClient(
-                                      "localhost:8001", verbose=True)):
+            for triton_client in (
+                httpclient.InferenceServerClient("localhost:8000", verbose=True),
+                grpcclient.InferenceServerClient("localhost:8001", verbose=True),
+            ):
                 try:
                     self.assertTrue(triton_client.is_server_live())
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "1"))
-                    self.assertFalse(
-                        triton_client.is_model_ready(model_name, "2"))
-                    self.assertTrue(
-                        triton_client.is_model_ready(model_name, "3"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "1"))
+                    self.assertFalse(triton_client.is_model_ready(model_name, "2"))
+                    self.assertTrue(triton_client.is_model_ready(model_name, "3"))
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -2283,14 +2433,17 @@ def test_file_override(self):
                 # not be used.
                 try:
                     triton_client.load_model(
-                        model_name, files={"file:1/model.onnx": file_content})
-                    self.assertTrue(
-                        False, "expected error on missing override config")
+                        model_name, files={"file:1/model.onnx": file_content}
+                    )
+                    self.assertTrue(False, "expected error on missing override config")
                 except InferenceServerException as ex:
                     # [FIXME] Improve error reporting to mention missing config
                     self.assertIn(
-                        "failed to load '{}', failed to poll from model repository"
-                        .format(model_name), ex.message())
+                        "failed to load '{}', failed to poll from model repository".format(
+                            model_name
+                        ),
+                        ex.message(),
+                    )
 
                 # Sanity check on previous loaded version is still available
                 # after the failure attempt to load model with different version
@@ -2298,18 +2451,22 @@ def test_file_override(self):
                 self.assertFalse(triton_client.is_model_ready(model_name, "2"))
                 self.assertTrue(triton_client.is_model_ready(model_name, "3"))
 
-                self._infer_success_models([
-                    base[0],
-                ], (3,), model_shape)
+                self._infer_success_models(
+                    [
+                        base[0],
+                    ],
+                    (3,),
+                    model_shape,
+                )
 
                 # Request to load the model with override file and config in
                 # a different name
                 try:
                     triton_client.load_model(
                         override_model_name,
-                        config="""{{"backend":"{backend}" }}""".format(
-                            backend=base[1]),
-                        files={"file:1/model.onnx": file_content})
+                        config="""{{"backend":"{backend}" }}""".format(backend=base[1]),
+                        files={"file:1/model.onnx": file_content},
+                    )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -2318,31 +2475,35 @@ def test_file_override(self):
                 self.assertFalse(triton_client.is_model_ready(model_name, "1"))
                 self.assertFalse(triton_client.is_model_ready(model_name, "2"))
                 self.assertTrue(triton_client.is_model_ready(model_name, "3"))
-                self._infer_success_models([
-                    base[0],
-                ], (3,), model_shape)
+                self._infer_success_models(
+                    [
+                        base[0],
+                    ],
+                    (3,),
+                    model_shape,
+                )
 
                 # New override model should also be available
-                self.assertTrue(
-                    triton_client.is_model_ready(override_model_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(override_model_name, "2"))
-                self.assertFalse(
-                    triton_client.is_model_ready(override_model_name, "3"))
-                self._infer_success_models([
-                    override_base,
-                ], (1,),
-                                           model_shape,
-                                           swap=True)
+                self.assertTrue(triton_client.is_model_ready(override_model_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(override_model_name, "2"))
+                self.assertFalse(triton_client.is_model_ready(override_model_name, "3"))
+                self._infer_success_models(
+                    [
+                        override_base,
+                    ],
+                    (1,),
+                    model_shape,
+                    swap=True,
+                )
 
                 # Request to load the model with override file and config in
                 # original name
                 try:
                     triton_client.load_model(
                         model_name,
-                        config="""{{"backend":"{backend}" }}""".format(
-                            backend=base[1]),
-                        files={"file:1/model.onnx": file_content})
+                        config="""{{"backend":"{backend}" }}""".format(backend=base[1]),
+                        files={"file:1/model.onnx": file_content},
+                    )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -2351,24 +2512,27 @@ def test_file_override(self):
                 self.assertTrue(triton_client.is_model_ready(model_name, "1"))
                 self.assertFalse(triton_client.is_model_ready(model_name, "2"))
                 self.assertFalse(triton_client.is_model_ready(model_name, "3"))
-                self._infer_success_models([
-                    base[0],
-                ], (1,),
-                                           model_shape,
-                                           swap=True)
+                self._infer_success_models(
+                    [
+                        base[0],
+                    ],
+                    (1,),
+                    model_shape,
+                    swap=True,
+                )
 
                 # The model with different name should be available
-                self.assertTrue(
-                    triton_client.is_model_ready(override_model_name, "1"))
-                self.assertFalse(
-                    triton_client.is_model_ready(override_model_name, "2"))
-                self.assertFalse(
-                    triton_client.is_model_ready(override_model_name, "3"))
-                self._infer_success_models([
-                    override_base,
-                ], (1,),
-                                           model_shape,
-                                           swap=True)
+                self.assertTrue(triton_client.is_model_ready(override_model_name, "1"))
+                self.assertFalse(triton_client.is_model_ready(override_model_name, "2"))
+                self.assertFalse(triton_client.is_model_ready(override_model_name, "3"))
+                self._infer_success_models(
+                    [
+                        override_base,
+                    ],
+                    (1,),
+                    model_shape,
+                    swap=True,
+                )
 
                 # Reset model for the next client iteration
                 try:
@@ -2381,19 +2545,22 @@ def test_file_override(self):
                 self.assertFalse(triton_client.is_model_ready(model_name, "1"))
                 self.assertFalse(triton_client.is_model_ready(model_name, "2"))
                 self.assertTrue(triton_client.is_model_ready(model_name, "3"))
-                self._infer_success_models([
-                    base[0],
-                ], (3,), model_shape)
+                self._infer_success_models(
+                    [
+                        base[0],
+                    ],
+                    (3,),
+                    model_shape,
+                )
 
     def test_shutdown_dynamic(self):
         model_shape = (1, 1)
         input_data = np.ones(shape=(1, 1), dtype=np.float32)
 
-        inputs = [grpcclient.InferInput('INPUT0', model_shape, "FP32")]
+        inputs = [grpcclient.InferInput("INPUT0", model_shape, "FP32")]
         inputs[0].set_data_from_numpy(input_data)
 
-        triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                         verbose=True)
+        triton_client = grpcclient.InferenceServerClient("localhost:8001", verbose=True)
         model_name = "custom_zero_1_float32"
 
         # Send two requests as only requests held in scheduler are counted
@@ -2411,26 +2578,27 @@ def callback(user_data, result, error):
         request_count = 6
         async_results = []
         for _ in range(request_count):
-            triton_client.async_infer(model_name, inputs,
-                                      partial(callback, async_results))
+            triton_client.async_infer(
+                model_name, inputs, partial(callback, async_results)
+            )
         time.sleep(1)
 
         # Send signal to shutdown the server
-        os.kill(int(os.environ['SERVER_PID']), signal.SIGINT)
+        os.kill(int(os.environ["SERVER_PID"]), signal.SIGINT)
 
         # Send more requests and should be rejected
         try:
             triton_client.infer(model_name, inputs)
-            self.assertTrue(False,
-                            "expected error for new inference during shutdown")
+            self.assertTrue(False, "expected error for new inference during shutdown")
         except InferenceServerException as ex:
             self.assertIn(
                 "Server is stopping, scheduler for model has stopped accepting new inference requests",
-                ex.message())
+                ex.message(),
+            )
 
         # Wait until the results are available in user_data
         time_out = 30
-        while ((len(async_results) < request_count) and time_out > 0):
+        while (len(async_results) < request_count) and time_out > 0:
             time_out = time_out - 1
             time.sleep(1)
 
@@ -2438,21 +2606,19 @@ def callback(user_data, result, error):
         for result in async_results:
             if type(result) == InferenceServerException:
                 raise result
-            output_data = result.as_numpy('OUTPUT0')
+            output_data = result.as_numpy("OUTPUT0")
             np.testing.assert_allclose(
-                output_data,
-                input_data,
-                err_msg='Inference result is not correct')
+                output_data, input_data, err_msg="Inference result is not correct"
+            )
 
     def test_shutdown_sequence(self):
         model_shape = (1, 1)
         input_data = np.ones(shape=(1, 1), dtype=np.int32)
 
-        inputs = [grpcclient.InferInput('INPUT', model_shape, "INT32")]
+        inputs = [grpcclient.InferInput("INPUT", model_shape, "INT32")]
         inputs[0].set_data_from_numpy(input_data)
 
-        triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                         verbose=True)
+        triton_client = grpcclient.InferenceServerClient("localhost:8001", verbose=True)
         model_name = "custom_sequence_int32"
 
         # Send two requests as only requests held in scheduler are counted
@@ -2467,59 +2633,57 @@ def callback(user_data, result, error):
         request_count = 2
         async_results = []
         for i in range(request_count):
-            triton_client.async_infer(model_name,
-                                      inputs,
-                                      partial(callback, async_results),
-                                      sequence_id=(i + 1),
-                                      sequence_start=True)
+            triton_client.async_infer(
+                model_name,
+                inputs,
+                partial(callback, async_results),
+                sequence_id=(i + 1),
+                sequence_start=True,
+            )
         time.sleep(1)
 
         # Send signal to shutdown the server
-        os.kill(int(os.environ['SERVER_PID']), signal.SIGINT)
+        os.kill(int(os.environ["SERVER_PID"]), signal.SIGINT)
 
         # Send requests with different characteristic
-        # 1: New sequence with new seqeuence ID
-        try:
-            triton_client.infer(model_name,
-                                inputs,
-                                sequence_id=request_count,
-                                sequence_start=True)
-            self.assertTrue(False,
-                            "expected error for new inference during shutdown")
+        # 1: New sequence with new sequence ID
+        try:
+            triton_client.infer(
+                model_name, inputs, sequence_id=request_count, sequence_start=True
+            )
+            self.assertTrue(False, "expected error for new inference during shutdown")
         except InferenceServerException as ex:
             self.assertIn(
                 "Server is stopping, scheduler for model has stopped accepting new inference requests",
-                ex.message())
-        # 2: New sequence with existing seqeuence ID
-        try:
-            triton_client.infer(model_name,
-                                inputs,
-                                sequence_id=1,
-                                sequence_start=True)
-            self.assertTrue(False,
-                            "expected error for new inference during shutdown")
+                ex.message(),
+            )
+        # 2: New sequence with existing sequence ID
+        try:
+            triton_client.infer(model_name, inputs, sequence_id=1, sequence_start=True)
+            self.assertTrue(False, "expected error for new inference during shutdown")
         except InferenceServerException as ex:
             self.assertIn(
                 "Server is stopping, scheduler for model has stopped accepting new inference requests",
-                ex.message())
+                ex.message(),
+            )
         # 3: Continuing sequence
         try:
-            res = triton_client.infer(model_name,
-                                      inputs,
-                                      sequence_id=2,
-                                      sequence_end=True)
-            output_data = res.as_numpy('OUTPUT')
+            res = triton_client.infer(
+                model_name, inputs, sequence_id=2, sequence_end=True
+            )
+            output_data = res.as_numpy("OUTPUT")
             # Result are accumulated
             np.testing.assert_allclose(
                 output_data,
                 input_data + input_data,
-                err_msg='Inference result is not correct')
+                err_msg="Inference result is not correct",
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         # Wait until the results are available in user_data
         time_out = 30
-        while ((len(async_results) < request_count) and time_out > 0):
+        while (len(async_results) < request_count) and time_out > 0:
             time_out = time_out - 1
             time.sleep(1)
 
@@ -2527,11 +2691,10 @@ def callback(user_data, result, error):
         for result in async_results:
             if type(result) == InferenceServerException:
                 raise result
-            output_data = result.as_numpy('OUTPUT')
+            output_data = result.as_numpy("OUTPUT")
             np.testing.assert_allclose(
-                output_data,
-                input_data,
-                err_msg='Inference result is not correct')
+                output_data, input_data, err_msg="Inference result is not correct"
+            )
 
         # Sleep 5 seconds for scheduler timeout to work and should
         # reduce the in-flight count
@@ -2541,11 +2704,10 @@ def test_shutdown_ensemble(self):
         model_shape = (1, 1)
         input_data = np.ones(shape=(1, 1), dtype=np.float32)
 
-        inputs = [grpcclient.InferInput('INPUT0', model_shape, "FP32")]
+        inputs = [grpcclient.InferInput("INPUT0", model_shape, "FP32")]
         inputs[0].set_data_from_numpy(input_data)
 
-        triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                         verbose=True)
+        triton_client = grpcclient.InferenceServerClient("localhost:8001", verbose=True)
         model_name = "ensemble_zero_1_float32"
 
         # Send two requests as only requests held in scheduler are counted
@@ -2562,27 +2724,28 @@ def callback(user_data, result, error):
         request_count = 1
         async_results = []
         for _ in range(request_count):
-            triton_client.async_infer(model_name, inputs,
-                                      partial(callback, async_results))
+            triton_client.async_infer(
+                model_name, inputs, partial(callback, async_results)
+            )
         time.sleep(1)
 
         # Send signal to shutdown the server
-        os.kill(int(os.environ['SERVER_PID']), signal.SIGINT)
+        os.kill(int(os.environ["SERVER_PID"]), signal.SIGINT)
 
         # Send more requests and should be rejected
         try:
             triton_client.infer(model_name, inputs)
-            self.assertTrue(False,
-                            "expected error for new inference during shutdown")
+            self.assertTrue(False, "expected error for new inference during shutdown")
         except InferenceServerException as ex:
             self.assertIn("in ensemble 'ensemble_zero_1_float32'", ex.message())
             self.assertIn(
                 "Server is stopping, scheduler for model has stopped accepting new inference requests",
-                ex.message())
+                ex.message(),
+            )
 
         # Wait until the results are available in user_data
         time_out = 10
-        while ((len(async_results) < request_count) and time_out > 0):
+        while (len(async_results) < request_count) and time_out > 0:
             time_out = time_out - 1
             time.sleep(1)
 
@@ -2590,17 +2753,17 @@ def callback(user_data, result, error):
         for result in async_results:
             if type(result) == InferenceServerException:
                 raise result
-            output_data = result.as_numpy('OUTPUT0')
+            output_data = result.as_numpy("OUTPUT0")
             np.testing.assert_allclose(
-                output_data,
-                input_data,
-                err_msg='Inference result is not correct')
+                output_data, input_data, err_msg="Inference result is not correct"
+            )
 
     def test_load_gpu_limit(self):
         model_name = "cuda_memory_consumer"
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             triton_client.load_model(model_name + "_1")
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
@@ -2608,18 +2771,19 @@ def test_load_gpu_limit(self):
         # After the first load, the memory consumption should have exceeded
         # the specified limit, load will fail
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             triton_client.load_model(model_name + "_2")
             self.assertTrue(False, "expected error for loading model")
         except Exception as ex:
-            self.assertIn("memory limit set for GPU 0 has exceeded",
-                          ex.message())
+            self.assertIn("memory limit set for GPU 0 has exceeded", ex.message())
 
         # Load should work after explicitly unload model to free memory
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
             triton_client.unload_model(model_name + "_1")
             triton_client.load_model(model_name + "_2")
         except Exception as ex:
@@ -2628,21 +2792,26 @@ def test_load_gpu_limit(self):
     def test_concurrent_load_speedup(self):
         # Initialize client
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         # Each model should have a loading delay of 10 seconds
-        model_pairs = [["identity_zero_1_int32_1", "identity_zero_1_int32_2"],
-                       ["python_identity_fp32_1", "python_identity_fp32_2"]]
+        model_pairs = [
+            ["identity_zero_1_int32_1", "identity_zero_1_int32_2"],
+            ["python_identity_fp32_1", "python_identity_fp32_2"],
+        ]
         # Test each model pair for speed up
         for model_pair in model_pairs:
             # Load both models concurrently
             threads = []
             for model_name in model_pair:
                 threads.append(
-                    threading.Thread(target=triton_client.load_model,
-                                     args=(model_name,)))
+                    threading.Thread(
+                        target=triton_client.load_model, args=(model_name,)
+                    )
+                )
             start_time = time.time()
             for thread in threads:
                 thread.start()
@@ -2653,11 +2822,13 @@ def test_concurrent_load_speedup(self):
             # Each of the two models has a minimum loading delay of 10 seconds
             # Speedup is observed when the concurrent loading time < 20 seconds
             # but use a tighter bound of 15 seconds
-            self.assertLess(loading_time, 15.0,
-                            "Concurrent loading speedup not observed")
+            self.assertLess(
+                loading_time, 15.0, "Concurrent loading speedup not observed"
+            )
             # Concurrent loading time cannot be < 10 seconds
-            self.assertGreaterEqual(loading_time, 10.0,
-                                    "Invalid concurrent loading time")
+            self.assertGreaterEqual(
+                loading_time, 10.0, "Invalid concurrent loading time"
+            )
             # Make sure the models are loaded
             self.assertTrue(triton_client.is_server_live())
             self.assertTrue(triton_client.is_server_ready())
@@ -2667,8 +2838,9 @@ def test_concurrent_load_speedup(self):
     def test_concurrent_load(self):
         # Initialize client
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         # Load same named model concurrently
@@ -2695,18 +2867,19 @@ def test_concurrent_load(self):
     def test_concurrent_load_unload(self):
         # Initialize client
         try:
-            triton_client = grpcclient.InferenceServerClient("localhost:8001",
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(
+                "localhost:8001", verbose=True
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
         # Load identity_zero_1_int32 and unload it while loading
         # The unload operation should wait until the load is completed
         with concurrent.futures.ThreadPoolExecutor() as pool:
-            load_thread = pool.submit(triton_client.load_model,
-                                      "identity_zero_1_int32")
+            load_thread = pool.submit(triton_client.load_model, "identity_zero_1_int32")
             time.sleep(2)  # wait between load and unload
-            unload_thread = pool.submit(triton_client.unload_model,
-                                        "identity_zero_1_int32")
+            unload_thread = pool.submit(
+                triton_client.unload_model, "identity_zero_1_int32"
+            )
             load_thread.result()
             unload_thread.result()
         self.assertTrue(triton_client.is_server_live())
@@ -2715,22 +2888,25 @@ def test_concurrent_load_unload(self):
         # Load ensemble_zero_1_float32 and unload its dependency while loading
         # The unload operation should wait until the load is completed
         with concurrent.futures.ThreadPoolExecutor() as pool:
-            load_thread = pool.submit(triton_client.load_model,
-                                      "ensemble_zero_1_float32")
+            load_thread = pool.submit(
+                triton_client.load_model, "ensemble_zero_1_float32"
+            )
             time.sleep(2)  # wait between load and unload
-            unload_thread = pool.submit(triton_client.unload_model,
-                                        "custom_zero_1_float32")
+            unload_thread = pool.submit(
+                triton_client.unload_model, "custom_zero_1_float32"
+            )
             load_thread.result()
             unload_thread.result()
         self.assertTrue(triton_client.is_server_live())
         self.assertTrue(triton_client.is_server_ready())
-        self.assertFalse(
-            triton_client.is_model_ready("ensemble_zero_1_float32"))
+        self.assertFalse(triton_client.is_model_ready("ensemble_zero_1_float32"))
         self.assertFalse(triton_client.is_model_ready("custom_zero_1_float32"))
         # Load both models and unload them concurrently
         model_names = ["identity_zero_1_int32", "ensemble_zero_1_float32"]
         for is_load in [True, False]:
-            action_fn = triton_client.load_model if is_load else triton_client.unload_model
+            action_fn = (
+                triton_client.load_model if is_load else triton_client.unload_model
+            )
             with concurrent.futures.ThreadPoolExecutor() as pool:
                 threads = []
                 for model_name in model_names:
@@ -2738,9 +2914,8 @@ def test_concurrent_load_unload(self):
                 for thread in concurrent.futures.as_completed(threads):
                     thread.result()
             for model_name in model_names:
-                self.assertEqual(is_load,
-                                 triton_client.is_model_ready(model_name))
+                self.assertEqual(is_load, triton_client.is_model_ready(model_name))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_lifecycle/test.sh b/qa/L0_lifecycle/test.sh
index ebbc409c85..ab12c1c7b8 100755
--- a/qa/L0_lifecycle/test.sh
+++ b/qa/L0_lifecycle/test.sh
@@ -1010,8 +1010,8 @@ LOG_IDX=$((LOG_IDX+1))
 
 # Test loading all models on startup in EXPLICIT model control mode AND
 # an additional --load-model argument, it should fail
-rm -fr models 
-mkdir models 
+rm -fr models
+mkdir models
 for i in onnx ; do
     cp -r $DATADIR/qa_model_repository/${i}_float32_float32_float32 models/.
     sed -i "s/max_batch_size:.*/max_batch_size: 1/" models/${i}_float32_float32_float32/config.pbtxt
@@ -1542,7 +1542,7 @@ mkdir models
 cp -r ../custom_models/custom_zero_1_float32 models/. && \
     mkdir -p models/custom_zero_1_float32/1 && \
     (cd models/custom_zero_1_float32 && \
-        echo "dynamic_batching {}" >> config.pbtxt 
+        echo "dynamic_batching {}" >> config.pbtxt
         echo "parameters [" >> config.pbtxt && \
         echo "{ key: \"execute_delay_ms\"; value: { string_value: \"5000\" }}" >> config.pbtxt && \
         echo "]" >> config.pbtxt)
@@ -1621,7 +1621,7 @@ cp -r ensemble_zero_1_float32 models/. && \
 cp -r ../custom_models/custom_zero_1_float32 models/. && \
     mkdir -p models/custom_zero_1_float32/1 && \
     (cd models/custom_zero_1_float32 && \
-        echo "dynamic_batching {}" >> config.pbtxt 
+        echo "dynamic_batching {}" >> config.pbtxt
         echo "parameters [" >> config.pbtxt && \
         echo "{ key: \"execute_delay_ms\"; value: { string_value: \"5000\" }}" >> config.pbtxt && \
         echo "]" >> config.pbtxt)
diff --git a/qa/L0_logging/logging_endpoint_test.py b/qa/L0_logging/logging_endpoint_test.py
old mode 100644
new mode 100755
index 2058d941c2..26f98de3da
--- a/qa/L0_logging/logging_endpoint_test.py
+++ b/qa/L0_logging/logging_endpoint_test.py
@@ -27,21 +27,21 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import sys
+
 sys.path.append("../common")
 
+import json
 import sys
 import unittest
-import tritonclient.http as httpclient
-import tritonclient.grpc as grpcclient
-import json
-from google.protobuf import json_format
 
 import test_util as tu
+import tritonclient.grpc as grpcclient
+import tritonclient.http as httpclient
+from google.protobuf import json_format
 
 
 # Similar set up as dynamic batcher tests
 class LogEndpointTest(tu.TestResultCollector):
-
     def tearDown(self):
         # Clear all log settings to initial state.
         # Note that the tearDown function uses HTTP client so the pass/fail
@@ -54,7 +54,7 @@ def tearDown(self):
             "log_warning": True,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         triton_client = httpclient.InferenceServerClient("localhost:8000")
         triton_client.update_log_settings(settings=clear_settings)
@@ -71,7 +71,7 @@ def check_server_initial_state(self):
             "log_warning": True,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         triton_client = httpclient.InferenceServerClient("localhost:8000")
         self.assertEqual(initial_settings, triton_client.get_log_settings())
@@ -85,42 +85,40 @@ def test_http_get_settings(self):
             "log_warning": True,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         triton_client = httpclient.InferenceServerClient("localhost:8000")
-        self.assertEqual(initial_settings, triton_client.get_log_settings(),
-                         "Unexpected initial log settings")
+        self.assertEqual(
+            initial_settings,
+            triton_client.get_log_settings(),
+            "Unexpected initial log settings",
+        )
 
     def test_grpc_get_settings(self):
         # Log settings will be the same as default settings since
         # no update has been made.
         initial_settings = grpcclient.service_pb2.LogSettingsResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "log_file": {
-                        "stringParam": ""
-                    },
-                    "log_info": {
-                        "boolParam": True
-                    },
-                    "log_warning": {
-                        "boolParam": True
-                    },
-                    "log_error": {
-                        "boolParam": True
-                    },
-                    "log_verbose_level": {
-                        "uint32Param": 0
-                    },
-                    "log_format": {
-                        "stringParam": "default"
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "log_file": {"stringParam": ""},
+                        "log_info": {"boolParam": True},
+                        "log_warning": {"boolParam": True},
+                        "log_error": {"boolParam": True},
+                        "log_verbose_level": {"uint32Param": 0},
+                        "log_format": {"stringParam": "default"},
+                    }
                 }
-            }), initial_settings)
+            ),
+            initial_settings,
+        )
         triton_client = grpcclient.InferenceServerClient("localhost:8001")
-        self.assertEqual(initial_settings, triton_client.get_log_settings(),
-                         "Unexpected initial log settings")
+        self.assertEqual(
+            initial_settings,
+            triton_client.get_log_settings(),
+            "Unexpected initial log settings",
+        )
 
     def test_http_update_settings(self):
         # Update each possible log configuration
@@ -134,7 +132,7 @@ def test_http_update_settings(self):
             "log_warning": True,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_2 = {
             "log_file": "log_file.log",
@@ -142,7 +140,7 @@ def test_http_update_settings(self):
             "log_warning": True,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_3 = {
             "log_file": "log_file.log",
@@ -150,7 +148,7 @@ def test_http_update_settings(self):
             "log_warning": False,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_4 = {
             "log_file": "log_file.log",
@@ -158,7 +156,7 @@ def test_http_update_settings(self):
             "log_warning": False,
             "log_error": False,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_5 = {
             "log_file": "log_file.log",
@@ -166,7 +164,7 @@ def test_http_update_settings(self):
             "log_warning": False,
             "log_error": False,
             "log_verbose_level": 1,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_6 = {
             "log_file": "log_file.log",
@@ -174,34 +172,40 @@ def test_http_update_settings(self):
             "log_warning": False,
             "log_error": False,
             "log_verbose_level": 1,
-            "log_format": "ISO8601"
+            "log_format": "ISO8601",
         }
 
         triton_client = httpclient.InferenceServerClient("localhost:8000")
         self.assertEqual(
             expected_log_settings_1,
             triton_client.update_log_settings(settings=expected_log_settings_1),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
         self.assertEqual(
             expected_log_settings_2,
             triton_client.update_log_settings(settings=expected_log_settings_2),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
         self.assertEqual(
             expected_log_settings_3,
             triton_client.update_log_settings(settings=expected_log_settings_3),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
         self.assertEqual(
             expected_log_settings_4,
             triton_client.update_log_settings(settings=expected_log_settings_4),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
         self.assertEqual(
             expected_log_settings_5,
             triton_client.update_log_settings(settings=expected_log_settings_5),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
         self.assertEqual(
             expected_log_settings_6,
             triton_client.update_log_settings(settings=expected_log_settings_6),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
 
     def test_grpc_update_settings(self):
         # Update each possible log configuration
@@ -216,37 +220,30 @@ def test_grpc_update_settings(self):
             "log_warning": True,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_1 = grpcclient.service_pb2.LogSettingsResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "log_file": {
-                        "stringParam": "log_file.log"
-                    },
-                    "log_info": {
-                        "boolParam": True
-                    },
-                    "log_warning": {
-                        "boolParam": True
-                    },
-                    "log_error": {
-                        "boolParam": True
-                    },
-                    "log_verbose_level": {
-                        "uint32Param": 0
-                    },
-                    "log_format": {
-                        "stringParam": "default"
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "log_file": {"stringParam": "log_file.log"},
+                        "log_info": {"boolParam": True},
+                        "log_warning": {"boolParam": True},
+                        "log_error": {"boolParam": True},
+                        "log_verbose_level": {"uint32Param": 0},
+                        "log_format": {"stringParam": "default"},
+                    }
                 }
-            }), expected_log_settings_1)
+            ),
+            expected_log_settings_1,
+        )
 
         self.assertEqual(
             expected_log_settings_1,
             triton_client.update_log_settings(settings=log_settings_1),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
 
         log_settings_2 = {
             "log_file": "log_file.log",
@@ -254,37 +251,30 @@ def test_grpc_update_settings(self):
             "log_warning": True,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_2 = grpcclient.service_pb2.LogSettingsResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "log_file": {
-                        "stringParam": "log_file.log"
-                    },
-                    "log_info": {
-                        "boolParam": False
-                    },
-                    "log_warning": {
-                        "boolParam": True
-                    },
-                    "log_error": {
-                        "boolParam": True
-                    },
-                    "log_verbose_level": {
-                        "uint32Param": 0
-                    },
-                    "log_format": {
-                        "stringParam": "default"
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "log_file": {"stringParam": "log_file.log"},
+                        "log_info": {"boolParam": False},
+                        "log_warning": {"boolParam": True},
+                        "log_error": {"boolParam": True},
+                        "log_verbose_level": {"uint32Param": 0},
+                        "log_format": {"stringParam": "default"},
+                    }
                 }
-            }), expected_log_settings_2)
+            ),
+            expected_log_settings_2,
+        )
 
         self.assertEqual(
             expected_log_settings_2,
             triton_client.update_log_settings(settings=log_settings_2),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
 
         log_settings_3 = {
             "log_file": "log_file.log",
@@ -292,37 +282,30 @@ def test_grpc_update_settings(self):
             "log_warning": False,
             "log_error": True,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_3 = grpcclient.service_pb2.LogSettingsResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "log_file": {
-                        "stringParam": "log_file.log"
-                    },
-                    "log_info": {
-                        "boolParam": False
-                    },
-                    "log_warning": {
-                        "boolParam": False
-                    },
-                    "log_error": {
-                        "boolParam": True
-                    },
-                    "log_verbose_level": {
-                        "uint32Param": 0
-                    },
-                    "log_format": {
-                        "stringParam": "default"
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "log_file": {"stringParam": "log_file.log"},
+                        "log_info": {"boolParam": False},
+                        "log_warning": {"boolParam": False},
+                        "log_error": {"boolParam": True},
+                        "log_verbose_level": {"uint32Param": 0},
+                        "log_format": {"stringParam": "default"},
+                    }
                 }
-            }), expected_log_settings_3)
+            ),
+            expected_log_settings_3,
+        )
 
         self.assertEqual(
             expected_log_settings_3,
             triton_client.update_log_settings(settings=log_settings_3),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
 
         log_settings_4 = {
             "log_file": "log_file.log",
@@ -330,37 +313,30 @@ def test_grpc_update_settings(self):
             "log_warning": False,
             "log_error": False,
             "log_verbose_level": 0,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_4 = grpcclient.service_pb2.LogSettingsResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "log_file": {
-                        "stringParam": "log_file.log"
-                    },
-                    "log_info": {
-                        "boolParam": False
-                    },
-                    "log_warning": {
-                        "boolParam": False
-                    },
-                    "log_error": {
-                        "boolParam": False
-                    },
-                    "log_verbose_level": {
-                        "uint32Param": 0
-                    },
-                    "log_format": {
-                        "stringParam": "default"
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "log_file": {"stringParam": "log_file.log"},
+                        "log_info": {"boolParam": False},
+                        "log_warning": {"boolParam": False},
+                        "log_error": {"boolParam": False},
+                        "log_verbose_level": {"uint32Param": 0},
+                        "log_format": {"stringParam": "default"},
+                    }
                 }
-            }), expected_log_settings_4)
+            ),
+            expected_log_settings_4,
+        )
 
         self.assertEqual(
             expected_log_settings_4,
             triton_client.update_log_settings(settings=log_settings_4),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
 
         log_settings_5 = {
             "log_file": "log_file.log",
@@ -368,37 +344,30 @@ def test_grpc_update_settings(self):
             "log_warning": False,
             "log_error": False,
             "log_verbose_level": 1,
-            "log_format": "default"
+            "log_format": "default",
         }
         expected_log_settings_5 = grpcclient.service_pb2.LogSettingsResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "log_file": {
-                        "stringParam": "log_file.log"
-                    },
-                    "log_info": {
-                        "boolParam": False
-                    },
-                    "log_warning": {
-                        "boolParam": False
-                    },
-                    "log_error": {
-                        "boolParam": False
-                    },
-                    "log_verbose_level": {
-                        "uint32Param": 1
-                    },
-                    "log_format": {
-                        "stringParam": "default"
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "log_file": {"stringParam": "log_file.log"},
+                        "log_info": {"boolParam": False},
+                        "log_warning": {"boolParam": False},
+                        "log_error": {"boolParam": False},
+                        "log_verbose_level": {"uint32Param": 1},
+                        "log_format": {"stringParam": "default"},
+                    }
                 }
-            }), expected_log_settings_5)
+            ),
+            expected_log_settings_5,
+        )
 
         self.assertEqual(
             expected_log_settings_5,
             triton_client.update_log_settings(settings=log_settings_5),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
 
         log_settings_6 = {
             "log_file": "log_file.log",
@@ -406,38 +375,31 @@ def test_grpc_update_settings(self):
             "log_warning": False,
             "log_error": False,
             "log_verbose_level": 1,
-            "log_format": "ISO8601"
+            "log_format": "ISO8601",
         }
         expected_log_settings_6 = grpcclient.service_pb2.LogSettingsResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "log_file": {
-                        "stringParam": "log_file.log"
-                    },
-                    "log_info": {
-                        "boolParam": False
-                    },
-                    "log_warning": {
-                        "boolParam": False
-                    },
-                    "log_error": {
-                        "boolParam": False
-                    },
-                    "log_verbose_level": {
-                        "uint32Param": 1
-                    },
-                    "log_format": {
-                        "stringParam": "ISO8601"
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "log_file": {"stringParam": "log_file.log"},
+                        "log_info": {"boolParam": False},
+                        "log_warning": {"boolParam": False},
+                        "log_error": {"boolParam": False},
+                        "log_verbose_level": {"uint32Param": 1},
+                        "log_format": {"stringParam": "ISO8601"},
+                    }
                 }
-            }), expected_log_settings_6)
+            ),
+            expected_log_settings_6,
+        )
 
         self.assertEqual(
             expected_log_settings_6,
             triton_client.update_log_settings(settings=log_settings_6),
-            "Unexpected updated log settings")
+            "Unexpected updated log settings",
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_logging/test.sh b/qa/L0_logging/test.sh
index 47c1e081cd..d83e0b76a4 100755
--- a/qa/L0_logging/test.sh
+++ b/qa/L0_logging/test.sh
@@ -70,7 +70,7 @@ RET=0
 
 function verify_correct_settings () {
   log_file_expected=$1
-  log_info_expected=$2 
+  log_info_expected=$2
   log_warn_expected=$3
   log_error_expected=$4
   log_verbose_expected=$5
@@ -142,7 +142,7 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-# Test Log File (Arguement)
+# Test Log File (Argument)
 SERVER_ARGS="--log-file=log_file.log --model-repository=$MODELSDIR"
 SERVER_LOG="./inference_server_log_file.log"
 run_server
@@ -214,7 +214,7 @@ if [ $? -ne 0 ]; then
     RET=1
 fi
 
-# Check redirection worked properly (server log has tolerance of 40 due to 
+# Check redirection worked properly (server log has tolerance of 40 due to
 # unavoidable onnx framework logging)
 expected_log_count=75
 actual_log_count=$(grep -c ^[IWEV][0-9][0-9][0-9][0-9].* ./log_file.log)
@@ -245,7 +245,7 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-# Test Log Info (Arguement)
+# Test Log Info (Argument)
 rm -f log_file.log
 SERVER_ARGS="--log-file=log_file.log --log-info=false --log-verbose=1 --model-repository=$MODELSDIR"
 SERVER_LOG="./inference_server_log_file.log"
@@ -375,7 +375,7 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-# Test Log Verbose Level (Arguement)
+# Test Log Verbose Level (Argument)
 rm -f log_file.log
 SERVER_ARGS="--log-file=log_file.log --log-verbose=1 --model-repository=$MODELSDIR"
 SERVER_LOG="./inference_server_log_file.log"
@@ -423,7 +423,7 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-# Test Log Format (Arguement)
+# Test Log Format (Argument)
 rm -f log_file.log
 SERVER_ARGS="--log-file=log_file.log --log-verbose=1 --log-format=ISO8601 --model-repository=$MODELSDIR"
 SERVER_LOG="./inference_server_log_file.log"
@@ -453,7 +453,7 @@ line=$(head -n 1 log_file.log)
 date=$(date '+%m%d')
 final_date="I${date}"
 format_date=$(echo $line | head -n1 | awk '{print $1;}')
-if [[ $final_date == $format_date ]]; then 
+if [[ $final_date == $format_date ]]; then
     echo -e "\n***\n*** Test Failed: Unexpected Log Format $LINENO\n***"
     RET=1
 fi
diff --git a/qa/L0_long_running_stress/crashing_client.py b/qa/L0_long_running_stress/crashing_client.py
old mode 100644
new mode 100755
index bb9faab45a..d9c727a3d3
--- a/qa/L0_long_running_stress/crashing_client.py
+++ b/qa/L0_long_running_stress/crashing_client.py
@@ -1,4 +1,6 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,27 +30,24 @@
 
 sys.path.append("../common")
 
-import numpy as np
-from multiprocessing import Process, shared_memory
+import argparse
 import time
+from multiprocessing import Process, shared_memory
+
+import numpy as np
 import test_util as tu
-import argparse
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import np_to_triton_dtype
 
 
-def crashing_client(model_name,
-                    dtype,
-                    tensor_shape,
-                    shm_name,
-                    triton_client,
-                    input_name="INPUT0"):
+def crashing_client(
+    model_name, dtype, tensor_shape, shm_name, triton_client, input_name="INPUT0"
+):
     in0 = np.random.random(tensor_shape).astype(dtype)
     if "libtorch" in model_name:
         input_name = "INPUT__0"
     inputs = [
-        grpcclient.InferInput(input_name, tensor_shape,
-                              np_to_triton_dtype(dtype)),
+        grpcclient.InferInput(input_name, tensor_shape, np_to_triton_dtype(dtype)),
     ]
     inputs[0].set_data_from_numpy(in0)
 
@@ -62,13 +61,15 @@ def crashing_client(model_name,
         results = triton_client.infer(model_name, inputs)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-t',
-                        '--trial',
-                        type=str,
-                        required=True,
-                        help='Set trial for the crashing client')
+    parser.add_argument(
+        "-t",
+        "--trial",
+        type=str,
+        required=True,
+        help="Set trial for the crashing client",
+    )
     FLAGS = parser.parse_args()
     trial = FLAGS.trial
 
@@ -76,22 +77,23 @@ def crashing_client(model_name,
     model_name = tu.get_zero_model_name(trial, 1, dtype)
     tensor_shape = (1,) if "nobatch" in trial else (1, 1)
 
-    triton_client = grpcclient.InferenceServerClient(url="localhost:8001",
-                                                     verbose=True)
+    triton_client = grpcclient.InferenceServerClient(url="localhost:8001", verbose=True)
 
     shm = shared_memory.SharedMemory(create=True, size=8)
     count = np.ndarray((1,), dtype=np.int32, buffer=shm.buf)
     count[0] = 0
 
-    p = Process(target=crashing_client,
-                name="crashing_client",
-                args=(
-                    model_name,
-                    dtype,
-                    tensor_shape,
-                    shm.name,
-                    triton_client,
-                ))
+    p = Process(
+        target=crashing_client,
+        name="crashing_client",
+        args=(
+            model_name,
+            dtype,
+            tensor_shape,
+            shm.name,
+            triton_client,
+        ),
+    )
 
     p.start()
 
diff --git a/qa/L0_long_running_stress/scenarios.py b/qa/L0_long_running_stress/scenarios.py
old mode 100644
new mode 100755
index 7e91968ccb..abb0004e90
--- a/qa/L0_long_running_stress/scenarios.py
+++ b/qa/L0_long_running_stress/scenarios.py
@@ -1,4 +1,6 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,26 +31,28 @@
 
 sys.path.append("../common")
 
-import numpy as np
-import time
-import test_util as tu
-import tritonclient.grpc as grpcclient
-from tritonclient.utils import np_to_triton_dtype
 import math
-from PIL import Image
 import os
 import subprocess
 import threading
+import time
+
+import numpy as np
+import test_util as tu
+import tritonclient.grpc as grpcclient
+from PIL import Image
+from tritonclient.utils import np_to_triton_dtype
+
 if sys.version_info >= (3, 0):
     import queue
 else:
     import Queue as queue
-from functools import partial
 
 import abc
 import csv
 import json
 import re
+from functools import partial
 
 DEFAULT_TIMEOUT_MS = 25000
 SEQUENCE_LENGTH_MEAN = 16
@@ -66,7 +70,6 @@ def completion_callback(user_data, result, error):
 
 
 class Scenario(metaclass=abc.ABCMeta):
-
     def __init__(self, name, trials, verbose=False, out_stream=sys.stdout):
         self.name_ = name
         self.trials_ = trials
@@ -109,13 +112,15 @@ class ModelOption:
         # 'queue_latency_range_us' specifies the range where queue latency
         # reported should be, otherwise, model concurrency will be adjusted
         # within 'concurrency_range' to influence the queue latency.
-        def __init__(self,
-                     model_name,
-                     batch_size,
-                     concurrency_range,
-                     queue_latency_range_us,
-                     input_shapes=[],
-                     input_file=None):
+        def __init__(
+            self,
+            model_name,
+            batch_size,
+            concurrency_range,
+            queue_latency_range_us,
+            input_shapes=[],
+            input_file=None,
+        ):
             self.model_name_ = model_name
             self.concurrency_range_ = list(concurrency_range)
             self.batch_size_ = batch_size
@@ -125,8 +130,11 @@ def __init__(self,
 
         def run(self, name, sequence_id_range, out_stream):
             csv_file = os.path.join(
-                "csv_dir", "{}_{}_{}.csv".format(name, self.model_name_,
-                                                 self.concurrency_range_[2]))
+                "csv_dir",
+                "{}_{}_{}.csv".format(
+                    name, self.model_name_, self.concurrency_range_[2]
+                ),
+            )
 
             arg_list = [PerfAnalyzerScenario.command_]
             # Always use GRPC streaming feature to ensure requests are handled
@@ -136,8 +144,9 @@ def run(self, name, sequence_id_range, out_stream):
             arg_list += ["-b", "{}".format(self.batch_size_)]
             arg_list += [
                 "--concurrency-range",
-                "{}:{}:1".format(self.concurrency_range_[2],
-                                 self.concurrency_range_[2])
+                "{}:{}:1".format(
+                    self.concurrency_range_[2], self.concurrency_range_[2]
+                ),
             ]
             arg_list += ["-f", csv_file]
             for name, shape in self.input_shapes_:
@@ -147,43 +156,44 @@ def run(self, name, sequence_id_range, out_stream):
             if sequence_id_range is not None:
                 arg_list += [
                     "--sequence-id-range",
-                    "{}:{}".format(sequence_id_range[0], sequence_id_range[1])
+                    "{}:{}".format(sequence_id_range[0], sequence_id_range[1]),
                 ]
 
-            completed_process = subprocess.run(arg_list,
-                                               text=True,
-                                               stdout=subprocess.PIPE,
-                                               stderr=subprocess.STDOUT)
+            completed_process = subprocess.run(
+                arg_list, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+            )
             # Write output to file before checking return code
             print(completed_process.stdout, file=out_stream)
             completed_process.check_returncode()
 
             # Read queue time and adjust concurrency
-            with open(csv_file, newline='') as csvfile:
+            with open(csv_file, newline="") as csvfile:
                 reader = csv.DictReader(csvfile)
                 for row in reader:
-                    current_queue_us = int(row['Server Queue'])
+                    current_queue_us = int(row["Server Queue"])
                     if current_queue_us < self.queue_latency_range_us_[0]:
                         self.concurrency_range_[2] = min(
-                            self.concurrency_range_[2] + 1,
-                            self.concurrency_range_[1])
+                            self.concurrency_range_[2] + 1, self.concurrency_range_[1]
+                        )
                     elif current_queue_us > self.queue_latency_range_us_[0]:
                         self.concurrency_range_[2] = max(
-                            self.concurrency_range_[2] - 1,
-                            self.concurrency_range_[0])
+                            self.concurrency_range_[2] - 1, self.concurrency_range_[0]
+                        )
                     break
-            m = re.search(r'Request count: ([0-9]+)', completed_process.stdout)
+            m = re.search(r"Request count: ([0-9]+)", completed_process.stdout)
             return int(m.group(1))
 
-    def __init__(self,
-                 name,
-                 rng,
-                 sequence_trials,
-                 identity_trials,
-                 queue_latency_range_us=(10000, 100000),
-                 sequence_id_range=None,
-                 verbose=False,
-                 out_stream=sys.stdout):
+    def __init__(
+        self,
+        name,
+        rng,
+        sequence_trials,
+        identity_trials,
+        queue_latency_range_us=(10000, 100000),
+        sequence_id_range=None,
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
         super().__init__(name, [], verbose, out_stream)
         self.rng_ = rng
         self.sequence_id_range_ = sequence_id_range
@@ -194,8 +204,10 @@ def __init__(self,
 
         # Add no validation models
         self.options_.append(
-            PerfAnalyzerScenario.ModelOption("resnet_v1_50_graphdef_def", 32,
-                                             (1, 4, 1), queue_latency_range_us))
+            PerfAnalyzerScenario.ModelOption(
+                "resnet_v1_50_graphdef_def", 32, (1, 4, 1), queue_latency_range_us
+            )
+        )
         for trial in sequence_trials:
             dtype = self.get_datatype(trial)
             # Skip string sequence model for now, it is hard for PA to generate
@@ -204,8 +216,10 @@ def __init__(self,
                 continue
             model_name = tu.get_sequence_model_name(trial, dtype)
             self.options_.append(
-                PerfAnalyzerScenario.ModelOption(model_name, 1, (1, 4, 1),
-                                                 queue_latency_range_us))
+                PerfAnalyzerScenario.ModelOption(
+                    model_name, 1, (1, 4, 1), queue_latency_range_us
+                )
+            )
         for trial in identity_trials:
             dtype = np.float32
             model_name = tu.get_zero_model_name(trial, 1, dtype)
@@ -214,9 +228,10 @@ def __init__(self,
             else:
                 input_shapes = [("INPUT0", "16")]
             self.options_.append(
-                PerfAnalyzerScenario.ModelOption(model_name, 1, (1, 4, 1),
-                                                 queue_latency_range_us,
-                                                 input_shapes))
+                PerfAnalyzerScenario.ModelOption(
+                    model_name, 1, (1, 4, 1), queue_latency_range_us, input_shapes
+                )
+            )
 
         # Add output validation version of the models
         # Skip resnet as the output data has variation which makes exact
@@ -224,25 +239,31 @@ def __init__(self,
         for trial in sequence_trials:
             dtype = self.get_datatype(trial)
             model_name = tu.get_sequence_model_name(trial, dtype)
-            data_file = os.path.join("validation_data",
-                                     "{}.json".format(model_name))
+            data_file = os.path.join("validation_data", "{}.json".format(model_name))
             self.generate_sequence_data(trial, dtype, data_file)
             self.options_.append(
-                PerfAnalyzerScenario.ModelOption(model_name,
-                                                 1, (1, 4, 1),
-                                                 queue_latency_range_us,
-                                                 input_file=data_file))
+                PerfAnalyzerScenario.ModelOption(
+                    model_name,
+                    1,
+                    (1, 4, 1),
+                    queue_latency_range_us,
+                    input_file=data_file,
+                )
+            )
         for trial in identity_trials:
             dtype = np.float32
             model_name = tu.get_zero_model_name(trial, 1, dtype)
-            data_file = os.path.join("validation_data",
-                                     "{}.json".format(model_name))
+            data_file = os.path.join("validation_data", "{}.json".format(model_name))
             self.generate_identity_data(trial, dtype, data_file)
             self.options_.append(
-                PerfAnalyzerScenario.ModelOption(model_name,
-                                                 1, (1, 4, 1),
-                                                 queue_latency_range_us,
-                                                 input_file=data_file))
+                PerfAnalyzerScenario.ModelOption(
+                    model_name,
+                    1,
+                    (1, 4, 1),
+                    queue_latency_range_us,
+                    input_file=data_file,
+                )
+            )
 
     def generate_sequence_data(self, trial, dtype, data_filename):
         input0 = "INPUT" if "libtorch" not in trial else "INPUT__0"
@@ -255,8 +276,7 @@ def generate_sequence_data(self, trial, dtype, data_filename):
             elif dtype == np.dtype(object):
                 res = str(i)
             else:
-                raise Exception(
-                    "unexpected sequence data type {}".format(dtype))
+                raise Exception("unexpected sequence data type {}".format(dtype))
             input_data.append({input0: [res]})
         output0 = "OUTPUT" if "libtorch" not in trial else "OUTPUT__0"
         output_data = []
@@ -272,8 +292,7 @@ def generate_sequence_data(self, trial, dtype, data_filename):
                 elif dtype == np.dtype(object):
                     res = str(sum)
                 else:
-                    raise Exception(
-                        "unexpected sequence data type {}".format(dtype))
+                    raise Exception("unexpected sequence data type {}".format(dtype))
                 output_data.append({output0: [res]})
         else:
             for i in range(3):
@@ -285,17 +304,17 @@ def generate_sequence_data(self, trial, dtype, data_filename):
                 elif dtype == np.dtype(object):
                     res = str(res)
                 else:
-                    raise Exception(
-                        "unexpected sequence data type {}".format(dtype))
+                    raise Exception("unexpected sequence data type {}".format(dtype))
                 output_data.append(
-                    {output0: [res if dtype != np.dtype(object) else str(res)]})
+                    {output0: [res if dtype != np.dtype(object) else str(res)]}
+                )
         data = {"data": [input_data]}
         data["validation_data"] = [output_data]
 
         # Only write to a file if there isn't validation file for the model
         PerfAnalyzerScenario.generation_mutex_.acquire()
         if not os.path.exists(data_filename):
-            with open(data_filename, 'w') as f:
+            with open(data_filename, "w") as f:
                 json.dump(data, f)
         PerfAnalyzerScenario.generation_mutex_.release()
 
@@ -311,43 +330,26 @@ def generate_identity_data(self, trial, dtype, data_filename):
             elif dtype == np.dtype(object):
                 res = str(i)
             else:
-                raise Exception(
-                    "unexpected identity data type {}".format(dtype))
+                raise Exception("unexpected identity data type {}".format(dtype))
             io_data.append(res)
         data = {
-            "data": [{
-                input0: {
-                    "content": io_data,
-                    "shape": [16]
-                }
-            }],
-            "validation_data": [{
-                output0: {
-                    "content": io_data,
-                    "shape": [16]
-                }
-            }]
+            "data": [{input0: {"content": io_data, "shape": [16]}}],
+            "validation_data": [{output0: {"content": io_data, "shape": [16]}}],
         }
         # Only write to a file if there isn't validation file for the model
         PerfAnalyzerScenario.generation_mutex_.acquire()
         if not os.path.exists(data_filename):
-            with open(data_filename, 'w') as f:
+            with open(data_filename, "w") as f:
                 json.dump(data, f)
         PerfAnalyzerScenario.generation_mutex_.release()
 
     def run(self, client_metadata):
         model_option = np.random.choice(self.options_)
-        return model_option.run(self.name_, self.sequence_id_range_,
-                                self.out_stream_)
+        return model_option.run(self.name_, self.sequence_id_range_, self.out_stream_)
 
 
 class ResNetScenario(Scenario):
-
-    def __init__(self,
-                 name,
-                 batch_size=32,
-                 verbose=False,
-                 out_stream=sys.stdout):
+    def __init__(self, name, batch_size=32, verbose=False, out_stream=sys.stdout):
         super().__init__(name, [], verbose, out_stream)
         self.model_name_ = "resnet_v1_50_graphdef_def"
         self.batch_size_ = batch_size
@@ -360,7 +362,7 @@ def __init__(self,
 
     def preprocess(self, filename):
         img = Image.open(filename)
-        resized_img = img.convert('RGB').resize((224, 224), Image.BILINEAR)
+        resized_img = img.convert("RGB").resize((224, 224), Image.BILINEAR)
         np_img = np.array(resized_img).astype(np.float32)
         if np_img.ndim == 2:
             np_img = np_img[:, :, np.newaxis]
@@ -370,31 +372,35 @@ def preprocess(self, filename):
     def postprocess(self, results):
         output_array = results.as_numpy("resnet_v1_50/predictions/Softmax")
         if len(output_array) != self.batch_size_:
-            raise Exception("expected {} results, got {}".format(
-                self.batch_size_, len(output_array)))
+            raise Exception(
+                "expected {} results, got {}".format(
+                    self.batch_size_, len(output_array)
+                )
+            )
 
         for results in output_array:
             for result in results:
                 if output_array.dtype.type == np.object_:
-                    cls = "".join(chr(x) for x in result).split(':')
+                    cls = "".join(chr(x) for x in result).split(":")
                 else:
-                    cls = result.split(':')
+                    cls = result.split(":")
                 if cls[2] != "VULTURE":
                     raise Exception(
-                        "expected VULTURE as classification result, got {}".
-                        format(cls[2]))
+                        "expected VULTURE as classification result, got {}".format(
+                            cls[2]
+                        )
+                    )
 
     def run(self, client_metadata):
         triton_client = client_metadata[0]
 
-        inputs = [
-            grpcclient.InferInput("input", self.image_data_.shape, "FP32")
-        ]
+        inputs = [grpcclient.InferInput("input", self.image_data_.shape, "FP32")]
         inputs[0].set_data_from_numpy(self.image_data_)
 
         outputs = [
-            grpcclient.InferRequestedOutput("resnet_v1_50/predictions/Softmax",
-                                            class_count=1)
+            grpcclient.InferRequestedOutput(
+                "resnet_v1_50/predictions/Softmax", class_count=1
+            )
         ]
         res = triton_client.infer(self.model_name_, inputs, outputs=outputs)
         self.postprocess(res)
@@ -402,14 +408,15 @@ def run(self, client_metadata):
 
 
 class TimeoutScenario(Scenario):
-
-    def __init__(self,
-                 name,
-                 trials,
-                 input_dtype=np.float32,
-                 input_name="INPUT0",
-                 verbose=False,
-                 out_stream=sys.stdout):
+    def __init__(
+        self,
+        name,
+        trials,
+        input_dtype=np.float32,
+        input_name="INPUT0",
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
         super().__init__(name, trials, verbose, out_stream)
         self.input_dtype_ = input_dtype
         self.input_name_ = input_name
@@ -422,12 +429,16 @@ def run(self, client_metadata):
         if "librotch" in trial:
             input_name = "INPUT__0"
 
-        tensor_shape = (math.trunc(1 * (1024 * 1024 * 1024) //
-                                   np.dtype(self.input_dtype_).itemsize),)
+        tensor_shape = (
+            math.trunc(
+                1 * (1024 * 1024 * 1024) // np.dtype(self.input_dtype_).itemsize
+            ),
+        )
         in0 = np.random.random(tensor_shape).astype(self.input_dtype_)
         inputs = [
-            grpcclient.InferInput(input_name, tensor_shape,
-                                  np_to_triton_dtype(self.input_dtype_)),
+            grpcclient.InferInput(
+                input_name, tensor_shape, np_to_triton_dtype(self.input_dtype_)
+            ),
         ]
         inputs[0].set_data_from_numpy(in0)
 
@@ -443,12 +454,11 @@ def run(self, client_metadata):
 
 
 class CrashingScenario(Scenario):
-
     def __init__(self, name, verbose=False, out_stream=sys.stdout):
         super().__init__(name, [], verbose, out_stream)
 
     def run(self, client_metadata):
-        # Only use "custom" model as it simulates exectuion delay which
+        # Only use "custom" model as it simulates execution delay which
         # simplifies "crashing simulation" (client exits while request is being
         # executed)
         trial = "custom"
@@ -456,8 +466,7 @@ def run(self, client_metadata):
         # Call the client as subprocess to avoid crashing stress test
         # and gather logging as string variable
         crashing_client = "crashing_client.py"
-        log = subprocess.check_output(
-            [sys.executable, crashing_client, "-t", trial])
+        log = subprocess.check_output([sys.executable, crashing_client, "-t", trial])
         result = self.parse_result(log.decode("utf-8"))
         if not result[1]:
             assert False, "crashing_client failed {}".format(self.name_)
@@ -472,22 +481,20 @@ def parse_result(self, log):
         if "request_count:" in log:
             idx_start = log.rindex("request_count:")
             idx_start = log.find(" ", idx_start)
-            idx_end = log.find('\n', idx_start)
-            request_count = int(log[idx_start + 1:idx_end])
+            idx_end = log.find("\n", idx_start)
+            request_count = int(log[idx_start + 1 : idx_end])
 
         if "live:" in log:
             idx_start = log.rindex("live:")
             idx_start = log.find(" ", idx_start)
-            idx_end = log.find('\n', idx_start)
-            is_server_live = log[idx_start + 1:idx_end]
+            idx_end = log.find("\n", idx_start)
+            is_server_live = log[idx_start + 1 : idx_end]
 
         return (request_count, is_server_live == "true")
 
 
 class SequenceScenario(Scenario):
-
     class UserData:
-
         def __init__(self):
             self._completed_requests = queue.Queue()
 
@@ -498,51 +505,63 @@ def __init__(self):
     def check_constraints(self, model_name, sequence_id):
         pass
 
-    def __init__(self,
-                 name,
-                 trials,
-                 rng,
-                 sequence_constraints,
-                 verbose=False,
-                 out_stream=sys.stdout):
+    def __init__(
+        self,
+        name,
+        trials,
+        rng,
+        sequence_constraints,
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
         super().__init__(name, trials, verbose, out_stream)
         self.rng_ = rng
         self.sequence_constraints_ = sequence_constraints
 
     def get_expected_result(self, expected_result, value, trial, flag_str=None):
         # Adjust the expected_result for models that
-        # couldn't implement the full accumulator. See
+        # could not implement the full accumulator. See
         # qa/common/gen_qa_sequence_models.py for more
         # information.
-        if (("nobatch" not in trial and
-             ("custom" not in trial)) or ("graphdef" in trial) or
-            ("plan" in trial) or ("onnx" in trial)) or ("libtorch" in trial):
+        if (
+            ("nobatch" not in trial and ("custom" not in trial))
+            or ("graphdef" in trial)
+            or ("plan" in trial)
+            or ("onnx" in trial)
+        ) or ("libtorch" in trial):
             expected_result = value
             if (flag_str is not None) and ("start" in flag_str):
                 expected_result += 1
         return expected_result
 
-    def check_sequence_async(self,
-                             client_metadata,
-                             trial,
-                             model_name,
-                             input_dtype,
-                             steps,
-                             timeout_ms=DEFAULT_TIMEOUT_MS,
-                             batch_size=1,
-                             sequence_name="<unknown>",
-                             tensor_shape=(1,),
-                             input_name="INPUT",
-                             output_name="OUTPUT"):
+    def check_sequence_async(
+        self,
+        client_metadata,
+        trial,
+        model_name,
+        input_dtype,
+        steps,
+        timeout_ms=DEFAULT_TIMEOUT_MS,
+        batch_size=1,
+        sequence_name="<unknown>",
+        tensor_shape=(1,),
+        input_name="INPUT",
+        output_name="OUTPUT",
+    ):
         """Perform sequence of inferences using async run. The 'steps' holds
         a list of tuples, one for each inference with format:
 
         (flag_str, value, expected_result, delay_ms)
 
         """
-        if (("savedmodel" not in trial) and ("graphdef" not in trial) and
-            ("custom" not in trial) and ("onnx" not in trial) and
-            ("libtorch" not in trial) and ("plan" not in trial)):
+        if (
+            ("savedmodel" not in trial)
+            and ("graphdef" not in trial)
+            and ("custom" not in trial)
+            and ("onnx" not in trial)
+            and ("libtorch" not in trial)
+            and ("plan" not in trial)
+        ):
             assert False, "unknown trial type: " + trial
 
         if "nobatch" not in trial:
@@ -566,28 +585,30 @@ def check_sequence_async(self,
             seq_start = False
             seq_end = False
             if flag_str is not None:
-                seq_start = ("start" in flag_str)
-                seq_end = ("end" in flag_str)
+                seq_start = "start" in flag_str
+                seq_end = "end" in flag_str
 
             if input_dtype == np.object_:
                 in0 = np.full(tensor_shape, value, dtype=np.int32)
-                in0n = np.array([str(x) for x in in0.reshape(in0.size)],
-                                dtype=object)
+                in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object)
                 in0 = in0n.reshape(tensor_shape)
             else:
                 in0 = np.full(tensor_shape, value, dtype=input_dtype)
 
             inputs = [
-                grpcclient.InferInput(input_name, tensor_shape,
-                                      np_to_triton_dtype(input_dtype)),
+                grpcclient.InferInput(
+                    input_name, tensor_shape, np_to_triton_dtype(input_dtype)
+                ),
             ]
             inputs[0].set_data_from_numpy(in0)
 
-            triton_client.async_stream_infer(model_name,
-                                             inputs,
-                                             sequence_id=sequence_id,
-                                             sequence_start=seq_start,
-                                             sequence_end=seq_end)
+            triton_client.async_stream_infer(
+                model_name,
+                inputs,
+                sequence_id=sequence_id,
+                sequence_start=seq_start,
+                sequence_end=seq_end,
+            )
             sent_count += 1
 
             if delay_ms is not None:
@@ -608,49 +629,62 @@ def check_sequence_async(self,
                 if (now_ms - seq_start_ms) > timeout_ms:
                     raise TimeoutException(
                         "Timeout expired for {}, got {} ms".format(
-                            sequence_name, (now_ms - seq_start_ms)))
-
-            result = results.as_numpy(
-                output_name)[0] if "nobatch" in trial else results.as_numpy(
-                    output_name)[0][0]
+                            sequence_name, (now_ms - seq_start_ms)
+                        )
+                    )
+
+            result = (
+                results.as_numpy(output_name)[0]
+                if "nobatch" in trial
+                else results.as_numpy(output_name)[0][0]
+            )
             if self.verbose_:
-                print("{} {}: + {} = {}".format(sequence_name, sequence_id,
-                                                value, result),
-                      file=self.out_stream_)
+                print(
+                    "{} {}: + {} = {}".format(
+                        sequence_name, sequence_id, value, result
+                    ),
+                    file=self.out_stream_,
+                )
 
             if expected is not None:
                 if input_dtype == np.object_:
-                    assert int(
-                        result
-                    ) == expected, "{}: expected result {}, got {} {} {}".format(
-                        sequence_name, expected, int(result), trial, model_name)
+                    assert (
+                        int(result) == expected
+                    ), "{}: expected result {}, got {} {} {}".format(
+                        sequence_name, expected, int(result), trial, model_name
+                    )
                 else:
-                    assert result == expected, "{}: expected result {}, got {} {} {}".format(
-                        sequence_name, expected, result, trial, model_name)
+                    assert (
+                        result == expected
+                    ), "{}: expected result {}, got {} {} {}".format(
+                        sequence_name, expected, result, trial, model_name
+                    )
         triton_client.stop_stream()
         return sent_count
 
 
 class SequenceNoEndScenario(SequenceScenario):
-
-    def __init__(self,
-                 name,
-                 trials,
-                 rng,
-                 sequence_constraints,
-                 verbose=False,
-                 out_stream=sys.stdout):
-        super().__init__(name, trials, rng, sequence_constraints, verbose,
-                         out_stream)
+    def __init__(
+        self,
+        name,
+        trials,
+        rng,
+        sequence_constraints,
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
+        super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream)
 
     def check_constraints(self, model_name, sequence_id):
         # The scenario can always be run regardless of the previous runs
         return True
 
-    def run(self,
-            client_metadata,
-            len_mean=SEQUENCE_LENGTH_MEAN,
-            len_stddev=SEQUENCE_LENGTH_STDEV):
+    def run(
+        self,
+        client_metadata,
+        len_mean=SEQUENCE_LENGTH_MEAN,
+        len_stddev=SEQUENCE_LENGTH_STDEV,
+    ):
         trial = self.get_trial()
         dtype = self.get_datatype(trial)
         model_name = tu.get_sequence_model_name(trial, dtype)
@@ -666,9 +700,10 @@ def run(self,
         # never ends. The sequence should be aborted by the server and its
         # slot reused for another sequence.
         seqlen = max(1, int(self.rng_.normal(len_mean, len_stddev)))
-        print("{} {}: no-end seqlen = {}".format(self.name_, client_metadata[1],
-                                                 seqlen),
-              file=self.out_stream_)
+        print(
+            "{} {}: no-end seqlen = {}".format(self.name_, client_metadata[1], seqlen),
+            file=self.out_stream_,
+        )
 
         values = self.rng_.randint(0, 1024 * 1024, size=seqlen).astype(dtype)
 
@@ -683,40 +718,42 @@ def run(self,
             val = values[idx]
             delay_ms = None
             expected_result += val
-            expected_result = self.get_expected_result(expected_result, val,
-                                                       trial, flags)
+            expected_result = self.get_expected_result(
+                expected_result, val, trial, flags
+            )
 
             # (flag_str, value, expected_result, delay_ms)
-            steps.append((flags, val, expected_result, delay_ms),)
+            steps.append(
+                (flags, val, expected_result, delay_ms),
+            )
 
-        return self.check_sequence_async(client_metadata,
-                                         trial,
-                                         model_name,
-                                         dtype,
-                                         steps,
-                                         sequence_name=self.name_)
+        return self.check_sequence_async(
+            client_metadata, trial, model_name, dtype, steps, sequence_name=self.name_
+        )
 
 
 class SequenceValidNoEndScenario(SequenceScenario):
-
-    def __init__(self,
-                 name,
-                 trials,
-                 rng,
-                 sequence_constraints,
-                 verbose=False,
-                 out_stream=sys.stdout):
-        super().__init__(name, trials, rng, sequence_constraints, verbose,
-                         out_stream)
+    def __init__(
+        self,
+        name,
+        trials,
+        rng,
+        sequence_constraints,
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
+        super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream)
 
     def check_constraints(self, model_name, sequence_id):
         # The scenario can always be run regardless of the previous runs
         return True
 
-    def run(self,
-            client_metadata,
-            len_mean=SEQUENCE_LENGTH_MEAN,
-            len_stddev=SEQUENCE_LENGTH_STDEV):
+    def run(
+        self,
+        client_metadata,
+        len_mean=SEQUENCE_LENGTH_MEAN,
+        len_stddev=SEQUENCE_LENGTH_STDEV,
+    ):
         trial = self.get_trial()
         dtype = self.get_datatype(trial)
         model_name = tu.get_sequence_model_name(trial, dtype)
@@ -733,15 +770,18 @@ def run(self,
         # sequences use the same correlation ID and are sent back-to-back.
         seqlen = [
             max(1, int(self.rng_.normal(len_mean, len_stddev))),
-            max(1, int(self.rng_.normal(len_mean, len_stddev)))
+            max(1, int(self.rng_.normal(len_mean, len_stddev))),
         ]
-        print("{} {}: valid-no-end seqlen[0] = {}, seqlen[1] = {}".format(
-            self.name_, client_metadata[1], seqlen[0], seqlen[1]),
-              file=self.out_stream_)
+        print(
+            "{} {}: valid-no-end seqlen[0] = {}, seqlen[1] = {}".format(
+                self.name_, client_metadata[1], seqlen[0], seqlen[1]
+            ),
+            file=self.out_stream_,
+        )
 
         values = [
             self.rng_.randint(0, 1024 * 1024, size=seqlen[0]).astype(dtype),
-            self.rng_.randint(0, 1024 * 1024, size=seqlen[1]).astype(dtype)
+            self.rng_.randint(0, 1024 * 1024, size=seqlen[1]).astype(dtype),
         ]
 
         for p in [0, 1]:
@@ -759,39 +799,41 @@ def run(self,
                 delay_ms = None
                 expected_result += val
                 expected_result = self.get_expected_result(
-                    expected_result, val, trial, flags)
+                    expected_result, val, trial, flags
+                )
 
                 # (flag_str, value, expected_result, delay_ms)
-                steps.append((flags, val, expected_result, delay_ms),)
+                steps.append(
+                    (flags, val, expected_result, delay_ms),
+                )
 
-        return self.check_sequence_async(client_metadata,
-                                         trial,
-                                         model_name,
-                                         dtype,
-                                         steps,
-                                         sequence_name=self.name_)
+        return self.check_sequence_async(
+            client_metadata, trial, model_name, dtype, steps, sequence_name=self.name_
+        )
 
 
 class SequenceValidValidScenario(SequenceScenario):
-
-    def __init__(self,
-                 name,
-                 trials,
-                 rng,
-                 sequence_constraints,
-                 verbose=False,
-                 out_stream=sys.stdout):
-        super().__init__(name, trials, rng, sequence_constraints, verbose,
-                         out_stream)
+    def __init__(
+        self,
+        name,
+        trials,
+        rng,
+        sequence_constraints,
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
+        super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream)
 
     def check_constraints(self, model_name, sequence_id):
         # The scenario can always be run regardless of the previous runs
         return True
 
-    def run(self,
-            client_metadata,
-            len_mean=SEQUENCE_LENGTH_MEAN,
-            len_stddev=SEQUENCE_LENGTH_STDEV):
+    def run(
+        self,
+        client_metadata,
+        len_mean=SEQUENCE_LENGTH_MEAN,
+        len_stddev=SEQUENCE_LENGTH_STDEV,
+    ):
         trial = self.get_trial()
         dtype = self.get_datatype(trial)
         model_name = tu.get_sequence_model_name(trial, dtype)
@@ -808,15 +850,18 @@ def run(self,
         # sent back-to-back.
         seqlen = [
             max(1, int(self.rng_.normal(len_mean, len_stddev))),
-            max(1, int(self.rng_.normal(len_mean, len_stddev)))
+            max(1, int(self.rng_.normal(len_mean, len_stddev))),
         ]
-        print("{} {}: valid-valid seqlen[0] = {}, seqlen[1] = {}".format(
-            self.name_, client_metadata[1], seqlen[0], seqlen[1]),
-              file=self.out_stream_)
+        print(
+            "{} {}: valid-valid seqlen[0] = {}, seqlen[1] = {}".format(
+                self.name_, client_metadata[1], seqlen[0], seqlen[1]
+            ),
+            file=self.out_stream_,
+        )
 
         values = [
             self.rng_.randint(0, 1024 * 1024, size=seqlen[0]).astype(dtype),
-            self.rng_.randint(0, 1024 * 1024, size=seqlen[1]).astype(dtype)
+            self.rng_.randint(0, 1024 * 1024, size=seqlen[1]).astype(dtype),
         ]
 
         for p in [0, 1]:
@@ -834,30 +879,30 @@ def run(self,
                 delay_ms = None
                 expected_result += val
                 expected_result = self.get_expected_result(
-                    expected_result, val, trial, flags)
+                    expected_result, val, trial, flags
+                )
 
                 # (flag_str, value, expected_result, delay_ms)
-                steps.append((flags, val, expected_result, delay_ms),)
+                steps.append(
+                    (flags, val, expected_result, delay_ms),
+                )
 
-        return self.check_sequence_async(client_metadata,
-                                         trial,
-                                         model_name,
-                                         dtype,
-                                         steps,
-                                         sequence_name=self.name_)
+        return self.check_sequence_async(
+            client_metadata, trial, model_name, dtype, steps, sequence_name=self.name_
+        )
 
 
 class SequenceNoStartScenario(SequenceScenario):
-
-    def __init__(self,
-                 name,
-                 trials,
-                 rng,
-                 sequence_constraints,
-                 verbose=False,
-                 out_stream=sys.stdout):
-        super().__init__(name, trials, rng, sequence_constraints, verbose,
-                         out_stream)
+    def __init__(
+        self,
+        name,
+        trials,
+        rng,
+        sequence_constraints,
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
+        super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream)
 
     def check_constraints(self, model_name, sequence_id):
         # no-start cannot follow no-end since the server will
@@ -865,7 +910,8 @@ def check_constraints(self, model_name, sequence_id):
         # the no-end sequence instead of being a sequence
         # missing start flag.
         if (model_name in self.sequence_constraints_) and (
-                sequence_id in self.sequence_constraints_[model_name]):
+            sequence_id in self.sequence_constraints_[model_name]
+        ):
             return not self.sequence_constraints_[model_name][sequence_id]
         return True
 
@@ -884,9 +930,12 @@ def run(self, client_metadata):
         # Create a sequence without a "start" flag. Sequence should get an
         # error from the server.
         seqlen = 1
-        print("{} {}: no-start seqlen = {}".format(self.name_,
-                                                   client_metadata[1], seqlen),
-              file=self.out_stream_)
+        print(
+            "{} {}: no-start seqlen = {}".format(
+                self.name_, client_metadata[1], seqlen
+            ),
+            file=self.out_stream_,
+        )
 
         values = self.rng_.randint(0, 1024 * 1024, size=seqlen).astype(dtype)
 
@@ -898,11 +947,12 @@ def run(self, client_metadata):
             delay_ms = None
 
             # (flag_str, value, expected_result, delay_ms)
-            steps.append((flags, val, None, delay_ms),)
+            steps.append(
+                (flags, val, None, delay_ms),
+            )
 
         try:
-            self.check_sequence_async(client_metadata, trial, model_name, dtype,
-                                      steps)
+            self.check_sequence_async(client_metadata, trial, model_name, dtype, steps)
             # Hit this point if sending no-start sequence to sequence id that
             # was used for no-end sequence and that means the constraints check
             # is inaccurate
@@ -915,25 +965,27 @@ def run(self, client_metadata):
 
 
 class SequenceValidScenario(SequenceScenario):
-
-    def __init__(self,
-                 name,
-                 trials,
-                 rng,
-                 sequence_constraints,
-                 verbose=False,
-                 out_stream=sys.stdout):
-        super().__init__(name, trials, rng, sequence_constraints, verbose,
-                         out_stream)
+    def __init__(
+        self,
+        name,
+        trials,
+        rng,
+        sequence_constraints,
+        verbose=False,
+        out_stream=sys.stdout,
+    ):
+        super().__init__(name, trials, rng, sequence_constraints, verbose, out_stream)
 
     def check_constraints(self, model_name, sequence_id):
         # The scenario can always be run regardless of the previous runs
         return True
 
-    def run(self,
-            client_metadata,
-            len_mean=SEQUENCE_LENGTH_MEAN,
-            len_stddev=SEQUENCE_LENGTH_STDEV):
+    def run(
+        self,
+        client_metadata,
+        len_mean=SEQUENCE_LENGTH_MEAN,
+        len_stddev=SEQUENCE_LENGTH_STDEV,
+    ):
         trial = self.get_trial()
         dtype = self.get_datatype(trial)
         model_name = tu.get_sequence_model_name(trial, dtype)
@@ -947,9 +999,10 @@ def run(self,
 
         # Create a variable length sequence with "start" and "end" flags.
         seqlen = max(1, int(self.rng_.normal(len_mean, len_stddev)))
-        print("{} {}: valid seqlen = {}".format(self.name_, client_metadata[1],
-                                                seqlen),
-              file=self.out_stream_)
+        print(
+            "{} {}: valid seqlen = {}".format(self.name_, client_metadata[1], seqlen),
+            file=self.out_stream_,
+        )
 
         values = self.rng_.randint(0, 1024 * 1024, size=seqlen).astype(dtype)
 
@@ -966,15 +1019,15 @@ def run(self,
             val = values[idx]
             delay_ms = None
             expected_result += val
-            expected_result = self.get_expected_result(expected_result, val,
-                                                       trial, flags)
+            expected_result = self.get_expected_result(
+                expected_result, val, trial, flags
+            )
 
             # (flag_str, value, expected_result, delay_ms)
-            steps.append((flags, val, expected_result, delay_ms),)
-
-        return self.check_sequence_async(client_metadata,
-                                         trial,
-                                         model_name,
-                                         dtype,
-                                         steps,
-                                         sequence_name=self.name_)
+            steps.append(
+                (flags, val, expected_result, delay_ms),
+            )
+
+        return self.check_sequence_async(
+            client_metadata, trial, model_name, dtype, steps, sequence_name=self.name_
+        )
diff --git a/qa/L0_long_running_stress/stress.py b/qa/L0_long_running_stress/stress.py
old mode 100644
new mode 100755
index a3713b4b0e..978f204ee6
--- a/qa/L0_long_running_stress/stress.py
+++ b/qa/L0_long_running_stress/stress.py
@@ -1,4 +1,6 @@
-# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -32,20 +34,20 @@
 
 import argparse
 import bisect
-from builtins import range
-from builtins import str
 import os
-import time
 import threading
+import time
 import traceback
-import numpy as np
+from builtins import range, str
 from functools import partial
-import tritonclient.grpc as grpcclient
+
+import numpy as np
 import prettytable
+import tritonclient.grpc as grpcclient
 
 FLAGS = None
 CORRELATION_ID_BLOCK_SIZE = 1024 * 1024
-BACKENDS = os.environ.get('BACKENDS', "graphdef savedmodel onnx plan")
+BACKENDS = os.environ.get("BACKENDS", "graphdef savedmodel onnx plan")
 
 _thread_exceptions = []
 _thread_exceptions_mutex = threading.Lock()
@@ -63,24 +65,26 @@
 def get_trials(is_sequence=True):
     _trials = ()
     if is_sequence:
-        for backend in BACKENDS.split(' '):
-            if (backend != "libtorch") and (backend != 'savedmodel'):
+        for backend in BACKENDS.split(" "):
+            if (backend != "libtorch") and (backend != "savedmodel"):
                 _trials += (backend + "_nobatch",)
             _trials += (backend,)
     else:
         _trials = ()
-        for backend in BACKENDS.split(' '):
-            if (backend != "libtorch"):
+        for backend in BACKENDS.split(" "):
+            if backend != "libtorch":
                 _trials += (backend + "_nobatch",)
     return _trials
 
 
-def update_test_count(test_case_count,
-                      failed_test_case_count,
-                      request_count,
-                      test_case_name,
-                      success=True,
-                      count=1):
+def update_test_count(
+    test_case_count,
+    failed_test_case_count,
+    request_count,
+    test_case_name,
+    success=True,
+    count=1,
+):
     if success:
         # Count the times each test case runs
         if test_case_name in test_case_count:
@@ -102,7 +106,6 @@ def update_test_count(test_case_count,
 
 
 class ScenarioSelector:
-
     def __init__(self, probs, rng):
         self.rng_ = rng
         self.probs_range_ = []
@@ -119,20 +122,24 @@ def __init__(self, probs, rng):
             self.probs_range_[i] /= total_weight
 
     def get_scenario(self):
-        return self.scenarios_[bisect.bisect_left(self.probs_range_,
-                                                  self.rng_.rand())]
+        return self.scenarios_[bisect.bisect_left(self.probs_range_, self.rng_.rand())]
 
 
-def stress_thread(name, seed, correlation_id_base, test_case_count,
-                  failed_test_case_count, sequence_request_count):
+def stress_thread(
+    name,
+    seed,
+    correlation_id_base,
+    test_case_count,
+    failed_test_case_count,
+    sequence_request_count,
+):
     # Thread responsible for generating sequences of inference
     # requests.
     global _thread_exceptions
 
     # Write any thread output to dedicated file
-    with open("{}.log".format(name), 'w') as out_file:
-        print("Starting thread {} with seed {}".format(name, seed),
-              file=out_file)
+    with open("{}.log".format(name), "w") as out_file:
+        print("Starting thread {} with seed {}".format(name, seed), file=out_file)
         rng = np.random.RandomState(seed)
 
         # FIXME revisit to check if it is necessary
@@ -151,74 +158,111 @@ def stress_thread(name, seed, correlation_id_base, test_case_count,
         rare_cnt = 8
         is_last_used_no_end = {}
 
-        update_counter_fn = partial(update_test_count, test_case_count,
-                                    failed_test_case_count,
-                                    sequence_request_count)
+        update_counter_fn = partial(
+            update_test_count,
+            test_case_count,
+            failed_test_case_count,
+            sequence_request_count,
+        )
         for c in range(common_cnt + rare_cnt):
             client_metadata_list.append(
-                (grpcclient.InferenceServerClient("localhost:8001",
-                                                  verbose=FLAGS.verbose),
-                 correlation_id_base + c))
+                (
+                    grpcclient.InferenceServerClient(
+                        "localhost:8001", verbose=FLAGS.verbose
+                    ),
+                    correlation_id_base + c,
+                )
+            )
         pa_start_seq_id = correlation_id_base + common_cnt + rare_cnt
         pa_end_seq_id = correlation_id_base + CORRELATION_ID_BLOCK_SIZE
 
         # Weight roughly in thousandth percent
-        ss = ScenarioSelector([
-            (60,
-             TimeoutScenario(name,
-                             get_trials(False),
-                             verbose=FLAGS.verbose,
-                             out_stream=out_file)),
-            (80, ResNetScenario(
-                name, verbose=FLAGS.verbose, out_stream=out_file)),
-            (60,
-             CrashingScenario(name, verbose=FLAGS.verbose,
-                              out_stream=out_file)),
-            (62,
-             SequenceNoEndScenario(name,
-                                   get_trials(),
-                                   rng,
-                                   is_last_used_no_end,
-                                   verbose=FLAGS.verbose,
-                                   out_stream=out_file)),
-            (68,
-             SequenceValidNoEndScenario(name,
-                                        get_trials(),
-                                        rng,
-                                        is_last_used_no_end,
-                                        verbose=FLAGS.verbose,
-                                        out_stream=out_file)),
-            (68,
-             SequenceValidValidScenario(name,
-                                        get_trials(),
-                                        rng,
-                                        is_last_used_no_end,
-                                        verbose=FLAGS.verbose,
-                                        out_stream=out_file)),
-            (7,
-             SequenceNoStartScenario(name,
-                                     get_trials(),
-                                     rng,
-                                     is_last_used_no_end,
-                                     verbose=FLAGS.verbose,
-                                     out_stream=out_file)),
-            (295,
-             SequenceValidScenario(name,
-                                   get_trials(),
-                                   rng,
-                                   is_last_used_no_end,
-                                   verbose=FLAGS.verbose,
-                                   out_stream=out_file)),
-            (300,
-             PerfAnalyzerScenario(
-                 name,
-                 rng,
-                 get_trials(),
-                 get_trials(False),
-                 sequence_id_range=(pa_start_seq_id, pa_end_seq_id),
-                 verbose=FLAGS.verbose,
-                 out_stream=out_file)),
-        ], rng)
+        ss = ScenarioSelector(
+            [
+                (
+                    60,
+                    TimeoutScenario(
+                        name,
+                        get_trials(False),
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+                (80, ResNetScenario(name, verbose=FLAGS.verbose, out_stream=out_file)),
+                (
+                    60,
+                    CrashingScenario(name, verbose=FLAGS.verbose, out_stream=out_file),
+                ),
+                (
+                    62,
+                    SequenceNoEndScenario(
+                        name,
+                        get_trials(),
+                        rng,
+                        is_last_used_no_end,
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+                (
+                    68,
+                    SequenceValidNoEndScenario(
+                        name,
+                        get_trials(),
+                        rng,
+                        is_last_used_no_end,
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+                (
+                    68,
+                    SequenceValidValidScenario(
+                        name,
+                        get_trials(),
+                        rng,
+                        is_last_used_no_end,
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+                (
+                    7,
+                    SequenceNoStartScenario(
+                        name,
+                        get_trials(),
+                        rng,
+                        is_last_used_no_end,
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+                (
+                    295,
+                    SequenceValidScenario(
+                        name,
+                        get_trials(),
+                        rng,
+                        is_last_used_no_end,
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+                (
+                    300,
+                    PerfAnalyzerScenario(
+                        name,
+                        rng,
+                        get_trials(),
+                        get_trials(False),
+                        sequence_id_range=(pa_start_seq_id, pa_end_seq_id),
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+            ],
+            rng,
+        )
 
         rare_idx = 0
         common_idx = 0
@@ -241,8 +285,9 @@ def stress_thread(name, seed, correlation_id_base, test_case_count,
                 update_counter_fn(scenario.scenario_name(), False)
                 _thread_exceptions_mutex.acquire()
                 try:
-                    _thread_exceptions.append((name, scenario.scenario_name(),
-                                               traceback.format_exc()))
+                    _thread_exceptions.append(
+                        (name, scenario.scenario_name(), traceback.format_exc())
+                    )
                 finally:
                     _thread_exceptions_mutex.release()
 
@@ -256,36 +301,52 @@ def stress_thread(name, seed, correlation_id_base, test_case_count,
         print("Exiting thread {}".format(name), file=out_file)
 
 
-def load_thread(name, seed, correlation_id_base, test_case_count,
-                failed_test_case_count, sequence_request_count):
+def load_thread(
+    name,
+    seed,
+    correlation_id_base,
+    test_case_count,
+    failed_test_case_count,
+    sequence_request_count,
+):
     # Thread responsible for generating sequences of inference
     # requests.
     global _thread_exceptions
 
     # Write any thread output to dedicated file
-    with open("{}.log".format(name), 'w') as out_file:
-        print("Starting thread {} with seed {}".format(name, seed),
-              file=out_file)
+    with open("{}.log".format(name), "w") as out_file:
+        print("Starting thread {} with seed {}".format(name, seed), file=out_file)
         rng = np.random.RandomState(seed)
 
-        update_counter_fn = partial(update_test_count, test_case_count,
-                                    failed_test_case_count,
-                                    sequence_request_count)
+        update_counter_fn = partial(
+            update_test_count,
+            test_case_count,
+            failed_test_case_count,
+            sequence_request_count,
+        )
         pa_start_seq_id = correlation_id_base
         pa_end_seq_id = correlation_id_base + CORRELATION_ID_BLOCK_SIZE
 
         # Create PerfAnalyzerScenario with no additional trial,
         # the default model 'resnet', more compute intense than the simple
         # models, will be the only choice for generating load
-        ss = ScenarioSelector([
-            (1,
-             PerfAnalyzerScenario(
-                 name,
-                 rng, [], [],
-                 sequence_id_range=(pa_start_seq_id, pa_end_seq_id),
-                 verbose=FLAGS.verbose,
-                 out_stream=out_file)),
-        ], rng)
+        ss = ScenarioSelector(
+            [
+                (
+                    1,
+                    PerfAnalyzerScenario(
+                        name,
+                        rng,
+                        [],
+                        [],
+                        sequence_id_range=(pa_start_seq_id, pa_end_seq_id),
+                        verbose=FLAGS.verbose,
+                        out_stream=out_file,
+                    ),
+                ),
+            ],
+            rng,
+        )
 
         while not STOP_STRESS_THREAD:
             scenario = ss.get_scenario()
@@ -297,8 +358,9 @@ def load_thread(name, seed, correlation_id_base, test_case_count,
                 update_counter_fn(scenario.scenario_name(), False)
                 _thread_exceptions_mutex.acquire()
                 try:
-                    _thread_exceptions.append((name, scenario.scenario_name(),
-                                               traceback.format_exc()))
+                    _thread_exceptions.append(
+                        (name, scenario.scenario_name(), traceback.format_exc())
+                    )
                 finally:
                     _thread_exceptions_mutex.release()
 
@@ -333,47 +395,45 @@ def accumulate_count(dict_list, test_case_name):
     return count
 
 
-def generate_report(elapsed_time, _test_case_count, _failed_test_case_count,
-                    _sequence_request_count):
+def generate_report(
+    elapsed_time, _test_case_count, _failed_test_case_count, _sequence_request_count
+):
     hrs = elapsed_time // 3600
     mins = (elapsed_time / 60) % 60
     secs = elapsed_time % 60
 
     test_case_description = {
-        'SequenceValidScenario':
-            'Send a sequence with "start" and "end" flags.',
-        'SequenceValidValidScenario':
-            'Send two sequences back to back using the same correlation ID'
-            ' with "start" and "end" flags.',
-        'SequenceValidNoEndScenario':
-            'Send two sequences back to back using the same correlation ID.'
-            ' The first with "start" and "end" flags, and the second with no'
-            ' "end" flag.',
-        'SequenceNoStartScenario':
-            'Send a sequence without a "start" flag. Sequence should get an'
-            ' error from the server.',
-        'SequenceNoEndScenario':
-            'Send a sequence with "start" flag but that never ends. The'
-            ' sequence should be aborted by the server and its slot reused'
-            ' for another sequence.',
-        'TimeoutScenario':
-            'Expect an exception for small timeout values.',
-        'ResNetScenario':
-            'Send a request using resnet model.',
-        'CrashingScenario':
-            'Client crashes in the middle of inferences.',
-        'PerfAnalyzerScenario':
-            'Client that maintains a specific load.',
+        "SequenceValidScenario": 'Send a sequence with "start" and "end" flags.',
+        "SequenceValidValidScenario": "Send two sequences back to back using the same correlation ID"
+        ' with "start" and "end" flags.',
+        "SequenceValidNoEndScenario": "Send two sequences back to back using the same correlation ID."
+        ' The first with "start" and "end" flags, and the second with no'
+        ' "end" flag.',
+        "SequenceNoStartScenario": 'Send a sequence without a "start" flag. Sequence should get an'
+        " error from the server.",
+        "SequenceNoEndScenario": 'Send a sequence with "start" flag but that never ends. The'
+        " sequence should be aborted by the server and its slot reused"
+        " for another sequence.",
+        "TimeoutScenario": "Expect an exception for small timeout values.",
+        "ResNetScenario": "Send a request using resnet model.",
+        "CrashingScenario": "Client crashes in the middle of inferences.",
+        "PerfAnalyzerScenario": "Client that maintains a specific load.",
     }
 
     f = open("stress_report.txt", "w")
-    f.write("Test Duration: {:0>2}:{:0>2}:{:0>2} (HH:MM:SS)\n".format(
-        int(hrs), int(mins), int(secs)))
+    f.write(
+        "Test Duration: {:0>2}:{:0>2}:{:0>2} (HH:MM:SS)\n".format(
+            int(hrs), int(mins), int(secs)
+        )
+    )
 
     t = prettytable.PrettyTable(hrules=prettytable.ALL)
     t.field_names = [
-        'Test Case', 'Number of Failures', 'Test Count', 'Request Count',
-        'Test Case Description'
+        "Test Case",
+        "Number of Failures",
+        "Test Count",
+        "Request Count",
+        "Test Case Description",
     ]
 
     t.align["Test Case"] = "l"
@@ -389,33 +449,38 @@ def generate_report(elapsed_time, _test_case_count, _failed_test_case_count,
     for c in test_case_description:
         # Accumulate all the individual thread counts
         acc_test_case_count[c] = accumulate_count(_test_case_count, c)
-        acc_failed_test_case_count[c] = accumulate_count(
-            _failed_test_case_count, c)
-        acc_sequence_request_count[c] = accumulate_count(
-            _sequence_request_count, c)
+        acc_failed_test_case_count[c] = accumulate_count(_failed_test_case_count, c)
+        acc_sequence_request_count[c] = accumulate_count(_sequence_request_count, c)
 
         description = test_case_description[c]
         # Add additional description on scenarios that allow failure
         if c in ALLOW_FAILURE_SCENARIO:
-            description += " Note that this scenario is marked to allow " \
-                           "failure due to subtle edge cases that will be " \
-                           "investigated in the future. However, only a " \
-                           "minimal failure count is expected and we should " \
-                           "take action if the number is concerning."
-        t.add_row([
-            c, acc_failed_test_case_count[c] if c in acc_failed_test_case_count
-            else 0, acc_test_case_count[c] if c in acc_test_case_count else 0,
-            acc_sequence_request_count[c]
-            if c in acc_sequence_request_count else 0,
-            format_content(description, 50)
-        ])
-
-    t.add_row([
-        'TOTAL',
-        sum(acc_failed_test_case_count.values()),
-        sum(acc_test_case_count.values()),
-        sum(acc_sequence_request_count.values()), 'X'
-    ])
+            description += (
+                " Note that this scenario is marked to allow "
+                "failure due to subtle edge cases that will be "
+                "investigated in the future. However, only a "
+                "minimal failure count is expected and we should "
+                "take action if the number is concerning."
+            )
+        t.add_row(
+            [
+                c,
+                acc_failed_test_case_count[c] if c in acc_failed_test_case_count else 0,
+                acc_test_case_count[c] if c in acc_test_case_count else 0,
+                acc_sequence_request_count[c] if c in acc_sequence_request_count else 0,
+                format_content(description, 50),
+            ]
+        )
+
+    t.add_row(
+        [
+            "TOTAL",
+            sum(acc_failed_test_case_count.values()),
+            sum(acc_test_case_count.values()),
+            sum(acc_sequence_request_count.values()),
+            "X",
+        ]
+    )
 
     print(t)
     f.write(str(t))
@@ -423,43 +488,48 @@ def generate_report(elapsed_time, _test_case_count, _failed_test_case_count,
     f.close()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-r',
-                        '--random-seed',
-                        type=int,
-                        required=False,
-                        help='Random seed.')
-    parser.add_argument('-t',
-                        '--concurrency',
-                        type=int,
-                        required=False,
-                        default=8,
-                        help='Request concurrency. Default is 8.')
-    parser.add_argument('--load-thread',
-                        type=int,
-                        required=False,
-                        default=0,
-                        help='Number of dedicated threads that keep compute '
-                        'device (i.e. GPU/CPUs) under load. The load generated '
-                        'from "--concurrency" often behaves as request spike, '
-                        ' this argument may be used to produce consistent load '
-                        ' to keep devices at high utilization. Default is 0, '
-                        'which means no dedicated load thread will be created.')
     parser.add_argument(
-        '-d',
-        '--test-duration',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-r", "--random-seed", type=int, required=False, help="Random seed."
+    )
+    parser.add_argument(
+        "-t",
+        "--concurrency",
+        type=int,
+        required=False,
+        default=8,
+        help="Request concurrency. Default is 8.",
+    )
+    parser.add_argument(
+        "--load-thread",
+        type=int,
+        required=False,
+        default=0,
+        help="Number of dedicated threads that keep compute "
+        "device (i.e. GPU/CPUs) under load. The load generated "
+        'from "--concurrency" often behaves as request spike, '
+        " this argument may be used to produce consistent load "
+        " to keep devices at high utilization. Default is 0, "
+        "which means no dedicated load thread will be created.",
+    )
+    parser.add_argument(
+        "-d",
+        "--test-duration",
         type=int,
         required=False,
         default=25000,
-        help='Duration of stress test to run. Default is 25000 seconds ' +
-        '(approximately 7 hours).')
+        help="Duration of stress test to run. Default is 25000 seconds "
+        + "(approximately 7 hours).",
+    )
     FLAGS = parser.parse_args()
 
     # Initialize the random seed. For reproducibility each thread
@@ -476,9 +546,7 @@ def generate_report(elapsed_time, _test_case_count, _failed_test_case_count,
     print("test duration = {}".format(FLAGS.test_duration))
 
     # Create hashes for each thread for generating report
-    _test_case_count = [
-        dict() for _ in range(FLAGS.concurrency + FLAGS.load_thread)
-    ]
+    _test_case_count = [dict() for _ in range(FLAGS.concurrency + FLAGS.load_thread)]
     _failed_test_case_count = [
         dict() for _ in range(FLAGS.concurrency + FLAGS.load_thread)
     ]
@@ -501,11 +569,18 @@ def generate_report(elapsed_time, _test_case_count, _failed_test_case_count,
         correlation_id_base = 1 + (idx * CORRELATION_ID_BLOCK_SIZE)
 
         threads.append(
-            threading.Thread(target=stress_thread,
-                             args=(thread_name, seed, correlation_id_base,
-                                   _test_case_count[idx],
-                                   _failed_test_case_count[idx],
-                                   _sequence_request_count[idx])))
+            threading.Thread(
+                target=stress_thread,
+                args=(
+                    thread_name,
+                    seed,
+                    correlation_id_base,
+                    _test_case_count[idx],
+                    _failed_test_case_count[idx],
+                    _sequence_request_count[idx],
+                ),
+            )
+        )
 
     for idx in range(FLAGS.load_thread):
         thread_name = "load_thread_{}".format(idx)
@@ -518,14 +593,22 @@ def generate_report(elapsed_time, _test_case_count, _failed_test_case_count,
         # Each thread is reserved a block of correlation IDs or size
         # CORRELATION_ID_BLOCK_SIZE
         correlation_id_base = 1 + (
-            (FLAGS.concurrency + idx) * CORRELATION_ID_BLOCK_SIZE)
+            (FLAGS.concurrency + idx) * CORRELATION_ID_BLOCK_SIZE
+        )
 
         threads.append(
-            threading.Thread(target=load_thread,
-                             args=(thread_name, seed, correlation_id_base,
-                                   _test_case_count[idx],
-                                   _failed_test_case_count[idx],
-                                   _sequence_request_count[idx])))
+            threading.Thread(
+                target=load_thread,
+                args=(
+                    thread_name,
+                    seed,
+                    correlation_id_base,
+                    _test_case_count[idx],
+                    _failed_test_case_count[idx],
+                    _sequence_request_count[idx],
+                ),
+            )
+        )
 
     exit_code = 0
 
@@ -551,15 +634,18 @@ def generate_report(elapsed_time, _test_case_count, _failed_test_case_count,
         if t.is_alive() and (exit_code == 0):
             exit_code = 1
 
-    generate_report(time.time() - start_time, _test_case_count,
-                    _failed_test_case_count, _sequence_request_count)
+    generate_report(
+        time.time() - start_time,
+        _test_case_count,
+        _failed_test_case_count,
+        _sequence_request_count,
+    )
 
     _thread_exceptions_mutex.acquire()
     try:
         if len(_thread_exceptions) > 0:
             for thread, scenario, ex in _thread_exceptions:
-                print("*********\n* {} {}\n{}*********\n".format(
-                    thread, scenario, ex))
+                print("*********\n* {} {}\n{}*********\n".format(thread, scenario, ex))
                 if scenario not in ALLOW_FAILURE_SCENARIO:
                     exit_code = 1
     finally:
diff --git a/qa/L0_long_running_stress/stress_mail.py b/qa/L0_long_running_stress/stress_mail.py
old mode 100644
new mode 100755
index e240e2a354..36f347c2ac
--- a/qa/L0_long_running_stress/stress_mail.py
+++ b/qa/L0_long_running_stress/stress_mail.py
@@ -30,21 +30,33 @@
 sys.path.append("../common")
 
 import os
-import nightly_email_helper
-
 from datetime import date
 
-CI_JOB_ID = os.environ.get('CI_JOB_ID', '')
+import nightly_email_helper
+
+CI_JOB_ID = os.environ.get("CI_JOB_ID", "")
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     today = date.today().strftime("%Y-%m-%d")
-    subject = "Triton Long-Running Stress Test " + \
-        ((sys.argv[1] + " ") if len(sys.argv) >= 2 else "") + "Summary: " + today
+    subject = (
+        "Triton Long-Running Stress Test "
+        + ((sys.argv[1] + " ") if len(sys.argv) >= 2 else "")
+        + "Summary: "
+        + today
+    )
     stress_report = "stress_report.txt"
     link = "https://gitlab-master.nvidia.com/dl/dgx/tritonserver/-/jobs/" + CI_JOB_ID
     write_up = "<p>The table below includes results from long-running stress test. Please refer to the description of each test case to see what different kinds of inference requests were sent. Request concurrency is set to 8.</p>"
-    write_up += "<p>Please check the CI output webpage for the details of the failures: " + link + "</p>"
-    html_content = "<html><head></head><body><pre style=\"font-size:11pt;font-family:Arial, sans-serif;\">" + write_up + "</pre><pre style=\"font-size:11pt;font-family:Consolas;\">"
+    write_up += (
+        "<p>Please check the CI output webpage for the details of the failures: "
+        + link
+        + "</p>"
+    )
+    html_content = (
+        '<html><head></head><body><pre style="font-size:11pt;font-family:Arial, sans-serif;">'
+        + write_up
+        + '</pre><pre style="font-size:11pt;font-family:Consolas;">'
+    )
     with open(stress_report, "r") as f:
         html_content += f.read() + "\n"
     html_content += "</pre></body></html>"
diff --git a/qa/L0_memory/test.sh b/qa/L0_memory/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_memory_growth/busy_op_test.py b/qa/L0_memory_growth/busy_op_test.py
old mode 100644
new mode 100755
index 537c328047..2814f38d8c
--- a/qa/L0_memory_growth/busy_op_test.py
+++ b/qa/L0_memory_growth/busy_op_test.py
@@ -27,56 +27,63 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 from builtins import range
+
+import numpy as np
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 from tritonclientutils import np_to_triton_dtype
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8000',
-                        help='Inference server URL. Default is localhost:8000.')
     parser.add_argument(
-        '-i',
-        '--protocol',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
         type=str,
         required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
-    parser.add_argument('-m',
-                        '--model',
-                        type=str,
-                        required=True,
-                        help='Name of model.')
-    parser.add_argument('-n',
-                        '--num-requests',
-                        type=int,
-                        required=True,
-                        help='Number of asynchronous requests to launch.')
-    parser.add_argument('-d',
-                        '--delay',
-                        type=int,
-                        required=True,
-                        help='Number of delay cycles to use as input to model.')
+        default="localhost:8000",
+        help="Inference server URL. Default is localhost:8000.",
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
+        type=str,
+        required=False,
+        default="http",
+        help='Protocol ("http"/"grpc") used to '
+        + 'communicate with inference service. Default is "http".',
+    )
+    parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.")
+    parser.add_argument(
+        "-n",
+        "--num-requests",
+        type=int,
+        required=True,
+        help="Number of asynchronous requests to launch.",
+    )
+    parser.add_argument(
+        "-d",
+        "--delay",
+        type=int,
+        required=True,
+        help="Number of delay cycles to use as input to model.",
+    )
 
     FLAGS = parser.parse_args()
     if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
+        print(
+            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
+        )
         exit(1)
 
     client_util = httpclient if FLAGS.protocol == "http" else grpcclient
@@ -94,8 +101,9 @@
     input_data = np.array([FLAGS.delay], dtype=np.int32)
 
     inputs = [
-        client_util.InferInput("in", input_data.shape,
-                               np_to_triton_dtype(input_data.dtype))
+        client_util.InferInput(
+            "in", input_data.shape, np_to_triton_dtype(input_data.dtype)
+        )
     ]
     inputs[0].set_data_from_numpy(input_data)
 
diff --git a/qa/L0_memory_growth/server_memory_mail.py b/qa/L0_memory_growth/server_memory_mail.py
old mode 100644
new mode 100755
index ef57c8732e..d1307d97a6
--- a/qa/L0_memory_growth/server_memory_mail.py
+++ b/qa/L0_memory_growth/server_memory_mail.py
@@ -29,19 +29,23 @@
 
 sys.path.append("../common")
 
-import nightly_email_helper
-
 import glob
 from datetime import date
 
-if __name__ == '__main__':
+import nightly_email_helper
+
+if __name__ == "__main__":
     today = date.today().strftime("%Y-%m-%d")
     subject = "Triton Server Memory Growth " + sys.argv[1] + " Summary: " + today
     memory_graphs_resnet = glob.glob("memory_growth_resnet*.log")
     memory_graphs_busyop = glob.glob("memory_growth_busyop.log")
     write_up = "<p>This test uses perf_analyzer as clients running on 4 different models. The max allowed difference between mean and maximum memory usage is set to 150MB.</p>"
     write_up += "<p><b>&#8226 What to look for</b><br>A linear memory growth in the beginning of the graph is acceptable only when it is followed by a flat memory usage. If a linear memory growth is observed during the entire test then there is possibly a memory leak.</p>"
-    html_content = "<html><head></head><body><pre style=\"font-size:11pt;font-family:Arial, sans-serif;\">" + write_up + "</pre><pre style=\"font-size:11pt;font-family:Consolas;\">"
+    html_content = (
+        '<html><head></head><body><pre style="font-size:11pt;font-family:Arial, sans-serif;">'
+        + write_up
+        + '</pre><pre style="font-size:11pt;font-family:Consolas;">'
+    )
     for mem_graph in sorted(memory_graphs_resnet):
         html_content += "\n" + mem_graph + "\n"
         with open(mem_graph, "r") as f:
@@ -52,12 +56,18 @@
     # When we see PTX failures in CI, the busyop memory graph is not created.
     if len(memory_graphs_busyop):
         write_up = "<p><b>&#8226 What to look for</b><br>The memory usage should increase continually over time, and a linear growth should be observed in the graph below.</p>"
-        html_content += "</pre><pre style=\"font-size:11pt;font-family:Arial, sans-serif;\">" + write_up + "</pre><pre style=\"font-size:11pt;font-family:Consolas;\">"
+        html_content += (
+            '</pre><pre style="font-size:11pt;font-family:Arial, sans-serif;">'
+            + write_up
+            + '</pre><pre style="font-size:11pt;font-family:Consolas;">'
+        )
         for mem_graph in sorted(memory_graphs_busyop):
             html_content += "\n" + mem_graph + "\n"
             with open(mem_graph, "r") as f:
                 html_content += f.read() + "\n"
     else:
-        html_content += "<p>The busyop model caused PTX failures when running the CI.</p>"
+        html_content += (
+            "<p>The busyop model caused PTX failures when running the CI.</p>"
+        )
     html_content += "</pre></body></html>"
     nightly_email_helper.send(subject, html_content, is_html=True)
diff --git a/qa/L0_metrics/metrics_test.py b/qa/L0_metrics/metrics_test.py
index 36d732cdfa..13efdb0d10 100755
--- a/qa/L0_metrics/metrics_test.py
+++ b/qa/L0_metrics/metrics_test.py
@@ -27,32 +27,38 @@
 
 import os
 import sys
+
 sys.path.append("../common")
 
-import requests
 import unittest
+
+import requests
 import test_util as tu
 
 INF_COUNTER_PATTERNS = [
-    'nv_inference_request_duration', 'nv_inference_queue_duration',
-    'nv_inference_compute_input_duration',
-    'nv_inference_compute_infer_duration',
-    'nv_inference_compute_output_duration'
+    "nv_inference_request_duration",
+    "nv_inference_queue_duration",
+    "nv_inference_compute_input_duration",
+    "nv_inference_compute_infer_duration",
+    "nv_inference_compute_output_duration",
 ]
 INF_SUMMARY_PATTERNS = [
-    'nv_inference_request_summary', 'nv_inference_queue_summary',
-    'nv_inference_compute_input_summary', 'nv_inference_compute_infer_summary',
-    'nv_inference_compute_output_summary'
+    "nv_inference_request_summary",
+    "nv_inference_queue_summary",
+    "nv_inference_compute_input_summary",
+    "nv_inference_compute_infer_summary",
+    "nv_inference_compute_output_summary",
 ]
 CACHE_COUNTER_PATTERNS = [
-    'nv_cache_num_hits_per_model', 'nv_cache_num_misses_per_model',
-    'nv_cache_hit_duration_per_model', 'nv_cache_miss_duration_per_model'
+    "nv_cache_num_hits_per_model",
+    "nv_cache_num_misses_per_model",
+    "nv_cache_hit_duration_per_model",
+    "nv_cache_miss_duration_per_model",
 ]
-CACHE_SUMMARY_PATTERNS = ['nv_cache_hit_summary', 'nv_cache_miss_summary']
+CACHE_SUMMARY_PATTERNS = ["nv_cache_hit_summary", "nv_cache_miss_summary"]
 
 
 class MetricsTest(tu.TestResultCollector):
-
     def _get_metrics(self):
         metrics_url = "http://localhost:8002/metrics"
         r = requests.get(metrics_url)
@@ -111,7 +117,7 @@ def test_summaries_custom_quantiles(self):
         print(metrics)
         for quantile in quantiles:
             print(quantile)
-            self.assertIn(f"quantile=\"{quantile}\"", metrics)
+            self.assertIn(f'quantile="{quantile}"', metrics)
 
     # DLIS-4762: Disable request summary when caching enabled for now
     def test_inf_summaries_exist_with_cache(self):
@@ -124,5 +130,5 @@ def test_inf_summaries_exist_with_cache(self):
             self.assertNotIn(metric, metrics)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh
index 6ad17ec6ed..d0cf0193ae 100755
--- a/qa/L0_metrics/test.sh
+++ b/qa/L0_metrics/test.sh
@@ -290,7 +290,7 @@ python3 ${PYTHON_TEST} MetricsTest.test_summaries_custom_quantiles 2>&1 | tee ${
 check_unit_test
 kill $SERVER_PID
 wait $SERVER_PID
-   
+
 if [ $RET -eq 0 ]; then
   echo -e "\n***\n*** Test Passed\n***"
 else
diff --git a/qa/L0_mlflow/plugin_test.py b/qa/L0_mlflow/plugin_test.py
old mode 100644
new mode 100755
index 65a7cbb248..a5d87a3c19
--- a/qa/L0_mlflow/plugin_test.py
+++ b/qa/L0_mlflow/plugin_test.py
@@ -30,49 +30,49 @@
 
 sys.path.append("../common")
 
+import json
 import unittest
+
+import numpy as np
 import test_util as tu
 from mlflow.deployments import get_deploy_client
-import json
-import numpy as np
 
 
 class PluginTest(tu.TestResultCollector):
-
     def setUp(self):
-        self.client_ = get_deploy_client('triton')
+        self.client_ = get_deploy_client("triton")
 
     def _validate_deployment(self, model_name):
         # create
-        self.client_.create_deployment(model_name,
-                                       "models:/{}/1".format(model_name),
-                                       flavor="onnx")
+        self.client_.create_deployment(
+            model_name, "models:/{}/1".format(model_name), flavor="onnx"
+        )
 
         # list
         deployment_list = self.client_.list_deployments()
         self.assertEqual(len(deployment_list), 1)
-        self.assertEqual(deployment_list[0]['name'], model_name)
+        self.assertEqual(deployment_list[0]["name"], model_name)
 
         # get
         deployment = self.client_.get_deployment(model_name)
-        self.assertEqual(deployment['name'], model_name)
+        self.assertEqual(deployment["name"], model_name)
 
         # predict
         inputs = {}
         with open("./mlflow-triton-plugin/examples/input.json", "r") as f:
             input_json = json.load(f)
-            for key, value in input_json['inputs'].items():
+            for key, value in input_json["inputs"].items():
                 inputs[key] = np.array(value, dtype=np.float32)
 
         output = self.client_.predict(model_name, inputs)
-        with open("./mlflow-triton-plugin/examples/expected_output.json",
-                  "r") as f:
+        with open("./mlflow-triton-plugin/examples/expected_output.json", "r") as f:
             output_json = json.load(f)
-            for key, value in output_json['outputs'].items():
+            for key, value in output_json["outputs"].items():
                 np.testing.assert_allclose(
-                    output['outputs'][key],
+                    output["outputs"][key],
                     np.array(value, dtype=np.int32),
-                    err_msg='Inference result is not correct')
+                    err_msg="Inference result is not correct",
+                )
 
         # delete
         self.client_.delete_deployment(model_name)
@@ -81,13 +81,12 @@ def test_onnx_flavor(self):
         # Log the ONNX model to MLFlow
         import mlflow.onnx
         import onnx
+
         model = onnx.load(
             "./mlflow-triton-plugin/examples/onnx_float32_int32_int32/1/model.onnx"
         )
         # Use a different name to ensure the plugin operates on correct model
-        mlflow.onnx.log_model(model,
-                              "triton",
-                              registered_model_name="onnx_model")
+        mlflow.onnx.log_model(model, "triton", registered_model_name="onnx_model")
 
         self._validate_deployment("onnx_model")
 
@@ -95,24 +94,28 @@ def test_onnx_flavor_with_files(self):
         # Log the ONNX model and additional Triton config file to MLFlow
         import mlflow.onnx
         import onnx
+
         model = onnx.load(
             "./mlflow-triton-plugin/examples/onnx_float32_int32_int32/1/model.onnx"
         )
-        config_path = "./mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt"
+        config_path = (
+            "./mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt"
+        )
         # Use a different name to ensure the plugin operates on correct model
-        mlflow.onnx.log_model(model,
-                              "triton",
-                              registered_model_name="onnx_model_with_files")
+        mlflow.onnx.log_model(
+            model, "triton", registered_model_name="onnx_model_with_files"
+        )
         mlflow.log_artifact(config_path, "triton")
 
         self._validate_deployment("onnx_model_with_files")
 
         # Check if the additional files are properly copied
         import filecmp
+
         self.assertTrue(
-            filecmp.cmp(config_path,
-                        "./models/onnx_model_with_files/config.pbtxt"))
+            filecmp.cmp(config_path, "./models/onnx_model_with_files/config.pbtxt")
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_mlflow/test.sh b/qa/L0_mlflow/test.sh
index 2ea7980735..4b5205ba25 100755
--- a/qa/L0_mlflow/test.sh
+++ b/qa/L0_mlflow/test.sh
@@ -32,12 +32,12 @@ source ../common/util.sh
 rm -fr *.log *.json
 
 # The default version of python 3.10.6 included in
-# Ubuntu 22.04 installs blinker 1.4. This doesn't 
-# work with the awscli which we try to install. 
-# Uninstalling blinker and allowing pip to install blinker 1.6 
-# fixes this issue. The alternative to this is to 
+# Ubuntu 22.04 installs blinker 1.4. This doesn't
+# work with the awscli which we try to install.
+# Uninstalling blinker and allowing pip to install blinker 1.6
+# fixes this issue. The alternative to this is to
 # install a higher version of python which uses blinker 1.6,
-# but it is unknown whether this test should rely on 
+# but it is unknown whether this test should rely on
 # the default installation of python.
 apt remove -y python3-blinker
 
diff --git a/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py b/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py
old mode 100644
new mode 100755
index 2810cd9b90..9c5e99e49e
--- a/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,13 +28,12 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(4)
         auto_complete_model_config.add_input(input0)
diff --git a/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py b/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py
old mode 100644
new mode 100755
index 80e9f9d59c..f617ac6faf
--- a/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,13 +28,12 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(4)
         auto_complete_model_config.set_dynamic_batching()
diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py b/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py
old mode 100644
new mode 100755
index a5e02161f6..ef915705e6
--- a/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,13 +28,12 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'dims': [4]}
-        input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "dims": [4]}
+        input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(0)
         auto_complete_model_config.add_input(input0)
diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py b/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py
old mode 100644
new mode 100755
index 02a29b9a16..b5f3a0c9fc
--- a/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,13 +28,12 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32'}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        input1 = {"name": "INPUT1", "data_type": "TYPE_FP32"}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(0)
         auto_complete_model_config.add_input(input0)
diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py b/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py
old mode 100644
new mode 100755
index 10492cc438..78ba70742c
--- a/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,13 +28,12 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        input1 = {'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        input1 = {"data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(0)
         auto_complete_model_config.add_input(input0)
diff --git a/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py b/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py
old mode 100644
new mode 100755
index 037339a091..6a83d9fcbd
--- a/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,18 +28,17 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
         input1 = {
-            'name': 'INPUT1',
-            'data_type': 'TYPE_FP32',
-            'dims': [4],
-            'is_shape_tensor:': True
+            "name": "INPUT1",
+            "data_type": "TYPE_FP32",
+            "dims": [4],
+            "is_shape_tensor:": True,
         }
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(0)
         auto_complete_model_config.add_input(input0)
diff --git a/qa/L0_model_config/autofill_noplatform/python/no_return/model.py b/qa/L0_model_config/autofill_noplatform/python/no_return/model.py
old mode 100644
new mode 100755
index 5c90b2bcfb..6bb52bc152
--- a/qa/L0_model_config/autofill_noplatform/python/no_return/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/no_return/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,13 +28,12 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(0)
         auto_complete_model_config.add_input(input0)
diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py b/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py
old mode 100644
new mode 100755
index e1af57e747..64a08ca859
--- a/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,13 +28,12 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(0)
         auto_complete_model_config.add_input(input0)
diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py b/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py
old mode 100644
new mode 100755
index 88294cdb97..0ee2d01f1a
--- a/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,13 +28,12 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32'}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32"}
 
         auto_complete_model_config.set_max_batch_size(0)
         auto_complete_model_config.add_input(input0)
diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py b/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py
old mode 100644
new mode 100755
index 130e854e05..12c777c613
--- a/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,13 +28,12 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'data_type': 'TYPE_FP32', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"data_type": "TYPE_FP32", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(0)
         auto_complete_model_config.add_input(input0)
diff --git a/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py b/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py
old mode 100644
new mode 100755
index 4d3298f866..40874ab404
--- a/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,17 +28,16 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
         output1 = {
-            'name': 'OUTPUT1',
-            'data_type': 'TYPE_FP32',
-            'dims': [4],
-            'is_shape_tensor:': True
+            "name": "OUTPUT1",
+            "data_type": "TYPE_FP32",
+            "dims": [4],
+            "is_shape_tensor:": True,
         }
 
         auto_complete_model_config.set_max_batch_size(0)
diff --git a/qa/L0_model_config/autofill_noplatform_success/onnx/cpu_instance/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/onnx/cpu_instance/config.pbtxt
old mode 100755
new mode 100644
diff --git a/qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/openvino/partial_config/config.pbtxt
old mode 100755
new mode 100644
diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py
old mode 100644
new mode 100755
index 723c343702..14ca01ee47
--- a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py
+++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,11 +28,10 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(4)
         auto_complete_model_config.set_dynamic_batching()
diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py
old mode 100644
new mode 100755
index 723c343702..14ca01ee47
--- a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py
+++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,11 +28,10 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(4)
         auto_complete_model_config.set_dynamic_batching()
diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py
old mode 100644
new mode 100755
index 723c343702..14ca01ee47
--- a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py
+++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,11 +28,10 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(4)
         auto_complete_model_config.set_dynamic_batching()
diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py
old mode 100644
new mode 100755
index 80e9f9d59c..f617ac6faf
--- a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py
+++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,13 +28,12 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(4)
         auto_complete_model_config.set_dynamic_batching()
diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py
old mode 100644
new mode 100755
index 80e9f9d59c..f617ac6faf
--- a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py
+++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,13 +28,12 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(4)
         auto_complete_model_config.set_dynamic_batching()
diff --git a/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py
old mode 100644
new mode 100755
index fc150ff497..e951a2ef35
--- a/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py
+++ b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,12 +28,11 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(0)
         auto_complete_model_config.add_input(input0)
diff --git a/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/config.pbtxt b/qa/L0_model_config/autofill_noplatform_success/tensorflow_savedmodel/reshape_config_provided/config.pbtxt
old mode 100755
new mode 100644
diff --git a/qa/L0_model_config/compare_status.py b/qa/L0_model_config/compare_status.py
old mode 100644
new mode 100755
index f1548e6de4..dbed05772a
--- a/qa/L0_model_config/compare_status.py
+++ b/qa/L0_model_config/compare_status.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,43 +27,46 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
+import json
 import os
 import sys
-import json
+
 import tritonclient.grpc as grpcclient
+import tritonclient.grpc.model_config_pb2 as mc
 import tritonclient.http as httpclient
+from google.protobuf import json_format, text_format
 from tritonclient.utils import *
-from google.protobuf import text_format
-from google.protobuf import json_format
-import tritonclient.grpc.model_config_pb2 as mc
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--expected_dir',
-                        type=str,
-                        required=True,
-                        help='Directory containing expected output files')
-    parser.add_argument('--model', type=str, required=True, help='Model name')
+    parser.add_argument(
+        "--expected_dir",
+        type=str,
+        required=True,
+        help="Directory containing expected output files",
+    )
+    parser.add_argument("--model", type=str, required=True, help="Model name")
     FLAGS, unparsed = parser.parse_known_args()
 
     for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]:
         model_name = FLAGS.model
         if pair[1] == "http":
-            triton_client = httpclient.InferenceServerClient(url=pair[0],
-                                                             verbose=False)
+            triton_client = httpclient.InferenceServerClient(url=pair[0], verbose=False)
             model_config = triton_client.get_model_config(model_name)
         else:
-            triton_client = grpcclient.InferenceServerClient(url=pair[0],
-                                                             verbose=False)
+            triton_client = grpcclient.InferenceServerClient(url=pair[0], verbose=False)
             model_config = triton_client.get_model_config(model_name)
 
         nonmatch = list()
         expected_files = [
-            f for f in os.listdir(FLAGS.expected_dir)
-            if (os.path.isfile(os.path.join(FLAGS.expected_dir, f)) and
-                (f.startswith("expected")))
+            f
+            for f in os.listdir(FLAGS.expected_dir)
+            if (
+                os.path.isfile(os.path.join(FLAGS.expected_dir, f))
+                and (f.startswith("expected"))
+            )
         ]
         for efile in expected_files:
             with open(os.path.join(FLAGS.expected_dir, efile)) as f:
@@ -69,8 +74,8 @@
 
             if pair[1] == "http":
                 config_json = json.loads(
-                    json_format.MessageToJson(config,
-                                              preserving_proto_field_name=True))
+                    json_format.MessageToJson(config, preserving_proto_field_name=True)
+                )
                 if config_json == model_config:
                     sys.exit(0)
             else:
diff --git a/qa/L0_model_config/noautofill_test.py b/qa/L0_model_config/noautofill_test.py
old mode 100644
new mode 100755
index 926e4d850e..d89e306eb8
--- a/qa/L0_model_config/noautofill_test.py
+++ b/qa/L0_model_config/noautofill_test.py
@@ -30,13 +30,13 @@
 sys.path.append("../common")
 
 import unittest
+
 import test_util as tu
 import tritonclient.http as httpclient
 from tritonclient.utils import InferenceServerException
 
 
 class NoAutoFillTest(tu.TestResultCollector):
-
     def setUp(self):
         self._model_name = "noautofill_noconfig"
         self._triton_client = httpclient.InferenceServerClient("localhost:8000")
@@ -45,12 +45,12 @@ def tearDown(self):
         self._triton_client.unload_model(self._model_name)
 
     def test_load_no_autofill_model_with_config(self):
-        config = "{\"max_batch_size\":\"16\"}"
+        config = '{"max_batch_size":"16"}'
         self._triton_client.load_model(self._model_name, config=config)
 
         # Check if the model config is correct
         model_config = self._triton_client.get_model_config(self._model_name)
-        self.assertEqual(model_config['max_batch_size'], 16)
+        self.assertEqual(model_config["max_batch_size"], 16)
 
     def test_load_no_autofill_model_with_no_config(self):
         with self.assertRaises(InferenceServerException) as ex:
@@ -58,5 +58,5 @@ def test_load_no_autofill_model_with_no_config(self):
         self.assertIn("model configuration is not provided", str(ex.exception))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_model_config/test.sh b/qa/L0_model_config/test.sh
index 8b1a318c51..f3bc98fe87 100755
--- a/qa/L0_model_config/test.sh
+++ b/qa/L0_model_config/test.sh
@@ -291,14 +291,14 @@ cp /data/inferenceserver/${REPO_VERSION}/qa_model_repository/openvino_int8_int8_
 rm -f $SERVER_LOG_BASE* $CLIENT_LOG
 RET=0
 
-# Run tests for logs which do not have a timestamp on them 
+# Run tests for logs which do not have a timestamp on them
 for TARGET in `ls cli_messages`; do
     case $TARGET in
         "cli_override")
             EXTRA_ARGS="--disable-auto-complete-config --strict-model-config=false" ;;
-        "cli_deprecation") 
+        "cli_deprecation")
             EXTRA_ARGS="--strict-model-config=true" ;;
-        *) 
+        *)
             EXTRA_ARGS="" ;;
     esac
 
diff --git a/qa/L0_model_namespacing/python_addsub/__init__.py b/qa/L0_model_namespacing/python_addsub/__init__.py
old mode 100644
new mode 100755
index e14880ceba..a664eafef0
--- a/qa/L0_model_namespacing/python_addsub/__init__.py
+++ b/qa/L0_model_namespacing/python_addsub/__init__.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,8 +26,9 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
 import json
+
+import numpy as np
 import triton_python_backend_utils as pb_utils
 
 
@@ -36,67 +39,85 @@ class TritonPythonModel:
     def auto_complete_config(auto_complete_model_config):
         # Only use packaged config if config is not explicitly provided
         config = auto_complete_model_config.as_dict()
-        if (len(config['input']) != 0) or (len(config['output']) != 0):
+        if (len(config["input"]) != 0) or (len(config["output"]) != 0):
             return auto_complete_model_config
 
-        auto_complete_model_config.add_input({
-            'name': 'INPUT0',
-            'data_type': 'TYPE_INT32',
-            'dims': [16,]
-        })
-        auto_complete_model_config.add_input({
-            'name': 'INPUT1',
-            'data_type': 'TYPE_INT32',
-            'dims': [16,]
-        })
-        auto_complete_model_config.add_output({
-            'name': 'OUTPUT0',
-            'data_type': 'TYPE_INT32',
-            'dims': [16,]
-        })
-        auto_complete_model_config.add_output({
-            'name': 'OUTPUT1',
-            'data_type': 'TYPE_INT32',
-            'dims': [16,]
-        })
+        auto_complete_model_config.add_input(
+            {
+                "name": "INPUT0",
+                "data_type": "TYPE_INT32",
+                "dims": [
+                    16,
+                ],
+            }
+        )
+        auto_complete_model_config.add_input(
+            {
+                "name": "INPUT1",
+                "data_type": "TYPE_INT32",
+                "dims": [
+                    16,
+                ],
+            }
+        )
+        auto_complete_model_config.add_output(
+            {
+                "name": "OUTPUT0",
+                "data_type": "TYPE_INT32",
+                "dims": [
+                    16,
+                ],
+            }
+        )
+        auto_complete_model_config.add_output(
+            {
+                "name": "OUTPUT1",
+                "data_type": "TYPE_INT32",
+                "dims": [
+                    16,
+                ],
+            }
+        )
         return auto_complete_model_config
 
     def initialize(self, args):
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
-        output0_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT0")
-        output1_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT1")
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
 
         self.output0_dtype = pb_utils.triton_string_to_numpy(
-            output0_config['data_type'])
+            output0_config["data_type"]
+        )
         self.output1_dtype = pb_utils.triton_string_to_numpy(
-            output1_config['data_type'])
+            output1_config["data_type"]
+        )
 
     def execute(self, requests):
-        """ This function is called on inference request.
-        """
+        """This function is called on inference request."""
 
         responses = []
         for request in requests:
             in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
             in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
-            responses.append(pb_utils.InferenceResponse(self.addsub(in_0,
-                                                                    in_1)))
+            responses.append(pb_utils.InferenceResponse(self.addsub(in_0, in_1)))
         return responses
 
     def addsub(self, in_0, in_1):
-        if in_0.as_numpy().dtype.type is np.bytes_ or in_0.as_numpy(
-        ).dtype == np.object_:
-            out_0, out_1 = (in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32),\
-                in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32))
+        if (
+            in_0.as_numpy().dtype.type is np.bytes_
+            or in_0.as_numpy().dtype == np.object_
+        ):
+            out_0, out_1 = (
+                in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32),
+                in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32),
+            )
         else:
-            out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(),
-                            in_0.as_numpy() - in_1.as_numpy())
+            out_0, out_1 = (
+                in_0.as_numpy() + in_1.as_numpy(),
+                in_0.as_numpy() - in_1.as_numpy(),
+            )
 
-        out_tensor_0 = pb_utils.Tensor("OUTPUT0",
-                                       out_0.astype(self.output0_dtype))
-        out_tensor_1 = pb_utils.Tensor("OUTPUT1",
-                                       out_1.astype(self.output1_dtype))
+        out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(self.output0_dtype))
+        out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(self.output1_dtype))
         return [out_tensor_0, out_tensor_1]
diff --git a/qa/L0_model_namespacing/python_subadd/__init__.py b/qa/L0_model_namespacing/python_subadd/__init__.py
old mode 100644
new mode 100755
index 6d38542bf0..bd3ddefe9e
--- a/qa/L0_model_namespacing/python_subadd/__init__.py
+++ b/qa/L0_model_namespacing/python_subadd/__init__.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,8 +26,9 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
 import json
+
+import numpy as np
 import triton_python_backend_utils as pb_utils
 
 
@@ -36,67 +39,85 @@ class TritonPythonModel:
     def auto_complete_config(auto_complete_model_config):
         # Only use packaged config if config is not explicitly provided
         config = auto_complete_model_config.as_dict()
-        if (len(config['input']) != 0) or (len(config['output']) != 0):
+        if (len(config["input"]) != 0) or (len(config["output"]) != 0):
             return auto_complete_model_config
 
-        auto_complete_model_config.add_input({
-            'name': 'INPUT0',
-            'data_type': 'TYPE_INT32',
-            'dims': [16,]
-        })
-        auto_complete_model_config.add_input({
-            'name': 'INPUT1',
-            'data_type': 'TYPE_INT32',
-            'dims': [16,]
-        })
-        auto_complete_model_config.add_output({
-            'name': 'OUTPUT0',
-            'data_type': 'TYPE_INT32',
-            'dims': [16,]
-        })
-        auto_complete_model_config.add_output({
-            'name': 'OUTPUT1',
-            'data_type': 'TYPE_INT32',
-            'dims': [16,]
-        })
+        auto_complete_model_config.add_input(
+            {
+                "name": "INPUT0",
+                "data_type": "TYPE_INT32",
+                "dims": [
+                    16,
+                ],
+            }
+        )
+        auto_complete_model_config.add_input(
+            {
+                "name": "INPUT1",
+                "data_type": "TYPE_INT32",
+                "dims": [
+                    16,
+                ],
+            }
+        )
+        auto_complete_model_config.add_output(
+            {
+                "name": "OUTPUT0",
+                "data_type": "TYPE_INT32",
+                "dims": [
+                    16,
+                ],
+            }
+        )
+        auto_complete_model_config.add_output(
+            {
+                "name": "OUTPUT1",
+                "data_type": "TYPE_INT32",
+                "dims": [
+                    16,
+                ],
+            }
+        )
         return auto_complete_model_config
 
     def initialize(self, args):
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
-        output0_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT0")
-        output1_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT1")
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
 
         self.output0_dtype = pb_utils.triton_string_to_numpy(
-            output0_config['data_type'])
+            output0_config["data_type"]
+        )
         self.output1_dtype = pb_utils.triton_string_to_numpy(
-            output1_config['data_type'])
+            output1_config["data_type"]
+        )
 
     def execute(self, requests):
-        """ This function is called on inference request.
-        """
+        """This function is called on inference request."""
 
         responses = []
         for request in requests:
             in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
             in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
-            responses.append(pb_utils.InferenceResponse(self.subadd(in_0,
-                                                                    in_1)))
+            responses.append(pb_utils.InferenceResponse(self.subadd(in_0, in_1)))
         return responses
 
     def subadd(self, in_0, in_1):
-        if in_0.as_numpy().dtype.type is np.bytes_ or in_0.as_numpy(
-        ).dtype == np.object_:
-            out_0, out_1 = (in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32),\
-                in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32))
+        if (
+            in_0.as_numpy().dtype.type is np.bytes_
+            or in_0.as_numpy().dtype == np.object_
+        ):
+            out_0, out_1 = (
+                in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32),
+                in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32),
+            )
         else:
-            out_0, out_1 = (in_0.as_numpy() - in_1.as_numpy(),
-                            in_0.as_numpy() + in_1.as_numpy())
+            out_0, out_1 = (
+                in_0.as_numpy() - in_1.as_numpy(),
+                in_0.as_numpy() + in_1.as_numpy(),
+            )
 
-        out_tensor_0 = pb_utils.Tensor("OUTPUT0",
-                                       out_0.astype(self.output0_dtype))
-        out_tensor_1 = pb_utils.Tensor("OUTPUT1",
-                                       out_1.astype(self.output1_dtype))
+        out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(self.output0_dtype))
+        out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(self.output1_dtype))
         return [out_tensor_0, out_tensor_1]
diff --git a/qa/L0_model_namespacing/test.py b/qa/L0_model_namespacing/test.py
old mode 100644
new mode 100755
index 9de6ac749c..f45300d4fd
--- a/qa/L0_model_namespacing/test.py
+++ b/qa/L0_model_namespacing/test.py
@@ -25,17 +25,17 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import sys
 import os
+import sys
 
 sys.path.append(os.path.join(os.environ["TRITON_QA_ROOT_DIR"], "common"))
 
-import numpy as np
-import unittest
-import time
 import shutil
-import test_util as tu
+import time
+import unittest
 
+import numpy as np
+import test_util as tu
 import tritonclient.http as httpclient
 from tritonclient.utils import InferenceServerException
 
@@ -57,16 +57,14 @@ def __init__(self, checker_client=None):
         if checker_client is None:
             import tritonclient.http as checker_client
         if "http" in checker_client.__name__:
-            self.client_ = checker_client.InferenceServerClient(
-                "localhost:8000")
+            self.client_ = checker_client.InferenceServerClient("localhost:8000")
         else:
-            self.client_ = checker_client.InferenceServerClient(
-                "localhost:8001")
+            self.client_ = checker_client.InferenceServerClient("localhost:8001")
 
         # Create infer input tensors
         self.inputs_ = []
-        self.inputs_.append(checker_client.InferInput('INPUT0', [16], "INT32"))
-        self.inputs_.append(checker_client.InferInput('INPUT1', [16], "INT32"))
+        self.inputs_.append(checker_client.InferInput("INPUT0", [16], "INT32"))
+        self.inputs_.append(checker_client.InferInput("INPUT1", [16], "INT32"))
 
         # Initialize the data and expected output
         input_data = np.arange(start=0, stop=16, dtype=np.int32)
@@ -74,15 +72,17 @@ def __init__(self, checker_client=None):
         self.inputs_[1].set_data_from_numpy(input_data)
         self.expected_outputs_ = {
             "add": (input_data + input_data),
-            "sub": (input_data - input_data)
+            "sub": (input_data - input_data),
         }
 
     def infer(self, model):
         res = self.client_.infer(model, self.inputs_)
-        np.testing.assert_allclose(res.as_numpy('OUTPUT0'),
-                                   self.expected_outputs_["add"])
-        np.testing.assert_allclose(res.as_numpy('OUTPUT1'),
-                                   self.expected_outputs_["sub"])
+        np.testing.assert_allclose(
+            res.as_numpy("OUTPUT0"), self.expected_outputs_["add"]
+        )
+        np.testing.assert_allclose(
+            res.as_numpy("OUTPUT1"), self.expected_outputs_["sub"]
+        )
 
 
 # Checker to perform inference on given model, expecting model to have
@@ -90,13 +90,14 @@ def infer(self, model):
 # OUTPUT0 = INPUT0 - INPUT1
 # OUTPUT1 = INPUT0 + INPUT1
 class SubAddChecker(AddSubChecker):
-
     def infer(self, model):
         res = self.client_.infer(model, self.inputs_)
-        np.testing.assert_allclose(res.as_numpy('OUTPUT0'),
-                                   self.expected_outputs_["sub"])
-        np.testing.assert_allclose(res.as_numpy('OUTPUT1'),
-                                   self.expected_outputs_["add"])
+        np.testing.assert_allclose(
+            res.as_numpy("OUTPUT0"), self.expected_outputs_["sub"]
+        )
+        np.testing.assert_allclose(
+            res.as_numpy("OUTPUT1"), self.expected_outputs_["add"]
+        )
 
 
 #
@@ -105,7 +106,6 @@ def infer(self, model):
 
 
 class ModelNamespacePoll(tu.TestResultCollector):
-
     def setUp(self):
         self.addsub_ = AddSubChecker()
         self.subadd_ = SubAddChecker()
@@ -138,19 +138,18 @@ def test_duplication(self):
 
         # infer check
         for model in [
-                "simple_addsub",
+            "simple_addsub",
         ]:
             self.addsub_.infer(model)
         for model in [
-                "simple_subadd",
+            "simple_subadd",
         ]:
             self.subadd_.infer(model)
 
         # error check
         try:
             self.addsub_.infer("composing_model")
-            self.assertTrue(
-                False, "expected error for inferring ambiguous named model")
+            self.assertTrue(False, "expected error for inferring ambiguous named model")
         except InferenceServerException as ex:
             self.assertIn("ambiguity", ex.message())
 
@@ -165,34 +164,32 @@ def test_ensemble_duplication(self):
 
         # infer
         for model in [
-                "composing_addsub",
+            "composing_addsub",
         ]:
             self.addsub_.infer(model)
         for model in [
-                "composing_subadd",
+            "composing_subadd",
         ]:
             self.subadd_.infer(model)
 
         # error check
         try:
             self.addsub_.infer("simple_ensemble")
-            self.assertTrue(
-                False, "expected error for inferring ambiguous named model")
+            self.assertTrue(False, "expected error for inferring ambiguous named model")
         except InferenceServerException as ex:
             self.assertIn("ambiguity", ex.message())
 
     def test_dynamic_resolution(self):
         # Same model setup as 'test_duplication', will remove / add one of the
         # composing model at runtime and expect the ensemble to be properly
-        # linked to exisiting composing model at different steps.
+        # linked to existing composing model at different steps.
         # 1. Remove 'composing_model' in addsub_repo, expect both ensembles use
         #    'composing_model' in subadd_repo and act as subadd
         # 2. Add back 'composing_model' in addsub_repo, expect the ensembles to behave the
         #    same as before the removal.
         self.assertTrue("NAMESPACE_TESTING_DIRCTORY" in os.environ)
         td = os.environ["NAMESPACE_TESTING_DIRCTORY"]
-        composing_before_path = os.path.join(td, "addsub_repo",
-                                             "composing_model")
+        composing_before_path = os.path.join(td, "addsub_repo", "composing_model")
         composing_after_path = os.path.join(td, "composing_model")
 
         self.check_health()
@@ -210,25 +207,23 @@ def test_dynamic_resolution(self):
 
         # infer
         for model in [
-                "simple_addsub",
+            "simple_addsub",
         ]:
             self.addsub_.infer(model)
         for model in [
-                "simple_subadd",
+            "simple_subadd",
         ]:
             self.subadd_.infer(model)
 
         # error check
         try:
             self.addsub_.infer("composing_model")
-            self.assertTrue(
-                False, "expected error for inferring ambiguous named model")
+            self.assertTrue(False, "expected error for inferring ambiguous named model")
         except InferenceServerException as ex:
             self.assertIn("ambiguity", ex.message())
 
 
 class ModelNamespaceExplicit(tu.TestResultCollector):
-
     def setUp(self):
         self.addsub_ = AddSubChecker()
         self.subadd_ = SubAddChecker()
@@ -267,19 +262,18 @@ def test_duplication(self):
 
         # infer
         for model in [
-                "simple_addsub",
+            "simple_addsub",
         ]:
             self.addsub_.infer(model)
         for model in [
-                "simple_subadd",
+            "simple_subadd",
         ]:
             self.subadd_.infer(model)
 
         # error check
         try:
             self.addsub_.infer("composing_model")
-            self.assertTrue(
-                False, "expected error for inferring ambiguous named model")
+            self.assertTrue(False, "expected error for inferring ambiguous named model")
         except InferenceServerException as ex:
             self.assertIn("ambiguity", ex.message())
 
@@ -297,34 +291,32 @@ def test_ensemble_duplication(self):
 
         # infer
         for model in [
-                "composing_addsub",
+            "composing_addsub",
         ]:
             self.addsub_.infer(model)
         for model in [
-                "composing_subadd",
+            "composing_subadd",
         ]:
             self.subadd_.infer(model)
 
         # error check
         try:
             self.addsub_.infer("simple_ensemble")
-            self.assertTrue(
-                False, "expected error for inferring ambiguous named model")
+            self.assertTrue(False, "expected error for inferring ambiguous named model")
         except InferenceServerException as ex:
             self.assertIn("ambiguity", ex.message())
 
     def test_dynamic_resolution(self):
         # Same model setup as 'test_duplication', will remove / add one of the
         # composing model at runtime and expect the ensemble to be properly
-        # linked to exisiting composing model at different steps.
+        # linked to existing composing model at different steps.
         # 1. Remove 'composing_model' in addsub_repo, expect both ensembles use
         #    'composing_model' in subadd_repo and act as subadd.
         # 2. Add back 'composing_model' in addsub_repo, expect the ensembles to behave the
         #    same as before the removal.
         self.assertTrue("NAMESPACE_TESTING_DIRCTORY" in os.environ)
         td = os.environ["NAMESPACE_TESTING_DIRCTORY"]
-        composing_before_path = os.path.join(td, "addsub_repo",
-                                             "composing_model")
+        composing_before_path = os.path.join(td, "addsub_repo", "composing_model")
         composing_after_path = os.path.join(td, "composing_model")
 
         self.check_health()
@@ -343,28 +335,27 @@ def test_dynamic_resolution(self):
         # Explicitly load one of the ensembel, should still trigger cascading
         # (re-)load
         for model in [
-                "simple_addsub",
+            "simple_addsub",
         ]:
             self.client_.load_model(model)
 
         # infer
         for model in [
-                "simple_addsub",
+            "simple_addsub",
         ]:
             self.addsub_.infer(model)
         for model in [
-                "simple_subadd",
+            "simple_subadd",
         ]:
             self.subadd_.infer(model)
 
         # error check
         try:
             self.addsub_.infer("composing_model")
-            self.assertTrue(
-                False, "expected error for inferring ambiguous named model")
+            self.assertTrue(False, "expected error for inferring ambiguous named model")
         except InferenceServerException as ex:
             self.assertIn("ambiguity", ex.message())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_model_namespacing/test.sh b/qa/L0_model_namespacing/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py
old mode 100644
new mode 100755
index 8a184619b0..71f89a1659
--- a/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py
+++ b/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py
@@ -1,5 +1,7 @@
-import sys
+#!/usr/bin/env python3
+
 import os
+import sys
 
 # load pre-defined QA model
 sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"])
diff --git a/qa/L0_model_namespacing/test_duplication/addsub_repo/simple_addsub/config.pbtxt b/qa/L0_model_namespacing/test_duplication/addsub_repo/simple_addsub/config.pbtxt
old mode 100755
new mode 100644
index 944adcecc2..245e256976
--- a/qa/L0_model_namespacing/test_duplication/addsub_repo/simple_addsub/config.pbtxt
+++ b/qa/L0_model_namespacing/test_duplication/addsub_repo/simple_addsub/config.pbtxt
@@ -35,7 +35,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 input [
@@ -43,7 +43,7 @@ input [
     name: "INPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 output [
@@ -51,8 +51,8 @@ output [
     name: "OUTPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 output [
@@ -60,8 +60,8 @@ output [
     name: "OUTPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 ensemble_scheduling {
diff --git a/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py
old mode 100644
new mode 100755
index b21b24fd4e..4eed1f9a40
--- a/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py
+++ b/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py
@@ -1,5 +1,7 @@
-import sys
+#!/usr/bin/env python3
+
 import os
+import sys
 
 # load pre-defined QA model
 sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"])
diff --git a/qa/L0_model_namespacing/test_duplication/subadd_repo/simple_subadd/config.pbtxt b/qa/L0_model_namespacing/test_duplication/subadd_repo/simple_subadd/config.pbtxt
old mode 100755
new mode 100644
index fc9fe34081..85d8ec0051
--- a/qa/L0_model_namespacing/test_duplication/subadd_repo/simple_subadd/config.pbtxt
+++ b/qa/L0_model_namespacing/test_duplication/subadd_repo/simple_subadd/config.pbtxt
@@ -33,7 +33,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 input [
@@ -41,7 +41,7 @@ input [
     name: "INPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 output [
@@ -49,8 +49,8 @@ output [
     name: "OUTPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 output [
@@ -58,8 +58,8 @@ output [
     name: "OUTPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 ensemble_scheduling {
diff --git a/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py
old mode 100644
new mode 100755
index 8a184619b0..71f89a1659
--- a/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py
+++ b/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py
@@ -1,5 +1,7 @@
-import sys
+#!/usr/bin/env python3
+
 import os
+import sys
 
 # load pre-defined QA model
 sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"])
diff --git a/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/simple_addsub/config.pbtxt b/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/simple_addsub/config.pbtxt
old mode 100755
new mode 100644
index 944adcecc2..245e256976
--- a/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/simple_addsub/config.pbtxt
+++ b/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/simple_addsub/config.pbtxt
@@ -35,7 +35,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 input [
@@ -43,7 +43,7 @@ input [
     name: "INPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 output [
@@ -51,8 +51,8 @@ output [
     name: "OUTPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 output [
@@ -60,8 +60,8 @@ output [
     name: "OUTPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 ensemble_scheduling {
diff --git a/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py
old mode 100644
new mode 100755
index b21b24fd4e..4eed1f9a40
--- a/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py
+++ b/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py
@@ -1,5 +1,7 @@
-import sys
+#!/usr/bin/env python3
+
 import os
+import sys
 
 # load pre-defined QA model
 sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"])
diff --git a/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/simple_subadd/config.pbtxt b/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/simple_subadd/config.pbtxt
old mode 100755
new mode 100644
index 944adcecc2..245e256976
--- a/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/simple_subadd/config.pbtxt
+++ b/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/simple_subadd/config.pbtxt
@@ -35,7 +35,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 input [
@@ -43,7 +43,7 @@ input [
     name: "INPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 output [
@@ -51,8 +51,8 @@ output [
     name: "OUTPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 output [
@@ -60,8 +60,8 @@ output [
     name: "OUTPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 ensemble_scheduling {
diff --git a/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py b/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py
old mode 100644
new mode 100755
index 8a184619b0..71f89a1659
--- a/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py
+++ b/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py
@@ -1,5 +1,7 @@
-import sys
+#!/usr/bin/env python3
+
 import os
+import sys
 
 # load pre-defined QA model
 sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"])
diff --git a/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/simple_ensemble/config.pbtxt b/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/simple_ensemble/config.pbtxt
old mode 100755
new mode 100644
index 2bf341b364..2a9f0003a3
--- a/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/simple_ensemble/config.pbtxt
+++ b/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/simple_ensemble/config.pbtxt
@@ -35,7 +35,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 input [
@@ -43,7 +43,7 @@ input [
     name: "INPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 output [
@@ -51,8 +51,8 @@ output [
     name: "OUTPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 output [
@@ -60,8 +60,8 @@ output [
     name: "OUTPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 ensemble_scheduling {
diff --git a/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py b/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py
old mode 100644
new mode 100755
index b21b24fd4e..4eed1f9a40
--- a/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py
+++ b/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py
@@ -1,5 +1,7 @@
-import sys
+#!/usr/bin/env python3
+
 import os
+import sys
 
 # load pre-defined QA model
 sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"])
diff --git a/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/simple_ensemble/config.pbtxt b/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/simple_ensemble/config.pbtxt
old mode 100755
new mode 100644
index aa79a7bd08..0ee1015f25
--- a/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/simple_ensemble/config.pbtxt
+++ b/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/simple_ensemble/config.pbtxt
@@ -35,7 +35,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 input [
@@ -43,7 +43,7 @@ input [
     name: "INPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 output [
@@ -51,8 +51,8 @@ output [
     name: "OUTPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 output [
@@ -60,8 +60,8 @@ output [
     name: "OUTPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 ensemble_scheduling {
diff --git a/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py b/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py
old mode 100644
new mode 100755
index 8a184619b0..71f89a1659
--- a/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py
+++ b/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py
@@ -1,5 +1,7 @@
-import sys
+#!/usr/bin/env python3
+
 import os
+import sys
 
 # load pre-defined QA model
 sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"])
diff --git a/qa/L0_model_namespacing/test_no_duplication/addsub_repo/simple_addsub/config.pbtxt b/qa/L0_model_namespacing/test_no_duplication/addsub_repo/simple_addsub/config.pbtxt
old mode 100755
new mode 100644
index 2bf341b364..2a9f0003a3
--- a/qa/L0_model_namespacing/test_no_duplication/addsub_repo/simple_addsub/config.pbtxt
+++ b/qa/L0_model_namespacing/test_no_duplication/addsub_repo/simple_addsub/config.pbtxt
@@ -35,7 +35,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 input [
@@ -43,7 +43,7 @@ input [
     name: "INPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 output [
@@ -51,8 +51,8 @@ output [
     name: "OUTPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 output [
@@ -60,8 +60,8 @@ output [
     name: "OUTPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 ensemble_scheduling {
diff --git a/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py b/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py
old mode 100644
new mode 100755
index b21b24fd4e..4eed1f9a40
--- a/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py
+++ b/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py
@@ -1,5 +1,7 @@
-import sys
+#!/usr/bin/env python3
+
 import os
+import sys
 
 # load pre-defined QA model
 sys.path.append(os.environ["TRITON_QA_PYTHON_MODEL_DIR"])
diff --git a/qa/L0_model_namespacing/test_no_duplication/subadd_repo/simple_subadd/config.pbtxt b/qa/L0_model_namespacing/test_no_duplication/subadd_repo/simple_subadd/config.pbtxt
old mode 100755
new mode 100644
index aa79a7bd08..0ee1015f25
--- a/qa/L0_model_namespacing/test_no_duplication/subadd_repo/simple_subadd/config.pbtxt
+++ b/qa/L0_model_namespacing/test_no_duplication/subadd_repo/simple_subadd/config.pbtxt
@@ -35,7 +35,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 input [
@@ -43,7 +43,7 @@ input [
     name: "INPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
+
   }
 ]
 output [
@@ -51,8 +51,8 @@ output [
     name: "OUTPUT0"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 output [
@@ -60,8 +60,8 @@ output [
     name: "OUTPUT1"
     data_type: TYPE_INT32
     dims: [ 16 ]
-    
-    
+
+
   }
 ]
 ensemble_scheduling {
diff --git a/qa/L0_model_queue/model_queue_test.py b/qa/L0_model_queue/model_queue_test.py
old mode 100644
new mode 100755
index e0875205ff..14d2349c8c
--- a/qa/L0_model_queue/model_queue_test.py
+++ b/qa/L0_model_queue/model_queue_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,15 +30,16 @@
 
 sys.path.append("../common")
 
-from builtins import range
-import time
 import threading
+import time
 import unittest
-import numpy as np
+from builtins import range
+from ctypes import *
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
 from tritonclientutils import InferenceServerException
-from ctypes import *
 
 _max_queue_delay_ms = 10000
 
@@ -45,15 +48,11 @@
 
 
 class ModelQueueTest(tu.TestResultCollector):
-
     def setUp(self):
         self.trials_ = []
         for base in ["custom", "ensemble"]:
             for is_http_trial in [True, False]:
-                self.trials_.append({
-                    "base": base,
-                    "is_http_trial": is_http_trial
-                })
+                self.trials_.append({"base": base, "is_http_trial": is_http_trial})
         global _deferred_exceptions
         _deferred_exceptions = []
 
@@ -70,33 +69,41 @@ def check_deferred_exception(self):
                 _deferred_exceptions.pop(0)
                 raise first_exception
 
-    def check_response(self,
-                       bs,
-                       dtype,
-                       shapes,
-                       priority,
-                       timeout_us,
-                       thresholds,
-                       base="custom",
-                       is_http_trial=True):
-        full_shapes = [[
-            bs,
-        ] + shape for shape in shapes]
+    def check_response(
+        self,
+        bs,
+        dtype,
+        shapes,
+        priority,
+        timeout_us,
+        thresholds,
+        base="custom",
+        is_http_trial=True,
+    ):
+        full_shapes = [
+            [
+                bs,
+            ]
+            + shape
+            for shape in shapes
+        ]
         try:
             start_ms = int(round(time.time() * 1000))
-            iu.infer_zero(self,
-                          base,
-                          bs,
-                          dtype,
-                          full_shapes,
-                          full_shapes,
-                          model_version=1,
-                          use_http_json_tensors=False,
-                          use_http=is_http_trial,
-                          use_grpc=(not is_http_trial),
-                          use_streaming=False,
-                          priority=priority,
-                          timeout_us=timeout_us)
+            iu.infer_zero(
+                self,
+                base,
+                bs,
+                dtype,
+                full_shapes,
+                full_shapes,
+                model_version=1,
+                use_http_json_tensors=False,
+                use_http=is_http_trial,
+                use_grpc=(not is_http_trial),
+                use_streaming=False,
+                priority=priority,
+                timeout_us=timeout_us,
+            )
 
             end_ms = int(round(time.time() * 1000))
 
@@ -105,13 +112,21 @@ def check_response(self,
             if lt_ms is not None:
                 self.assertTrue(
                     (end_ms - start_ms) < lt_ms,
-                    "expected less than " + str(lt_ms) +
-                    "ms response time, got " + str(end_ms - start_ms) + " ms")
+                    "expected less than "
+                    + str(lt_ms)
+                    + "ms response time, got "
+                    + str(end_ms - start_ms)
+                    + " ms",
+                )
             if gt_ms is not None:
                 self.assertTrue(
                     (end_ms - start_ms) > gt_ms,
-                    "expected greater than " + str(gt_ms) +
-                    "ms response time, got " + str(end_ms - start_ms) + " ms")
+                    "expected greater than "
+                    + str(gt_ms)
+                    + "ms response time, got "
+                    + str(end_ms - start_ms)
+                    + " ms",
+                )
         except Exception as ex:
             self.add_deferred_exception(ex)
 
@@ -130,10 +145,12 @@ def test_max_queue_size(self):
             threads = []
             for i in range(10):
                 threads.append(
-                    threading.Thread(target=self.check_response,
-                                     args=(1, dtype, shapes, 0, 0, (None,
-                                                                    None)),
-                                     kwargs=trial))
+                    threading.Thread(
+                        target=self.check_response,
+                        args=(1, dtype, shapes, 0, 0, (None, None)),
+                        kwargs=trial,
+                    )
+                )
             preceding_thread.start()
             time.sleep(0.5)
             for t in threads:
@@ -150,8 +167,10 @@ def test_max_queue_size(self):
                 except InferenceServerException as ex:
                     self.assertTrue(
                         "Exceeds maximum queue size" in ex.message(),
-                        "Expected error message \"Exceeds maximum queue size\", got: {}"
-                        .format(ex))
+                        'Expected error message "Exceeds maximum queue size", got: {}'.format(
+                            ex
+                        ),
+                    )
             try:
                 self.check_deferred_exception()
             except InferenceServerException as ex:
@@ -170,18 +189,26 @@ def test_policy_delay(self):
             try:
                 threads = []
                 threads.append(
-                    threading.Thread(target=self.check_response,
-                                     args=(1, dtype, shapes, 0, 0, (15000,
-                                                                    10000)),
-                                     kwargs=trial))
+                    threading.Thread(
+                        target=self.check_response,
+                        args=(1, dtype, shapes, 0, 0, (15000, 10000)),
+                        kwargs=trial,
+                    )
+                )
                 threads.append(
-                    threading.Thread(target=self.check_response,
-                                     args=(2, dtype, shapes, 0, 0, (100, 0)),
-                                     kwargs=trial))
+                    threading.Thread(
+                        target=self.check_response,
+                        args=(2, dtype, shapes, 0, 0, (100, 0)),
+                        kwargs=trial,
+                    )
+                )
                 threads.append(
-                    threading.Thread(target=self.check_response,
-                                     args=(2, dtype, shapes, 0, 0, (100, 0)),
-                                     kwargs=trial))
+                    threading.Thread(
+                        target=self.check_response,
+                        args=(2, dtype, shapes, 0, 0, (100, 0)),
+                        kwargs=trial,
+                    )
+                )
                 threads[0].start()
                 time.sleep(0.2)
                 threads[1].start()
@@ -203,17 +230,26 @@ def test_policy_reject(self):
         for trial in self.trials_:
             threads = []
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(1, dtype, shapes, 0, 0, (None, None)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(1, dtype, shapes, 0, 0, (None, None)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 0, 0, (100, 0)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 0, 0, (100, 0)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 0, 0, (100, 0)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 0, 0, (100, 0)),
+                    kwargs=trial,
+                )
+            )
             threads[0].start()
             time.sleep(0.2)
             threads[1].start()
@@ -228,8 +264,10 @@ def test_policy_reject(self):
             except InferenceServerException as ex:
                 self.assertTrue(
                     "Request timeout expired" in ex.message(),
-                    "Expected error message \"Request timeout expired\", got: {}"
-                    .format(ex))
+                    'Expected error message "Request timeout expired", got: {}'.format(
+                        ex
+                    ),
+                )
 
             try:
                 self.check_deferred_exception()
@@ -238,7 +276,7 @@ def test_policy_reject(self):
 
     def test_timeout_override(self):
         # Send requests with batch sizes 1, 1, 3 where the first request
-        # overrides the timout to be less than 'default_timeout_microseconds',
+        # overrides the timeout to be less than 'default_timeout_microseconds',
         # and the second and third requests are sent after the overridden
         # timeout. Expect the first request is timed-out and rejected before
         # 'default_timeout_microseconds', which makes the second and third
@@ -250,18 +288,26 @@ def test_timeout_override(self):
         for trial in self.trials_:
             threads = []
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(1, dtype, shapes, 0, 100000, (None,
-                                                                     None)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(1, dtype, shapes, 0, 100000, (None, None)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 0, 0, (100, 0)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 0, 0, (100, 0)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 0, 0, (100, 0)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 0, 0, (100, 0)),
+                    kwargs=trial,
+                )
+            )
             threads[0].start()
             time.sleep(0.2)
             threads[1].start()
@@ -276,8 +322,10 @@ def test_timeout_override(self):
             except InferenceServerException as ex:
                 self.assertTrue(
                     "Request timeout expired" in ex.message(),
-                    "Expected error message \"Request timeout expired\", got: {}"
-                    .format(ex))
+                    'Expected error message "Request timeout expired", got: {}'.format(
+                        ex
+                    ),
+                )
 
             try:
                 self.check_deferred_exception()
@@ -289,18 +337,26 @@ def test_timeout_override(self):
             # 'default_timeout_microseconds' and before queue delay.
             threads = []
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(1, dtype, shapes, 0, 10000000, (None,
-                                                                       None)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(1, dtype, shapes, 0, 10000000, (None, None)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 0, 0, (1100, 700)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 0, 0, (1100, 700)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 0, 0, (1100, 700)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 0, 0, (1100, 700)),
+                    kwargs=trial,
+                )
+            )
             threads[0].start()
             time.sleep(0.2)
             threads[1].start()
@@ -315,8 +371,10 @@ def test_timeout_override(self):
             except InferenceServerException as ex:
                 self.assertTrue(
                     "Request timeout expired" in ex.message(),
-                    "Expected error message \"Request timeout expired\", got: {}"
-                    .format(ex))
+                    'Expected error message "Request timeout expired", got: {}'.format(
+                        ex
+                    ),
+                )
 
             try:
                 self.check_deferred_exception()
@@ -327,17 +385,26 @@ def test_timeout_override(self):
             # processed only after 'default_timeout_microseconds'
             threads = []
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(1, dtype, shapes, 0, 0, (None, None)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(1, dtype, shapes, 0, 0, (None, None)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 0, 0, (1100, 700)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 0, 0, (1100, 700)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 0, 0, (1100, 700)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 0, 0, (1100, 700)),
+                    kwargs=trial,
+                )
+            )
             threads[0].start()
             time.sleep(0.2)
             threads[1].start()
@@ -352,8 +419,10 @@ def test_timeout_override(self):
             except InferenceServerException as ex:
                 self.assertTrue(
                     "Request timeout expired" in ex.message(),
-                    "Expected error message \"Request timeout expired\", got: {}"
-                    .format(ex))
+                    'Expected error message "Request timeout expired", got: {}'.format(
+                        ex
+                    ),
+                )
 
             try:
                 self.check_deferred_exception()
@@ -370,17 +439,26 @@ def test_priority_levels(self):
         for trial in self.trials_:
             threads = []
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 0, 0, (500, 200)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 0, 0, (500, 200)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(1, dtype, shapes, 0, 0, (15000, 10000)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(1, dtype, shapes, 0, 0, (15000, 10000)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 1, 0, (100, 0)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 1, 0, (100, 0)),
+                    kwargs=trial,
+                )
+            )
             threads[0].start()
             # wait to make sure the order is correct
             time.sleep(0.1)
@@ -407,18 +485,26 @@ def test_max_priority_levels(self):
         for trial in self.trials_:
             threads = []
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 0, 0, (500, 200)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 0, 0, (500, 200)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(1, dtype, shapes, MAX_UINT32_PLUS_1, 0,
-                                       (15000, 10000)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(1, dtype, shapes, MAX_UINT32_PLUS_1, 0, (15000, 10000)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 1, 0, (100, 0)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 1, 0, (100, 0)),
+                    kwargs=trial,
+                )
+            )
             threads[0].start()
             # wait to make sure the order is correct
             time.sleep(0.1)
@@ -464,31 +550,47 @@ def test_priority_with_policy(self):
             # The expected ranges may not be rounded to accommodate
             # the sleep between sending requests
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 1, 0, (2000, 1000)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 1, 0, (2000, 1000)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(1, dtype, shapes, 1, 1000000, (3400,
-                                                                      2400)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(1, dtype, shapes, 1, 1000000, (3400, 2400)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 1, 0, (1700, 700)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 1, 0, (1700, 700)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, dtype, shapes, 2, 2000000, (None,
-                                                                      None)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, dtype, shapes, 2, 2000000, (None, None)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(3, dtype, shapes, 2, 0, (2700, 1700)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(3, dtype, shapes, 2, 0, (2700, 1700)),
+                    kwargs=trial,
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(6, dtype, shapes, 2, 0, (15000, 10000)),
-                                 kwargs=trial))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(6, dtype, shapes, 2, 0, (15000, 10000)),
+                    kwargs=trial,
+                )
+            )
             for t in threads:
                 t.start()
                 time.sleep(0.2)
@@ -502,8 +604,10 @@ def test_priority_with_policy(self):
             except InferenceServerException as ex:
                 self.assertTrue(
                     "Request timeout expired" in ex.message(),
-                    "Expected error message \"Request timeout expired\", got: {}"
-                    .format(ex))
+                    'Expected error message "Request timeout expired", got: {}'.format(
+                        ex
+                    ),
+                )
 
             try:
                 self.check_deferred_exception()
@@ -511,5 +615,5 @@ def test_priority_with_policy(self):
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_model_update/instance_update_test.py b/qa/L0_model_update/instance_update_test.py
old mode 100644
new mode 100755
index 39f5bfc8d4..27a09486d9
--- a/qa/L0_model_update/instance_update_test.py
+++ b/qa/L0_model_update/instance_update_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,23 +26,28 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import unittest
+import concurrent.futures
+import json
 import os
 import random
 import time
-import concurrent.futures
-import json
+import unittest
+
 import numpy as np
 import tritonclient.grpc as grpcclient
+from models.model_init_del.util import (
+    disable_batching,
+    enable_batching,
+    get_count,
+    reset_count,
+    set_delay,
+    update_instance_group,
+    update_model_file,
+)
 from tritonclient.utils import InferenceServerException
-from models.model_init_del.util import (get_count, reset_count, set_delay,
-                                        update_instance_group,
-                                        update_model_file, enable_batching,
-                                        disable_batching)
 
 
 class TestInstanceUpdate(unittest.TestCase):
-
     __model_name = "model_init_del"
 
     def setUp(self):
@@ -99,17 +106,18 @@ def __load_model(self, instance_count, instance_config="", batching=False):
         set_delay("initialize", 0)
         set_delay("infer", 0)
         # Load model
-        self.__update_instance_count(instance_count,
-                                     0,
-                                     instance_config,
-                                     batching=batching)
-
-    def __update_instance_count(self,
-                                add_count,
-                                del_count,
-                                instance_config="",
-                                wait_for_finalize=False,
-                                batching=False):
+        self.__update_instance_count(
+            instance_count, 0, instance_config, batching=batching
+        )
+
+    def __update_instance_count(
+        self,
+        add_count,
+        del_count,
+        instance_config="",
+        wait_for_finalize=False,
+        batching=False,
+    ):
         self.assertIsInstance(add_count, int)
         self.assertGreaterEqual(add_count, 0)
         self.assertIsInstance(del_count, int)
@@ -122,8 +130,7 @@ def __update_instance_count(self,
         if len(instance_config) == 0:
             prev_count = prev_initialize_count - prev_finalize_count
             new_count = prev_count + add_count - del_count
-            instance_config = ("{\ncount: " + str(new_count) +
-                               "\nkind: KIND_CPU\n}")
+            instance_config = "{\ncount: " + str(new_count) + "\nkind: KIND_CPU\n}"
         update_instance_group(instance_config)
         self.__triton.load_model(self.__model_name)
         self.__check_count("initialize", new_initialize_count)
@@ -190,20 +197,20 @@ def test_gpu_instance_update(self):
     def test_gpu_cpu_instance_update(self):
         # Load model with 1 GPU instance and 2 CPU instance
         self.__load_model(
-            3,
-            "{\ncount: 2\nkind: KIND_CPU\n},\n{\ncount: 1\nkind: KIND_GPU\n}")
+            3, "{\ncount: 2\nkind: KIND_CPU\n},\n{\ncount: 1\nkind: KIND_GPU\n}"
+        )
         # Add 2 GPU instance and remove 1 CPU instance
         self.__update_instance_count(
-            2, 1,
-            "{\ncount: 1\nkind: KIND_CPU\n},\n{\ncount: 3\nkind: KIND_GPU\n}")
+            2, 1, "{\ncount: 1\nkind: KIND_CPU\n},\n{\ncount: 3\nkind: KIND_GPU\n}"
+        )
         # Shuffle the instances
         self.__update_instance_count(
-            0, 0,
-            "{\ncount: 3\nkind: KIND_GPU\n},\n{\ncount: 1\nkind: KIND_CPU\n}")
+            0, 0, "{\ncount: 3\nkind: KIND_GPU\n},\n{\ncount: 1\nkind: KIND_CPU\n}"
+        )
         # Remove 1 GPU instance and add 1 CPU instance
         self.__update_instance_count(
-            1, 1,
-            "{\ncount: 2\nkind: KIND_GPU\n},\n{\ncount: 2\nkind: KIND_CPU\n}")
+            1, 1, "{\ncount: 2\nkind: KIND_GPU\n},\n{\ncount: 2\nkind: KIND_CPU\n}"
+        )
         # Unload model
         self.__unload_model()
 
@@ -212,12 +219,13 @@ def test_instance_name_update(self):
         # Load 3 instances with 2 different names
         self.__load_model(
             3,
-            "{\nname: \"old_1\"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: \"old_2\"\ncount: 2\nkind: KIND_GPU\n}"
+            '{\nname: "old_1"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "old_2"\ncount: 2\nkind: KIND_GPU\n}',
         )
         # Change the instance names
         self.__update_instance_count(
-            0, 0,
-            "{\nname: \"new_1\"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: \"new_2\"\ncount: 2\nkind: KIND_GPU\n}"
+            0,
+            0,
+            '{\nname: "new_1"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "new_2"\ncount: 2\nkind: KIND_GPU\n}',
         )
         # Unload model
         self.__unload_model()
@@ -227,24 +235,27 @@ def test_instance_signature(self):
         # Load 2 GPU instances and 3 CPU instances
         self.__load_model(
             5,
-            "{\nname: \"GPU_group\"\ncount: 2\nkind: KIND_GPU\n},\n{\nname: \"CPU_group\"\ncount: 3\nkind: KIND_CPU\n}"
+            '{\nname: "GPU_group"\ncount: 2\nkind: KIND_GPU\n},\n{\nname: "CPU_group"\ncount: 3\nkind: KIND_CPU\n}',
         )
         # Flatten the instances representation
         self.__update_instance_count(
-            0, 0,
-            "{\nname: \"CPU_1\"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: \"CPU_2_3\"\ncount: 2\nkind: KIND_CPU\n},\n{\nname: \"GPU_1\"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: \"GPU_2\"\ncount: 1\nkind: KIND_GPU\n}"
+            0,
+            0,
+            '{\nname: "CPU_1"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "CPU_2_3"\ncount: 2\nkind: KIND_CPU\n},\n{\nname: "GPU_1"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: "GPU_2"\ncount: 1\nkind: KIND_GPU\n}',
         )
         time.sleep(0.1)  # larger the gap for config.pbtxt timestamp to update
         # Consolidate different representations
         self.__update_instance_count(
-            0, 0,
-            "{\nname: \"CPU_group\"\ncount: 3\nkind: KIND_CPU\n},\n{\nname: \"GPU_group\"\ncount: 2\nkind: KIND_GPU\n}"
+            0,
+            0,
+            '{\nname: "CPU_group"\ncount: 3\nkind: KIND_CPU\n},\n{\nname: "GPU_group"\ncount: 2\nkind: KIND_GPU\n}',
         )
         time.sleep(0.1)  # larger the gap for config.pbtxt timestamp to update
         # Flatten the instances representation
         self.__update_instance_count(
-            0, 0,
-            "{\nname: \"GPU_1\"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: \"GPU_2\"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: \"CPU_1\"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: \"CPU_2\"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: \"CPU_3\"\ncount: 1\nkind: KIND_CPU\n}"
+            0,
+            0,
+            '{\nname: "GPU_1"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: "GPU_2"\ncount: 1\nkind: KIND_GPU\n},\n{\nname: "CPU_1"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "CPU_2"\ncount: 1\nkind: KIND_CPU\n},\n{\nname: "CPU_3"\ncount: 1\nkind: KIND_CPU\n}',
         )
         # Unload model
         self.__unload_model()
@@ -266,21 +277,22 @@ def test_invalid_config(self):
     def test_model_file_update(self):
         self.__load_model(5)
         update_model_file()
-        self.__update_instance_count(6,
-                                     5,
-                                     "{\ncount: 6\nkind: KIND_CPU\n}",
-                                     wait_for_finalize=True)
+        self.__update_instance_count(
+            6, 5, "{\ncount: 6\nkind: KIND_CPU\n}", wait_for_finalize=True
+        )
         self.__unload_model()
 
     # Test instance update with non instance config changed in config.pbtxt
     def test_non_instance_config_update(self):
         self.__load_model(4, batching=False)
         enable_batching()
-        self.__update_instance_count(2,
-                                     4,
-                                     "{\ncount: 2\nkind: KIND_CPU\n}",
-                                     wait_for_finalize=True,
-                                     batching=True)
+        self.__update_instance_count(
+            2,
+            4,
+            "{\ncount: 2\nkind: KIND_CPU\n}",
+            wait_for_finalize=True,
+            batching=True,
+        )
         self.__unload_model(batching=True)
 
     # Test passing new instance config via load API
@@ -320,8 +332,7 @@ def test_update_while_inferencing(self):
             infer_thread = pool.submit(self.__infer)
             time.sleep(2)  # make sure inference has started
             update_start_time = time.time()
-            update_thread = pool.submit(self.__triton.load_model,
-                                        self.__model_name)
+            update_thread = pool.submit(self.__triton.load_model, self.__model_name)
             update_thread.result()
             update_end_time = time.time()
             infer_thread.result()
@@ -347,8 +358,7 @@ def test_infer_while_updating(self):
         update_instance_group("{\ncount: 2\nkind: KIND_CPU\n}")
         with concurrent.futures.ThreadPoolExecutor() as pool:
             update_start_time = time.time()
-            update_thread = pool.submit(self.__triton.load_model,
-                                        self.__model_name)
+            update_thread = pool.submit(self.__triton.load_model, self.__model_name)
             time.sleep(2)  # make sure update has started
             infer_start_time = time.time()
             infer_thread = pool.submit(self.__infer)
@@ -369,18 +379,21 @@ def test_infer_while_updating(self):
         self.__unload_model()
 
     # Test instance resource requirement increase
-    @unittest.skipUnless("execution_count" in os.environ["RATE_LIMIT_MODE"],
-                         "Rate limiter precondition not met for this test")
+    @unittest.skipUnless(
+        "execution_count" in os.environ["RATE_LIMIT_MODE"],
+        "Rate limiter precondition not met for this test",
+    )
     def test_instance_resource_increase(self):
         # Load model
         self.__load_model(
             1,
-            "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 2\n}\n]\n}\n}"
+            '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 2\n}\n]\n}\n}',
         )
         # Increase resource requirement
         self.__update_instance_count(
-            1, 1,
-            "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 8\n}\n]\n}\n}"
+            1,
+            1,
+            '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 8\n}\n]\n}\n}',
         )
         # Check the model is not blocked from infer due to the default resource
         # possibly not updated to the larger resource requirement.
@@ -401,42 +414,48 @@ def infer():
         self.__unload_model()
 
     # Test instance resource requirement increase above explicit resource
-    @unittest.skipUnless(os.environ["RATE_LIMIT_MODE"] ==
-                         "execution_count_with_explicit_resource",
-                         "Rate limiter precondition not met for this test")
+    @unittest.skipUnless(
+        os.environ["RATE_LIMIT_MODE"] == "execution_count_with_explicit_resource",
+        "Rate limiter precondition not met for this test",
+    )
     def test_instance_resource_increase_above_explicit(self):
         # Load model
         self.__load_model(
             1,
-            "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 2\n}\n]\n}\n}"
+            '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 2\n}\n]\n}\n}',
         )
         # Increase resource requirement
         with self.assertRaises(InferenceServerException):
             self.__update_instance_count(
-                0, 0,
-                "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 32\n}\n]\n}\n}"
+                0,
+                0,
+                '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 32\n}\n]\n}\n}',
             )
         # Correct the resource requirement to match the explicit resource
         self.__update_instance_count(
-            1, 1,
-            "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 10\n}\n]\n}\n}"
+            1,
+            1,
+            '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 10\n}\n]\n}\n}',
         )
         # Unload model
         self.__unload_model()
 
     # Test instance resource requirement decrease
-    @unittest.skipUnless("execution_count" in os.environ["RATE_LIMIT_MODE"],
-                         "Rate limiter precondition not met for this test")
+    @unittest.skipUnless(
+        "execution_count" in os.environ["RATE_LIMIT_MODE"],
+        "Rate limiter precondition not met for this test",
+    )
     def test_instance_resource_decrease(self):
         # Load model
         self.__load_model(
             1,
-            "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 4\n}\n]\n}\n}"
+            '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 4\n}\n]\n}\n}',
         )
         # Decrease resource requirement
         self.__update_instance_count(
-            1, 1,
-            "{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: \"R1\"\ncount: 3\n}\n]\n}\n}"
+            1,
+            1,
+            '{\ncount: 1\nkind: KIND_CPU\nrate_limiter {\nresources [\n{\nname: "R1"\ncount: 3\n}\n]\n}\n}',
         )
         # Unload model
         self.__unload_model()
@@ -445,8 +464,11 @@ def test_instance_resource_decrease(self):
         # max resource is actually decreased.
         time.sleep(1)  # make sure the log file is updated
         log_path = os.path.join(
-            os.environ["MODEL_LOG_DIR"], "instance_update_test.rate_limit_" +
-            os.environ["RATE_LIMIT_MODE"] + ".server.log")
+            os.environ["MODEL_LOG_DIR"],
+            "instance_update_test.rate_limit_"
+            + os.environ["RATE_LIMIT_MODE"]
+            + ".server.log",
+        )
         with open(log_path, mode="r", encoding="utf-8", errors="strict") as f:
             if os.environ["RATE_LIMIT_MODE"] == "execution_count":
                 # Make sure the previous max resource limit of 4 is reduced to 3
diff --git a/qa/L0_multi_server/test.sh b/qa/L0_multi_server/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_nan_inf/models/nan_inf_output/1/model.py b/qa/L0_nan_inf/models/nan_inf_output/1/model.py
old mode 100644
new mode 100755
index df269edf52..d85c3b4702
--- a/qa/L0_nan_inf/models/nan_inf_output/1/model.py
+++ b/qa/L0_nan_inf/models/nan_inf_output/1/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,24 +27,22 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import json
+
 import numpy as np
 import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
-        self.model_config = json.loads(args['model_config'])
+        self.model_config = json.loads(args["model_config"])
 
     def execute(self, requests):
-        """ This function is called on inference request.
-        """
+        """This function is called on inference request."""
 
         responses = []
         for _ in requests:
             # Include one of each specially parsed JSON value: nan, inf, and -inf
-            out_0 = np.array([np.nan, np.inf, np.NINF, 1, 2, 3],
-                             dtype=np.float32)
+            out_0 = np.array([np.nan, np.inf, np.NINF, 1, 2, 3], dtype=np.float32)
             out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0)
             responses.append(pb_utils.InferenceResponse([out_tensor_0]))
 
diff --git a/qa/L0_nan_inf/nan_inf_test.py b/qa/L0_nan_inf/nan_inf_test.py
old mode 100644
new mode 100755
index 630c573a1b..3013b03850
--- a/qa/L0_nan_inf/nan_inf_test.py
+++ b/qa/L0_nan_inf/nan_inf_test.py
@@ -27,37 +27,34 @@
 
 import sys
 
-sys.path.append('../common')
+sys.path.append("../common")
 
 import json
-import unittest
 import traceback
+import unittest
 
-import requests
 import numpy as np
-import tritonclient.http as tritonhttpclient
+import requests
+import test_util as tu
 import tritonclient.grpc as tritongrpcclient
+import tritonclient.http as tritonhttpclient
 from tritonclient.utils import InferenceServerException
-import test_util as tu
 
 
 class NanInfTest(tu.TestResultCollector):
-    expected_output = np.array([np.nan, np.inf, np.NINF, 1, 2, 3],
-                               dtype=np.float32)
+    expected_output = np.array([np.nan, np.inf, np.NINF, 1, 2, 3], dtype=np.float32)
     model_name = "nan_inf_output"
 
     def test_http_raw(self):
         payload = {
-            "inputs": [{
-                "name": "INPUT0",
-                "datatype": "FP32",
-                "shape": [1],
-                "data": [1]
-            }]
+            "inputs": [
+                {"name": "INPUT0", "datatype": "FP32", "shape": [1], "data": [1]}
+            ]
         }
         response = requests.post(
             "http://localhost:8000/v2/models/nan_inf_output/infer",
-            data=json.dumps(payload))
+            data=json.dumps(payload),
+        )
         if not response.ok:
             self.assertTrue(False, "Response not OK: {}".format(response.text))
 
@@ -65,40 +62,40 @@ def test_http_raw(self):
             print(response.json())
         except:
             self.assertTrue(
-                False, "Response was not valid JSON:\n{}".format(response.text))
+                False, "Response was not valid JSON:\n{}".format(response.text)
+            )
 
     def test_http(self):
         triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
         inputs = []
-        inputs.append(tritonhttpclient.InferInput('INPUT0', [1], "FP32"))
+        inputs.append(tritonhttpclient.InferInput("INPUT0", [1], "FP32"))
         self.infer_helper(triton_client, inputs)
 
     def test_grpc(self):
         triton_client = tritongrpcclient.InferenceServerClient("localhost:8001")
         inputs = []
-        inputs.append(tritongrpcclient.InferInput('INPUT0', [1], "FP32"))
+        inputs.append(tritongrpcclient.InferInput("INPUT0", [1], "FP32"))
         self.infer_helper(triton_client, inputs)
 
     def infer_helper(self, triton_client, inputs):
         inputs[0].set_data_from_numpy(np.arange(1, dtype=np.float32))
 
         try:
-            results = triton_client.infer(model_name=self.model_name,
-                                          inputs=inputs)
-            output0_data = results.as_numpy('OUTPUT0')
+            results = triton_client.infer(model_name=self.model_name, inputs=inputs)
+            output0_data = results.as_numpy("OUTPUT0")
             # Verify output is as expected
             # Make sure nan's are equivalent when compared
-            output_correct = np.array_equal(output0_data,
-                                            self.expected_output,
-                                            equal_nan=True)
+            output_correct = np.array_equal(
+                output0_data, self.expected_output, equal_nan=True
+            )
             self.assertTrue(
-                output_correct,
-                "didn't get expected output0: {}".format(output0_data))
+                output_correct, "didn't get expected output0: {}".format(output0_data)
+            )
         except InferenceServerException as ex:
             self.assertTrue(False, ex.message())
         except:
             self.assertTrue(False, traceback.format_exc())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_nullchar_string/nullchar_string_client.py b/qa/L0_nullchar_string/nullchar_string_client.py
old mode 100644
new mode 100755
index d90304856d..2d69b41b3d
--- a/qa/L0_nullchar_string/nullchar_string_client.py
+++ b/qa/L0_nullchar_string/nullchar_string_client.py
@@ -26,47 +26,51 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 
+import numpy as np
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 from tritonclientutils import np_to_triton_dtype
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-m',
-                        '--model-name',
-                        type=str,
-                        required=True,
-                        help='Name of model')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8000',
-                        help='Inference server URL. Default is localhost:8000.')
     parser.add_argument(
-        '-i',
-        '--protocol',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-m", "--model-name", type=str, required=True, help="Name of model"
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
+        type=str,
+        required=False,
+        default="localhost:8000",
+        help="Inference server URL. Default is localhost:8000.",
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
         type=str,
         required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
+        default="http",
+        help='Protocol ("http"/"grpc") used to '
+        + 'communicate with inference service. Default is "http".',
+    )
 
     FLAGS = parser.parse_args()
 
     if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
+        print(
+            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
+        )
         exit(1)
 
     client_util = httpclient if FLAGS.protocol == "http" else grpcclient
@@ -86,8 +90,9 @@
     # Send inference request to the inference server. Get results for
     # output tensor.
     inputs = [
-        client_util.InferInput("INPUT0", input0_data.shape,
-                               np_to_triton_dtype(np.object_))
+        client_util.InferInput(
+            "INPUT0", input0_data.shape, np_to_triton_dtype(np.object_)
+        )
     ]
     inputs[0].set_data_from_numpy(input0_data)
 
@@ -95,7 +100,7 @@
 
     # We expect there to be 1 result (with batch-size 1). Compare the input
     # and output tensor calculated by the model. They must be the same.
-    output0_data = results.as_numpy('OUTPUT0')
+    output0_data = results.as_numpy("OUTPUT0")
 
     print(input0_data, "?=?", output0_data)
     assert np.equal(input0_data.astype(np.bytes_), output0_data).all()
diff --git a/qa/L0_nullchar_string/test.sh b/qa/L0_nullchar_string/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_optional_input/models/ensemble_identity_2_float32/config.pbtxt b/qa/L0_optional_input/models/ensemble_identity_2_float32/config.pbtxt
old mode 100755
new mode 100644
diff --git a/qa/L0_optional_input/models/identity_2_float32/config.pbtxt b/qa/L0_optional_input/models/identity_2_float32/config.pbtxt
old mode 100755
new mode 100644
diff --git a/qa/L0_optional_input/models/pipeline_identity_2_float32/config.pbtxt b/qa/L0_optional_input/models/pipeline_identity_2_float32/config.pbtxt
old mode 100755
new mode 100644
diff --git a/qa/L0_optional_input/optional_input_test.py b/qa/L0_optional_input/optional_input_test.py
old mode 100644
new mode 100755
index c813146ecd..308efebf45
--- a/qa/L0_optional_input/optional_input_test.py
+++ b/qa/L0_optional_input/optional_input_test.py
@@ -30,13 +30,14 @@
 
 sys.path.append("../common")
 
-import numpy as np
 import sys
-import time
 import threading
+import time
 import unittest
-import tritonclient.grpc as grpcclient
+
+import numpy as np
 import test_util as tu
+import tritonclient.grpc as grpcclient
 
 _deferred_exceptions_lock = threading.Lock()
 _deferred_exceptions = []
@@ -44,31 +45,30 @@
 
 # Similar set up as dynamic batcher tests
 class OptionalInputTest(tu.TestResultCollector):
-
     def setUp(self):
         global _deferred_exceptions
         _deferred_exceptions = []
 
         # The helper client for setup will be GRPC for simplicity.
         self.triton_client_ = grpcclient.InferenceServerClient("localhost:8001")
-        self.model_name_ = 'identity_2_float32'
+        self.model_name_ = "identity_2_float32"
         # This will not be changed even when ensemble is under test,
         # as the dynamic batching is performed within the composing model
-        self.check_status_model = 'identity_2_float32'
+        self.check_status_model = "identity_2_float32"
         self.tensor_shape_ = (1, 1)
         self.inputs_ = {
-            "INPUT0": grpcclient.InferInput('INPUT0', [1, 1], "FP32"),
-            "INPUT1": grpcclient.InferInput('INPUT1', [1, 1], "FP32")
+            "INPUT0": grpcclient.InferInput("INPUT0", [1, 1], "FP32"),
+            "INPUT1": grpcclient.InferInput("INPUT1", [1, 1], "FP32"),
         }
         self.input_data_ = {
             "INPUT0": np.ones(shape=(1, 1), dtype=np.float32),
-            "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32)
+            "INPUT1": np.zeros(shape=(1, 1), dtype=np.float32),
         }
         self.inputs_["INPUT0"].set_data_from_numpy(self.input_data_["INPUT0"])
         self.inputs_["INPUT1"].set_data_from_numpy(self.input_data_["INPUT1"])
         self.outputs_ = {
-            "INPUT0": grpcclient.InferRequestedOutput('OUTPUT0'),
-            "INPUT1": grpcclient.InferRequestedOutput('OUTPUT1')
+            "INPUT0": grpcclient.InferRequestedOutput("OUTPUT0"),
+            "INPUT1": grpcclient.InferRequestedOutput("OUTPUT1"),
         }
 
     def add_deferred_exception(self, ex):
@@ -93,9 +93,9 @@ def check_response(self, thresholds, provided_inputs=("INPUT0", "INPUT1")):
                 outputs.append(self.outputs_[provided_input])
 
             triton_client = grpcclient.InferenceServerClient("localhost:8001")
-            results = triton_client.infer(model_name=self.model_name_,
-                                          inputs=inputs,
-                                          outputs=outputs)
+            results = triton_client.infer(
+                model_name=self.model_name_, inputs=inputs, outputs=outputs
+            )
 
             end_ms = int(round(time.time() * 1000))
 
@@ -106,20 +106,30 @@ def check_response(self, thresholds, provided_inputs=("INPUT0", "INPUT1")):
                 self.assertTrue(
                     np.array_equal(output_data, expected),
                     "{}, {}, expected: {}, got {}".format(
-                        self.model_name_, output_name, expected, output_data))
+                        self.model_name_, output_name, expected, output_data
+                    ),
+                )
 
             gt_ms = thresholds[0]
             lt_ms = thresholds[1]
             if lt_ms is not None:
                 self.assertTrue(
                     (end_ms - start_ms) < lt_ms,
-                    "expected less than " + str(lt_ms) +
-                    "ms response time, got " + str(end_ms - start_ms) + " ms")
+                    "expected less than "
+                    + str(lt_ms)
+                    + "ms response time, got "
+                    + str(end_ms - start_ms)
+                    + " ms",
+                )
             if gt_ms is not None:
                 self.assertTrue(
                     (end_ms - start_ms) > gt_ms,
-                    "expected greater than " + str(gt_ms) +
-                    "ms response time, got " + str(end_ms - start_ms) + " ms")
+                    "expected greater than "
+                    + str(gt_ms)
+                    + "ms response time, got "
+                    + str(end_ms - start_ms)
+                    + " ms",
+                )
         except Exception as ex:
             self.add_deferred_exception(ex)
 
@@ -129,56 +139,75 @@ def check_status(self, model_name, batch_exec, request_cnt, infer_cnt):
         # inference statistics to be ready.
         num_tries = 10
         for i in range(num_tries):
-            stats = self.triton_client_.get_inference_statistics(
-                model_name, "1")
+            stats = self.triton_client_.get_inference_statistics(model_name, "1")
             self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats")
             actual_exec_cnt = stats.model_stats[0].execution_count
             if actual_exec_cnt == exec_cnt:
                 break
-            print("WARNING: expect {} executions, got {} (attempt {})".format(
-                exec_cnt, actual_exec_cnt, i))
+            print(
+                "WARNING: expect {} executions, got {} (attempt {})".format(
+                    exec_cnt, actual_exec_cnt, i
+                )
+            )
             time.sleep(1)
 
-        self.assertEqual(stats.model_stats[0].name, model_name,
-                         "expect model stats for model {}".format(model_name))
         self.assertEqual(
-            stats.model_stats[0].version, "1",
-            "expect model stats for model {} version 1".format(model_name))
+            stats.model_stats[0].name,
+            model_name,
+            "expect model stats for model {}".format(model_name),
+        )
+        self.assertEqual(
+            stats.model_stats[0].version,
+            "1",
+            "expect model stats for model {} version 1".format(model_name),
+        )
 
         batch_stats = stats.model_stats[0].batch_stats
         self.assertEqual(
-            len(batch_stats), len(batch_exec),
+            len(batch_stats),
+            len(batch_exec),
             "expected {} different batch-sizes, got {}".format(
-                len(batch_exec), len(batch_stats)))
+                len(batch_exec), len(batch_stats)
+            ),
+        )
 
         for batch_stat in batch_stats:
             bs = batch_stat.batch_size
             bc = batch_stat.compute_infer.count
-            self.assertTrue(bs in batch_exec,
-                            "unexpected batch-size {}".format(bs))
+            self.assertTrue(bs in batch_exec, "unexpected batch-size {}".format(bs))
             # Get count from one of the stats
             self.assertEqual(
-                bc, batch_exec[bs],
-                "expected model-execution-count {} for batch size {}, got {}".
-                format(batch_exec[bs], bs, bc))
+                bc,
+                batch_exec[bs],
+                "expected model-execution-count {} for batch size {}, got {}".format(
+                    batch_exec[bs], bs, bc
+                ),
+            )
 
         actual_request_cnt = stats.model_stats[0].inference_stats.success.count
         self.assertEqual(
-            actual_request_cnt, request_cnt,
+            actual_request_cnt,
+            request_cnt,
             "expected model-request-count {}, got {}".format(
-                request_cnt, actual_request_cnt))
+                request_cnt, actual_request_cnt
+            ),
+        )
 
         actual_exec_cnt = stats.model_stats[0].execution_count
         self.assertEqual(
-            actual_request_cnt, request_cnt,
-            "expected model-exec-count {}, got {}".format(
-                request_cnt, actual_exec_cnt))
+            actual_request_cnt,
+            request_cnt,
+            "expected model-exec-count {}, got {}".format(request_cnt, actual_exec_cnt),
+        )
 
         actual_infer_cnt = stats.model_stats[0].inference_count
         self.assertEqual(
-            actual_infer_cnt, infer_cnt,
+            actual_infer_cnt,
+            infer_cnt,
             "expected model-inference-count {}, got {}".format(
-                infer_cnt, actual_infer_cnt))
+                infer_cnt, actual_infer_cnt
+            ),
+        )
 
     def test_all_inputs(self):
         # Provide all inputs, send requests that don't form preferred batch
@@ -186,11 +215,11 @@ def test_all_inputs(self):
         try:
             threads = []
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=((4000, None),)))
+                threading.Thread(target=self.check_response, args=((4000, None),))
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=((4000, None),)))
+                threading.Thread(target=self.check_response, args=((4000, None),))
+            )
             threads[0].start()
             threads[1].start()
             for t in threads:
@@ -207,13 +236,19 @@ def test_optional_same_input(self):
         try:
             threads = []
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=((4000, None),),
-                                 kwargs={'provided_inputs': ("INPUT1",)}))
+                threading.Thread(
+                    target=self.check_response,
+                    args=((4000, None),),
+                    kwargs={"provided_inputs": ("INPUT1",)},
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=((4000, None),),
-                                 kwargs={'provided_inputs': ("INPUT1",)}))
+                threading.Thread(
+                    target=self.check_response,
+                    args=((4000, None),),
+                    kwargs={"provided_inputs": ("INPUT1",)},
+                )
+            )
             threads[0].start()
             threads[1].start()
             for t in threads:
@@ -231,22 +266,34 @@ def test_optional_mix_inputs(self):
         try:
             threads = []
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=((0, 4000),),
-                                 kwargs={'provided_inputs': ("INPUT0",)}))
+                threading.Thread(
+                    target=self.check_response,
+                    args=((0, 4000),),
+                    kwargs={"provided_inputs": ("INPUT0",)},
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=((0, 4000),),
-                                 kwargs={'provided_inputs': ("INPUT1",)}))
+                threading.Thread(
+                    target=self.check_response,
+                    args=((0, 4000),),
+                    kwargs={"provided_inputs": ("INPUT1",)},
+                )
+            )
 
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=((0, 4000),),
-                                 kwargs={'provided_inputs': ("INPUT0",)}))
+                threading.Thread(
+                    target=self.check_response,
+                    args=((0, 4000),),
+                    kwargs={"provided_inputs": ("INPUT0",)},
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=((4000, None),),
-                                 kwargs={'provided_inputs': ("INPUT1",)}))
+                threading.Thread(
+                    target=self.check_response,
+                    args=((4000, None),),
+                    kwargs={"provided_inputs": ("INPUT1",)},
+                )
+            )
             for t in threads:
                 t.start()
                 time.sleep(0.5)
@@ -266,19 +313,26 @@ def test_optional_mix_inputs_2(self):
         try:
             threads = []
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=((0, 4000),),
-                                 kwargs={'provided_inputs': ("INPUT0",)}))
+                threading.Thread(
+                    target=self.check_response,
+                    args=((0, 4000),),
+                    kwargs={"provided_inputs": ("INPUT0",)},
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response, args=((0, 4000),)))
+                threading.Thread(target=self.check_response, args=((0, 4000),))
+            )
 
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=((0, 4000),),
-                                 kwargs={'provided_inputs': ("INPUT0",)}))
+                threading.Thread(
+                    target=self.check_response,
+                    args=((0, 4000),),
+                    kwargs={"provided_inputs": ("INPUT0",)},
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=((4000, None),)))
+                threading.Thread(target=self.check_response, args=((4000, None),))
+            )
             for t in threads:
                 t.start()
                 time.sleep(0.5)
@@ -292,28 +346,28 @@ def test_optional_mix_inputs_2(self):
 
     def test_ensemble_all_inputs(self):
         # The ensemble is only a wrapper over 'identity_2_float32'
-        self.model_name_ = 'ensemble_identity_2_float32'
+        self.model_name_ = "ensemble_identity_2_float32"
         self.test_all_inputs()
         # From the ensemble's perspective, the requests are processed as it is
         self.check_status(self.model_name_, {1: 2}, 2, 2)
 
     def test_ensemble_optional_same_input(self):
         # The ensemble is only a wrapper over 'identity_2_float32'
-        self.model_name_ = 'ensemble_identity_2_float32'
+        self.model_name_ = "ensemble_identity_2_float32"
         self.test_optional_same_input()
         # From the ensemble's perspective, the requests are processed as it is
         self.check_status(self.model_name_, {1: 2}, 2, 2)
 
     def test_ensemble_optional_mix_inputs(self):
         # The ensemble is only a wrapper over 'identity_2_float32'
-        self.model_name_ = 'ensemble_identity_2_float32'
+        self.model_name_ = "ensemble_identity_2_float32"
         self.test_optional_mix_inputs()
         # From the ensemble's perspective, the requests are processed as it is
         self.check_status(self.model_name_, {1: 4}, 4, 4)
 
     def test_ensemble_optional_mix_inputs_2(self):
         # The ensemble is only a wrapper over 'identity_2_float32'
-        self.model_name_ = 'ensemble_identity_2_float32'
+        self.model_name_ = "ensemble_identity_2_float32"
         self.test_optional_mix_inputs_2()
         # From the ensemble's perspective, the requests are processed as it is
         self.check_status(self.model_name_, {1: 4}, 4, 4)
@@ -323,7 +377,7 @@ def test_ensemble_optional_pipeline(self):
         # inputs, where the ensemble step only connects a subset of inputs
         # for the second model (which is valid because the disconnected inputs
         # are marked optional). See 'config.pbtxt' for detail.
-        self.model_name_ = 'pipeline_identity_2_float32'
+        self.model_name_ = "pipeline_identity_2_float32"
 
         # Provide all inputs, send requests that don't form preferred batch
         # so all requests should be returned after the queue delay
@@ -334,28 +388,29 @@ def test_ensemble_optional_pipeline(self):
                 inputs.append(self.inputs_[provided_input])
 
             triton_client = grpcclient.InferenceServerClient("localhost:8001")
-            results = triton_client.infer(model_name=self.model_name_,
-                                          inputs=inputs)
+            results = triton_client.infer(model_name=self.model_name_, inputs=inputs)
 
             # OUTPU0 is always zero, OUTPUT1 = INPUT0
             output_data = results.as_numpy("OUTPUT0")
             expected = np.zeros(shape=(1, 1), dtype=np.float32)
             self.assertTrue(
                 np.array_equal(output_data, expected),
-                "{}, {}, expected: {}, got {}".format(self.model_name_,
-                                                      "OUTPUT0", expected,
-                                                      output_data))
+                "{}, {}, expected: {}, got {}".format(
+                    self.model_name_, "OUTPUT0", expected, output_data
+                ),
+            )
 
             expected = self.input_data_["INPUT0"]
             output_data = results.as_numpy("OUTPUT1")
             self.assertTrue(
                 np.array_equal(output_data, expected),
-                "{}, {}, expected: {}, got {}".format(self.model_name_,
-                                                      "OUTPUT1", expected,
-                                                      output_data))
+                "{}, {}, expected: {}, got {}".format(
+                    self.model_name_, "OUTPUT1", expected, output_data
+                ),
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_output_name/output_name_test.py b/qa/L0_output_name/output_name_test.py
old mode 100644
new mode 100755
index 46a464ada0..905174640c
--- a/qa/L0_output_name/output_name_test.py
+++ b/qa/L0_output_name/output_name_test.py
@@ -30,17 +30,16 @@
 sys.path.append("../common")
 
 import unittest
+
 import test_util as tu
+from tritongrpcclient import grpc_service_pb2, grpc_service_pb2_grpc
 
 import grpc
-from tritongrpcclient import grpc_service_pb2
-from tritongrpcclient import grpc_service_pb2_grpc
 
 _trials = ("graphdef", "libtorch", "onnx", "plan", "savedmodel")
 
 
 class OutputNameValidationTest(tu.TestResultCollector):
-
     def requestGenerator(self, model_name, output_name):
         request = grpc_service_pb2.ModelInferRequest()
         request.model_name = model_name
@@ -53,12 +52,11 @@ def requestGenerator(self, model_name, output_name):
 
         request.inputs.extend([input])
 
-        output = grpc_service_pb2.ModelInferRequest(
-        ).InferRequestedOutputTensor()
+        output = grpc_service_pb2.ModelInferRequest().InferRequestedOutputTensor()
         output.name = output_name
         request.outputs.extend([output])
 
-        request.raw_input_contents.extend([bytes(4 * 'a', 'utf-8')])
+        request.raw_input_contents.extend([bytes(4 * "a", "utf-8")])
 
         return request
 
@@ -73,14 +71,14 @@ def test_grpc(self):
             try:
                 response = grpc_stub.ModelInfer(request)
                 self.assertTrue(
-                    False,
-                    "unexpected success for unknown output " + model_name)
+                    False, "unexpected success for unknown output " + model_name
+                )
             except grpc.RpcError as rpc_error:
                 msg = rpc_error.details()
                 self.assertTrue(
-                    msg.startswith(
-                        "unexpected inference output 'DUMMY' for model"))
+                    msg.startswith("unexpected inference output 'DUMMY' for model")
+                )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_output_name/test.sh b/qa/L0_output_name/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_output_validation/lt_op_val_client.py b/qa/L0_output_validation/lt_op_val_client.py
old mode 100644
new mode 100755
index 220ae42e64..77b5a16e3f
--- a/qa/L0_output_validation/lt_op_val_client.py
+++ b/qa/L0_output_validation/lt_op_val_client.py
@@ -30,41 +30,44 @@
 
 sys.path.append("../common")
 
-import requests
 import unittest
+
+import requests
 import test_util as tu
 
 
 class OutputValidationTest(tu.TestResultCollector):
     # for datatype mismatch
     def test_datatype(self):
-        url = 'http://localhost:8000/v2/models/libtorch_datatype_1_float32/infer'
+        url = "http://localhost:8000/v2/models/libtorch_datatype_1_float32/infer"
         body = '{"inputs":[{"name":"INPUT__0","shape":[1,1],"datatype":"FP32","data":[1.0]}],"outputs":[{"name":"OUTPUT__0"}]}'
         response = requests.post(url, data=body)
         msg = response.json()["error"]
         self.assertTrue(
             msg.startswith(
                 "configuration expects datatype TYPE_INT32 for output 'OUTPUT__0', model provides TYPE_FP32"
-            ))
+            )
+        )
 
     # for output mismatch
     def test_index(self):
-        url = 'http://localhost:8000/v2/models/libtorch_index_1_float32/infer'
+        url = "http://localhost:8000/v2/models/libtorch_index_1_float32/infer"
         body = '{"inputs":[{"name":"INPUT__0","shape":[1,1],"datatype":"FP32","data":[1.0]}],"outputs":[{"name":"OUTPUT__1"}]}'
         response = requests.post(url, data=body)
         msg = response.json()["error"]
         self.assertTrue(
             msg.startswith(
                 "The output OUTPUT__1 in the model configuration refers to an output index which doesn't exist. This model has 1 outputs"
-            ))
+            )
+        )
 
     # successful run
     def test_success(self):
-        url = 'http://localhost:8000/v2/models/libtorch_zero_1_float32/infer'
+        url = "http://localhost:8000/v2/models/libtorch_zero_1_float32/infer"
         body = '{"inputs":[{"name":"INPUT__0","shape":[1,1],"datatype":"FP32","data":[1.0]}],"outputs":[{"name":"OUTPUT__0"}]}'
         response = requests.post(url, data=body)
         self.assertEqual(response.status_code, 200)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_output_validation/test.sh b/qa/L0_output_validation/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_parallel_copy/parallel_copy_test.py b/qa/L0_parallel_copy/parallel_copy_test.py
old mode 100644
new mode 100755
index 4fdf406cc1..6748fee006
--- a/qa/L0_parallel_copy/parallel_copy_test.py
+++ b/qa/L0_parallel_copy/parallel_copy_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,33 +30,36 @@
 
 sys.path.append("../common")
 
-from builtins import range
+import functools
 import time
 import unittest
+from builtins import range
+
 import numpy as np
 import test_util as tu
-import functools
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import InferenceServerException
 
 
 class ParallelCopyTest(tu.TestResultCollector):
-
     def setUp(self):
         self.client_ = grpcclient.InferenceServerClient("localhost:8001")
         self.dtype_ = np.float32
-        self.model_name_ = tu.get_zero_model_name('plan', 1, self.dtype_)
+        self.model_name_ = tu.get_zero_model_name("plan", 1, self.dtype_)
 
     def _batch_input_duration(self, batch_size):
         stats = self.client_.get_inference_statistics(self.model_name_, "1")
         self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats")
         self.assertEqual(
-            stats.model_stats[0].name, self.model_name_,
-            "expect model stats for model {}".format(self.model_name_))
+            stats.model_stats[0].name,
+            self.model_name_,
+            "expect model stats for model {}".format(self.model_name_),
+        )
         self.assertEqual(
-            stats.model_stats[0].version, "1",
-            "expect model stats for model {} version 1".format(
-                self.model_name_))
+            stats.model_stats[0].version,
+            "1",
+            "expect model stats for model {} version 1".format(self.model_name_),
+        )
 
         batch_stats = stats.model_stats[0].batch_stats
 
@@ -70,10 +75,11 @@ def _run(self, batch_sizes):
             np.random.random([bs, 16 * 1024 * 1024]).astype(self.dtype_)
             for bs in batch_sizes
         ]
-        inputs = [[
-            grpcclient.InferInput('INPUT0', [bs, 16 * 1024 * 1024], "FP32")
-        ] for bs in batch_sizes]
-        output = [grpcclient.InferRequestedOutput('OUTPUT0')]
+        inputs = [
+            [grpcclient.InferInput("INPUT0", [bs, 16 * 1024 * 1024], "FP32")]
+            for bs in batch_sizes
+        ]
+        output = [grpcclient.InferRequestedOutput("OUTPUT0")]
 
         for idx in range(len(inputs)):
             inputs[idx][0].set_data_from_numpy(input_data[idx])
@@ -89,11 +95,12 @@ def callback(user_data, idx, result, error):
 
         before_compute_input_duration = self._batch_input_duration(batch_size)
         for idx in range(len(batch_sizes)):
-            self.client_.async_infer(model_name=self.model_name_,
-                                     inputs=inputs[idx],
-                                     callback=functools.partial(
-                                         callback, user_data, idx),
-                                     outputs=output)
+            self.client_.async_infer(
+                model_name=self.model_name_,
+                inputs=inputs[idx],
+                callback=functools.partial(callback, user_data, idx),
+                outputs=output,
+            )
 
         # Wait until the results are available in user_data
         time_out = 20
@@ -108,19 +115,24 @@ def callback(user_data, idx, result, error):
             time_out = time_out - 1
             time.sleep(1)
         done_cnt = functools.reduce(
-            lambda dc, x: dc + 1 if x is not None else dc, user_data, 0)
+            lambda dc, x: dc + 1 if x is not None else dc, user_data, 0
+        )
         self.assertEqual(
-            done_cnt, len(batch_sizes),
-            "expected {} responses, got {}".format(len(batch_sizes), done_cnt))
+            done_cnt,
+            len(batch_sizes),
+            "expected {} responses, got {}".format(len(batch_sizes), done_cnt),
+        )
         for idx in range(len(batch_sizes)):
             res = user_data[idx]
             self.assertFalse(
                 type(res) == InferenceServerException,
-                "expected response for request {}, got exception {}".format(
-                    idx, res))
-            output_data = res.as_numpy('OUTPUT0')
-            self.assertTrue(np.array_equal(output_data, input_data[idx]),
-                            "Mismatched output data for request {}".format(idx))
+                "expected response for request {}, got exception {}".format(idx, res),
+            )
+            output_data = res.as_numpy("OUTPUT0")
+            self.assertTrue(
+                np.array_equal(output_data, input_data[idx]),
+                "Mismatched output data for request {}".format(idx),
+            )
 
         after_compute_input_duration = self._batch_input_duration(batch_size)
         return after_compute_input_duration - before_compute_input_duration
@@ -135,13 +147,17 @@ def test_performance(self):
 
         # The following check is loose, local runs show that the speedup is not
         # significant (~15%), may be due to the dispatch overhead
-        # which cancels part of the improvment
+        # which cancels part of the improvement
         self.assertTrue(
             serialized_time > parallelized_time,
-            "Expected parallelized copy is faster than serialized copy")
-        print("serialized v.s. parallelized : {} v.s. {}".format(
-            serialized_time, parallelized_time))
+            "Expected parallelized copy is faster than serialized copy",
+        )
+        print(
+            "serialized v.s. parallelized : {} v.s. {}".format(
+                serialized_time, parallelized_time
+            )
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_parameters/model_repository/parameter/1/model.py b/qa/L0_parameters/model_repository/parameter/1/model.py
old mode 100644
new mode 100755
index 70388d6c40..458d5467c8
--- a/qa/L0_parameters/model_repository/parameter/1/model.py
+++ b/qa/L0_parameters/model_repository/parameter/1/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,39 +26,34 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
-import numpy as np
 import json
 
+import numpy as np
+import triton_python_backend_utils as pb_utils
 
-class TritonPythonModel:
 
+class TritonPythonModel:
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        inputs = [{'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [1]}]
-        outputs = [{
-            'name': 'key',
-            'data_type': 'TYPE_STRING',
-            'dims': [-1]
-        }, {
-            'name': 'value',
-            'data_type': 'TYPE_STRING',
-            'dims': [-1]
-        }]
+        inputs = [{"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [1]}]
+        outputs = [
+            {"name": "key", "data_type": "TYPE_STRING", "dims": [-1]},
+            {"name": "value", "data_type": "TYPE_STRING", "dims": [-1]},
+        ]
 
         config = auto_complete_model_config.as_dict()
         input_names = []
         output_names = []
-        for input in config['input']:
-            input_names.append(input['name'])
-        for output in config['output']:
-            output_names.append(output['name'])
+        for input in config["input"]:
+            input_names.append(input["name"])
+        for output in config["output"]:
+            output_names.append(output["name"])
 
         for input in inputs:
-            if input['name'] not in input_names:
+            if input["name"] not in input_names:
                 auto_complete_model_config.add_input(input)
         for output in outputs:
-            if output['name'] not in output_names:
+            if output["name"] not in output_names:
                 auto_complete_model_config.add_output(output)
 
         auto_complete_model_config.set_max_batch_size(0)
@@ -73,10 +70,10 @@ def execute(self, requests):
                 keys.append(key)
                 values.append(value)
             key_output = pb_utils.Tensor("key", np.asarray(keys, dtype=object))
-            value_output = pb_utils.Tensor("value",
-                                           np.asarray(values, dtype=object))
+            value_output = pb_utils.Tensor("value", np.asarray(values, dtype=object))
             inference_response = pb_utils.InferenceResponse(
-                output_tensors=[key_output, value_output])
+                output_tensors=[key_output, value_output]
+            )
             responses.append(inference_response)
 
         return responses
diff --git a/qa/L0_parameters/parameters_test.py b/qa/L0_parameters/parameters_test.py
old mode 100644
new mode 100755
index 5cbc2c7586..0a2f142e34
--- a/qa/L0_parameters/parameters_test.py
+++ b/qa/L0_parameters/parameters_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,50 +30,49 @@
 
 sys.path.append("../common")
 
+import os
+import queue
+import unittest
+from functools import partial
+from unittest import IsolatedAsyncioTestCase
+
 import numpy as np
-import tritonclient.http as httpclient
 import tritonclient.grpc as grpcclient
-import tritonclient.http.aio as asynchttpclient
 import tritonclient.grpc.aio as asyncgrpcclient
+import tritonclient.http as httpclient
+import tritonclient.http.aio as asynchttpclient
 from tritonclient.utils import InferenceServerException
-from unittest import IsolatedAsyncioTestCase
-import unittest
-import queue
-from functools import partial
-import os
-TEST_HEADER = os.environ.get('TEST_HEADER')
 
+TEST_HEADER = os.environ.get("TEST_HEADER")
 
-class InferenceParametersTest(IsolatedAsyncioTestCase):
 
+class InferenceParametersTest(IsolatedAsyncioTestCase):
     async def asyncSetUp(self):
-        self.http = httpclient.InferenceServerClient(url='localhost:8000')
-        self.async_http = asynchttpclient.InferenceServerClient(
-            url='localhost:8000')
-        self.grpc = grpcclient.InferenceServerClient(url='localhost:8001')
-        self.async_grpc = asyncgrpcclient.InferenceServerClient(
-            url='localhost:8001')
+        self.http = httpclient.InferenceServerClient(url="localhost:8000")
+        self.async_http = asynchttpclient.InferenceServerClient(url="localhost:8000")
+        self.grpc = grpcclient.InferenceServerClient(url="localhost:8001")
+        self.async_grpc = asyncgrpcclient.InferenceServerClient(url="localhost:8001")
 
         self.parameter_list = []
-        self.parameter_list.append({'key1': 'value1', 'key2': 'value2'})
-        self.parameter_list.append({'key1': 1, 'key2': 2})
-        self.parameter_list.append({'key1': True, 'key2': 'value2'})
-        self.parameter_list.append({'triton_': True, 'key2': 'value2'})
+        self.parameter_list.append({"key1": "value1", "key2": "value2"})
+        self.parameter_list.append({"key1": 1, "key2": 2})
+        self.parameter_list.append({"key1": True, "key2": "value2"})
+        self.parameter_list.append({"triton_": True, "key2": "value2"})
 
         if TEST_HEADER == "1":
             self.headers = {
-                'header_1': 'value_1',
-                'header_2': 'value_2',
-                'my_header_1': 'my_value_1',
-                'my_header_2': 'my_value_2',
-                'my_header_3': 'This is a "quoted" string with a backslash\ '
+                "header_1": "value_1",
+                "header_2": "value_2",
+                "my_header_1": "my_value_1",
+                "my_header_2": "my_value_2",
+                "my_header_3": 'This is a "quoted" string with a backslash\ ',
             }
 
             # only these headers should be forwarded to the model.
             self.expected_headers = {
-                'my_header_1': 'my_value_1',
-                'my_header_2': 'my_value_2',
-                'my_header_3': 'This is a "quoted" string with a backslash\ '
+                "my_header_1": "my_value_1",
+                "my_header_2": "my_value_2",
+                "my_header_3": 'This is a "quoted" string with a backslash\ ',
             }
         else:
             self.headers = {}
@@ -87,60 +88,63 @@ def callback(user_data, result, error):
 
     def create_inputs(self, client_type):
         inputs = []
-        inputs.append(client_type.InferInput('INPUT0', [1], "FP32"))
+        inputs.append(client_type.InferInput("INPUT0", [1], "FP32"))
 
         # Initialize the data
         inputs[0].set_data_from_numpy(np.asarray([1], dtype=np.float32))
         return inputs
 
-    async def send_request_and_verify(self,
-                                      client_type,
-                                      client,
-                                      is_async=False):
-
+    async def send_request_and_verify(self, client_type, client, is_async=False):
         inputs = self.create_inputs(client_type)
         for parameters in self.parameter_list:
             # The `triton_` prefix is reserved for Triton usage
             should_error = False
-            if 'triton_' in parameters.keys():
+            if "triton_" in parameters.keys():
                 should_error = True
 
             if is_async:
                 if should_error:
                     with self.assertRaises(InferenceServerException):
-                        result = await client.infer(model_name='parameter',
-                                                    inputs=inputs,
-                                                    parameters=parameters,
-                                                    headers=self.headers)
+                        result = await client.infer(
+                            model_name="parameter",
+                            inputs=inputs,
+                            parameters=parameters,
+                            headers=self.headers,
+                        )
                     return
                 else:
-                    result = await client.infer(model_name='parameter',
-                                                inputs=inputs,
-                                                parameters=parameters,
-                                                headers=self.headers)
+                    result = await client.infer(
+                        model_name="parameter",
+                        inputs=inputs,
+                        parameters=parameters,
+                        headers=self.headers,
+                    )
 
             else:
                 if should_error:
                     with self.assertRaises(InferenceServerException):
-                        result = client.infer(model_name='parameter',
-                                              inputs=inputs,
-                                              parameters=parameters,
-                                              headers=self.headers)
+                        result = client.infer(
+                            model_name="parameter",
+                            inputs=inputs,
+                            parameters=parameters,
+                            headers=self.headers,
+                        )
                     return
                 else:
-                    result = client.infer(model_name='parameter',
-                                          inputs=inputs,
-                                          parameters=parameters,
-                                          headers=self.headers)
+                    result = client.infer(
+                        model_name="parameter",
+                        inputs=inputs,
+                        parameters=parameters,
+                        headers=self.headers,
+                    )
 
             self.verify_outputs(result, parameters)
 
     def verify_outputs(self, result, parameters):
-        keys = result.as_numpy('key')
-        values = result.as_numpy('value')
+        keys = result.as_numpy("key")
+        values = result.as_numpy("value")
         keys = keys.astype(str).tolist()
-        expected_keys = list(parameters.keys()) + list(
-            self.expected_headers.keys())
+        expected_keys = list(parameters.keys()) + list(self.expected_headers.keys())
         self.assertEqual(set(keys), set(expected_keys))
 
         # We have to convert the parameter values to string
@@ -158,24 +162,26 @@ async def test_http_parameter(self):
         await self.send_request_and_verify(httpclient, self.http)
 
     async def test_async_http_parameter(self):
-        await self.send_request_and_verify(asynchttpclient,
-                                           self.async_http,
-                                           is_async=True)
+        await self.send_request_and_verify(
+            asynchttpclient, self.async_http, is_async=True
+        )
 
     async def test_async_grpc_parameter(self):
-        await self.send_request_and_verify(asyncgrpcclient,
-                                           self.async_grpc,
-                                           is_async=True)
+        await self.send_request_and_verify(
+            asyncgrpcclient, self.async_grpc, is_async=True
+        )
 
     def test_http_async_parameter(self):
         inputs = self.create_inputs(httpclient)
         # Skip the parameter that returns an error
         parameter_list = self.parameter_list[:-1]
         for parameters in parameter_list:
-            result = self.http.async_infer(model_name='parameter',
-                                           inputs=inputs,
-                                           parameters=parameters,
-                                           headers=self.headers).get_result()
+            result = self.http.async_infer(
+                model_name="parameter",
+                inputs=inputs,
+                parameters=parameters,
+                headers=self.headers,
+            ).get_result()
             self.verify_outputs(result, parameters)
 
     def test_grpc_async_parameter(self):
@@ -184,28 +190,30 @@ def test_grpc_async_parameter(self):
         # Skip the parameter that returns an error
         parameter_list = self.parameter_list[:-1]
         for parameters in parameter_list:
-            self.grpc.async_infer(model_name='parameter',
-                                  inputs=inputs,
-                                  parameters=parameters,
-                                  headers=self.headers,
-                                  callback=partial(self.grpc_callback,
-                                                   user_data))
+            self.grpc.async_infer(
+                model_name="parameter",
+                inputs=inputs,
+                parameters=parameters,
+                headers=self.headers,
+                callback=partial(self.grpc_callback, user_data),
+            )
             result = user_data.get()
             self.assertFalse(result is InferenceServerException)
             self.verify_outputs(result, parameters)
 
     def test_grpc_stream_parameter(self):
         user_data = queue.Queue()
-        self.grpc.start_stream(callback=partial(self.grpc_callback, user_data),
-                               headers=self.headers)
+        self.grpc.start_stream(
+            callback=partial(self.grpc_callback, user_data), headers=self.headers
+        )
         inputs = self.create_inputs(grpcclient)
         # Skip the parameter that returns an error
         parameter_list = self.parameter_list[:-1]
         for parameters in parameter_list:
             # async stream infer
-            self.grpc.async_stream_infer(model_name='parameter',
-                                         inputs=inputs,
-                                         parameters=parameters)
+            self.grpc.async_stream_infer(
+                model_name="parameter", inputs=inputs, parameters=parameters
+            )
             result = user_data.get()
             self.assertFalse(result is InferenceServerException)
             self.verify_outputs(result, parameters)
@@ -218,5 +226,5 @@ async def asyncTearDown(self):
         await self.async_http.close()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_parameters/test.sh b/qa/L0_parameters/test.sh
old mode 100644
new mode 100755
index 48513cb61a..4c8ac00931
--- a/qa/L0_parameters/test.sh
+++ b/qa/L0_parameters/test.sh
@@ -50,7 +50,7 @@ source ../common/util.sh
 
 RET=0
 for i in {0..1}; do
-  
+
   # TEST_HEADER is a parameter used by `parameters_test.py` that controls
   # whether the script will test for inclusion of headers in parameters or not.
   if [ $i == 1 ]; then
@@ -64,7 +64,7 @@ for i in {0..1}; do
       cat $SERVER_LOG
       exit 1
   fi
-  
+
   set +e
   TEST_HEADER=$i python3 $TEST_SCRIPT_PY >$CLIENT_LOG 2>&1
   if [ $? -ne 0 ]; then
@@ -72,9 +72,9 @@ for i in {0..1}; do
       echo -e "\n***\n*** Test Failed\n***"
       RET=1
   fi
-  
+
   set -e
-  
+
   kill $SERVER_PID
   wait $SERVER_PID
 done
diff --git a/qa/L0_passive_instance/models/distributed_int32_int32_int32/config.pbtxt b/qa/L0_passive_instance/models/distributed_int32_int32_int32/config.pbtxt
old mode 100755
new mode 100644
diff --git a/qa/L0_passive_instance/passive_instance_test.py b/qa/L0_passive_instance/passive_instance_test.py
old mode 100644
new mode 100755
index b96055b0b3..d7cdfffa7b
--- a/qa/L0_passive_instance/passive_instance_test.py
+++ b/qa/L0_passive_instance/passive_instance_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,20 +31,21 @@
 sys.path.append("../common")
 
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
 
 
 class PassiveInstanceTest(tu.TestResultCollector):
-
     def test_inference(self):
         try:
-            iu.infer_exact(self, "distributed", (1, 16), 1, np.int32, np.int32,
-                           np.int32)
+            iu.infer_exact(
+                self, "distributed", (1, 16), 1, np.int32, np.int32, np.int32
+            )
         except Exception as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_passive_instance/test.sh b/qa/L0_passive_instance/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_perf_analyzer/test.sh b/qa/L0_perf_analyzer/test.sh
index 8a6d0cfff4..42b80f009f 100755
--- a/qa/L0_perf_analyzer/test.sh
+++ b/qa/L0_perf_analyzer/test.sh
@@ -409,7 +409,7 @@ for PROTOCOL in grpc http; do
         RET=1
     fi
     set -e
-    
+
     # Binary search for concurrency range mode and make sure it doesn't hang
     $PERF_ANALYZER -v -a --request-distribution "poisson" --shared-memory none \
     --percentile 99 --binary-search --concurrency-range 1:8:2 -l 5 \
@@ -809,8 +809,8 @@ set -e
 
 # Test with optional inputs missing and invalid
 set +e
-OPTIONAL_INPUT_ERROR_STRING="For batch sizes larger than 1, the same set of 
-inputs must be specified for each batch. You cannot use different set of 
+OPTIONAL_INPUT_ERROR_STRING="For batch sizes larger than 1, the same set of
+inputs must be specified for each batch. You cannot use different set of
 optional inputs for each individual batch."
 $PERF_ANALYZER -v -m optional -b 2 --measurement-mode "count_windows" \
     --input-data=${INT_OPTIONAL_JSONDATAFILE} -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1
@@ -854,7 +854,7 @@ if [ $(cat $CLIENT_LOG |  grep "Request Rate: 40" | wc -l) -eq 0 ]; then
 fi
 set -e
 
-# Test --serial-sequences mode 
+# Test --serial-sequences mode
 set +e
 $PERF_ANALYZER -v -i $PROTOCOL -m  simple_savedmodel_sequence_object -p 1000 --request-rate-range 100:200:50 --serial-sequences \
     --input-data=$SEQ_JSONDATAFILE -s ${STABILITY_THRESHOLD} >$CLIENT_LOG 2>&1
@@ -880,7 +880,7 @@ if [ $(cat $CLIENT_LOG |  grep "${ERROR_STRING}" | wc -l) -ne 0 ]; then
     cat $CLIENT_LOG
     echo -e "\n***\n*** Test Failed\n***"
     RET=1
-fi    
+fi
 set -e
 
 ## Test perf_analyzer with MPI / multiple models
@@ -984,23 +984,23 @@ wait $SERVER_PID
 
 # Generate valid CA
 openssl genrsa -passout pass:1234 -des3 -out ca.key 4096
-openssl req -passin pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA"
+openssl req -passing pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA"
 
 # Generate valid Server Key/Cert
 openssl genrsa -passout pass:1234 -des3 -out server.key 4096
-openssl req -passin pass:1234 -new -key server.key -out server.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost"
-openssl x509 -req -passin pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt
+openssl req -passing pass:1234 -new -key server.key -out server.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost"
+openssl x509 -req -passing pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt
 
 # Remove passphrase from the Server Key
-openssl rsa -passin pass:1234 -in server.key -out server.key
+openssl rsa -passing pass:1234 -in server.key -out server.key
 
 # Generate valid Client Key/Cert
 openssl genrsa -passout pass:1234 -des3 -out client.key 4096
-openssl req -passin pass:1234 -new -key client.key -out client.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost"
-openssl x509 -passin pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt
+openssl req -passing pass:1234 -new -key client.key -out client.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost"
+openssl x509 -passing pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt
 
 # Remove passphrase from Client Key
-openssl rsa -passin pass:1234 -in client.key -out client.key
+openssl rsa -passing pass:1234 -in client.key -out client.key
 
 # Create mutated client key (Make first char of each like capital)
 cp client.key client2.key && sed -i "s/\b\(.\)/\u\1/g" client2.key
diff --git a/qa/L0_perf_analyzer_doc_links/test.sh b/qa/L0_perf_analyzer_doc_links/test.sh
old mode 100644
new mode 100755
index 52e3e76e12..ec6eeef057
--- a/qa/L0_perf_analyzer_doc_links/test.sh
+++ b/qa/L0_perf_analyzer_doc_links/test.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -33,14 +34,14 @@ python3 -m pip install mkdocs
 python3 -m pip install mkdocs-htmlproofer-plugin==0.10.3
 
 #Download perf_analyzer docs
-TRITON_CLIENT_REPO_TAG="${TRITON_CLIENT_REPO_TAG:=main}" 
+TRITON_CLIENT_REPO_TAG="${TRITON_CLIENT_REPO_TAG:=main}"
 git clone -b ${TRITON_CLIENT_REPO_TAG} https://github.com/triton-inference-server/client.git
 cp `pwd`/client/src/c++/perf_analyzer/README.md .
 cp -rf `pwd`/client/src/c++/perf_analyzer/docs .
 
-# Need to remove all links that start with -- or -. Mkdocs converts all -- to - for anchor links. 
-# This breaks all links to cli commands throughout the docs. This will iterate over all 
-# files in the docs directory and remove -- and - at the start of options, which allows the 
+# Need to remove all links that start with -- or -. Mkdocs converts all -- to - for anchor links.
+# This breaks all links to cli commands throughout the docs. This will iterate over all
+# files in the docs directory and remove -- and - at the start of options, which allows the
 # tool to check links for correctness.
 for file in `pwd`/docs/*
 do
diff --git a/qa/L0_perf_analyzer_ground_truth/test.sh b/qa/L0_perf_analyzer_ground_truth/test.sh
index f01d1a0ec2..d5d78e63f4 100755
--- a/qa/L0_perf_analyzer_ground_truth/test.sh
+++ b/qa/L0_perf_analyzer_ground_truth/test.sh
@@ -92,7 +92,7 @@ function check_grpc_time {
     done
 }
 
-# Create input_data.json to communicate the requested model delay 
+# Create input_data.json to communicate the requested model delay
 # $1: desired model delay
 function create_input_data {
     echo "{\"data\":[{\"INPUT0\" : [${1}]}]}" > input_data.json
@@ -134,7 +134,7 @@ TOLERANCE="0.05"
 
 for model_delay in ${MODEL_DELAYS[@]}; do
     create_input_data ${model_delay}
-    EXPECTED_RESULT=$(python3 -c "print(1 / ${model_delay})")    
+    EXPECTED_RESULT=$(python3 -c "print(1 / ${model_delay})")
     for protocol in ${PROTOCOLS}; do
         for model in ${MODELS}; do
         echo "================================================================"
diff --git a/qa/L0_perf_analyzer_report/test.sh b/qa/L0_perf_analyzer_report/test.sh
index c6f3d210f1..7a04905842 100755
--- a/qa/L0_perf_analyzer_report/test.sh
+++ b/qa/L0_perf_analyzer_report/test.sh
@@ -125,7 +125,7 @@ done
 sed -i "s/${COMPOSING_MODEL}/${COMPOSING_MODEL_CACHE_ENABLED}/g" "${MODEL_DIR}/${ENSEMBLE_MODEL_CACHE_ENABLED}/config.pbtxt"
 sed -i "s/${COMPOSING_MODEL}/${COMPOSING_MODEL_CACHE_DISABLED}/g" "${MODEL_DIR}/${ENSEMBLE_MODEL_CACHE_DISABLED}/config.pbtxt"
 
-## Append cache config to each model config 
+## Append cache config to each model config
 echo -e "response_cache { enable: True }" >> "${MODEL_DIR}/${ENSEMBLE_MODEL_CACHE_ENABLED}/config.pbtxt"
 echo -e "response_cache { enable: False }" >> "${MODEL_DIR}/${ENSEMBLE_MODEL_CACHE_DISABLED}/config.pbtxt"
 echo -e "response_cache { enable: True }" >> "${MODEL_DIR}/${COMPOSING_MODEL_CACHE_ENABLED}/config.pbtxt"
diff --git a/qa/L0_perf_kaldi/create_data.sh b/qa/L0_perf_kaldi/create_data.sh
old mode 100644
new mode 100755
index 68b32a4099..849b56d906
--- a/qa/L0_perf_kaldi/create_data.sh
+++ b/qa/L0_perf_kaldi/create_data.sh
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# Needs to be run in asr_kaldi main directory and must be copied to 
+# Needs to be run in asr_kaldi main directory and must be copied to
 # draco for benchmark test
 TRITON_VERSION="20.05"
 
diff --git a/qa/L0_perf_kaldi/test.sh b/qa/L0_perf_kaldi/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_perf_nomodel/run_test.sh b/qa/L0_perf_nomodel/run_test.sh
index 1efe38cc97..b1e2702ecb 100755
--- a/qa/L0_perf_nomodel/run_test.sh
+++ b/qa/L0_perf_nomodel/run_test.sh
@@ -83,7 +83,7 @@ PERF_CLIENT_PERCENTILE_ARGS="" &&
     PERF_CLIENT_PERCENTILE_ARGS="--percentile=${PERF_CLIENT_PERCENTILE}"
 PERF_CLIENT_EXTRA_ARGS="$PERF_CLIENT_PERCENTILE_ARGS --shared-memory ${SHARED_MEMORY}"
 
-# Overload use of PERF_CLIENT_PROTOCOL for convenience with existing test and 
+# Overload use of PERF_CLIENT_PROTOCOL for convenience with existing test and
 # reporting structure, though "triton_c_api" is not strictly a "protocol".
 if [[ "${PERF_CLIENT_PROTOCOL}" == "triton_c_api" ]]; then
     # Server will be run in-process with C API
diff --git a/qa/L0_perf_pyclients/simple_perf_client.py b/qa/L0_perf_pyclients/simple_perf_client.py
old mode 100644
new mode 100755
index f73f774c27..fd02f94887
--- a/qa/L0_perf_pyclients/simple_perf_client.py
+++ b/qa/L0_perf_pyclients/simple_perf_client.py
@@ -26,14 +26,13 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
 import time
 
+import numpy as np
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
-from tritonclient.utils import triton_to_np_dtype
-from tritonclient.utils import InferenceServerException
+from tritonclient.utils import InferenceServerException, triton_to_np_dtype
 
 FLAGS = None
 
@@ -44,47 +43,59 @@ def parse_model_grpc(model_metadata, model_config):
     by this client.
     """
     if len(model_metadata.inputs) != 1:
-        raise Exception("expecting 1 input, got {}".format(
-            len(model_metadata.inputs)))
+        raise Exception("expecting 1 input, got {}".format(len(model_metadata.inputs)))
     if len(model_metadata.outputs) != 1:
-        raise Exception("expecting 1 output, got {}".format(
-            len(model_metadata.outputs)))
+        raise Exception(
+            "expecting 1 output, got {}".format(len(model_metadata.outputs))
+        )
 
     if len(model_config.input) != 1:
         raise Exception(
             "expecting 1 input in model configuration, got {}".format(
-                len(model_config.input)))
+                len(model_config.input)
+            )
+        )
 
     input_metadata = model_metadata.inputs[0]
     output_metadata = model_metadata.outputs[0]
 
-    batch_dim = (model_config.max_batch_size > 0)
+    batch_dim = model_config.max_batch_size > 0
     expected_dims = 1 + (1 if batch_dim else 0)
 
     if len(input_metadata.shape) != expected_dims:
         raise Exception(
-            "expecting input to have {} dimensions, model '{}' input has {}".
-            format(expected_dims, model_metadata.name,
-                   len(input_metadata.shape)))
+            "expecting input to have {} dimensions, model '{}' input has {}".format(
+                expected_dims, model_metadata.name, len(input_metadata.shape)
+            )
+        )
 
     if len(output_metadata.shape) != expected_dims:
         raise Exception(
-            "expecting output to have {} dimensions, model '{}' output has {}".
-            format(expected_dims, model_metadata.name,
-                   len(output_metadata.shape)))
+            "expecting output to have {} dimensions, model '{}' output has {}".format(
+                expected_dims, model_metadata.name, len(output_metadata.shape)
+            )
+        )
 
     if input_metadata.shape[-1] != -1:
         raise Exception(
-            "expecting input to have variable shape [-1], model '{}' input has {}"
-            .format(model_metadata.name, input_metadata.shape))
+            "expecting input to have variable shape [-1], model '{}' input has {}".format(
+                model_metadata.name, input_metadata.shape
+            )
+        )
 
     if output_metadata.shape[-1] != -1:
         raise Exception(
-            "expecting output to have variable shape [-1], model '{}' output has {}"
-            .format(model_metadata.name, output_metadata.shape))
+            "expecting output to have variable shape [-1], model '{}' output has {}".format(
+                model_metadata.name, output_metadata.shape
+            )
+        )
 
-    return (model_config.max_batch_size, input_metadata.name,
-            output_metadata.name, input_metadata.datatype)
+    return (
+        model_config.max_batch_size,
+        input_metadata.name,
+        output_metadata.name,
+        input_metadata.datatype,
+    )
 
 
 def parse_model_http(model_metadata, model_config):
@@ -92,151 +103,176 @@ def parse_model_http(model_metadata, model_config):
     Check the configuration of a model to make sure it is supported
     by this client.
     """
-    if len(model_metadata['inputs']) != 1:
-        raise Exception("expecting 1 input, got {}".format(
-            len(model_metadata['inputs'])))
-    if len(model_metadata['outputs']) != 1:
-        raise Exception("expecting 1 output, got {}".format(
-            len(model_metadata['outputs'])))
-
-    if len(model_config['input']) != 1:
+    if len(model_metadata["inputs"]) != 1:
+        raise Exception(
+            "expecting 1 input, got {}".format(len(model_metadata["inputs"]))
+        )
+    if len(model_metadata["outputs"]) != 1:
+        raise Exception(
+            "expecting 1 output, got {}".format(len(model_metadata["outputs"]))
+        )
+
+    if len(model_config["input"]) != 1:
         raise Exception(
             "expecting 1 input in model configuration, got {}".format(
-                len(model_config['input'])))
+                len(model_config["input"])
+            )
+        )
 
-    input_metadata = model_metadata['inputs'][0]
-    output_metadata = model_metadata['outputs'][0]
+    input_metadata = model_metadata["inputs"][0]
+    output_metadata = model_metadata["outputs"][0]
 
     max_batch_size = 0
-    if 'max_batch_size' in model_config:
-        max_batch_size = model_config['max_batch_size']
+    if "max_batch_size" in model_config:
+        max_batch_size = model_config["max_batch_size"]
 
-    batch_dim = (max_batch_size > 0)
+    batch_dim = max_batch_size > 0
     expected_dims = 1 + (1 if batch_dim else 0)
 
-    if len(input_metadata['shape']) != expected_dims:
+    if len(input_metadata["shape"]) != expected_dims:
         raise Exception(
-            "expecting input to have {} dimensions, model '{}' input has {}".
-            format(expected_dims, model_metadata.name,
-                   len(input_metadata['shape'])))
+            "expecting input to have {} dimensions, model '{}' input has {}".format(
+                expected_dims, model_metadata.name, len(input_metadata["shape"])
+            )
+        )
 
-    if len(output_metadata['shape']) != expected_dims:
+    if len(output_metadata["shape"]) != expected_dims:
         raise Exception(
-            "expecting output to have {} dimensions, model '{}' output has {}".
-            format(expected_dims, model_metadata.name,
-                   len(output_metadata['shape'])))
+            "expecting output to have {} dimensions, model '{}' output has {}".format(
+                expected_dims, model_metadata.name, len(output_metadata["shape"])
+            )
+        )
 
-    if input_metadata['shape'][-1] != -1:
+    if input_metadata["shape"][-1] != -1:
         raise Exception(
-            "expecting input to have variable shape [-1], model '{}' input has {}"
-            .format(model_metadata.name, input_metadata['shape']))
+            "expecting input to have variable shape [-1], model '{}' input has {}".format(
+                model_metadata.name, input_metadata["shape"]
+            )
+        )
 
-    if output_metadata['shape'][-1] != -1:
+    if output_metadata["shape"][-1] != -1:
         raise Exception(
-            "expecting output to have variable shape [-1], model '{}' output has {}"
-            .format(model_metadata.name, output_metadata['shape']))
+            "expecting output to have variable shape [-1], model '{}' output has {}".format(
+                model_metadata.name, output_metadata["shape"]
+            )
+        )
 
-    return (max_batch_size, input_metadata['name'], output_metadata['name'],
-            input_metadata['datatype'])
+    return (
+        max_batch_size,
+        input_metadata["name"],
+        output_metadata["name"],
+        input_metadata["datatype"],
+    )
 
 
 def requestGenerator(input_name, input_data, output_name, dtype, protocol):
-
     # Set the input data
     inputs = []
     if protocol.lower() == "grpc":
-        inputs.append(grpcclient.InferInput(input_name, input_data.shape,
-                                            dtype))
+        inputs.append(grpcclient.InferInput(input_name, input_data.shape, dtype))
         inputs[0].set_data_from_numpy(input_data)
     else:
-        inputs.append(httpclient.InferInput(input_name, input_data.shape,
-                                            dtype))
+        inputs.append(httpclient.InferInput(input_name, input_data.shape, dtype))
         inputs[0].set_data_from_numpy(input_data, binary_data=True)
 
     outputs = []
     if protocol.lower() == "grpc":
         outputs.append(grpcclient.InferRequestedOutput(output_name))
     else:
-        outputs.append(
-            httpclient.InferRequestedOutput(output_name, binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput(output_name, binary_data=True))
 
     return inputs, outputs
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-m',
-                        '--model-name',
-                        type=str,
-                        required=True,
-                        help='Name of model')
     parser.add_argument(
-        '-x',
-        '--model-version',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-m", "--model-name", type=str, required=True, help="Name of model"
+    )
+    parser.add_argument(
+        "-x",
+        "--model-version",
         type=str,
         required=False,
         default="",
-        help='Version of model. Default is to use latest version.')
-    parser.add_argument('-b',
-                        '--batch-size',
-                        type=int,
-                        required=False,
-                        default=1,
-                        help='Batch size. Default is 1.')
-    parser.add_argument('-s',
-                        '--shape',
-                        type=int,
-                        required=False,
-                        default=1,
-                        help='The shape of the tensor. Default is 1.')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8000',
-                        help='Inference server URL. Default is localhost:8000.')
-    parser.add_argument('-i',
-                        '--protocol',
-                        type=str,
-                        required=False,
-                        default='HTTP',
-                        help='Protocol (HTTP/gRPC) used to communicate with ' +
-                        'the inference service. Default is HTTP.')
-    parser.add_argument('-c',
-                        '--iteration_count',
-                        type=int,
-                        required=False,
-                        default=1000,
-                        help='The number of iterations. Default is 1000.')
+        help="Version of model. Default is to use latest version.",
+    )
     parser.add_argument(
-        '-w',
-        '--warmup_count',
+        "-b",
+        "--batch-size",
+        type=int,
+        required=False,
+        default=1,
+        help="Batch size. Default is 1.",
+    )
+    parser.add_argument(
+        "-s",
+        "--shape",
+        type=int,
+        required=False,
+        default=1,
+        help="The shape of the tensor. Default is 1.",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
+        type=str,
+        required=False,
+        default="localhost:8000",
+        help="Inference server URL. Default is localhost:8000.",
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
+        type=str,
+        required=False,
+        default="HTTP",
+        help="Protocol (HTTP/gRPC) used to communicate with "
+        + "the inference service. Default is HTTP.",
+    )
+    parser.add_argument(
+        "-c",
+        "--iteration_count",
+        type=int,
+        required=False,
+        default=1000,
+        help="The number of iterations. Default is 1000.",
+    )
+    parser.add_argument(
+        "-w",
+        "--warmup_count",
         type=int,
         required=False,
         default=500,
-        help='The number of warm-up iterations. Default is 500.')
+        help="The number of warm-up iterations. Default is 500.",
+    )
     parser.add_argument(
-        '--csv',
+        "--csv",
         type=str,
         required=False,
         default=None,
-        help='The name of the file to store the results in CSV format')
+        help="The name of the file to store the results in CSV format",
+    )
     FLAGS = parser.parse_args()
 
     try:
         if FLAGS.protocol.lower() == "grpc":
             # Create gRPC client for communicating with the server
             triton_client = grpcclient.InferenceServerClient(
-                url=FLAGS.url, verbose=FLAGS.verbose)
+                url=FLAGS.url, verbose=FLAGS.verbose
+            )
         else:
             triton_client = httpclient.InferenceServerClient(
-                url=FLAGS.url, verbose=FLAGS.verbose, concurrency=1)
+                url=FLAGS.url, verbose=FLAGS.verbose, concurrency=1
+            )
     except Exception as e:
         print("client creation failed: " + str(e))
         sys.exit(1)
@@ -245,7 +281,8 @@ def requestGenerator(input_name, input_data, output_name, dtype, protocol):
     # properties of the model that we need for preprocessing
     try:
         model_metadata = triton_client.get_model_metadata(
-            model_name=FLAGS.model_name, model_version=FLAGS.model_version)
+            model_name=FLAGS.model_name, model_version=FLAGS.model_version
+        )
     except InferenceServerException as e:
         print("failed to retrieve the metadata: " + str(e))
         sys.exit(1)
@@ -254,36 +291,41 @@ def requestGenerator(input_name, input_data, output_name, dtype, protocol):
     # properties of the model that we need for preprocessing
     try:
         model_metadata = triton_client.get_model_metadata(
-            model_name=FLAGS.model_name, model_version=FLAGS.model_version)
+            model_name=FLAGS.model_name, model_version=FLAGS.model_version
+        )
     except InferenceServerException as e:
         print("failed to retrieve the metadata: " + str(e))
         sys.exit(1)
 
     try:
         model_config = triton_client.get_model_config(
-            model_name=FLAGS.model_name, model_version=FLAGS.model_version)
+            model_name=FLAGS.model_name, model_version=FLAGS.model_version
+        )
     except InferenceServerException as e:
         print("failed to retrieve the config: " + str(e))
         sys.exit(1)
 
     if FLAGS.protocol.lower() == "grpc":
         max_batch_size, input_name, output_name, dtype = parse_model_grpc(
-            model_metadata, model_config.config)
+            model_metadata, model_config.config
+        )
     else:
         max_batch_size, input_name, output_name, dtype = parse_model_http(
-            model_metadata, model_config)
+            model_metadata, model_config
+        )
 
-    input_data = np.zeros([FLAGS.batch_size, FLAGS.shape],
-                          dtype=triton_to_np_dtype(dtype))
+    input_data = np.zeros(
+        [FLAGS.batch_size, FLAGS.shape], dtype=triton_to_np_dtype(dtype)
+    )
 
     # --------------------------- Warm-Up --------------------------------------------------------
     for i in range(FLAGS.warmup_count):
-        inputs, outputs = requestGenerator(input_name, input_data, output_name,
-                                           dtype, FLAGS.protocol.lower())
-        triton_client.infer(FLAGS.model_name,
-                            inputs,
-                            model_version=FLAGS.model_version,
-                            outputs=outputs)
+        inputs, outputs = requestGenerator(
+            input_name, input_data, output_name, dtype, FLAGS.protocol.lower()
+        )
+        triton_client.infer(
+            FLAGS.model_name, inputs, model_version=FLAGS.model_version, outputs=outputs
+        )
 
     latencies = []
 
@@ -293,12 +335,12 @@ def requestGenerator(input_name, input_data, output_name, dtype, protocol):
 
     for i in range(FLAGS.iteration_count):
         t0 = time.time()
-        inputs, outputs = requestGenerator(input_name, input_data, output_name,
-                                           dtype, FLAGS.protocol.lower())
-        triton_client.infer(FLAGS.model_name,
-                            inputs,
-                            model_version=FLAGS.model_version,
-                            outputs=outputs)
+        inputs, outputs = requestGenerator(
+            input_name, input_data, output_name, dtype, FLAGS.protocol.lower()
+        )
+        triton_client.infer(
+            FLAGS.model_name, inputs, model_version=FLAGS.model_version, outputs=outputs
+        )
         latencies.append(time.time() - t0)
 
     end_time = time.time()
@@ -321,12 +363,17 @@ def requestGenerator(input_name, input_data, output_name, dtype, protocol):
 
     # --------------------------- Write CSV --------------------------------------------------------
     if FLAGS.csv != None:
-        file = open(FLAGS.csv, 'w')
+        file = open(FLAGS.csv, "w")
         file.write(
             "Concurrency,Inferences/Second,p50 latency,p90 latency,p95 latency,p99 latency\n"
         )
-        file.write("1,{},{},{},{},{}".format(throughput, p50_latency * 1000,
-                                             p90_latency * 1000,
-                                             p95_latency * 1000,
-                                             p99_latency * 1000))
+        file.write(
+            "1,{},{},{},{},{}".format(
+                throughput,
+                p50_latency * 1000,
+                p90_latency * 1000,
+                p95_latency * 1000,
+                p99_latency * 1000,
+            )
+        )
         file.close()
diff --git a/qa/L0_perf_resnet/run_test.sh b/qa/L0_perf_resnet/run_test.sh
index bbd9b33c42..579d00c0e5 100755
--- a/qa/L0_perf_resnet/run_test.sh
+++ b/qa/L0_perf_resnet/run_test.sh
@@ -63,7 +63,7 @@ if [ "$ARCH" == "aarch64" ]; then
     fi
 fi
 
-# Overload use of PERF_CLIENT_PROTOCOL for convenience with existing test and 
+# Overload use of PERF_CLIENT_PROTOCOL for convenience with existing test and
 # reporting structure, though "triton_c_api" is not strictly a "protocol".
 if [[ "${PERF_CLIENT_PROTOCOL}" == "triton_c_api" ]]; then
     # Server will be run in-process with C API
diff --git a/qa/L0_query/query_e2e.py b/qa/L0_query/query_e2e.py
old mode 100644
new mode 100755
index 9e301002a1..048a4a8d41
--- a/qa/L0_query/query_e2e.py
+++ b/qa/L0_query/query_e2e.py
@@ -27,23 +27,23 @@
 
 import sys
 
-sys.path.append('../common')
+sys.path.append("../common")
+
+import unittest
 
 import numpy as np
-import tritonclient.http as tritonhttpclient
+import test_util as tu
 import tritonclient.grpc as tritongrpcclient
+import tritonclient.http as tritonhttpclient
 from tritonclient.utils import InferenceServerException
 from tritonclient.utils import cuda_shared_memory as cudashm
-import unittest
-import test_util as tu
 
 
 class QueryTest(tu.TestResultCollector):
-
     def test_http(self):
         triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
         inputs = []
-        inputs.append(tritonhttpclient.InferInput('INPUT', [1], "UINT8"))
+        inputs.append(tritonhttpclient.InferInput("INPUT", [1], "UINT8"))
         inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8))
 
         try:
@@ -56,33 +56,33 @@ def test_http(self):
     def test_http_shared_memory(self):
         triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
         inputs = []
-        inputs.append(tritonhttpclient.InferInput('INPUT', [1], "UINT8"))
+        inputs.append(tritonhttpclient.InferInput("INPUT", [1], "UINT8"))
         inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8))
 
         # Set up CUDA shared memory for outputs
         triton_client.unregister_system_shared_memory()
         triton_client.unregister_cuda_shared_memory()
-        shm_op0_handle = cudashm.create_shared_memory_region(
-            "output0_data", 4, 0)
-        shm_op1_handle = cudashm.create_shared_memory_region(
-            "output1_data", 4, 0)
+        shm_op0_handle = cudashm.create_shared_memory_region("output0_data", 4, 0)
+        shm_op1_handle = cudashm.create_shared_memory_region("output1_data", 4, 0)
         triton_client.register_cuda_shared_memory(
-            "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 4)
+            "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 4
+        )
         triton_client.register_cuda_shared_memory(
-            "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 4)
+            "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 4
+        )
         outputs = []
         outputs.append(
-            tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
+            tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True)
+        )
         outputs[-1].set_shared_memory("output0_data", 4)
 
         outputs.append(
-            tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True))
+            tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True)
+        )
         outputs[-1].set_shared_memory("output1_data", 4)
 
         try:
-            triton_client.infer(model_name="query",
-                                inputs=inputs,
-                                outputs=outputs)
+            triton_client.infer(model_name="query", inputs=inputs, outputs=outputs)
             self.assertTrue(False, "expect error with query information")
         except InferenceServerException as ex:
             self.assertTrue("OUTPUT0 GPU 0" in ex.message())
@@ -96,34 +96,34 @@ def test_http_shared_memory(self):
     def test_http_out_of_shared_memory(self):
         triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
         inputs = []
-        inputs.append(tritonhttpclient.InferInput('INPUT', [1], "UINT8"))
+        inputs.append(tritonhttpclient.InferInput("INPUT", [1], "UINT8"))
         inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8))
 
         # Set up too small CUDA shared memory for outputs, expect query
         # returns default value
         triton_client.unregister_system_shared_memory()
         triton_client.unregister_cuda_shared_memory()
-        shm_op0_handle = cudashm.create_shared_memory_region(
-            "output0_data", 1, 0)
-        shm_op1_handle = cudashm.create_shared_memory_region(
-            "output1_data", 1, 0)
+        shm_op0_handle = cudashm.create_shared_memory_region("output0_data", 1, 0)
+        shm_op1_handle = cudashm.create_shared_memory_region("output1_data", 1, 0)
         triton_client.register_cuda_shared_memory(
-            "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1)
+            "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1
+        )
         triton_client.register_cuda_shared_memory(
-            "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1)
+            "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1
+        )
         outputs = []
         outputs.append(
-            tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
+            tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True)
+        )
         outputs[-1].set_shared_memory("output0_data", 1)
 
         outputs.append(
-            tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True))
+            tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True)
+        )
         outputs[-1].set_shared_memory("output1_data", 1)
 
         try:
-            triton_client.infer(model_name="query",
-                                inputs=inputs,
-                                outputs=outputs)
+            triton_client.infer(model_name="query", inputs=inputs, outputs=outputs)
             self.assertTrue(False, "expect error with query information")
         except InferenceServerException as ex:
             self.assertTrue("OUTPUT0 CPU 0" in ex.message())
@@ -137,7 +137,7 @@ def test_http_out_of_shared_memory(self):
     def test_grpc(self):
         triton_client = tritongrpcclient.InferenceServerClient("localhost:8001")
         inputs = []
-        inputs.append(tritongrpcclient.InferInput('INPUT', [1], "UINT8"))
+        inputs.append(tritongrpcclient.InferInput("INPUT", [1], "UINT8"))
         inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8))
 
         try:
@@ -150,31 +150,29 @@ def test_grpc(self):
     def test_grpc_shared_memory(self):
         triton_client = tritongrpcclient.InferenceServerClient("localhost:8001")
         inputs = []
-        inputs.append(tritongrpcclient.InferInput('INPUT', [1], "UINT8"))
+        inputs.append(tritongrpcclient.InferInput("INPUT", [1], "UINT8"))
         inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8))
 
         # Set up CUDA shared memory for outputs
         triton_client.unregister_system_shared_memory()
         triton_client.unregister_cuda_shared_memory()
-        shm_op0_handle = cudashm.create_shared_memory_region(
-            "output0_data", 4, 0)
-        shm_op1_handle = cudashm.create_shared_memory_region(
-            "output1_data", 4, 0)
+        shm_op0_handle = cudashm.create_shared_memory_region("output0_data", 4, 0)
+        shm_op1_handle = cudashm.create_shared_memory_region("output1_data", 4, 0)
         triton_client.register_cuda_shared_memory(
-            "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 4)
+            "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 4
+        )
         triton_client.register_cuda_shared_memory(
-            "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 4)
+            "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 4
+        )
         outputs = []
-        outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
+        outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0"))
         outputs[-1].set_shared_memory("output0_data", 4)
 
-        outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))
+        outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1"))
         outputs[-1].set_shared_memory("output1_data", 4)
 
         try:
-            triton_client.infer(model_name="query",
-                                inputs=inputs,
-                                outputs=outputs)
+            triton_client.infer(model_name="query", inputs=inputs, outputs=outputs)
             self.assertTrue(False, "expect error with query information")
         except InferenceServerException as ex:
             self.assertTrue("OUTPUT0 GPU 0" in ex.message())
@@ -188,32 +186,30 @@ def test_grpc_shared_memory(self):
     def test_grpc_out_of_shared_memory(self):
         triton_client = tritongrpcclient.InferenceServerClient("localhost:8001")
         inputs = []
-        inputs.append(tritongrpcclient.InferInput('INPUT', [1], "UINT8"))
+        inputs.append(tritongrpcclient.InferInput("INPUT", [1], "UINT8"))
         inputs[0].set_data_from_numpy(np.arange(1, dtype=np.uint8))
 
         # Set up too small CUDA shared memory for outputs, expect query
         # returns default value
         triton_client.unregister_system_shared_memory()
         triton_client.unregister_cuda_shared_memory()
-        shm_op0_handle = cudashm.create_shared_memory_region(
-            "output0_data", 1, 0)
-        shm_op1_handle = cudashm.create_shared_memory_region(
-            "output1_data", 1, 0)
+        shm_op0_handle = cudashm.create_shared_memory_region("output0_data", 1, 0)
+        shm_op1_handle = cudashm.create_shared_memory_region("output1_data", 1, 0)
         triton_client.register_cuda_shared_memory(
-            "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1)
+            "output0_data", cudashm.get_raw_handle(shm_op0_handle), 0, 1
+        )
         triton_client.register_cuda_shared_memory(
-            "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1)
+            "output1_data", cudashm.get_raw_handle(shm_op1_handle), 0, 1
+        )
         outputs = []
-        outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT0'))
+        outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT0"))
         outputs[-1].set_shared_memory("output0_data", 1)
 
-        outputs.append(tritongrpcclient.InferRequestedOutput('OUTPUT1'))
+        outputs.append(tritongrpcclient.InferRequestedOutput("OUTPUT1"))
         outputs[-1].set_shared_memory("output1_data", 1)
 
         try:
-            triton_client.infer(model_name="query",
-                                inputs=inputs,
-                                outputs=outputs)
+            triton_client.infer(model_name="query", inputs=inputs, outputs=outputs)
             self.assertTrue(False, "expect error with query information")
         except InferenceServerException as ex:
             self.assertTrue("OUTPUT0 CPU 0" in ex.message())
@@ -225,5 +221,5 @@ def test_grpc_out_of_shared_memory(self):
         triton_client.unregister_cuda_shared_memory()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_query/test.sh b/qa/L0_query/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_rate_limiter/rate_limiter_test.py b/qa/L0_rate_limiter/rate_limiter_test.py
old mode 100644
new mode 100755
index c02c50b61e..4bc7b82e70
--- a/qa/L0_rate_limiter/rate_limiter_test.py
+++ b/qa/L0_rate_limiter/rate_limiter_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,11 +31,12 @@
 sys.path.append("../common")
 
 import functools
-import numpy as np
 import os
-import unittest
 import threading
 import time
+import unittest
+
+import numpy as np
 import sequence_util as su
 import tritongrpcclient as grpcclient
 from tritonclientutils import *
@@ -46,7 +49,6 @@
 
 
 class AsyncGrpcRunner:
-
     def __init__(self, tester, server_url, model_name, delay_ms):
         self._tester = tester
         self._server_url = server_url
@@ -79,18 +81,17 @@ def req_loop(self):
         client = grpcclient.InferenceServerClient(self._server_url)
 
         inputs = [
-            grpcclient.InferInput("INPUT0", self._shape,
-                                  np_to_triton_dtype(self._dtype))
+            grpcclient.InferInput(
+                "INPUT0", self._shape, np_to_triton_dtype(self._dtype)
+            )
         ]
 
         self._inflight_requests = 0
-        start_stat = client.get_inference_statistics(
-            model_name=self._model_name)
+        start_stat = client.get_inference_statistics(model_name=self._model_name)
         global _exit_signal
 
         while not _exit_signal:
-            input_numpy = np.random.random_sample(self._shape).astype(
-                self._dtype)
+            input_numpy = np.random.random_sample(self._shape).astype(self._dtype)
             inputs[0].set_data_from_numpy(input_numpy)
             self._input_data.append(input_numpy)
 
@@ -99,12 +100,15 @@ def req_loop(self):
                 def _check_can_send():
                     return self._inflight_requests < _inference_concurrency
 
-                can_send = self._sync.wait_for(_check_can_send,
-                                               timeout=_response_wait_time_s)
+                can_send = self._sync.wait_for(
+                    _check_can_send, timeout=_response_wait_time_s
+                )
                 self._tester.assertTrue(
                     can_send,
                     "client didn't receive a response within {}s".format(
-                        _response_wait_time_s))
+                        _response_wait_time_s
+                    ),
+                )
 
                 callback = functools.partial(AsyncGrpcRunner._on_result, self)
                 client.async_infer(
@@ -115,7 +119,7 @@ def _check_can_send():
                 )
                 self._inflight_requests += 1
                 self._num_sent_request += 1
-                if (self._num_sent_request == _inference_count):
+                if self._num_sent_request == _inference_count:
                     _exit_signal = True
                 time.sleep(self._delay_ms / 1000.0)
 
@@ -125,17 +129,21 @@ def _check_can_send():
             def _all_processed():
                 return self._inflight_requests == 0
 
-            self._processed_all = self._sync.wait_for(_all_processed,
-                                                      _finish_wait_time_s)
+            self._processed_all = self._sync.wait_for(
+                _all_processed, _finish_wait_time_s
+            )
             self._tester.assertTrue(
                 self._processed_all,
-                "the processing didn't complete even after waiting for {}s".
-                format(_finish_wait_time_s))
+                "the processing didn't complete even after waiting for {}s".format(
+                    _finish_wait_time_s
+                ),
+            )
 
         end_stat = client.get_inference_statistics(model_name=self._model_name)
-        self._processed_request_count = end_stat.model_stats[
-            0].inference_stats.success.count - start_stat.model_stats[
-                0].inference_stats.success.count
+        self._processed_request_count = (
+            end_stat.model_stats[0].inference_stats.success.count
+            - start_stat.model_stats[0].inference_stats.success.count
+        )
 
     def start(self):
         self._req_thread.start()
@@ -144,13 +152,15 @@ def _validate_run(self):
         if len(self._errors) != 0:
             raise self._errors[0]
         self._tester.assertEqual(
-            len(self._input_data), len(self._results.keys()),
-            "the number of inputs and output should match")
+            len(self._input_data),
+            len(self._results.keys()),
+            "the number of inputs and output should match",
+        )
         for i in range(len(self._input_data)):
             self._tester.assertFalse(
-                (self._input_data[i] !=
-                 self._results[i].as_numpy('OUTPUT0')).any(),
-                "the output data should match with the input data")
+                (self._input_data[i] != self._results[i].as_numpy("OUTPUT0")).any(),
+                "the output data should match with the input data",
+            )
 
     def join(self):
         self._req_thread.join()
@@ -158,17 +168,16 @@ def join(self):
 
 
 class RateLimiterTest(su.SequenceBatcherTestUtil):
-
     def stress_models(self, model_names, delay_ms=0):
         infer_counts = {}
         try:
             runners = []
             for model_name in model_names:
                 runners.append(
-                    AsyncGrpcRunner(self,
-                                    "localhost:8001",
-                                    model_name,
-                                    delay_ms=delay_ms))
+                    AsyncGrpcRunner(
+                        self, "localhost:8001", model_name, delay_ms=delay_ms
+                    )
+                )
             for r in runners:
                 r.start()
             for r in runners:
@@ -191,7 +200,7 @@ def test_single_model(self):
     def test_cross_model_prioritization_limited_resource(self):
         # Sends requests to two models, one operating at
         # priority of 1 and other at 2 respectively.
-        # The availabe resource counts doesn't allow models
+        # The available resource counts doesn't allow models
         # to execute simultaneously.
 
         model_names = ["custom_zero_1_float32", "custom_zero_1_float32_v2"]
@@ -199,32 +208,36 @@ def test_cross_model_prioritization_limited_resource(self):
         # TODO: Validate the priority and resource counts are set correctly
 
         infer_counts = self.stress_models(model_names)
-        infer_ratio = infer_counts[model_names[0]] / float(
-            infer_counts[model_names[1]])
+        infer_ratio = infer_counts[model_names[0]] / float(infer_counts[model_names[1]])
 
         self.assertGreater(
-            infer_ratio, 1.80,
+            infer_ratio,
+            1.80,
             "Got infer ratio across models {}, expected closer to 2".format(
-                infer_ratio))
+                infer_ratio
+            ),
+        )
 
     def test_cross_model_prioritization_plenty_resource(self):
         # Sends requests to two models, one operating at
         # priority of 1 and other at 2 respectively.
-        # The availabe resource counts wll allow both models
-        # to run simulataneously.
+        # The available resource counts wll allow both models
+        # to run simultaneously.
 
         model_names = ["custom_zero_1_float32", "custom_zero_1_float32_v2"]
 
         # TODO: Validate the priority and resource counts are set correctly
 
         infer_counts = self.stress_models(model_names)
-        infer_diff = abs(infer_counts[model_names[0]] -
-                         infer_counts[model_names[1]])
+        infer_diff = abs(infer_counts[model_names[0]] - infer_counts[model_names[1]])
 
         self.assertGreater(
-            10, infer_diff,
-            "Got infer difference between models {}, expected closer to 0".
-            format(infer_diff))
+            10,
+            infer_diff,
+            "Got infer difference between models {}, expected closer to 0".format(
+                infer_diff
+            ),
+        )
 
     def test_single_model_dynamic_batching(self):
         # Send all the inference requests with a delay to a model
@@ -242,18 +255,25 @@ def test_single_model_dynamic_batching(self):
 
         batch_stats = stats.model_stats[0].batch_stats
         self.assertEqual(
-            len(batch_stats), 1,
-            "expected single batch-size, got {}".format(len(batch_stats)))
+            len(batch_stats),
+            1,
+            "expected single batch-size, got {}".format(len(batch_stats)),
+        )
 
         for batch_stat in batch_stats:
             self.assertEqual(
-                batch_stat.batch_size, 4,
-                "unexpected batch-size {}".format(batch_stat.batch_size))
+                batch_stat.batch_size,
+                4,
+                "unexpected batch-size {}".format(batch_stat.batch_size),
+            )
             # Get count from one of the stats
             self.assertEqual(
-                batch_stat.compute_infer.count, _inference_count / 4,
-                "expected model-execution-count {} for batch size {}, got {}".
-                format(_inference_count / 4, 4, batch_stat.compute_infer.count))
+                batch_stat.compute_infer.count,
+                _inference_count / 4,
+                "expected model-execution-count {} for batch size {}, got {}".format(
+                    _inference_count / 4, 4, batch_stat.compute_infer.count
+                ),
+            )
 
     def test_single_model_sequence_batching(self):
         # Send one sequence and check for correct accumulator
@@ -265,19 +285,26 @@ def test_single_model_sequence_batching(self):
             model_name = "custom_sequence_int32"
             self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
             self.check_sequence(
-                'custom',
+                "custom",
                 model_name,
                 np.int32,
                 5,
                 (4000, None),
                 # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay))
-                (("start", 1, None, None), (None, 2, None, None),
-                 (None, 3, None, None), (None, 4, None, None),
-                 (None, 5, None, None), (None, 6, None, None),
-                 (None, 7, None, None), (None, 8, None, None),
-                 ("end", 9, None, None)),
+                (
+                    ("start", 1, None, None),
+                    (None, 2, None, None),
+                    (None, 3, None, None),
+                    (None, 4, None, None),
+                    (None, 5, None, None),
+                    (None, 6, None, None),
+                    (None, 7, None, None),
+                    (None, 8, None, None),
+                    ("end", 9, None, None),
+                ),
                 45,
-                'grpc')
+                "grpc",
+            )
 
             self.check_deferred_exception()
             self.check_status(model_name, {1: 9}, 9, 9)
@@ -285,5 +312,5 @@ def test_single_model_sequence_batching(self):
             self.assertTrue(False, "unexpected error {}".format(ex))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_rate_limiter/test.sh b/qa/L0_rate_limiter/test.sh
old mode 100644
new mode 100755
index 0de1553908..334af99e4c
--- a/qa/L0_rate_limiter/test.sh
+++ b/qa/L0_rate_limiter/test.sh
@@ -278,7 +278,7 @@ kill $SERVER_PID
 wait $SERVER_PID
 
 ##
-## Tests with mulitple instances of the same model
+## Tests with multiple instances of the same model
 ##
 # Replace the second model with a second instance with same resource requirements and priority.
 # TODO: Currently there is no way to check which instance got to run inferences hence we only
diff --git a/qa/L0_register/test.sh b/qa/L0_register/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_repoagent_checksum/identity_test.py b/qa/L0_repoagent_checksum/identity_test.py
old mode 100644
new mode 100755
index ad9f268967..4db55e0d45
--- a/qa/L0_repoagent_checksum/identity_test.py
+++ b/qa/L0_repoagent_checksum/identity_test.py
@@ -27,40 +27,43 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import numpy as np
 import sys
+
+import numpy as np
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
 from tritonclientutils import np_to_triton_dtype
 
 FLAGS = None
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Inference server URL.')
     parser.add_argument(
-        '-i',
-        '--protocol',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-u", "--url", type=str, required=False, help="Inference server URL."
+    )
+    parser.add_argument(
+        "-i",
+        "--protocol",
         type=str,
         required=False,
-        default='http',
-        help='Protocol ("http"/"grpc") used to ' +
-        'communicate with inference service. Default is "http".')
+        default="http",
+        help='Protocol ("http"/"grpc") used to '
+        + 'communicate with inference service. Default is "http".',
+    )
 
     FLAGS = parser.parse_args()
     if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print("unexpected protocol \"{}\", expects \"http\" or \"grpc\"".format(
-            FLAGS.protocol))
+        print(
+            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
+        )
         exit(1)
 
     client_util = httpclient if FLAGS.protocol == "http" else grpcclient
@@ -69,23 +72,23 @@
         FLAGS.url = "localhost:8000" if FLAGS.protocol == "http" else "localhost:8001"
 
     # Reuse a single client for all sync tests
-    with client_util.InferenceServerClient(FLAGS.url,
-                                           verbose=FLAGS.verbose) as client:
+    with client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose) as client:
         for model_name, np_dtype, shape in (
-                # yapf: disable
+            # yapf: disable
             ("identity_int32", np.int32, [0]),
-            ("identity_int32", np.int32, [7])):
+            ("identity_int32", np.int32, [7])
+        ):
             # yapf: enable
             if np_dtype != object:
                 input_data = (16384 * np.random.randn(*shape)).astype(np_dtype)
             else:
-                in0 = (16384 * np.ones(shape, dtype='int'))
-                in0n = np.array([str(x) for x in in0.reshape(in0.size)],
-                                dtype=object)
+                in0 = 16384 * np.ones(shape, dtype="int")
+                in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object)
                 input_data = in0n.reshape(in0.shape)
             inputs = [
-                client_util.InferInput("INPUT0", input_data.shape,
-                                       np_to_triton_dtype(input_data.dtype))
+                client_util.InferInput(
+                    "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                )
             ]
             inputs[0].set_data_from_numpy(input_data)
 
@@ -102,6 +105,9 @@
                 output_data = np.char.decode(output_data)
 
             if not np.array_equal(output_data, input_data):
-                print("error: expected output {} to match input {}".format(
-                    output_data, input_data))
+                print(
+                    "error: expected output {} to match input {}".format(
+                        output_data, input_data
+                    )
+                )
                 sys.exit(1)
diff --git a/qa/L0_response_cache/test.sh b/qa/L0_response_cache/test.sh
index 8ec610b065..434195b693 100755
--- a/qa/L0_response_cache/test.sh
+++ b/qa/L0_response_cache/test.sh
@@ -71,8 +71,8 @@ function stop_redis() {
 }
 
 function set_redis_auth() {
-  # NOTE: Per-user auth [Access Control List (ACL)] is only supported in 
-  #       Redis >= 6.0 and is more comprehensive in what can be configured. 
+  # NOTE: Per-user auth [Access Control List (ACL)] is only supported in
+  #       Redis >= 6.0 and is more comprehensive in what can be configured.
   #       For simplicity and wider range of Redis version support, use
   #       server-wide password  via "requirepass" for now.
   redis-cli -h "${TRITON_REDIS_HOST}" -p "${TRITON_REDIS_PORT}" config set requirepass "${REDIS_PW}"
@@ -88,7 +88,7 @@ function unset_redis_auth() {
 # UNIT TESTS
 set +e
 
-## Unit tests currently run for both Local and Redis cache implementaitons
+## Unit tests currently run for both Local and Redis cache implementations
 ## by default. However, we could break out the unit tests for each
 ## into separate runs gtest filters if needed in the future:
 ## - `${UNIT_TEST} --gtest_filter=*Local*`
@@ -130,7 +130,7 @@ function check_server_expected_failure {
     else
         # Check that server fails with the correct error message
         set +e
-        grep -i "${EXPECTED_MESSAGE}" ${SERVER_LOG} 
+        grep -i "${EXPECTED_MESSAGE}" ${SERVER_LOG}
         if [ $? -ne 0 ]; then
             echo -e "\n***\n*** Failed: Expected [${EXPECTED_MESSAGE}] error message in output\n***"
             cat $SERVER_LOG
diff --git a/qa/L0_sagemaker/sagemaker_multi_model_test.py b/qa/L0_sagemaker/sagemaker_multi_model_test.py
old mode 100644
new mode 100755
index 06cd48edd7..b2052f6751
--- a/qa/L0_sagemaker/sagemaker_multi_model_test.py
+++ b/qa/L0_sagemaker/sagemaker_multi_model_test.py
@@ -29,58 +29,80 @@
 
 sys.path.append("../common")
 
+import json
 import os
+import sys
 import time
 import unittest
+
 import numpy as np
+import requests
 import test_util as tu
 import tritonclient.http as httpclient
 
-import json
-import os
-import requests
-import sys
-
 
 class SageMakerMultiModelTest(tu.TestResultCollector):
-
     def setUp(self):
-
         SAGEMAKER_BIND_TO_PORT = os.getenv("SAGEMAKER_BIND_TO_PORT", "8080")
-        self.url_mme_ = "http://localhost:{}/models".format(
-            SAGEMAKER_BIND_TO_PORT)
+        self.url_mme_ = "http://localhost:{}/models".format(SAGEMAKER_BIND_TO_PORT)
 
         # model_1 setup
         self.model1_name = "sm_mme_model_1"
         self.model1_url = "/opt/ml/models/123456789abcdefghi/model"
 
-        self.model1_input_data_ = [
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-        ]
+        self.model1_input_data_ = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
         self.model1_expected_output0_data_ = [
-            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+            0,
+            2,
+            4,
+            6,
+            8,
+            10,
+            12,
+            14,
+            16,
+            18,
+            20,
+            22,
+            24,
+            26,
+            28,
+            30,
         ]
         self.model1_expected_output1_data_ = [
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
         ]
 
         self.model1_expected_result_ = {
-            "model_name":
-                "sm_mme_model_1",
-            "model_version":
-                "1",
+            "model_name": "sm_mme_model_1",
+            "model_version": "1",
             "outputs": [
                 {
                     "name": "OUTPUT0",
                     "datatype": "INT32",
                     "shape": [1, 16],
-                    "data": self.model1_expected_output0_data_
+                    "data": self.model1_expected_output0_data_,
                 },
                 {
                     "name": "OUTPUT1",
                     "datatype": "INT32",
                     "shape": [1, 16],
-                    "data": self.model1_expected_output1_data_
+                    "data": self.model1_expected_output1_data_,
                 },
             ],
         }
@@ -97,42 +119,45 @@ def setUp(self):
         self.model3_url = "/opt/ml/models/123456789ensemble/model"
 
     def test_sm_0_environment_variables_set(self):
-        self.assertEqual(os.getenv("SAGEMAKER_MULTI_MODEL"), "true",
-                         "Variable SAGEMAKER_MULTI_MODEL must be set to true")
+        self.assertEqual(
+            os.getenv("SAGEMAKER_MULTI_MODEL"),
+            "true",
+            "Variable SAGEMAKER_MULTI_MODEL must be set to true",
+        )
 
     def test_sm_1_model_load(self):
         # Load model_1
         request_body = {"model_name": self.model1_name, "url": self.model1_url}
         headers = {"Content-Type": "application/json"}
-        r = requests.post(self.url_mme_,
-                          data=json.dumps(request_body),
-                          headers=headers)
+        r = requests.post(self.url_mme_, data=json.dumps(request_body), headers=headers)
         time.sleep(5)  # wait for model to load
         self.assertEqual(
-            r.status_code, 200,
-            "Expected status code 200, received {}".format(r.status_code))
+            r.status_code,
+            200,
+            "Expected status code 200, received {}".format(r.status_code),
+        )
 
         # Load the same model again, expect a 409
         request_body = {"model_name": self.model1_name, "url": self.model1_url}
         headers = {"Content-Type": "application/json"}
-        r = requests.post(self.url_mme_,
-                          data=json.dumps(request_body),
-                          headers=headers)
+        r = requests.post(self.url_mme_, data=json.dumps(request_body), headers=headers)
         time.sleep(5)  # wait for model to load
         self.assertEqual(
-            r.status_code, 409,
-            "Expected status code 409, received {}".format(r.status_code))
+            r.status_code,
+            409,
+            "Expected status code 409, received {}".format(r.status_code),
+        )
 
         # Load model_2
         request_body = {"model_name": self.model2_name, "url": self.model2_url}
         headers = {"Content-Type": "application/json"}
-        r = requests.post(self.url_mme_,
-                          data=json.dumps(request_body),
-                          headers=headers)
+        r = requests.post(self.url_mme_, data=json.dumps(request_body), headers=headers)
         time.sleep(5)  # wait for model to load
         self.assertEqual(
-            r.status_code, 200,
-            "Expected status code 200, received {}".format(r.status_code))
+            r.status_code,
+            200,
+            "Expected status code 200, received {}".format(r.status_code),
+        )
 
     def test_sm_2_model_list(self):
         r = requests.get(self.url_mme_)
@@ -141,11 +166,11 @@ def test_sm_2_model_list(self):
             "models": [
                 {
                     "modelName": self.model1_name,
-                    "modelUrl": self.model1_url.rstrip("/model")
+                    "modelUrl": self.model1_url.rstrip("/model"),
                 },
                 {
                     "modelName": self.model2_name,
-                    "modelUrl": self.model2_url.rstrip("/model")
+                    "modelUrl": self.model2_url.rstrip("/model"),
                 },
             ]
         }
@@ -153,11 +178,11 @@ def test_sm_2_model_list(self):
             "models": [
                 {
                     "modelName": self.model2_name,
-                    "modelUrl": self.model2_url.rstrip("/model")
+                    "modelUrl": self.model2_url.rstrip("/model"),
                 },
                 {
                     "modelName": self.model1_name,
-                    "modelUrl": self.model1_url.rstrip("/model")
+                    "modelUrl": self.model1_url.rstrip("/model"),
                 },
             ]
         }
@@ -167,7 +192,8 @@ def test_sm_2_model_list(self):
             r.json(),
             [expected_response_1, expected_response_2],
             "Expected one of {}, received: {}".format(
-                [expected_response_1, expected_response_2], r.json()),
+                [expected_response_1, expected_response_2], r.json()
+            ),
         )
 
     def test_sm_3_model_get(self):
@@ -176,12 +202,13 @@ def test_sm_3_model_get(self):
         time.sleep(3)
         expected_response = {
             "modelName": self.model1_name,
-            "modelUrl": self.model1_url.rstrip("/model")
+            "modelUrl": self.model1_url.rstrip("/model"),
         }
         self.assertEqual(
-            r.json(), expected_response,
-            "Expected response: {}, received: {}".format(
-                expected_response, r.json()))
+            r.json(),
+            expected_response,
+            "Expected response: {}, received: {}".format(expected_response, r.json()),
+        )
 
     def test_sm_4_model_invoke(self):
         # Invoke model_1
@@ -196,12 +223,11 @@ def test_sm_4_model_invoke(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=False)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
         request_body, _ = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+            inputs, outputs=outputs
+        )
 
         headers = {"Content-Type": "application/json"}
         invoke_url = "{}/{}/invoke".format(self.url_mme_, self.model1_name)
@@ -212,47 +238,56 @@ def test_sm_4_model_invoke(self):
             self.model1_expected_result_,
             r.json(),
             "Expected response : {}, received: {}".format(
-                self.model1_expected_result_, r.json()),
+                self.model1_expected_result_, r.json()
+            ),
         )
 
         # Invoke model_2
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput(
-            "INPUT0",
-            [1, 8],
-            "FP32",
-        ))
+        inputs.append(
+            httpclient.InferInput(
+                "INPUT0",
+                [1, 8],
+                "FP32",
+            )
+        )
         input_data = np.array(self.model2_input_data_, dtype=np.float32)
         input_data = np.expand_dims(input_data, axis=0)
         inputs[0].set_data_from_numpy(input_data, binary_data=True)
 
-        outputs.append(
-            httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
 
-        request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+        (
+            request_body,
+            header_length,
+        ) = httpclient.InferenceServerClient.generate_request_body(
+            inputs, outputs=outputs
+        )
 
         invoke_url = "{}/{}/invoke".format(self.url_mme_, self.model2_name)
         headers = {
-            "Content-Type":
-                "application/vnd.sagemaker-triton.binary+json;json-header-size={}"
-                .format(header_length)
+            "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size={}".format(
+                header_length
+            )
         }
         r = requests.post(invoke_url, data=request_body, headers=headers)
 
-        header_length_prefix = "application/vnd.sagemaker-triton.binary+json;json-header-size="
-        header_length_str = r.headers["Content-Type"][len(header_length_prefix
-                                                         ):]
+        header_length_prefix = (
+            "application/vnd.sagemaker-triton.binary+json;json-header-size="
+        )
+        header_length_str = r.headers["Content-Type"][len(header_length_prefix) :]
         result = httpclient.InferenceServerClient.parse_response_body(
-            r._content, header_length=int(header_length_str))
+            r._content, header_length=int(header_length_str)
+        )
 
         # Get the inference header size so we can locate the output binary data
         output_data = result.as_numpy("OUTPUT0")
 
         for i in range(8):
-            self.assertEqual(output_data[0][i], input_data[0][i],
-                             "Tensor Value Mismatch")
+            self.assertEqual(
+                output_data[0][i], input_data[0][i], "Tensor Value Mismatch"
+            )
 
     def test_sm_5_model_unload(self):
         # Unload model_1
@@ -260,39 +295,45 @@ def test_sm_5_model_unload(self):
         r = requests.delete(unload_url)
         time.sleep(3)
         self.assertEqual(
-            r.status_code, 200,
-            "Expected status code 200, received {}".format(r.status_code))
+            r.status_code,
+            200,
+            "Expected status code 200, received {}".format(r.status_code),
+        )
 
         # Unload model_2
         unload_url = "{}/{}".format(self.url_mme_, self.model2_name)
         r = requests.delete(unload_url)
         time.sleep(3)
         self.assertEqual(
-            r.status_code, 200,
-            "Expected status code 200, received {}".format(r.status_code))
+            r.status_code,
+            200,
+            "Expected status code 200, received {}".format(r.status_code),
+        )
 
         # Unload a non-loaded model, expect a 404
         unload_url = "{}/sm_non_loaded_model".format(self.url_mme_)
         r = requests.delete(unload_url)
         time.sleep(3)
         self.assertEqual(
-            r.status_code, 404,
-            "Expected status code 404, received {}".format(r.status_code))
+            r.status_code,
+            404,
+            "Expected status code 404, received {}".format(r.status_code),
+        )
 
     def test_sm_6_ensemble_model(self):
         # Load ensemble model
         request_body = {"model_name": self.model3_name, "url": self.model3_url}
         headers = {
             "Content-Type": "application/json",
-            "X-Amzn-SageMaker-Target-Model": f"{self.model3_name}"
+            "X-Amzn-SageMaker-Target-Model": f"{self.model3_name}",
         }
-        r = requests.post(self.url_mme_,
-                          data=json.dumps(request_body),
-                          headers=headers)
+        r = requests.post(self.url_mme_, data=json.dumps(request_body), headers=headers)
         time.sleep(5)  # wait for model to load
         self.assertEqual(
-            r.status_code, 200,
-            "Expected status code 200, received {}".format(r.status_code))
+            r.status_code,
+            200,
+            "Expected status code 200, received {}".format(r.status_code),
+        )
 
         # Invoke ensemble model
         inputs = []
@@ -306,12 +347,11 @@ def test_sm_6_ensemble_model(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=False)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
         request_body, _ = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+            inputs, outputs=outputs
+        )
 
         headers = {"Content-Type": "application/json"}
         invoke_url = "{}/{}/invoke".format(self.url_mme_, self.model3_name)
@@ -319,16 +359,20 @@ def test_sm_6_ensemble_model(self):
         print(f"response: {r.text}")
         r.raise_for_status()
         self.assertEqual(
-            r.status_code, 200,
-            "Expected status code 200, received {}".format(r.status_code))
+            r.status_code,
+            200,
+            "Expected status code 200, received {}".format(r.status_code),
+        )
 
         # Unload ensemble model
         unload_url = "{}/{}".format(self.url_mme_, self.model3_name)
         r = requests.delete(unload_url, headers=headers)
         time.sleep(5)
         self.assertEqual(
-            r.status_code, 200,
-            "Expected status code 200, received {}".format(r.status_code))
+            r.status_code,
+            200,
+            "Expected status code 200, received {}".format(r.status_code),
+        )
 
 
 if __name__ == "__main__":
diff --git a/qa/L0_sagemaker/sagemaker_test.py b/qa/L0_sagemaker/sagemaker_test.py
old mode 100644
new mode 100755
index 3989e4aa93..6e76a9f0fd
--- a/qa/L0_sagemaker/sagemaker_test.py
+++ b/qa/L0_sagemaker/sagemaker_test.py
@@ -29,80 +29,95 @@
 
 sys.path.append("../common")
 
+import json
 import os
+import sys
 import unittest
+
 import numpy as np
+import requests
 import test_util as tu
 import tritonclient.http as httpclient
 
-import json
-import os
-import requests
-import sys
-
 
 class SageMakerTest(tu.TestResultCollector):
-
     def setUp(self):
-        SAGEMAKER_BIND_TO_PORT = os.getenv('SAGEMAKER_BIND_TO_PORT', '8080')
-        self.url_ = "http://localhost:{}/invocations".format(
-            SAGEMAKER_BIND_TO_PORT)
-        self.input_data_ = [
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-        ]
+        SAGEMAKER_BIND_TO_PORT = os.getenv("SAGEMAKER_BIND_TO_PORT", "8080")
+        self.url_ = "http://localhost:{}/invocations".format(SAGEMAKER_BIND_TO_PORT)
+        self.input_data_ = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
         self.expected_output0_data_ = [
-            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
-        ]
-        self.expected_output1_data_ = [
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+            0,
+            2,
+            4,
+            6,
+            8,
+            10,
+            12,
+            14,
+            16,
+            18,
+            20,
+            22,
+            24,
+            26,
+            28,
+            30,
         ]
+        self.expected_output1_data_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 
         self.expected_result_ = {
-            "model_name":
-                "sm_model",
-            "model_version":
-                "1",
-            "outputs": [{
-                "name": "OUTPUT0",
-                "datatype": "INT32",
-                "shape": [1, 16],
-                "data": self.expected_output0_data_
-            }, {
-                "name": "OUTPUT1",
-                "datatype": "INT32",
-                "shape": [1, 16],
-                "data": self.expected_output1_data_
-            }]
+            "model_name": "sm_model",
+            "model_version": "1",
+            "outputs": [
+                {
+                    "name": "OUTPUT0",
+                    "datatype": "INT32",
+                    "shape": [1, 16],
+                    "data": self.expected_output0_data_,
+                },
+                {
+                    "name": "OUTPUT1",
+                    "datatype": "INT32",
+                    "shape": [1, 16],
+                    "data": self.expected_output1_data_,
+                },
+            ],
         }
 
     def test_direct_inference(self):
         request = {
-            "inputs": [{
-                "name": "INPUT0",
-                "datatype": "INT32",
-                "shape": [1, 16],
-                "data": self.input_data_
-            }, {
-                "name": "INPUT1",
-                "datatype": "INT32",
-                "shape": [1, 16],
-                "data": self.input_data_
-            }]
+            "inputs": [
+                {
+                    "name": "INPUT0",
+                    "datatype": "INT32",
+                    "shape": [1, 16],
+                    "data": self.input_data_,
+                },
+                {
+                    "name": "INPUT1",
+                    "datatype": "INT32",
+                    "shape": [1, 16],
+                    "data": self.input_data_,
+                },
+            ]
         }
-        headers = {'Content-Type': 'application/json'}
+        headers = {"Content-Type": "application/json"}
         r = requests.post(self.url_, data=json.dumps(request), headers=headers)
         r.raise_for_status()
 
         self.assertEqual(
-            self.expected_result_, r.json(),
+            self.expected_result_,
+            r.json(),
             "Expected response body: {}; got: {}".format(
-                self.expected_result_, r.json()))
+                self.expected_result_, r.json()
+            ),
+        )
 
     def test_inference_client_generated_request(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -110,27 +125,29 @@ def test_inference_client_generated_request(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=False)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
         request_body, _ = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+            inputs, outputs=outputs
+        )
 
-        headers = {'Content-Type': 'application/json'}
+        headers = {"Content-Type": "application/json"}
         r = requests.post(self.url_, data=request_body, headers=headers)
         r.raise_for_status()
 
         self.assertEqual(
-            self.expected_result_, r.json(),
+            self.expected_result_,
+            r.json(),
             "Expected response body: {}; got: {}".format(
-                self.expected_result_, r.json()))
+                self.expected_result_, r.json()
+            ),
+        )
 
     def test_inference_client_generated_request_binary(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -138,31 +155,36 @@ def test_inference_client_generated_request_binary(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=True)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
-        request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
+        (
+            request_body,
+            header_length,
+        ) = httpclient.InferenceServerClient.generate_request_body(
+            inputs, outputs=outputs
+        )
 
         headers = {
-            'Content-Type':
-                'application/vnd.sagemaker-triton.binary+json;json-header-size={}'
-                .format(header_length)
+            "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size={}".format(
+                header_length
+            )
         }
         r = requests.post(self.url_, data=request_body, headers=headers)
         r.raise_for_status()
 
         self.assertEqual(
-            self.expected_result_, r.json(),
+            self.expected_result_,
+            r.json(),
             "Expected response body: {}; got: {}".format(
-                self.expected_result_, r.json()))
+                self.expected_result_, r.json()
+            ),
+        )
 
     def test_inference_client_generated_response(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -170,22 +192,20 @@ def test_inference_client_generated_response(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=False)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
         request_body, _ = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+            inputs, outputs=outputs
+        )
 
-        headers = {'Content-Type': 'application/json'}
+        headers = {"Content-Type": "application/json"}
         r = requests.post(self.url_, data=request_body, headers=headers)
         r.raise_for_status()
 
-        result = httpclient.InferenceServerClient.parse_response_body(
-            r._content)
+        result = httpclient.InferenceServerClient.parse_response_body(r._content)
 
-        output0_data = result.as_numpy('OUTPUT0')
-        output1_data = result.as_numpy('OUTPUT1')
+        output0_data = result.as_numpy("OUTPUT0")
+        output1_data = result.as_numpy("OUTPUT1")
         for i in range(16):
             self.assertEqual(output0_data[0][i], self.expected_output0_data_[i])
             self.assertEqual(output1_data[0][i], self.expected_output1_data_[i])
@@ -193,8 +213,8 @@ def test_inference_client_generated_response(self):
     def test_inference_client_generated_response_binary(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -202,25 +222,26 @@ def test_inference_client_generated_response_binary(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=False)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
         request_body, _ = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+            inputs, outputs=outputs
+        )
 
-        headers = {'Content-Type': 'application/json'}
+        headers = {"Content-Type": "application/json"}
         r = requests.post(self.url_, data=request_body, headers=headers)
         r.raise_for_status()
 
-        header_length_prefix = "application/vnd.sagemaker-triton.binary+json;json-header-size="
-        header_length_str = r.headers['Content-Type'][len(header_length_prefix
-                                                         ):]
+        header_length_prefix = (
+            "application/vnd.sagemaker-triton.binary+json;json-header-size="
+        )
+        header_length_str = r.headers["Content-Type"][len(header_length_prefix) :]
         result = httpclient.InferenceServerClient.parse_response_body(
-            r._content, header_length=int(header_length_str))
+            r._content, header_length=int(header_length_str)
+        )
 
-        output0_data = result.as_numpy('OUTPUT0')
-        output1_data = result.as_numpy('OUTPUT1')
+        output0_data = result.as_numpy("OUTPUT0")
+        output1_data = result.as_numpy("OUTPUT1")
         for i in range(16):
             self.assertEqual(output0_data[0][i], self.expected_output0_data_[i])
             self.assertEqual(output1_data[0][i], self.expected_output1_data_[i])
@@ -228,8 +249,8 @@ def test_inference_client_generated_response_binary(self):
     def test_malformed_binary_header(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -237,29 +258,34 @@ def test_malformed_binary_header(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=True)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
-        request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
+        (
+            request_body,
+            header_length,
+        ) = httpclient.InferenceServerClient.generate_request_body(
+            inputs, outputs=outputs
+        )
 
         headers = {
-            'Content-Type':
-                'additional-string/application/vnd.sagemaker-triton.binary+json;json-header-size={}'
-                .format(header_length)
+            "Content-Type": "additional-string/application/vnd.sagemaker-triton.binary+json;json-header-size={}".format(
+                header_length
+            )
         }
         r = requests.post(self.url_, data=request_body, headers=headers)
         self.assertEqual(
-            400, r.status_code,
+            400,
+            r.status_code,
             "Expected error code {} returned for the request; got: {}".format(
-                400, r.status_code))
+                400, r.status_code
+            ),
+        )
 
     def test_malformed_binary_header_not_number(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -267,29 +293,34 @@ def test_malformed_binary_header_not_number(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=True)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
-        request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
+        (
+            request_body,
+            header_length,
+        ) = httpclient.InferenceServerClient.generate_request_body(
+            inputs, outputs=outputs
+        )
 
         headers = {
-            'Content-Type':
-                'application/vnd.sagemaker-triton.binary+json;json-header-size=additional-string{}'
-                .format(header_length)
+            "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size=additional-string{}".format(
+                header_length
+            )
         }
         r = requests.post(self.url_, data=request_body, headers=headers)
         self.assertEqual(
-            400, r.status_code,
+            400,
+            r.status_code,
             "Expected error code {} returned for the request; got: {}".format(
-                400, r.status_code))
+                400, r.status_code
+            ),
+        )
 
     def test_malformed_binary_header_negative_number(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -297,28 +328,32 @@ def test_malformed_binary_header_negative_number(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=True)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
-        request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
+        (
+            request_body,
+            header_length,
+        ) = httpclient.InferenceServerClient.generate_request_body(
+            inputs, outputs=outputs
+        )
 
         headers = {
-            'Content-Type':
-                'application/vnd.sagemaker-triton.binary+json;json-header-size=-123'
+            "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size=-123"
         }
         r = requests.post(self.url_, data=request_body, headers=headers)
         self.assertEqual(
-            400, r.status_code,
+            400,
+            r.status_code,
             "Expected error code {} returned for the request; got: {}".format(
-                400, r.status_code))
+                400, r.status_code
+            ),
+        )
 
     def test_malformed_binary_header_large_number(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -326,23 +361,27 @@ def test_malformed_binary_header_large_number(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=True)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
-        request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
+        (
+            request_body,
+            header_length,
+        ) = httpclient.InferenceServerClient.generate_request_body(
+            inputs, outputs=outputs
+        )
 
         headers = {
-            'Content-Type':
-                'application/vnd.sagemaker-triton.binary+json;json-header-size=12345'
+            "Content-Type": "application/vnd.sagemaker-triton.binary+json;json-header-size=12345"
         }
         r = requests.post(self.url_, data=request_body, headers=headers)
         self.assertEqual(
-            400, r.status_code,
+            400,
+            r.status_code,
             "Expected error code {} returned for the request; got: {}".format(
-                400, r.status_code))
+                400, r.status_code
+            ),
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_savedmodel_shape/saved_model_shape_test.py b/qa/L0_savedmodel_shape/saved_model_shape_test.py
old mode 100644
new mode 100755
index 5c754ad600..b5ae13a680
--- a/qa/L0_savedmodel_shape/saved_model_shape_test.py
+++ b/qa/L0_savedmodel_shape/saved_model_shape_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,192 +31,198 @@
 sys.path.append("../common")
 
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
 
 np_dtype_string = np.dtype(object)
 
 
 class SavedModelShapeTest(tu.TestResultCollector):
-
-    def _full_exact(self, input_dtype, output0_dtype, output1_dtype,
-                    output0_raw, output1_raw, swap):
-
-        def _infer_exact_helper(tester,
-                                pf,
-                                tensor_shape,
-                                batch_size,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                output0_raw=True,
-                                output1_raw=True,
-                                model_version=None,
-                                swap=False,
-                                outputs=("OUTPUT0", "OUTPUT1"),
-                                use_http=True,
-                                use_grpc=True,
-                                skip_request_id_check=False,
-                                use_streaming=True,
-                                correlation_id=0):
+    def _full_exact(
+        self, input_dtype, output0_dtype, output1_dtype, output0_raw, output1_raw, swap
+    ):
+        def _infer_exact_helper(
+            tester,
+            pf,
+            tensor_shape,
+            batch_size,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_raw=True,
+            output1_raw=True,
+            model_version=None,
+            swap=False,
+            outputs=("OUTPUT0", "OUTPUT1"),
+            use_http=True,
+            use_grpc=True,
+            skip_request_id_check=False,
+            use_streaming=True,
+            correlation_id=0,
+        ):
             for bs in (1, batch_size):
                 # model that does not support batching
                 if bs == 1:
-                    iu.infer_exact(tester,
-                                   "savedmodel_nobatch",
-                                   tensor_shape,
-                                   bs,
-                                   input_dtype,
-                                   output0_dtype,
-                                   output1_dtype,
-                                   output0_raw=output0_raw,
-                                   output1_raw=output1_raw,
-                                   model_version=model_version,
-                                   swap=swap,
-                                   outputs=outputs,
-                                   use_http=use_http,
-                                   use_grpc=use_grpc,
-                                   skip_request_id_check=skip_request_id_check,
-                                   use_streaming=use_streaming,
-                                   correlation_id=correlation_id)
+                    iu.infer_exact(
+                        tester,
+                        "savedmodel_nobatch",
+                        tensor_shape,
+                        bs,
+                        input_dtype,
+                        output0_dtype,
+                        output1_dtype,
+                        output0_raw=output0_raw,
+                        output1_raw=output1_raw,
+                        model_version=model_version,
+                        swap=swap,
+                        outputs=outputs,
+                        use_http=use_http,
+                        use_grpc=use_grpc,
+                        skip_request_id_check=skip_request_id_check,
+                        use_streaming=use_streaming,
+                        correlation_id=correlation_id,
+                    )
                 # model that supports batching
-                iu.infer_exact(tester,
-                               "savedmodel", (bs,) + tensor_shape,
-                               bs,
-                               input_dtype,
-                               output0_dtype,
-                               output1_dtype,
-                               output0_raw=output0_raw,
-                               output1_raw=output1_raw,
-                               model_version=model_version,
-                               swap=swap,
-                               outputs=outputs,
-                               use_http=use_http,
-                               use_grpc=use_grpc,
-                               skip_request_id_check=skip_request_id_check,
-                               use_streaming=use_streaming,
-                               correlation_id=correlation_id)
+                iu.infer_exact(
+                    tester,
+                    "savedmodel",
+                    (bs,) + tensor_shape,
+                    bs,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    output0_raw=output0_raw,
+                    output1_raw=output1_raw,
+                    model_version=model_version,
+                    swap=swap,
+                    outputs=outputs,
+                    use_http=use_http,
+                    use_grpc=use_grpc,
+                    skip_request_id_check=skip_request_id_check,
+                    use_streaming=use_streaming,
+                    correlation_id=correlation_id,
+                )
 
         input_size = 16
 
-        if tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
-                                    (input_size,), (input_size,),
-                                    (input_size,)):
-            _infer_exact_helper(self,
-                                "savedmodel", (input_size,),
-                                8,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                output0_raw=output0_raw,
-                                output1_raw=output1_raw,
-                                swap=swap)
+        if tu.validate_for_tf_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size,),
+            (input_size,),
+            (input_size,),
+        ):
+            _infer_exact_helper(
+                self,
+                "savedmodel",
+                (input_size,),
+                8,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                output0_raw=output0_raw,
+                output1_raw=output1_raw,
+                swap=swap,
+            )
 
     def test_raw_bbb(self):
-        self._full_exact(np.int8,
-                         np.int8,
-                         np.int8,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.int8, np.int8, np.int8, output0_raw=True, output1_raw=True, swap=True
+        )
 
     def test_raw_sss(self):
-        self._full_exact(np.int16,
-                         np.int16,
-                         np.int16,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.int16, np.int16, np.int16, output0_raw=True, output1_raw=True, swap=True
+        )
 
     def test_raw_iii(self):
-        self._full_exact(np.int32,
-                         np.int32,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.int32, np.int32, np.int32, output0_raw=True, output1_raw=True, swap=True
+        )
 
     def test_raw_lll(self):
-        self._full_exact(np.int64,
-                         np.int64,
-                         np.int64,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int64, np.int64, np.int64, output0_raw=True, output1_raw=True, swap=False
+        )
 
     def test_raw_hhh(self):
-        self._full_exact(np.float16,
-                         np.float16,
-                         np.float16,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.float16,
+            np.float16,
+            np.float16,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_fff(self):
-        self._full_exact(np.float32,
-                         np.float32,
-                         np.float32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.float32,
+            np.float32,
+            np.float32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=True,
+        )
 
     def test_raw_hff(self):
-        self._full_exact(np.float16,
-                         np.float32,
-                         np.float32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.float16,
+            np.float32,
+            np.float32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_bii(self):
-        self._full_exact(np.int8,
-                         np.int32,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int8, np.int32, np.int32, output0_raw=True, output1_raw=True, swap=False
+        )
 
     def test_raw_ibb(self):
-        self._full_exact(np.int32,
-                         np.int8,
-                         np.int8,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32, np.int8, np.int8, output0_raw=True, output1_raw=True, swap=False
+        )
 
     def test_raw_ibs(self):
-        self._full_exact(np.int32,
-                         np.int8,
-                         np.int16,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32, np.int8, np.int16, output0_raw=True, output1_raw=True, swap=False
+        )
 
     def test_raw_iff(self):
-        self._full_exact(np.int32,
-                         np.float32,
-                         np.float32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32,
+            np.float32,
+            np.float32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_fii(self):
-        self._full_exact(np.float32,
-                         np.int32,
-                         np.int32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.float32,
+            np.int32,
+            np.int32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_raw_ihs(self):
-        self._full_exact(np.int32,
-                         np.float16,
-                         np.int16,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np.int32,
+            np.float16,
+            np.int16,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_savedmodel_shape/test.sh b/qa/L0_savedmodel_shape/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_secure_grpc/test.sh b/qa/L0_secure_grpc/test.sh
old mode 100644
new mode 100755
index e1f9c8dd0b..63c9b104a6
--- a/qa/L0_secure_grpc/test.sh
+++ b/qa/L0_secure_grpc/test.sh
@@ -56,23 +56,23 @@ rm -fr *.log *.log.*
 
 # Generate valid CA
 openssl genrsa -passout pass:1234 -des3 -out ca.key 4096
-openssl req -passin pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA"
+openssl req -passing pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA"
 
 # Generate valid Server Key/Cert
 openssl genrsa -passout pass:1234 -des3 -out server.key 4096
-openssl req -passin pass:1234 -new -key server.key -out server.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost"
-openssl x509 -req -passin pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt
+openssl req -passing pass:1234 -new -key server.key -out server.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost"
+openssl x509 -req -passing pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt
 
 # Remove passphrase from the Server Key
-openssl rsa -passin pass:1234 -in server.key -out server.key
+openssl rsa -passing pass:1234 -in server.key -out server.key
 
 # Generate valid Client Key/Cert
 openssl genrsa -passout pass:1234 -des3 -out client.key 4096
-openssl req -passin pass:1234 -new -key client.key -out client.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost"
-openssl x509 -passin pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt
+openssl req -passing pass:1234 -new -key client.key -out client.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost"
+openssl x509 -passing pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt
 
 # Remove passphrase from Client Key
-openssl rsa -passin pass:1234 -in client.key -out client.key
+openssl rsa -passing pass:1234 -in client.key -out client.key
 
 # Create mutated client key (Make first char of each like capital)
 cp client.key client2.key && sed -i "s/\b\(.\)/\u\1/g" client2.key
diff --git a/qa/L0_sequence_batcher/sequence_batcher_test.py b/qa/L0_sequence_batcher/sequence_batcher_test.py
old mode 100644
new mode 100755
index c2ccd0111e..11b659b05a
--- a/qa/L0_sequence_batcher/sequence_batcher_test.py
+++ b/qa/L0_sequence_batcher/sequence_batcher_test.py
@@ -2997,4 +2997,4 @@ def test_send_request_after_timeout(self):
 
 
 if __name__ == '__main__':
-    unittest.main()
+    unittest.main()
\ No newline at end of file
diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh
index 208221f8ef..3dabfaba7a 100755
--- a/qa/L0_sequence_batcher/test.sh
+++ b/qa/L0_sequence_batcher/test.sh
@@ -241,7 +241,7 @@ for BACKEND in $BACKENDS; do
     MODELS="$MODELS ../custom_models/custom_sequence_int32"
   else
     DTYPES=$(get_datatype $BACKEND)
-    
+
     for DTYPE in $DTYPES; do
       MODELS="$MODELS $DATADIR/$FIXED_MODEL_REPOSITORY/${BACKEND}_sequence_${DTYPE}"
     done
@@ -256,7 +256,7 @@ for BACKEND in $BACKENDS; do
             MODELS="$MODELS ${TMP//onnx/python}"
           else
             MODELS="$MODELS $DATADIR/qa_ensemble_model_repository/$FIXED_MODEL_REPOSITORY/*_${BACKEND}_sequence_${DTYPE}"
-          fi 
+          fi
         fi
       done
     fi
@@ -743,7 +743,7 @@ done
 
 # Test request timeout with sequence batcher
 # only run the test outside shared memory setting as
-# shared memory feature is irrelevant 
+# shared memory feature is irrelevant
 if [ "$TEST_SYSTEM_SHARED_MEMORY" -ne 1 ] && [ "$TEST_CUDA_SHARED_MEMORY" -ne 1 ]; then
     export NO_BATCHING=0
     export MODEL_INSTANCES=1
diff --git a/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py b/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py
old mode 100644
new mode 100755
index dee5502c78..15f16da352
--- a/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py
+++ b/qa/L0_sequence_corrid_batcher/sequence_corrid_batcher_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,24 +31,22 @@
 sys.path.append("../common")
 
 import os
-import time
 import threading
+import time
 import unittest
+
 import numpy as np
-import test_util as tu
 import sequence_util as su
+import test_util as tu
 
-_test_system_shared_memory = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
-_test_cuda_shared_memory = bool(
-    int(os.environ.get('TEST_CUDA_SHARED_MEMORY', 0)))
+_test_system_shared_memory = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
+_test_cuda_shared_memory = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0)))
 
-_no_batching = (int(os.environ['NO_BATCHING']) == 1)
-_model_instances = int(os.environ['MODEL_INSTANCES'])
+_no_batching = int(os.environ["NO_BATCHING"]) == 1
+_model_instances = int(os.environ["MODEL_INSTANCES"])
 
 if _no_batching:
-    _trials = ("savedmodel_nobatch", "graphdef_nobatch", "plan_nobatch",
-               "onnx_nobatch")
+    _trials = ("savedmodel_nobatch", "graphdef_nobatch", "plan_nobatch", "onnx_nobatch")
 else:
     _trials = ("savedmodel", "graphdef", "plan", "onnx")
 
@@ -55,23 +55,20 @@
 
 
 class SequenceCorrIDBatcherTest(su.SequenceBatcherTestUtil):
-
     def get_datatype(self, trial):
         return np.int32
 
-    def get_expected_result(self,
-                            expected_result,
-                            corrid,
-                            value,
-                            trial,
-                            flag_str=None):
+    def get_expected_result(self, expected_result, corrid, value, trial, flag_str=None):
         # Adjust the expected_result for models that
-        # couldn't implement the full accumulator. See
+        # could not implement the full accumulator. See
         # qa/common/gen_qa_dyna_sequence_models.py for more
         # information.
-        if ((("nobatch" not in trial) and ("custom" not in trial)) or \
-            ("graphdef" in trial) or ("plan" in trial) or \
-            ("onnx" in trial)) or ("libtorch" in trial):
+        if (
+            (("nobatch" not in trial) and ("custom" not in trial))
+            or ("graphdef" in trial)
+            or ("plan" in trial)
+            or ("onnx" in trial)
+        ) or ("libtorch" in trial):
             expected_result = value
             if flag_str is not None:
                 if "start" in flag_str:
@@ -88,14 +85,16 @@ def test_skip_batch(self):
         for trial in _trials:
             self.clear_deferred_exceptions()
             dtype = self.get_datatype(trial)
-            precreated_shm0_handles = self.precreate_register_regions((1, 3),
-                                                                      dtype, 0)
+            precreated_shm0_handles = self.precreate_register_regions((1, 3), dtype, 0)
             precreated_shm1_handles = self.precreate_register_regions(
-                (11, 12, 13, 14), dtype, 1)
+                (11, 12, 13, 14), dtype, 1
+            )
             precreated_shm2_handles = self.precreate_register_regions(
-                (111, 113), dtype, 2)
+                (111, 113), dtype, 2
+            )
             precreated_shm3_handles = self.precreate_register_regions(
-                (1111, 1112, 1113, 1114), dtype, 3)
+                (1111, 1112, 1113, 1114), dtype, 3
+            )
             try:
                 model_name = tu.get_dyna_sequence_model_name(trial, dtype)
 
@@ -104,12 +103,11 @@ def test_skip_batch(self):
                 # Need scheduler to wait for queue to contain all
                 # inferences for both sequences.
                 self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
+                self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12)
+                self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                 self.assertEqual(
-                    int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12)
-                self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                              os.environ)
-                self.assertEqual(
-                    int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0)
+                    int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0
+                )
 
                 corrids = [1001, 1002, 1003, 1004]
                 threads = []
@@ -124,12 +122,14 @@ def test_skip_batch(self):
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
                             (("start", 1, None), ("end", 3, None)),
-                            self.get_expected_result(4 + corrids[0], corrids[0],
-                                                     3, trial, "end"),
-                            precreated_shm0_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            self.get_expected_result(
+                                4 + corrids[0], corrids[0], 3, trial, "end"
+                            ),
+                            precreated_shm0_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -140,15 +140,20 @@ def test_skip_batch(self):
                             corrids[1],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 11, None), (None, 12, None),
-                             (None, 13, None), ("end", 14, None)),
-                            self.get_expected_result(50 + corrids[1],
-                                                     corrids[1], 14, trial,
-                                                     "end"),
-                            precreated_shm1_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            (
+                                ("start", 11, None),
+                                (None, 12, None),
+                                (None, 13, None),
+                                ("end", 14, None),
+                            ),
+                            self.get_expected_result(
+                                50 + corrids[1], corrids[1], 14, trial, "end"
+                            ),
+                            precreated_shm1_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -160,13 +165,14 @@ def test_skip_batch(self):
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
                             (("start", 111, None), ("end", 113, None)),
-                            self.get_expected_result(224 + corrids[2],
-                                                     corrids[2], 113, trial,
-                                                     "end"),
-                            precreated_shm2_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            self.get_expected_result(
+                                224 + corrids[2], corrids[2], 113, trial, "end"
+                            ),
+                            precreated_shm2_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
                 threads.append(
                     threading.Thread(
                         target=self.check_sequence_async,
@@ -177,15 +183,20 @@ def test_skip_batch(self):
                             corrids[3],
                             (None, None),
                             # (flag_str, value, pre_delay_ms)
-                            (("start", 1111, None), (None, 1112, None),
-                             (None, 1113, None), ("end", 1114, None)),
-                            self.get_expected_result(4450 + corrids[3],
-                                                     corrids[3], 1114, trial,
-                                                     "end"),
-                            precreated_shm3_handles),
-                        kwargs={
-                            'sequence_name': "{}".format(self._testMethodName)
-                        }))
+                            (
+                                ("start", 1111, None),
+                                (None, 1112, None),
+                                (None, 1113, None),
+                                ("end", 1114, None),
+                            ),
+                            self.get_expected_result(
+                                4450 + corrids[3], corrids[3], 1114, trial, "end"
+                            ),
+                            precreated_shm3_handles,
+                        ),
+                        kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                    )
+                )
 
                 threads[1].start()
                 threads[3].start()
@@ -211,5 +222,5 @@ def test_skip_batch(self):
                     self.cleanup_shm_regions(precreated_shm3_handles)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_sequence_stress/sequence_stress.py b/qa/L0_sequence_stress/sequence_stress.py
old mode 100644
new mode 100755
index 26d7f4bbfa..039cf793a2
--- a/qa/L0_sequence_stress/sequence_stress.py
+++ b/qa/L0_sequence_stress/sequence_stress.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,14 +31,14 @@
 sys.path.append("../common")
 
 import argparse
-from builtins import range
-from builtins import str
-import time
 import threading
+import time
 import traceback
+from builtins import range, str
+from functools import partial
+
 import numpy as np
 import test_util as tu
-from functools import partial
 import tritongrpcclient as grpcclient
 from tritonclientutils import np_to_triton_dtype
 
@@ -56,7 +58,6 @@
 
 
 class UserData:
-
     def __init__(self):
         self._completed_requests = queue.Queue()
 
@@ -71,21 +72,27 @@ class TimeoutException(Exception):
     pass
 
 
-def check_sequence_async(client_metadata,
-                         trial,
-                         model_name,
-                         input_dtype,
-                         steps,
-                         timeout_ms=DEFAULT_TIMEOUT_MS,
-                         sequence_name="<unknown>"):
+def check_sequence_async(
+    client_metadata,
+    trial,
+    model_name,
+    input_dtype,
+    steps,
+    timeout_ms=DEFAULT_TIMEOUT_MS,
+    sequence_name="<unknown>",
+):
     """Perform sequence of inferences using async run. The 'steps' holds
     a list of tuples, one for each inference with format:
 
     (flag_str, value, expected_result, delay_ms)
 
     """
-    if (("savedmodel" in trial) or ("graphdef" in trial) or
-        ("custom" in trial) or ("plan" in trial)):
+    if (
+        ("savedmodel" in trial)
+        or ("graphdef" in trial)
+        or ("custom" in trial)
+        or ("plan" in trial)
+    ):
         tensor_shape = (
             1,
             1,
@@ -108,27 +115,29 @@ def check_sequence_async(client_metadata,
         seq_start = False
         seq_end = False
         if flag_str is not None:
-            seq_start = ("start" in flag_str)
-            seq_end = ("end" in flag_str)
+            seq_start = "start" in flag_str
+            seq_end = "end" in flag_str
 
         if input_dtype == np.object_:
             in0 = np.full(tensor_shape, value, dtype=np.int32)
-            in0n = np.array([str(x) for x in in0.reshape(in0.size)],
-                            dtype=object)
+            in0n = np.array([str(x) for x in in0.reshape(in0.size)], dtype=object)
             in0 = in0n.reshape(tensor_shape)
         else:
             in0 = np.full(tensor_shape, value, dtype=input_dtype)
         inputs = [
-            grpcclient.InferInput("INPUT", tensor_shape,
-                                  np_to_triton_dtype(input_dtype)),
+            grpcclient.InferInput(
+                "INPUT", tensor_shape, np_to_triton_dtype(input_dtype)
+            ),
         ]
         inputs[0].set_data_from_numpy(in0)
 
-        triton_client.async_stream_infer(model_name,
-                                         inputs,
-                                         sequence_id=sequence_id,
-                                         sequence_start=seq_start,
-                                         sequence_end=seq_end)
+        triton_client.async_stream_infer(
+            model_name,
+            inputs,
+            sequence_id=sequence_id,
+            sequence_start=seq_start,
+            sequence_end=seq_end,
+        )
         sent_count += 1
 
         if delay_ms is not None:
@@ -147,23 +156,21 @@ def check_sequence_async(client_metadata,
         if timeout_ms != None:
             now_ms = int(round(time.time() * 1000))
             if (now_ms - seq_start_ms) > timeout_ms:
-                raise TimeoutException(
-                    "Timeout expired for {}".format(sequence_name))
+                raise TimeoutException("Timeout expired for {}".format(sequence_name))
 
         result = results.as_numpy("OUTPUT")[0][0]
         if FLAGS.verbose:
-            print("{} {}: + {} = {}".format(sequence_name, sequence_id, value,
-                                            result))
+            print("{} {}: + {} = {}".format(sequence_name, sequence_id, value, result))
 
         if expected is not None:
             if input_dtype == np.object_:
-                assert int(
-                    result
-                ) == expected, "{}: expected result {}, got {}".format(
-                    sequence_name, expected, int(result))
+                assert int(result) == expected, "{}: expected result {}, got {}".format(
+                    sequence_name, expected, int(result)
+                )
             else:
                 assert result == expected, "{}: expected result {}, got {}".format(
-                    sequence_name, expected, result)
+                    sequence_name, expected, result
+                )
     triton_client.stop_stream()
 
 
@@ -176,12 +183,12 @@ def get_datatype(trial):
     return np.int32
 
 
-def sequence_valid(client_metadata, rng, trial, model_name, dtype, len_mean,
-                   len_stddev, sequence_name):
+def sequence_valid(
+    client_metadata, rng, trial, model_name, dtype, len_mean, len_stddev, sequence_name
+):
     # Create a variable length sequence with "start" and "end" flags.
     seqlen = max(1, int(rng.normal(len_mean, len_stddev)))
-    print("{} {}: valid seqlen = {}".format(sequence_name, client_metadata[1],
-                                            seqlen))
+    print("{} {}: valid seqlen = {}".format(sequence_name, client_metadata[1], seqlen))
 
     values = rng.randint(0, 1024 * 1024, size=seqlen, dtype=dtype)
 
@@ -200,31 +207,34 @@ def sequence_valid(client_metadata, rng, trial, model_name, dtype, len_mean,
         expected_result += val
 
         # (flag_str, value, expected_result, delay_ms)
-        steps.append((flags, val, expected_result, delay_ms),)
+        steps.append(
+            (flags, val, expected_result, delay_ms),
+        )
 
-    check_sequence_async(client_metadata,
-                         trial,
-                         model_name,
-                         dtype,
-                         steps,
-                         sequence_name=sequence_name)
+    check_sequence_async(
+        client_metadata, trial, model_name, dtype, steps, sequence_name=sequence_name
+    )
 
 
-def sequence_valid_valid(client_metadata, rng, trial, model_name, dtype,
-                         len_mean, len_stddev, sequence_name):
+def sequence_valid_valid(
+    client_metadata, rng, trial, model_name, dtype, len_mean, len_stddev, sequence_name
+):
     # Create two variable length sequences with "start" and "end"
     # flags, where both sequences use the same correlation ID and are
     # sent back-to-back.
     seqlen = [
         max(1, int(rng.normal(len_mean, len_stddev))),
-        max(1, int(rng.normal(len_mean, len_stddev)))
+        max(1, int(rng.normal(len_mean, len_stddev))),
     ]
-    print("{} {}: valid-valid seqlen[0] = {}, seqlen[1] = {}".format(
-        sequence_name, client_metadata[1], seqlen[0], seqlen[1]))
+    print(
+        "{} {}: valid-valid seqlen[0] = {}, seqlen[1] = {}".format(
+            sequence_name, client_metadata[1], seqlen[0], seqlen[1]
+        )
+    )
 
     values = [
         rng.randint(0, 1024 * 1024, size=seqlen[0], dtype=dtype),
-        rng.randint(0, 1024 * 1024, size=seqlen[1], dtype=dtype)
+        rng.randint(0, 1024 * 1024, size=seqlen[1], dtype=dtype),
     ]
 
     for p in [0, 1]:
@@ -243,31 +253,34 @@ def sequence_valid_valid(client_metadata, rng, trial, model_name, dtype,
             expected_result += val
 
             # (flag_str, value, expected_result, delay_ms)
-            steps.append((flags, val, expected_result, delay_ms),)
+            steps.append(
+                (flags, val, expected_result, delay_ms),
+            )
 
-    check_sequence_async(client_metadata,
-                         trial,
-                         model_name,
-                         dtype,
-                         steps,
-                         sequence_name=sequence_name)
+    check_sequence_async(
+        client_metadata, trial, model_name, dtype, steps, sequence_name=sequence_name
+    )
 
 
-def sequence_valid_no_end(client_metadata, rng, trial, model_name, dtype,
-                          len_mean, len_stddev, sequence_name):
+def sequence_valid_no_end(
+    client_metadata, rng, trial, model_name, dtype, len_mean, len_stddev, sequence_name
+):
     # Create two variable length sequences, the first with "start" and
     # "end" flags and the second with no "end" flag, where both
     # sequences use the same correlation ID and are sent back-to-back.
     seqlen = [
         max(1, int(rng.normal(len_mean, len_stddev))),
-        max(1, int(rng.normal(len_mean, len_stddev)))
+        max(1, int(rng.normal(len_mean, len_stddev))),
     ]
-    print("{} {}: valid-no-end seqlen[0] = {}, seqlen[1] = {}".format(
-        sequence_name, client_metadata[1], seqlen[0], seqlen[1]))
+    print(
+        "{} {}: valid-no-end seqlen[0] = {}, seqlen[1] = {}".format(
+            sequence_name, client_metadata[1], seqlen[0], seqlen[1]
+        )
+    )
 
     values = [
         rng.randint(0, 1024 * 1024, size=seqlen[0], dtype=dtype),
-        rng.randint(0, 1024 * 1024, size=seqlen[1], dtype=dtype)
+        rng.randint(0, 1024 * 1024, size=seqlen[1], dtype=dtype),
     ]
 
     for p in [0, 1]:
@@ -286,23 +299,22 @@ def sequence_valid_no_end(client_metadata, rng, trial, model_name, dtype,
             expected_result += val
 
             # (flag_str, value, expected_result, delay_ms)
-            steps.append((flags, val, expected_result, delay_ms),)
+            steps.append(
+                (flags, val, expected_result, delay_ms),
+            )
 
-    check_sequence_async(client_metadata,
-                         trial,
-                         model_name,
-                         dtype,
-                         steps,
-                         sequence_name=sequence_name)
+    check_sequence_async(
+        client_metadata, trial, model_name, dtype, steps, sequence_name=sequence_name
+    )
 
 
-def sequence_no_start(client_metadata, rng, trial, model_name, dtype,
-                      sequence_name):
+def sequence_no_start(client_metadata, rng, trial, model_name, dtype, sequence_name):
     # Create a sequence without a "start" flag. Sequence should get an
     # error from the server.
     seqlen = 1
-    print("{} {}: no-start seqlen = {}".format(sequence_name,
-                                               client_metadata[1], seqlen))
+    print(
+        "{} {}: no-start seqlen = {}".format(sequence_name, client_metadata[1], seqlen)
+    )
 
     values = rng.randint(0, 1024 * 1024, size=seqlen, dtype=dtype)
 
@@ -314,29 +326,33 @@ def sequence_no_start(client_metadata, rng, trial, model_name, dtype,
         delay_ms = None
 
         # (flag_str, value, expected_result, delay_ms)
-        steps.append((flags, val, None, delay_ms),)
+        steps.append(
+            (flags, val, None, delay_ms),
+        )
 
     try:
-        check_sequence_async(client_metadata,
-                             trial,
-                             model_name,
-                             dtype,
-                             steps,
-                             sequence_name=sequence_name)
+        check_sequence_async(
+            client_metadata,
+            trial,
+            model_name,
+            dtype,
+            steps,
+            sequence_name=sequence_name,
+        )
         assert False, "expected inference failure from missing START flag"
     except Exception as ex:
         if "must specify the START flag" not in ex.message():
             raise
 
 
-def sequence_no_end(client_metadata, rng, trial, model_name, dtype, len_mean,
-                    len_stddev, sequence_name):
+def sequence_no_end(
+    client_metadata, rng, trial, model_name, dtype, len_mean, len_stddev, sequence_name
+):
     # Create a variable length sequence with "start" flag but that
     # never ends. The sequence should be aborted by the server and its
     # slot reused for another sequence.
     seqlen = max(1, int(rng.normal(len_mean, len_stddev)))
-    print("{} {}: no-end seqlen = {}".format(sequence_name, client_metadata[1],
-                                             seqlen))
+    print("{} {}: no-end seqlen = {}".format(sequence_name, client_metadata[1], seqlen))
 
     values = rng.randint(0, 1024 * 1024, size=seqlen, dtype=dtype)
 
@@ -353,18 +369,16 @@ def sequence_no_end(client_metadata, rng, trial, model_name, dtype, len_mean,
         expected_result += val
 
         # (flag_str, value, expected_result, delay_ms)
-        steps.append((flags, val, expected_result, delay_ms),)
+        steps.append(
+            (flags, val, expected_result, delay_ms),
+        )
 
-    check_sequence_async(client_metadata,
-                         trial,
-                         model_name,
-                         dtype,
-                         steps,
-                         sequence_name=sequence_name)
+    check_sequence_async(
+        client_metadata, trial, model_name, dtype, steps, sequence_name=sequence_name
+    )
 
 
-def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name,
-                  dtype):
+def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name, dtype):
     # Thread responsible for generating sequences of inference
     # requests.
     global _thread_exceptions
@@ -390,9 +404,13 @@ def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name,
 
         for c in range(common_cnt + rare_cnt):
             client_metadata_list.append(
-                (grpcclient.InferenceServerClient("localhost:8001",
-                                                  verbose=FLAGS.verbose),
-                 correlation_id_base + c))
+                (
+                    grpcclient.InferenceServerClient(
+                        "localhost:8001", verbose=FLAGS.verbose
+                    ),
+                    correlation_id_base + c,
+                )
+            )
             last_choices.append(None)
 
         rare_idx = 0
@@ -408,34 +426,40 @@ def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name,
                 # exercise the idle sequence path of the sequence
                 # scheduler
                 if choice < 0.33:
-                    sequence_no_end(client_metadata_list[client_idx],
-                                    rng,
-                                    trial,
-                                    model_name,
-                                    dtype,
-                                    SEQUENCE_LENGTH_MEAN,
-                                    SEQUENCE_LENGTH_STDEV,
-                                    sequence_name=name)
+                    sequence_no_end(
+                        client_metadata_list[client_idx],
+                        rng,
+                        trial,
+                        model_name,
+                        dtype,
+                        SEQUENCE_LENGTH_MEAN,
+                        SEQUENCE_LENGTH_STDEV,
+                        sequence_name=name,
+                    )
                     last_choices[client_idx] = "no-end"
                 elif choice < 0.66:
-                    sequence_valid_no_end(client_metadata_list[client_idx],
-                                          rng,
-                                          trial,
-                                          model_name,
-                                          dtype,
-                                          SEQUENCE_LENGTH_MEAN,
-                                          SEQUENCE_LENGTH_STDEV,
-                                          sequence_name=name)
+                    sequence_valid_no_end(
+                        client_metadata_list[client_idx],
+                        rng,
+                        trial,
+                        model_name,
+                        dtype,
+                        SEQUENCE_LENGTH_MEAN,
+                        SEQUENCE_LENGTH_STDEV,
+                        sequence_name=name,
+                    )
                     last_choices[client_idx] = "valid-no-end"
                 else:
-                    sequence_valid_valid(client_metadata_list[client_idx],
-                                         rng,
-                                         trial,
-                                         model_name,
-                                         dtype,
-                                         SEQUENCE_LENGTH_MEAN,
-                                         SEQUENCE_LENGTH_STDEV,
-                                         sequence_name=name)
+                    sequence_valid_valid(
+                        client_metadata_list[client_idx],
+                        rng,
+                        trial,
+                        model_name,
+                        dtype,
+                        SEQUENCE_LENGTH_MEAN,
+                        SEQUENCE_LENGTH_STDEV,
+                        sequence_name=name,
+                    )
                     last_choices[client_idx] = "valid-valid"
 
                 rare_idx = (rare_idx + 1) % rare_cnt
@@ -451,54 +475,67 @@ def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name,
                 # just assume that the no-start is a continuation of
                 # the no-end sequence instead of being a sequence
                 # missing start flag.
-                if ((last_choice != "no-end") and
-                    (last_choice != "valid-no-end") and (choice < 0.01)):
-                    sequence_no_start(client_metadata,
-                                      rng,
-                                      trial,
-                                      model_name,
-                                      dtype,
-                                      sequence_name=name)
+                if (
+                    (last_choice != "no-end")
+                    and (last_choice != "valid-no-end")
+                    and (choice < 0.01)
+                ):
+                    sequence_no_start(
+                        client_metadata,
+                        rng,
+                        trial,
+                        model_name,
+                        dtype,
+                        sequence_name=name,
+                    )
                     last_choices[client_idx] = "no-start"
                 elif choice < 0.05:
-                    sequence_no_end(client_metadata,
-                                    rng,
-                                    trial,
-                                    model_name,
-                                    dtype,
-                                    SEQUENCE_LENGTH_MEAN,
-                                    SEQUENCE_LENGTH_STDEV,
-                                    sequence_name=name)
+                    sequence_no_end(
+                        client_metadata,
+                        rng,
+                        trial,
+                        model_name,
+                        dtype,
+                        SEQUENCE_LENGTH_MEAN,
+                        SEQUENCE_LENGTH_STDEV,
+                        sequence_name=name,
+                    )
                     last_choices[client_idx] = "no-end"
                 elif choice < 0.10:
-                    sequence_valid_no_end(client_metadata,
-                                          rng,
-                                          trial,
-                                          model_name,
-                                          dtype,
-                                          SEQUENCE_LENGTH_MEAN,
-                                          SEQUENCE_LENGTH_STDEV,
-                                          sequence_name=name)
+                    sequence_valid_no_end(
+                        client_metadata,
+                        rng,
+                        trial,
+                        model_name,
+                        dtype,
+                        SEQUENCE_LENGTH_MEAN,
+                        SEQUENCE_LENGTH_STDEV,
+                        sequence_name=name,
+                    )
                     last_choices[client_idx] = "valid-no-end"
                 elif choice < 0.15:
-                    sequence_valid_valid(client_metadata,
-                                         rng,
-                                         trial,
-                                         model_name,
-                                         dtype,
-                                         SEQUENCE_LENGTH_MEAN,
-                                         SEQUENCE_LENGTH_STDEV,
-                                         sequence_name=name)
+                    sequence_valid_valid(
+                        client_metadata,
+                        rng,
+                        trial,
+                        model_name,
+                        dtype,
+                        SEQUENCE_LENGTH_MEAN,
+                        SEQUENCE_LENGTH_STDEV,
+                        sequence_name=name,
+                    )
                     last_choices[client_idx] = "valid-valid"
                 else:
-                    sequence_valid(client_metadata,
-                                   rng,
-                                   trial,
-                                   model_name,
-                                   dtype,
-                                   SEQUENCE_LENGTH_MEAN,
-                                   SEQUENCE_LENGTH_STDEV,
-                                   sequence_name=name)
+                    sequence_valid(
+                        client_metadata,
+                        rng,
+                        trial,
+                        model_name,
+                        dtype,
+                        SEQUENCE_LENGTH_MEAN,
+                        SEQUENCE_LENGTH_STDEV,
+                        sequence_name=name,
+                    )
                     last_choices[client_idx] = "valid"
 
     except Exception as ex:
@@ -519,38 +556,40 @@ def stress_thread(name, seed, pass_cnt, correlation_id_base, trial, model_name,
 
 
 def check_status(model_name):
-    client = grpcclient.InferenceServerClient("localhost:8001",
-                                              verbose=FLAGS.verbose)
+    client = grpcclient.InferenceServerClient("localhost:8001", verbose=FLAGS.verbose)
     stats = client.get_inference_statistics(model_name)
     print(stats)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-r',
-                        '--random-seed',
-                        type=int,
-                        required=False,
-                        help='Random seed.')
-    parser.add_argument('-t',
-                        '--concurrency',
-                        type=int,
-                        required=False,
-                        default=8,
-                        help='Request concurrency. Default is 8.')
     parser.add_argument(
-        '-i',
-        '--iterations',
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-r", "--random-seed", type=int, required=False, help="Random seed."
+    )
+    parser.add_argument(
+        "-t",
+        "--concurrency",
+        type=int,
+        required=False,
+        default=8,
+        help="Request concurrency. Default is 8.",
+    )
+    parser.add_argument(
+        "-i",
+        "--iterations",
         type=int,
         required=False,
         default=200,
-        help='Number of iterations of stress test to run. Default is 200.')
+        help="Number of iterations of stress test to run. Default is 200.",
+    )
     FLAGS = parser.parse_args()
 
     # Initialize the random seed. For reproducibility each thread
@@ -584,10 +623,19 @@ def check_status(model_name):
         correlation_id_base = 1 + (idx * CORRELATION_ID_BLOCK_SIZE)
 
         threads.append(
-            threading.Thread(target=stress_thread,
-                             args=(thread_name, seed, FLAGS.iterations,
-                                   correlation_id_base, trial, model_name,
-                                   dtype)))
+            threading.Thread(
+                target=stress_thread,
+                args=(
+                    thread_name,
+                    seed,
+                    FLAGS.iterations,
+                    correlation_id_base,
+                    trial,
+                    model_name,
+                    dtype,
+                ),
+            )
+        )
 
     for t in threads:
         t.start()
diff --git a/qa/L0_server_status/server_status_test.py b/qa/L0_server_status/server_status_test.py
old mode 100644
new mode 100755
index 93c94588df..7ab04708f0
--- a/qa/L0_server_status/server_status_test.py
+++ b/qa/L0_server_status/server_status_test.py
@@ -1,4 +1,6 @@
-# Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,10 +30,11 @@
 
 sys.path.append("../common")
 
-import numpy as np
 import os
 import unittest
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
 import tritongrpcclient as grpcclient
 import tritonhttpclient as httpclient
@@ -39,24 +42,29 @@
 
 
 class ServerMetadataTest(tu.TestResultCollector):
-
     def test_basic(self):
         try:
-            for pair in [("localhost:8000", "http"),
-                         ("localhost:8001", "grpc")]:
+            for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]:
                 model_name = "graphdef_int32_int8_int8"
                 extensions = [
-                    'classification', 'sequence', 'model_repository',
-                    'schedule_policy', 'model_configuration',
-                    'system_shared_memory', 'cuda_shared_memory',
-                    'binary_tensor_data', 'statistics'
+                    "classification",
+                    "sequence",
+                    "model_repository",
+                    "schedule_policy",
+                    "model_configuration",
+                    "system_shared_memory",
+                    "cuda_shared_memory",
+                    "binary_tensor_data",
+                    "statistics",
                 ]
                 if pair[1] == "http":
                     triton_client = httpclient.InferenceServerClient(
-                        url=pair[0], verbose=True)
+                        url=pair[0], verbose=True
+                    )
                 else:
                     triton_client = grpcclient.InferenceServerClient(
-                        url=pair[0], verbose=True)
+                        url=pair[0], verbose=True
+                    )
 
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
@@ -64,16 +72,18 @@ def test_basic(self):
                 model_metadata = triton_client.get_model_metadata(model_name)
 
                 if pair[1] == "http":
-                    self.assertEqual(os.environ["TRITON_SERVER_VERSION"],
-                                     server_metadata['version'])
-                    self.assertEqual("triton", server_metadata['name'])
+                    self.assertEqual(
+                        os.environ["TRITON_SERVER_VERSION"], server_metadata["version"]
+                    )
+                    self.assertEqual("triton", server_metadata["name"])
                     for ext in extensions:
-                        self.assertIn(ext, server_metadata['extensions'])
+                        self.assertIn(ext, server_metadata["extensions"])
 
-                    self.assertEqual(model_name, model_metadata['name'])
+                    self.assertEqual(model_name, model_metadata["name"])
                 else:
-                    self.assertEqual(os.environ["TRITON_SERVER_VERSION"],
-                                     server_metadata.version)
+                    self.assertEqual(
+                        os.environ["TRITON_SERVER_VERSION"], server_metadata.version
+                    )
                     self.assertEqual("triton", server_metadata.name)
                     for ext in extensions:
                         self.assertIn(ext, server_metadata.extensions)
@@ -84,91 +94,96 @@ def test_basic(self):
 
     def test_unknown_model(self):
         try:
-            for pair in [("localhost:8000", "http"),
-                         ("localhost:8001", "grpc")]:
+            for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]:
                 model_name = "foo"
                 if pair[1] == "http":
                     triton_client = httpclient.InferenceServerClient(
-                        url=pair[0], verbose=True)
+                        url=pair[0], verbose=True
+                    )
                 else:
                     triton_client = grpcclient.InferenceServerClient(
-                        url=pair[0], verbose=True)
+                        url=pair[0], verbose=True
+                    )
 
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
                 server_metadata = triton_client.get_server_metadata()
                 if pair[1] == "http":
-                    self.assertEqual(os.environ["TRITON_SERVER_VERSION"],
-                                     server_metadata['version'])
-                    self.assertEqual("triton", server_metadata['name'])
+                    self.assertEqual(
+                        os.environ["TRITON_SERVER_VERSION"], server_metadata["version"]
+                    )
+                    self.assertEqual("triton", server_metadata["name"])
                 else:
-                    self.assertEqual(os.environ["TRITON_SERVER_VERSION"],
-                                     server_metadata.version)
+                    self.assertEqual(
+                        os.environ["TRITON_SERVER_VERSION"], server_metadata.version
+                    )
                     self.assertEqual("triton", server_metadata.name)
 
                 model_metadata = triton_client.get_model_metadata(model_name)
                 self.assertTrue(False, "expected unknown model failure")
         except InferenceServerException as ex:
-            self.assertTrue(ex.message().startswith(
-                "Request for unknown model: 'foo' is not found"))
+            self.assertTrue(
+                ex.message().startswith("Request for unknown model: 'foo' is not found")
+            )
 
     def test_unknown_model_version(self):
         try:
-            for pair in [("localhost:8000", "http"),
-                         ("localhost:8001", "grpc")]:
+            for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]:
                 model_name = "graphdef_int32_int8_int8"
                 if pair[1] == "http":
                     triton_client = httpclient.InferenceServerClient(
-                        url=pair[0], verbose=True)
+                        url=pair[0], verbose=True
+                    )
                 else:
                     triton_client = grpcclient.InferenceServerClient(
-                        url=pair[0], verbose=True)
+                        url=pair[0], verbose=True
+                    )
 
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
 
                 model_metadata = triton_client.get_model_metadata(
-                    model_name, model_version="99")
+                    model_name, model_version="99"
+                )
                 self.assertTrue(False, "expected unknown model version failure")
         except InferenceServerException as ex:
-            self.assertTrue(ex.message().startswith(
-                "Request for unknown model: 'graphdef_int32_int8_int8' version 99 is not found"
-            ))
+            self.assertTrue(
+                ex.message().startswith(
+                    "Request for unknown model: 'graphdef_int32_int8_int8' version 99 is not found"
+                )
+            )
 
     def test_model_latest_infer(self):
         input_size = 16
         tensor_shape = (1, input_size)
-        platform_name = {
-            'graphdef': 'tensorflow_graphdef',
-            'onnx': 'onnxruntime_onnx'
-        }
+        platform_name = {"graphdef": "tensorflow_graphdef", "onnx": "onnxruntime_onnx"}
 
         # There are 3 versions of *_int32_int32_int32 and all
         # should be available.
-        for platform in ('graphdef', 'onnx'):
+        for platform in ("graphdef", "onnx"):
             model_name = platform + "_int32_int32_int32"
 
             # Initially there should be no version stats..
             try:
-                for pair in [("localhost:8000", "http"),
-                             ("localhost:8001", "grpc")]:
+                for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]:
                     if pair[1] == "http":
                         triton_client = httpclient.InferenceServerClient(
-                            url=pair[0], verbose=True)
+                            url=pair[0], verbose=True
+                        )
                     else:
                         triton_client = grpcclient.InferenceServerClient(
-                            url=pair[0], verbose=True)
+                            url=pair[0], verbose=True
+                        )
 
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    model_metadata = triton_client.get_model_metadata(
-                        model_name)
+                    model_metadata = triton_client.get_model_metadata(model_name)
                     # verify all versions are reported when no model version is specified
                     if pair[1] == "http":
-                        self.assertEqual(model_name, model_metadata['name'])
-                        self.assertEqual(len(model_metadata['versions']), 3)
+                        self.assertEqual(model_name, model_metadata["name"])
+                        self.assertEqual(len(model_metadata["versions"]), 3)
                         for v in (1, 2, 3):
-                            self.assertIn(str(v), model_metadata['versions'])
+                            self.assertIn(str(v), model_metadata["versions"])
                     else:
                         self.assertEqual(model_name, model_metadata.name)
                         self.assertEqual(len(model_metadata.versions), 3)
@@ -177,9 +192,9 @@ def test_model_latest_infer(self):
 
                     # verify contents of model metadata
                     if pair[1] == "http":
-                        model_platform = model_metadata['platform']
-                        model_inputs = model_metadata['inputs']
-                        model_outputs = model_metadata['outputs']
+                        model_platform = model_metadata["platform"]
+                        model_inputs = model_metadata["inputs"]
+                        model_outputs = model_metadata["outputs"]
                     else:
                         model_platform = model_metadata.platform
                         model_inputs = model_metadata.inputs
@@ -191,9 +206,9 @@ def test_model_latest_infer(self):
 
                     for model_input in model_inputs:
                         if pair[1] == "http":
-                            input_dtype = model_input['datatype']
-                            input_shape = model_input['shape']
-                            input_name = model_input['name']
+                            input_dtype = model_input["datatype"]
+                            input_shape = model_input["shape"]
+                            input_name = model_input["name"]
                         else:
                             input_dtype = model_input.datatype
                             input_shape = model_input.shape
@@ -204,9 +219,9 @@ def test_model_latest_infer(self):
 
                     for model_output in model_outputs:
                         if pair[1] == "http":
-                            output_dtype = model_output['datatype']
-                            output_shape = model_output['shape']
-                            output_name = model_output['name']
+                            output_dtype = model_output["datatype"]
+                            output_shape = model_output["shape"]
+                            output_name = model_output["name"]
                         else:
                             output_dtype = model_output.datatype
                             output_shape = model_output.shape
@@ -219,67 +234,79 @@ def test_model_latest_infer(self):
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
             # Infer using latest version (which is 3)...
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.int32,
-                           np.int32,
-                           np.int32,
-                           model_version=None,
-                           swap=True)
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.int32,
+                np.int32,
+                np.int32,
+                model_version=None,
+                swap=True,
+            )
 
             try:
-                for pair in [("localhost:8000", "http"),
-                             ("localhost:8001", "grpc")]:
+                for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]:
                     if pair[1] == "http":
                         triton_client = httpclient.InferenceServerClient(
-                            url=pair[0], verbose=True)
+                            url=pair[0], verbose=True
+                        )
                     else:
                         triton_client = grpcclient.InferenceServerClient(
-                            url=pair[0], verbose=True)
+                            url=pair[0], verbose=True
+                        )
 
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
                     for v in (1, 2, 3):
                         self.assertTrue(
-                            triton_client.is_model_ready(model_name,
-                                                         model_version=str(v)))
+                            triton_client.is_model_ready(
+                                model_name, model_version=str(v)
+                            )
+                        )
 
                     # Only version 3 should have infer stats
-                    infer_stats = triton_client.get_inference_statistics(
-                        model_name)
+                    infer_stats = triton_client.get_inference_statistics(model_name)
                     if pair[1] == "http":
-                        stats = infer_stats['model_stats']
+                        stats = infer_stats["model_stats"]
                     else:
                         stats = infer_stats.model_stats
                     self.assertEqual(
-                        len(stats), 3,
-                        "expected 3 infer stats for model " + model_name)
+                        len(stats), 3, "expected 3 infer stats for model " + model_name
+                    )
                     for s in stats:
                         if pair[1] == "http":
-                            v = s['version']
-                            stat = s['inference_stats']
+                            v = s["version"]
+                            stat = s["inference_stats"]
                         else:
                             v = s.version
                             stat = s.inference_stats
 
                         if v == "3":
                             if pair[1] == "http":
-                                self.assertTrue(stat['success']['count'], 3)
+                                self.assertTrue(stat["success"]["count"], 3)
                             else:
                                 self.assertTrue(stat.success.count, 3)
                         else:
                             if pair[1] == "http":
                                 self.assertEqual(
-                                    stat['success']['count'], 0,
+                                    stat["success"]["count"],
+                                    0,
                                     "unexpected infer success counts for version "
-                                    + str(v) + " of model " + model_name)
+                                    + str(v)
+                                    + " of model "
+                                    + model_name,
+                                )
                             else:
                                 self.assertEqual(
-                                    stat.success.count, 0,
+                                    stat.success.count,
+                                    0,
                                     "unexpected infer success counts for version "
-                                    + str(v) + " of model " + model_name)
+                                    + str(v)
+                                    + " of model "
+                                    + model_name,
+                                )
 
             except InferenceServerException as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
@@ -289,136 +316,150 @@ def test_model_specific_infer(self):
 
         # There are 3 versions of *_float32_float32_float32 but only
         # versions 1 and 3 should be available.
-        for platform in ('graphdef', 'onnx', 'plan'):
+        for platform in ("graphdef", "onnx", "plan"):
             tensor_shape = (1, input_size)
             model_name = platform + "_float32_float32_float32"
 
             # Initially there should be no version status...
             try:
-                for pair in [("localhost:8000", "http"),
-                             ("localhost:8001", "grpc")]:
+                for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]:
                     if pair[1] == "http":
                         triton_client = httpclient.InferenceServerClient(
-                            url=pair[0], verbose=True)
+                            url=pair[0], verbose=True
+                        )
                     else:
                         triton_client = grpcclient.InferenceServerClient(
-                            url=pair[0], verbose=True)
+                            url=pair[0], verbose=True
+                        )
 
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
                     self.assertTrue(
-                        triton_client.is_model_ready(model_name,
-                                                     model_version="1"))
+                        triton_client.is_model_ready(model_name, model_version="1")
+                    )
                     self.assertFalse(
-                        triton_client.is_model_ready(model_name,
-                                                     model_version="2"))
+                        triton_client.is_model_ready(model_name, model_version="2")
+                    )
                     self.assertTrue(
-                        triton_client.is_model_ready(model_name,
-                                                     model_version="3"))
+                        triton_client.is_model_ready(model_name, model_version="3")
+                    )
             except InferenceServerException as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
             # Infer using version 1...
-            iu.infer_exact(self,
-                           platform,
-                           tensor_shape,
-                           1,
-                           np.float32,
-                           np.float32,
-                           np.float32,
-                           model_version=1,
-                           swap=False)
+            iu.infer_exact(
+                self,
+                platform,
+                tensor_shape,
+                1,
+                np.float32,
+                np.float32,
+                np.float32,
+                model_version=1,
+                swap=False,
+            )
 
             try:
-                for pair in [("localhost:8000", "http"),
-                             ("localhost:8001", "grpc")]:
+                for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]:
                     if pair[1] == "http":
                         triton_client = httpclient.InferenceServerClient(
-                            url=pair[0], verbose=True)
+                            url=pair[0], verbose=True
+                        )
                     else:
                         triton_client = grpcclient.InferenceServerClient(
-                            url=pair[0], verbose=True)
+                            url=pair[0], verbose=True
+                        )
 
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
                     self.assertTrue(
-                        triton_client.is_model_ready(model_name,
-                                                     model_version="1"))
+                        triton_client.is_model_ready(model_name, model_version="1")
+                    )
                     self.assertFalse(
-                        triton_client.is_model_ready(model_name,
-                                                     model_version="2"))
+                        triton_client.is_model_ready(model_name, model_version="2")
+                    )
                     self.assertTrue(
-                        triton_client.is_model_ready(model_name,
-                                                     model_version="3"))
+                        triton_client.is_model_ready(model_name, model_version="3")
+                    )
 
                     # Only version 1 should have infer stats
                     infer_stats = triton_client.get_inference_statistics(
-                        model_name, model_version='1')
+                        model_name, model_version="1"
+                    )
                     if pair[1] == "http":
                         self.assertEqual(
-                            len(infer_stats['model_stats']), 1,
+                            len(infer_stats["model_stats"]),
+                            1,
                             "expected 1 infer stats for version 1"
-                            " of model " + model_name)
-                        stats = infer_stats['model_stats'][0]['inference_stats']
-                        self.assertTrue(stats['success']['count'], 3)
+                            " of model " + model_name,
+                        )
+                        stats = infer_stats["model_stats"][0]["inference_stats"]
+                        self.assertTrue(stats["success"]["count"], 3)
                     else:
                         self.assertEqual(
-                            len(infer_stats.model_stats), 1,
+                            len(infer_stats.model_stats),
+                            1,
                             "expected 1 infer stats for version 1"
-                            " of model " + model_name)
+                            " of model " + model_name,
+                        )
                         stats = infer_stats.model_stats[0].inference_stats
                         self.assertTrue(stats.success.count, 3)
                     infer_stats = triton_client.get_inference_statistics(
-                        model_name, model_version='3')
+                        model_name, model_version="3"
+                    )
                     if pair[1] == "http":
-                        stats = infer_stats['model_stats'][0]['inference_stats']
+                        stats = infer_stats["model_stats"][0]["inference_stats"]
                         self.assertEqual(
-                            stats['success']['count'], 0,
+                            stats["success"]["count"],
+                            0,
                             "unexpected infer stats for version 3"
-                            " of model " + model_name)
+                            " of model " + model_name,
+                        )
                     else:
                         stats = infer_stats.model_stats[0].inference_stats
                         self.assertEqual(
-                            stats.success.count, 0,
+                            stats.success.count,
+                            0,
                             "unexpected infer stats for version 3"
-                            " of model " + model_name)
+                            " of model " + model_name,
+                        )
 
             except InferenceServerException as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
 
 
 class ModelMetadataTest(tu.TestResultCollector):
-    '''
+    """
     These tests must be run after the ServerMetadataTest. See test.sh
     file for correct test running.
-    '''
+    """
 
     def test_model_versions_deleted(self):
         # Originally There were 3 versions of *_int32_int32_int32 and
         # version 3 was executed once. Version 2 and 3 models were
         # deleted from the model repository so now only expect version 1 to
         # be ready and show stats.
-        for platform in ('graphdef', 'onnx'):
+        for platform in ("graphdef", "onnx"):
             model_name = platform + "_int32_int32_int32"
 
             try:
-                for pair in [("localhost:8000", "http"),
-                             ("localhost:8001", "grpc")]:
+                for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]:
                     if pair[1] == "http":
                         triton_client = httpclient.InferenceServerClient(
-                            url=pair[0], verbose=True)
+                            url=pair[0], verbose=True
+                        )
                     else:
                         triton_client = grpcclient.InferenceServerClient(
-                            url=pair[0], verbose=True)
+                            url=pair[0], verbose=True
+                        )
 
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    model_metadata = triton_client.get_model_metadata(
-                        model_name)
+                    model_metadata = triton_client.get_model_metadata(model_name)
                     if pair[1] == "http":
-                        self.assertEqual(model_name, model_metadata['name'])
-                        self.assertEqual(len(model_metadata['versions']), 1)
-                        self.assertEqual("1", model_metadata['versions'][0])
+                        self.assertEqual(model_name, model_metadata["name"])
+                        self.assertEqual(len(model_metadata["versions"]), 1)
+                        self.assertEqual("1", model_metadata["versions"][0])
                     else:
                         self.assertEqual(model_name, model_metadata.name)
                         self.assertEqual(len(model_metadata.versions), 1)
@@ -429,30 +470,41 @@ def test_model_versions_deleted(self):
                         if v == 1:
                             self.assertTrue(
                                 triton_client.is_model_ready(
-                                    model_name, model_version=str(v)))
+                                    model_name, model_version=str(v)
+                                )
+                            )
                             infer_stats = triton_client.get_inference_statistics(
-                                model_name, model_version=str(v))
+                                model_name, model_version=str(v)
+                            )
                             if pair[1] == "http":
                                 self.assertEqual(
-                                    len(infer_stats['model_stats']), 1,
-                                    "expected 1 infer stats for version " +
-                                    str(v) + " of model " + model_name)
-                                stats = infer_stats['model_stats'][0][
-                                    'inference_stats']
-                                self.assertEqual(stats['success']['count'], 0)
+                                    len(infer_stats["model_stats"]),
+                                    1,
+                                    "expected 1 infer stats for version "
+                                    + str(v)
+                                    + " of model "
+                                    + model_name,
+                                )
+                                stats = infer_stats["model_stats"][0]["inference_stats"]
+                                self.assertEqual(stats["success"]["count"], 0)
                             else:
                                 self.assertEqual(
-                                    len(infer_stats.model_stats), 1,
-                                    "expected 1 infer stats for version " +
-                                    str(v) + " of model " + model_name)
-                                stats = infer_stats.model_stats[
-                                    0].inference_stats
+                                    len(infer_stats.model_stats),
+                                    1,
+                                    "expected 1 infer stats for version "
+                                    + str(v)
+                                    + " of model "
+                                    + model_name,
+                                )
+                                stats = infer_stats.model_stats[0].inference_stats
                                 self.assertEqual(stats.success.count, 0)
 
                         else:
                             self.assertFalse(
                                 triton_client.is_model_ready(
-                                    model_name, model_version=str(v)))
+                                    model_name, model_version=str(v)
+                                )
+                            )
 
             except InferenceServerException as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
@@ -461,40 +513,46 @@ def test_model_versions_added(self):
         # Originally There was version 1 of *_float16_float32_float32.
         # Version 7 was added so now expect just version 7 to be ready
         # and provide infer stats.
-        for platform in ('graphdef',):
+        for platform in ("graphdef",):
             model_name = platform + "_float16_float32_float32"
 
             try:
-                for pair in [("localhost:8000", "http"),
-                             ("localhost:8001", "grpc")]:
+                for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]:
                     if pair[1] == "http":
                         triton_client = httpclient.InferenceServerClient(
-                            url=pair[0], verbose=True)
+                            url=pair[0], verbose=True
+                        )
                     else:
                         triton_client = grpcclient.InferenceServerClient(
-                            url=pair[0], verbose=True)
+                            url=pair[0], verbose=True
+                        )
 
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    model_metadata = triton_client.get_model_metadata(
-                        model_name)
+                    model_metadata = triton_client.get_model_metadata(model_name)
                     if pair[1] == "http":
                         self.assertEqual(
-                            model_name, model_metadata['name'],
-                            "expected status for model " + model_name)
+                            model_name,
+                            model_metadata["name"],
+                            "expected status for model " + model_name,
+                        )
                         self.assertEqual(
-                            len(model_metadata['versions']), 1,
-                            "expected status for 1 versions for model " +
-                            model_name)
-                        self.assertEqual("7", model_metadata['versions'][0])
+                            len(model_metadata["versions"]),
+                            1,
+                            "expected status for 1 versions for model " + model_name,
+                        )
+                        self.assertEqual("7", model_metadata["versions"][0])
                     else:
                         self.assertEqual(
-                            model_name, model_metadata.name,
-                            "expected status for model " + model_name)
+                            model_name,
+                            model_metadata.name,
+                            "expected status for model " + model_name,
+                        )
                         self.assertEqual(
-                            len(model_metadata.versions), 1,
-                            "expected status for 1 versions for model " +
-                            model_name)
+                            len(model_metadata.versions),
+                            1,
+                            "expected status for 1 versions for model " + model_name,
+                        )
                         self.assertEqual("7", model_metadata.versions[0])
 
                     # Only version 7 should be ready and show infer stat.
@@ -502,39 +560,52 @@ def test_model_versions_added(self):
                         if v == 7:
                             self.assertTrue(
                                 triton_client.is_model_ready(
-                                    model_name, model_version=str(v)))
+                                    model_name, model_version=str(v)
+                                )
+                            )
                             infer_stats = triton_client.get_inference_statistics(
-                                model_name, model_version=str(v))
+                                model_name, model_version=str(v)
+                            )
                             if pair[1] == "http":
-                                stats = infer_stats['model_stats'][0][
-                                    'inference_stats']
+                                stats = infer_stats["model_stats"][0]["inference_stats"]
                                 self.assertEqual(
-                                    stats['success']['count'], 0,
-                                    "unexpected infer stats for version " +
-                                    str(v) + " of model " + model_name)
+                                    stats["success"]["count"],
+                                    0,
+                                    "unexpected infer stats for version "
+                                    + str(v)
+                                    + " of model "
+                                    + model_name,
+                                )
                             else:
-                                stats = infer_stats.model_stats[
-                                    0].inference_stats
+                                stats = infer_stats.model_stats[0].inference_stats
                                 self.assertEqual(
-                                    stats.success.count, 0,
-                                    "unexpected infer stats for version " +
-                                    str(v) + " of model " + model_name)
+                                    stats.success.count,
+                                    0,
+                                    "unexpected infer stats for version "
+                                    + str(v)
+                                    + " of model "
+                                    + model_name,
+                                )
 
                         else:
                             self.assertFalse(
                                 triton_client.is_model_ready(
-                                    model_name, model_version=str(v)))
+                                    model_name, model_version=str(v)
+                                )
+                            )
                             try:
                                 infer_stats = triton_client.get_inference_statistics(
-                                    model_name, model_version=str(v))
+                                    model_name, model_version=str(v)
+                                )
                                 self.assertTrue(
                                     False,
-                                    "unexpected infer stats for the model that is not ready"
+                                    "unexpected infer stats for the model that is not ready",
                                 )
                             except InferenceServerException as ex:
                                 self.assertIn(
                                     "requested model version is not available for model",
-                                    str(ex))
+                                    str(ex),
+                                )
 
             except InferenceServerException as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
@@ -544,27 +615,27 @@ def test_infer_stats_no_model_version(self):
         # version 3 was executed once. Version 2 and 3 models were
         # deleted from the model repository so now only expect version 1 to
         # be ready and show infer stats.
-        for platform in ('graphdef', 'onnx'):
+        for platform in ("graphdef", "onnx"):
             model_name = platform + "_int32_int32_int32"
 
             try:
-                for pair in [("localhost:8000", "http"),
-                             ("localhost:8001", "grpc")]:
+                for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]:
                     if pair[1] == "http":
                         triton_client = httpclient.InferenceServerClient(
-                            url=pair[0], verbose=True)
+                            url=pair[0], verbose=True
+                        )
                     else:
                         triton_client = grpcclient.InferenceServerClient(
-                            url=pair[0], verbose=True)
+                            url=pair[0], verbose=True
+                        )
 
                     self.assertTrue(triton_client.is_server_live())
                     self.assertTrue(triton_client.is_server_ready())
-                    model_metadata = triton_client.get_model_metadata(
-                        model_name)
+                    model_metadata = triton_client.get_model_metadata(model_name)
                     if pair[1] == "http":
-                        self.assertEqual(model_name, model_metadata['name'])
-                        self.assertEqual(len(model_metadata['versions']), 1)
-                        self.assertEqual("1", model_metadata['versions'][0])
+                        self.assertEqual(model_name, model_metadata["name"])
+                        self.assertEqual(len(model_metadata["versions"]), 1)
+                        self.assertEqual("1", model_metadata["versions"][0])
                     else:
                         self.assertEqual(model_name, model_metadata.name)
                         self.assertEqual(len(model_metadata.versions), 1)
@@ -575,44 +646,55 @@ def test_infer_stats_no_model_version(self):
                         if v == 1:
                             self.assertTrue(
                                 triton_client.is_model_ready(
-                                    model_name, model_version=str(v)))
+                                    model_name, model_version=str(v)
+                                )
+                            )
                         else:
                             self.assertFalse(
                                 triton_client.is_model_ready(
-                                    model_name, model_version=str(v)))
+                                    model_name, model_version=str(v)
+                                )
+                            )
 
-                    infer_stats = triton_client.get_inference_statistics(
-                        model_name)
+                    infer_stats = triton_client.get_inference_statistics(model_name)
                     if pair[1] == "http":
-                        stats = infer_stats['model_stats']
+                        stats = infer_stats["model_stats"]
                     else:
                         stats = infer_stats.model_stats
                     self.assertEqual(
-                        len(stats), 1,
-                        "expected 1 infer stats for model " + model_name)
+                        len(stats), 1, "expected 1 infer stats for model " + model_name
+                    )
 
                     if pair[1] == "http":
-                        version = stats[0]['version']
-                        stat = stats[0]['inference_stats']
+                        version = stats[0]["version"]
+                        stat = stats[0]["inference_stats"]
                     else:
                         version = stats[0].version
                         stat = stats[0].inference_stats
 
                     if version != "1":
                         self.assertTrue(
-                            False,
-                            "expected version 1 for infer stat, got " + version)
+                            False, "expected version 1 for infer stat, got " + version
+                        )
                     else:
                         if pair[1] == "http":
                             self.assertEqual(
-                                stat['success']['count'], 0,
-                                "unexpected infer stats for version " +
-                                str(version) + " of model " + model_name)
+                                stat["success"]["count"],
+                                0,
+                                "unexpected infer stats for version "
+                                + str(version)
+                                + " of model "
+                                + model_name,
+                            )
                         else:
                             self.assertEqual(
-                                stat.success.count, 0,
-                                "unexpected infer stats for version " +
-                                str(version) + " of model " + model_name)
+                                stat.success.count,
+                                0,
+                                "unexpected infer stats for version "
+                                + str(version)
+                                + " of model "
+                                + model_name,
+                            )
 
             except InferenceServerException as ex:
                 self.assertTrue(False, "unexpected error {}".format(ex))
@@ -620,14 +702,15 @@ def test_infer_stats_no_model_version(self):
     def test_infer_stats_no_model(self):
         # Test get_inference_statistics when no model/model_version is passed.
         try:
-            for pair in [("localhost:8000", "http"),
-                         ("localhost:8001", "grpc")]:
+            for pair in [("localhost:8000", "http"), ("localhost:8001", "grpc")]:
                 if pair[1] == "http":
                     triton_client = httpclient.InferenceServerClient(
-                        url=pair[0], verbose=True)
+                        url=pair[0], verbose=True
+                    )
                 else:
                     triton_client = grpcclient.InferenceServerClient(
-                        url=pair[0], verbose=True)
+                        url=pair[0], verbose=True
+                    )
 
                 self.assertTrue(triton_client.is_server_live())
                 self.assertTrue(triton_client.is_server_ready())
@@ -635,17 +718,18 @@ def test_infer_stats_no_model(self):
                 # Returns infer stats for ALL models + ready versions
                 infer_stats = triton_client.get_inference_statistics()
                 if pair[1] == "http":
-                    stats = infer_stats['model_stats']
+                    stats = infer_stats["model_stats"]
                 else:
                     stats = infer_stats.model_stats
                 self.assertEqual(
-                    len(stats), 219,
-                    "expected 219 infer stats for all ready versions of all model"
+                    len(stats),
+                    219,
+                    "expected 219 infer stats for all ready versions of all model",
                 )
 
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_shared_memory/shared_memory_test.py b/qa/L0_shared_memory/shared_memory_test.py
old mode 100644
new mode 100755
index d4207064bf..6350dc2abe
--- a/qa/L0_shared_memory/shared_memory_test.py
+++ b/qa/L0_shared_memory/shared_memory_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,9 +30,10 @@
 
 sys.path.append("../common")
 
-import numpy as np
-import unittest
 import os
+import unittest
+
+import numpy as np
 import test_util as tu
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
@@ -39,12 +42,12 @@
 
 
 class SharedMemoryTest(tu.TestResultCollector):
-
     def test_invalid_create_shm(self):
         # Raises error since tried to create invalid system shared memory region
         try:
             shm_op0_handle = shm.create_shared_memory_region(
-                "dummy_data", "/dummy_data", -1)
+                "dummy_data", "/dummy_data", -1
+            )
             shm.destroy_shared_memory_region(shm_op0_handle)
         except Exception as ex:
             self.assertTrue(str(ex) == "unable to initialize the size")
@@ -55,12 +58,11 @@ def test_valid_create_set_register(self):
             triton_client = httpclient.InferenceServerClient(_url, verbose=True)
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-        shm_op0_handle = shm.create_shared_memory_region(
-            "dummy_data", "/dummy_data", 8)
-        shm.set_shared_memory_region(shm_op0_handle,
-                                     [np.array([1, 2], dtype=np.float32)])
-        triton_client.register_system_shared_memory("dummy_data", "/dummy_data",
-                                                    8)
+        shm_op0_handle = shm.create_shared_memory_region("dummy_data", "/dummy_data", 8)
+        shm.set_shared_memory_region(
+            shm_op0_handle, [np.array([1, 2], dtype=np.float32)]
+        )
+        triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8)
         shm_status = triton_client.get_system_shared_memory_status()
         if _protocol == "http":
             self.assertTrue(len(shm_status) == 1)
@@ -74,8 +76,7 @@ def test_unregister_before_register(self):
             triton_client = httpclient.InferenceServerClient(_url, verbose=True)
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-        shm_op0_handle = shm.create_shared_memory_region(
-            "dummy_data", "/dummy_data", 8)
+        shm_op0_handle = shm.create_shared_memory_region("dummy_data", "/dummy_data", 8)
         triton_client.unregister_system_shared_memory("dummy_data")
         shm_status = triton_client.get_system_shared_memory_status()
         if _protocol == "http":
@@ -90,10 +91,8 @@ def test_unregister_after_register(self):
             triton_client = httpclient.InferenceServerClient(_url, verbose=True)
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-        shm_op0_handle = shm.create_shared_memory_region(
-            "dummy_data", "/dummy_data", 8)
-        triton_client.register_system_shared_memory("dummy_data", "/dummy_data",
-                                                    8)
+        shm_op0_handle = shm.create_shared_memory_region("dummy_data", "/dummy_data", 8)
+        triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8)
         triton_client.unregister_system_shared_memory("dummy_data")
         shm_status = triton_client.get_system_shared_memory_status()
         if _protocol == "http":
@@ -108,17 +107,14 @@ def test_reregister_after_register(self):
             triton_client = httpclient.InferenceServerClient(_url, verbose=True)
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-        shm_op0_handle = shm.create_shared_memory_region(
-            "dummy_data", "/dummy_data", 8)
-        triton_client.register_system_shared_memory("dummy_data", "/dummy_data",
-                                                    8)
+        shm_op0_handle = shm.create_shared_memory_region("dummy_data", "/dummy_data", 8)
+        triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8)
         try:
-            triton_client.register_system_shared_memory("dummy_data",
-                                                        "/dummy_data", 8)
+            triton_client.register_system_shared_memory("dummy_data", "/dummy_data", 8)
         except Exception as ex:
             self.assertTrue(
-                "shared memory region 'dummy_data' already in manager" in str(
-                    ex))
+                "shared memory region 'dummy_data' already in manager" in str(ex)
+            )
         shm_status = triton_client.get_system_shared_memory_status()
         if _protocol == "http":
             self.assertTrue(len(shm_status) == 1)
@@ -128,13 +124,17 @@ def test_reregister_after_register(self):
 
     def _configure_sever(self):
         shm_ip0_handle = shm.create_shared_memory_region(
-            "input0_data", "/input0_data", 64)
+            "input0_data", "/input0_data", 64
+        )
         shm_ip1_handle = shm.create_shared_memory_region(
-            "input1_data", "/input1_data", 64)
+            "input1_data", "/input1_data", 64
+        )
         shm_op0_handle = shm.create_shared_memory_region(
-            "output0_data", "/output0_data", 64)
+            "output0_data", "/output0_data", 64
+        )
         shm_op1_handle = shm.create_shared_memory_region(
-            "output1_data", "/output1_data", 64)
+            "output1_data", "/output1_data", 64
+        )
         input0_data = np.arange(start=0, stop=16, dtype=np.int32)
         input1_data = np.ones(shape=16, dtype=np.int32)
         shm.set_shared_memory_region(shm_ip0_handle, [input0_data])
@@ -143,28 +143,26 @@ def _configure_sever(self):
             triton_client = httpclient.InferenceServerClient(_url, verbose=True)
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-        triton_client.register_system_shared_memory("input0_data",
-                                                    "/input0_data", 64)
-        triton_client.register_system_shared_memory("input1_data",
-                                                    "/input1_data", 64)
-        triton_client.register_system_shared_memory("output0_data",
-                                                    "/output0_data", 64)
-        triton_client.register_system_shared_memory("output1_data",
-                                                    "/output1_data", 64)
+        triton_client.register_system_shared_memory("input0_data", "/input0_data", 64)
+        triton_client.register_system_shared_memory("input1_data", "/input1_data", 64)
+        triton_client.register_system_shared_memory("output0_data", "/output0_data", 64)
+        triton_client.register_system_shared_memory("output1_data", "/output1_data", 64)
         return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
 
     def _cleanup_server(self, shm_handles):
         for shm_handle in shm_handles:
             shm.destroy_shared_memory_region(shm_handle)
 
-    def _basic_inference(self,
-                         shm_ip0_handle,
-                         shm_ip1_handle,
-                         shm_op0_handle,
-                         shm_op1_handle,
-                         error_msg,
-                         big_shm_name="",
-                         big_shm_size=64):
+    def _basic_inference(
+        self,
+        shm_ip0_handle,
+        shm_ip1_handle,
+        shm_op0_handle,
+        shm_op1_handle,
+        error_msg,
+        big_shm_name="",
+        big_shm_size=64,
+    ):
         input0_data = np.arange(start=0, stop=16, dtype=np.int32)
         input1_data = np.ones(shape=16, dtype=np.int32)
         inputs = []
@@ -173,16 +171,16 @@ def _basic_inference(self,
             triton_client = httpclient.InferenceServerClient(_url, verbose=True)
             inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
             inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
+            outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
             outputs.append(
-                httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
-            outputs.append(
-                httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
+                httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)
+            )
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
             inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
             inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
-            outputs.append(grpcclient.InferRequestedOutput('OUTPUT0'))
-            outputs.append(grpcclient.InferRequestedOutput('OUTPUT1'))
+            outputs.append(grpcclient.InferRequestedOutput("OUTPUT0"))
+            outputs.append(grpcclient.InferRequestedOutput("OUTPUT1"))
 
         inputs[0].set_shared_memory("input0_data", 64)
 
@@ -197,23 +195,24 @@ def _basic_inference(self,
         outputs[1].set_shared_memory("output1_data", 64)
 
         try:
-            results = triton_client.infer("simple",
-                                          inputs,
-                                          model_version="",
-                                          outputs=outputs)
-            output = results.get_output('OUTPUT0')
+            results = triton_client.infer(
+                "simple", inputs, model_version="", outputs=outputs
+            )
+            output = results.get_output("OUTPUT0")
             if _protocol == "http":
-                output_datatype = output['datatype']
-                output_shape = output['shape']
+                output_datatype = output["datatype"]
+                output_shape = output["shape"]
             else:
                 output_datatype = output.datatype
                 output_shape = output.shape
             output_dtype = utils.triton_to_np_dtype(output_datatype)
-            output_data = shm.get_contents_as_numpy(shm_op0_handle,
-                                                    output_dtype, output_shape)
+            output_data = shm.get_contents_as_numpy(
+                shm_op0_handle, output_dtype, output_shape
+            )
             self.assertTrue(
                 (output_data[0] == (input0_data + input1_data)).all(),
-                "Model output does not match expected output")
+                "Model output does not match expected output",
+            )
         except Exception as ex:
             error_msg.append(str(ex))
 
@@ -221,8 +220,9 @@ def test_unregister_after_inference(self):
         # Unregister after inference
         error_msg = []
         shm_handles = self._configure_sever()
-        self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2],
-                              shm_handles[3], error_msg)
+        self._basic_inference(
+            shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg
+        )
         if len(error_msg) > 0:
             raise Exception(str(error_msg))
         if _protocol == "http":
@@ -245,14 +245,15 @@ def test_register_after_inference(self):
             triton_client = httpclient.InferenceServerClient(_url, verbose=True)
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-        self._basic_inference(shm_handles[0], shm_handles[1], shm_handles[2],
-                              shm_handles[3], error_msg)
+        self._basic_inference(
+            shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg
+        )
         if len(error_msg) > 0:
             raise Exception(str(error_msg))
         shm_ip2_handle = shm.create_shared_memory_region(
-            "input2_data", "/input2_data", 64)
-        triton_client.register_system_shared_memory("input2_data",
-                                                    "/input2_data", 64)
+            "input2_data", "/input2_data", 64
+        )
+        triton_client.register_system_shared_memory("input2_data", "/input2_data", 64)
         shm_status = triton_client.get_system_shared_memory_status()
         if _protocol == "http":
             self.assertTrue(len(shm_status) == 5)
@@ -266,19 +267,27 @@ def test_too_big_shm(self):
         error_msg = []
         shm_handles = self._configure_sever()
         shm_ip2_handle = shm.create_shared_memory_region(
-            "input2_data", "/input2_data", 128)
+            "input2_data", "/input2_data", 128
+        )
         if _protocol == "http":
             triton_client = httpclient.InferenceServerClient(_url, verbose=True)
         else:
             triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-        triton_client.register_system_shared_memory("input2_data",
-                                                    "/input2_data", 128)
-        self._basic_inference(shm_handles[0], shm_ip2_handle, shm_handles[2],
-                              shm_handles[3], error_msg, "input2_data", 128)
+        triton_client.register_system_shared_memory("input2_data", "/input2_data", 128)
+        self._basic_inference(
+            shm_handles[0],
+            shm_ip2_handle,
+            shm_handles[2],
+            shm_handles[3],
+            error_msg,
+            "input2_data",
+            128,
+        )
         if len(error_msg) > 0:
             self.assertTrue(
                 "unexpected total byte size 128 for input 'INPUT1', expecting 64"
-                in error_msg[-1])
+                in error_msg[-1]
+            )
         shm_handles.append(shm_ip2_handle)
         self._cleanup_server(shm_handles)
 
@@ -287,8 +296,9 @@ def test_mixed_raw_shm(self):
         error_msg = []
         shm_handles = self._configure_sever()
         input1_data = np.ones(shape=16, dtype=np.int32)
-        self._basic_inference(shm_handles[0], [input1_data], shm_handles[2],
-                              shm_handles[3], error_msg)
+        self._basic_inference(
+            shm_handles[0], [input1_data], shm_handles[2], shm_handles[3], error_msg
+        )
         if len(error_msg) > 0:
             raise Exception(error_msg[-1])
         self._cleanup_server(shm_handles)
@@ -314,8 +324,8 @@ def test_unregisterall(self):
         self._cleanup_server(shm_handles)
 
 
-if __name__ == '__main__':
-    _protocol = os.environ.get('CLIENT_TYPE', "http")
+if __name__ == "__main__":
+    _protocol = os.environ.get("CLIENT_TYPE", "http")
     if _protocol == "http":
         _url = "localhost:8000"
     else:
diff --git a/qa/L0_shared_memory/test.sh b/qa/L0_shared_memory/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_simple_ensemble/ensemble_test.py b/qa/L0_simple_ensemble/ensemble_test.py
old mode 100644
new mode 100755
index 514cef59b6..0b064c13e8
--- a/qa/L0_simple_ensemble/ensemble_test.py
+++ b/qa/L0_simple_ensemble/ensemble_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -30,72 +32,77 @@
 sys.path.append("../clients")
 
 import logging
-
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
 import tritonhttpclient
 
 
 class EnsembleTest(tu.TestResultCollector):
-
     def _get_infer_count_per_version(self, model_name):
-        triton_client = tritonhttpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True)
+        triton_client = tritonhttpclient.InferenceServerClient(
+            "localhost:8000", verbose=True
+        )
         stats = triton_client.get_inference_statistics(model_name)
         self.assertEqual(len(stats["model_stats"]), 2)
         infer_count = [0, 0]
         for model_stat in stats["model_stats"]:
-            self.assertEqual(model_stat["name"], model_name,
-                             "expected stats for model " + model_name)
-            model_version = model_stat['version']
+            self.assertEqual(
+                model_stat["name"], model_name, "expected stats for model " + model_name
+            )
+            model_version = model_stat["version"]
             if model_version == "1":
-                infer_count[0] = model_stat["inference_stats"]["success"][
-                    "count"]
+                infer_count[0] = model_stat["inference_stats"]["success"]["count"]
             elif model_version == "2":
-                infer_count[1] = model_stat["inference_stats"]["success"][
-                    "count"]
+                infer_count[1] = model_stat["inference_stats"]["success"]["count"]
             else:
                 self.assertTrue(
-                    False, "unexpected version {} for model {}".format(
-                        model_version, model_name))
+                    False,
+                    "unexpected version {} for model {}".format(
+                        model_version, model_name
+                    ),
+                )
         return infer_count
 
     def test_ensemble_add_sub(self):
         for bs in (1, 8):
-            iu.infer_exact(self, "ensemble_add_sub", (bs, 16), bs, np.int32,
-                           np.int32, np.int32)
+            iu.infer_exact(
+                self, "ensemble_add_sub", (bs, 16), bs, np.int32, np.int32, np.int32
+            )
 
         infer_count = self._get_infer_count_per_version("simple")
         # The two 'simple' versions should have the same infer count
-        if (infer_count[0] != infer_count[1]):
+        if infer_count[0] != infer_count[1]:
             self.assertTrue(
-                False,
-                "unexpeced different infer count for different 'simple' versions"
+                False, "unexpeced different infer count for different 'simple' versions"
             )
 
     def test_ensemble_add_sub_one_output(self):
         for bs in (1, 8):
-            iu.infer_exact(self,
-                           "ensemble_add_sub", (bs, 16),
-                           bs,
-                           np.int32,
-                           np.int32,
-                           np.int32,
-                           outputs=("OUTPUT0",))
+            iu.infer_exact(
+                self,
+                "ensemble_add_sub",
+                (bs, 16),
+                bs,
+                np.int32,
+                np.int32,
+                np.int32,
+                outputs=("OUTPUT0",),
+            )
 
         infer_count = self._get_infer_count_per_version("simple")
         # Only 'simple' version 2 should have non-zero infer count
         # as it is in charge of producing OUTPUT0
-        if (infer_count[0] != 0):
+        if infer_count[0] != 0:
             self.assertTrue(
-                False, "unexpeced non-zero infer count for 'simple' version 1")
-        elif (infer_count[1] == 0):
-            self.assertTrue(
-                False, "unexpeced zero infer count for 'simple' version 2")
+                False, "unexpeced non-zero infer count for 'simple' version 1"
+            )
+        elif infer_count[1] == 0:
+            self.assertTrue(False, "unexpeced zero infer count for 'simple' version 2")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     logging.basicConfig(stream=sys.stderr)
     unittest.main()
diff --git a/qa/L0_simple_nodejs_client/test.sh b/qa/L0_simple_nodejs_client/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_socket/test.sh b/qa/L0_socket/test.sh
old mode 100644
new mode 100755
index 08852624d7..228eec3f2a
--- a/qa/L0_socket/test.sh
+++ b/qa/L0_socket/test.sh
@@ -138,7 +138,7 @@ for address in default explicit; do
         kill $SERVER_PID
         wait $SERVER_PID
 
-        # error if http/grpc port overlaps with grpc/http explicit port 
+        # error if http/grpc port overlaps with grpc/http explicit port
         if [ "$p" == "http" ]; then
             SERVER_ARGS="--model-repository=$DATADIR $SAME_EXPLICIT_ADDRESS --http-port 8003 --grpc-port 8003"
             run_server_nowait
diff --git a/qa/L0_storage_S3_local/mock_s3_service.py b/qa/L0_storage_S3_local/mock_s3_service.py
old mode 100644
new mode 100755
index b146cd8f3f..956aac0e66
--- a/qa/L0_storage_S3_local/mock_s3_service.py
+++ b/qa/L0_storage_S3_local/mock_s3_service.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,13 +26,12 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import time
 import threading
-from http.server import HTTPServer, BaseHTTPRequestHandler
-
+import time
+from http.server import BaseHTTPRequestHandler, HTTPServer
 
-class MockS3Service():
 
+class MockS3Service:
     __address = "localhost"
     __port = 8080
 
@@ -49,8 +50,10 @@ def __CheckHttp2Ads(self):
                     v = self.headers["connection"].lower()
                     if "upgrade" in v or "http2" in v:
                         test_results["http2_ads"] = True
-                if "upgrade" in self.headers and "h2c" in self.headers[
-                        "upgrade"].lower():
+                if (
+                    "upgrade" in self.headers
+                    and "h2c" in self.headers["upgrade"].lower()
+                ):
                     test_results["http2_ads"] = True
                 if "http2-settings" in self.headers:
                     test_results["http2_ads"] = True
@@ -64,14 +67,15 @@ def do_HEAD(self):
             def do_GET(self):
                 self.__CheckHttp2Ads()
                 test_results["get_count"] += 1
-                self.send_error(404, "Thank you for using the mock s3 service!",
-                                "Your bucket is not found here!")
+                self.send_error(
+                    404,
+                    "Thank you for using the mock s3 service!",
+                    "Your bucket is not found here!",
+                )
 
         self.__test_results = test_results
-        self.__server = HTTPServer((self.__address, self.__port),
-                                   RequestValidator)
-        self.__service_thread = threading.Thread(
-            target=self.__server.serve_forever)
+        self.__server = HTTPServer((self.__address, self.__port), RequestValidator)
+        self.__service_thread = threading.Thread(target=self.__server.serve_forever)
 
     def __enter__(self):
         self.__service_thread.start()
@@ -82,12 +86,14 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self.__service_thread.join()
 
     def TestPassed(self):
-        return self.__test_results["head_count"] > 0 and self.__test_results[
-            "get_count"] > 0 and not self.__test_results["http2_ads"]
+        return (
+            self.__test_results["head_count"] > 0
+            and self.__test_results["get_count"] > 0
+            and not self.__test_results["http2_ads"]
+        )
 
 
 if __name__ == "__main__":
-
     # Initialize mock service
     mock_s3_service = MockS3Service()
 
diff --git a/qa/L0_storage_azure/test.sh b/qa/L0_storage_azure/test.sh
index 9f67b1f272..9345671c84 100755
--- a/qa/L0_storage_azure/test.sh
+++ b/qa/L0_storage_azure/test.sh
@@ -218,7 +218,7 @@ for FW in ${AUTOCOMPLETE_BACKENDS}; do
     for model in ${FW}_float32_float32_float32 ${FW}_object_object_object; do
         cp -r /data/inferenceserver/${REPO_VERSION}/qa_model_repository/${model} models/
         # Config files specify things expected by unit test like label_filename
-        # and max_batch_size for comparing results, so remove some key fields 
+        # and max_batch_size for comparing results, so remove some key fields
         # for autocomplete to fill that won't break the unit test.
         sed -i '/platform:/d' models/${model}/config.pbtxt
         sed -i '/data_type:/d' models/${model}/config.pbtxt
diff --git a/qa/L0_storage_swiftstack/infer_test.py b/qa/L0_storage_swiftstack/infer_test.py
old mode 100644
new mode 100755
index 5e1b3704ff..f8a65a01a4
--- a/qa/L0_storage_swiftstack/infer_test.py
+++ b/qa/L0_storage_swiftstack/infer_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,133 +31,177 @@
 sys.path.append("../common")
 
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
 
 
 class InferTest(tu.TestResultCollector):
-
-    def _full_exact(self, input_dtype, output0_dtype, output1_dtype,
-                    output0_raw, output1_raw, swap):
-
-        def _infer_exact_helper(tester,
-                                pf,
-                                tensor_shape,
-                                batch_size,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                output0_raw=True,
-                                output1_raw=True,
-                                model_version=None,
-                                swap=False,
-                                outputs=("OUTPUT0", "OUTPUT1"),
-                                use_http=True,
-                                use_grpc=True,
-                                skip_request_id_check=False,
-                                use_streaming=True,
-                                correlation_id=0):
+    def _full_exact(
+        self, input_dtype, output0_dtype, output1_dtype, output0_raw, output1_raw, swap
+    ):
+        def _infer_exact_helper(
+            tester,
+            pf,
+            tensor_shape,
+            batch_size,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_raw=True,
+            output1_raw=True,
+            model_version=None,
+            swap=False,
+            outputs=("OUTPUT0", "OUTPUT1"),
+            use_http=True,
+            use_grpc=True,
+            skip_request_id_check=False,
+            use_streaming=True,
+            correlation_id=0,
+        ):
             for bs in (1, batch_size):
-                iu.infer_exact(tester,
-                               pf, (bs,) + tensor_shape,
-                               bs,
-                               input_dtype,
-                               output0_dtype,
-                               output1_dtype,
-                               output0_raw=output0_raw,
-                               output1_raw=output1_raw,
-                               model_version=model_version,
-                               swap=swap,
-                               outputs=outputs,
-                               use_http=use_http,
-                               use_grpc=use_grpc,
-                               skip_request_id_check=skip_request_id_check,
-                               use_streaming=use_streaming,
-                               correlation_id=correlation_id)
+                iu.infer_exact(
+                    tester,
+                    pf,
+                    (bs,) + tensor_shape,
+                    bs,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    output0_raw=output0_raw,
+                    output1_raw=output1_raw,
+                    model_version=model_version,
+                    swap=swap,
+                    outputs=outputs,
+                    use_http=use_http,
+                    use_grpc=use_grpc,
+                    skip_request_id_check=skip_request_id_check,
+                    use_streaming=use_streaming,
+                    correlation_id=correlation_id,
+                )
 
         input_size = 16
 
-        if tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
-                                    (input_size,), (input_size,),
-                                    (input_size,)):
+        if tu.validate_for_tf_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size,),
+            (input_size,),
+            (input_size,),
+        ):
             for pf in ["graphdef", "savedmodel"]:
-                _infer_exact_helper(self,
-                                    pf, (input_size,),
-                                    8,
-                                    input_dtype,
-                                    output0_dtype,
-                                    output1_dtype,
-                                    output0_raw=output0_raw,
-                                    output1_raw=output1_raw,
-                                    swap=swap)
-
-        if tu.validate_for_trt_model(input_dtype, output0_dtype, output1_dtype,
-                                     (input_size, 1, 1), (input_size, 1, 1),
-                                     (input_size, 1, 1)):
+                _infer_exact_helper(
+                    self,
+                    pf,
+                    (input_size,),
+                    8,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    output0_raw=output0_raw,
+                    output1_raw=output1_raw,
+                    swap=swap,
+                )
+
+        if tu.validate_for_trt_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size, 1, 1),
+            (input_size, 1, 1),
+            (input_size, 1, 1),
+        ):
             if input_dtype == np.int8:
-                _infer_exact_helper(self,
-                                    'plan', (input_size, 1, 1),
-                                    8,
-                                    input_dtype,
-                                    output0_dtype,
-                                    output1_dtype,
-                                    output0_raw=output0_raw,
-                                    output1_raw=output1_raw,
-                                    swap=swap)
+                _infer_exact_helper(
+                    self,
+                    "plan",
+                    (input_size, 1, 1),
+                    8,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    output0_raw=output0_raw,
+                    output1_raw=output1_raw,
+                    swap=swap,
+                )
             else:
-                _infer_exact_helper(self,
-                                    'plan', (input_size,),
-                                    8,
-                                    input_dtype,
-                                    output0_dtype,
-                                    output1_dtype,
-                                    output0_raw=output0_raw,
-                                    output1_raw=output1_raw,
-                                    swap=swap)
-
-        if tu.validate_for_onnx_model(input_dtype, output0_dtype, output1_dtype,
-                                      (input_size,), (input_size,),
-                                      (input_size,)):
-            _infer_exact_helper(self,
-                                'onnx', (input_size,),
-                                8,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                output0_raw=output0_raw,
-                                output1_raw=output1_raw,
-                                swap=swap)
-
-        if tu.validate_for_libtorch_model(input_dtype, output0_dtype,
-                                          output1_dtype, (input_size,),
-                                          (input_size,), (input_size,)):
-            _infer_exact_helper(self,
-                                'libtorch', (input_size,),
-                                8,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                output0_raw=output0_raw,
-                                output1_raw=output1_raw,
-                                swap=swap)
+                _infer_exact_helper(
+                    self,
+                    "plan",
+                    (input_size,),
+                    8,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    output0_raw=output0_raw,
+                    output1_raw=output1_raw,
+                    swap=swap,
+                )
+
+        if tu.validate_for_onnx_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size,),
+            (input_size,),
+            (input_size,),
+        ):
+            _infer_exact_helper(
+                self,
+                "onnx",
+                (input_size,),
+                8,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                output0_raw=output0_raw,
+                output1_raw=output1_raw,
+                swap=swap,
+            )
+
+        if tu.validate_for_libtorch_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size,),
+            (input_size,),
+            (input_size,),
+        ):
+            _infer_exact_helper(
+                self,
+                "libtorch",
+                (input_size,),
+                8,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                output0_raw=output0_raw,
+                output1_raw=output1_raw,
+                swap=swap,
+            )
 
     def test_raw_fff(self):
-        self._full_exact(np.float32,
-                         np.float32,
-                         np.float32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.float32,
+            np.float32,
+            np.float32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=True,
+        )
 
     def test_class_fff(self):
-        self._full_exact(np.float32,
-                         np.float32,
-                         np.float32,
-                         output0_raw=False,
-                         output1_raw=False,
-                         swap=True)
+        self._full_exact(
+            np.float32,
+            np.float32,
+            np.float32,
+            output0_raw=False,
+            output1_raw=False,
+            swap=True,
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_string_io/string_client_test.py b/qa/L0_string_io/string_client_test.py
old mode 100644
new mode 100755
index aabcd7f111..16112ac70c
--- a/qa/L0_string_io/string_client_test.py
+++ b/qa/L0_string_io/string_client_test.py
@@ -27,25 +27,25 @@
 
 import sys
 
-sys.path.append('../common')
+sys.path.append("../common")
 
-import numpy as np
+import unittest
 from builtins import range
-import tritonclient.http as tritonhttpclient
+
+import numpy as np
+import test_util as tu
 import tritonclient.grpc as tritongrpcclient
+import tritonclient.http as tritonhttpclient
 import tritonclient.utils as tritonutils
-import unittest
-import test_util as tu
 
 
 class ClientStringTest(tu.TestResultCollector):
-
     def _test_infer_unicode(self, model_name, client, input_):
         # Send inference request to the inference server. Get results for
         # both output tensors.
         inputs = []
         outputs = []
-        inputs.append(client[1].InferInput('INPUT0', input_.shape, "BYTES"))
+        inputs.append(client[1].InferInput("INPUT0", input_.shape, "BYTES"))
 
         if client[1] == tritonhttpclient:
             inputs[0].set_data_from_numpy(input_, client[3])
@@ -53,31 +53,26 @@ def _test_infer_unicode(self, model_name, client, input_):
             inputs[0].set_data_from_numpy(input_)
 
         if client[1] == tritonhttpclient:
-            outputs.append(client[1].InferRequestedOutput(
-                'OUTPUT0', binary_data=client[2]))
+            outputs.append(
+                client[1].InferRequestedOutput("OUTPUT0", binary_data=client[2])
+            )
         else:
-            outputs.append(client[1].InferRequestedOutput('OUTPUT0'))
+            outputs.append(client[1].InferRequestedOutput("OUTPUT0"))
 
-        results = client[0].infer(model_name=model_name,
-                                  inputs=inputs,
-                                  outputs=outputs)
+        results = client[0].infer(model_name=model_name, inputs=inputs, outputs=outputs)
 
-        out0 = results.as_numpy('OUTPUT0')
+        out0 = results.as_numpy("OUTPUT0")
         # We expect there to be 1 results (with batch-size 1). Verify
         # that all 8 result elements are the same as the input.
         self.assertTrue(np.array_equal(input_, out0))
         return out0
 
-    def _test_infer_non_unicode(self,
-                                model_name,
-                                client,
-                                input_,
-                                binary_data=True):
+    def _test_infer_non_unicode(self, model_name, client, input_, binary_data=True):
         # Send inference request to the inference server. Get results for
         # both output tensors.
         inputs = []
         outputs = []
-        inputs.append(client[1].InferInput('INPUT0', input_.shape, "BYTES"))
+        inputs.append(client[1].InferInput("INPUT0", input_.shape, "BYTES"))
 
         if client[1] == tritonhttpclient:
             inputs[0].set_data_from_numpy(input_, client[3])
@@ -85,57 +80,58 @@ def _test_infer_non_unicode(self,
             inputs[0].set_data_from_numpy(input_)
 
         if client[1] == tritonhttpclient:
-            outputs.append(client[1].InferRequestedOutput(
-                'OUTPUT0', binary_data=client[2]))
+            outputs.append(
+                client[1].InferRequestedOutput("OUTPUT0", binary_data=client[2])
+            )
         else:
-            outputs.append(client[1].InferRequestedOutput('OUTPUT0'))
+            outputs.append(client[1].InferRequestedOutput("OUTPUT0"))
 
-        results = client[0].infer(model_name=model_name,
-                                  inputs=inputs,
-                                  outputs=outputs)
+        results = client[0].infer(model_name=model_name, inputs=inputs, outputs=outputs)
 
-        out0 = results.as_numpy('OUTPUT0')
+        out0 = results.as_numpy("OUTPUT0")
         # We expect there to be 1 results (with batch-size 1). Verify
         # that all 8 result elements are the same as the input.
         if client[2]:
             self.assertTrue(np.array_equal(input_.astype(np.bytes_), out0))
         else:
             self.assertTrue(
-                np.array_equal(input_.astype(np.bytes_),
-                               out0.astype(np.bytes_)))
+                np.array_equal(input_.astype(np.bytes_), out0.astype(np.bytes_))
+            )
         return out0
 
-    def _test_unicode_bytes_dtype(self, client, model_name, dtype='|S78'):
+    def _test_unicode_bytes_dtype(self, client, model_name, dtype="|S78"):
         # Create the data for the input tensor. Initialize the tensor to 8
         # byte strings. (dtype of np.bytes_)
         # Sample string that should no longer cause failure
-        in0 = np.array([
-            [
-                b'\nF\n\'\n\x01a\x12"\x1a \n\x1e\xfa\x03\x94\x01\x0f\xd7\x02\xf1\x05\xdf\x01\x82\x03\xb5\x05\xc1\x07\xba\x06\xff\x06\xc7\x07L\xf5\x03\xe2\x07\xa9\x03\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04\xdf\\\xcb\xbf'
-            ],
+        in0 = np.array(
             [
-                b'\n:\n\x1a\n\x01a\x12\x15\x1a\x13\n\x11*\xe3\x05\xc5\x06\xda\x07\xcb\x06~\xb1\x05\xb3\x01\xa9\x02\x15\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\xbb[\n\xbf'
+                [
+                    b"\nF\n'\n\x01a\x12\"\x1a \n\x1e\xfa\x03\x94\x01\x0f\xd7\x02\xf1\x05\xdf\x01\x82\x03\xb5\x05\xc1\x07\xba\x06\xff\x06\xc7\x07L\xf5\x03\xe2\x07\xa9\x03\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04\xdf\\\xcb\xbf"
+                ],
+                [
+                    b"\n:\n\x1a\n\x01a\x12\x15\x1a\x13\n\x11*\xe3\x05\xc5\x06\xda\x07\xcb\x06~\xb1\x05\xb3\x01\xa9\x02\x15\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\xbb[\n\xbf"
+                ],
+                [
+                    b"\nL\n-\n\x01a\x12(\x1a&\n$\x87\x07\xce\x01\xe7\x06\xee\x04\xe1\x03\xf1\x03\xd7\x07\xbe\x02\xb8\x05\xe0\x05\xe4\x01\x88\x06\xb6\x03\xb9\x05\x83\x06\xf8\x04\xe2\x04\xf4\x06\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04\xbc\x99+@"
+                ],
+                [
+                    b"\n2\n\x12\n\x01a\x12\r\x1a\x0b\n\t\x99\x02\xde\x04\x9f\x04\xc5\x053\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\x12\x07\x83\xbe"
+                ],
+                [
+                    b"\nJ\n\r\n\x01b\x12\x08\x1a\x06\n\x04\x9b\x94\xad\x04\n\r\n\x01c\x12\x08\x12\x06\n\x04\xc3\x8a\x08\xbf\n*\n\x01a\x12%\x1a#\n!\x9c\x02\xb2\x02\xcd\x02\x9d\x07\x8d\x01\xb6\x05a\xf1\x01\xf0\x05\xdb\x02\xac\x04\xbd\x05\xe0\x04\xd2\x06\xaf\x02\xa8\x01\x8b\x04"
+                ],
+                [
+                    b"\n3\n\x13\n\x01a\x12\x0e\x1a\x0c\n\n<\xe2\x05\x8a\x01\xb3\x07?\xfd\x01\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\x1b\x931\xbf\x00\x00"
+                ],
+                [
+                    b"\n&\n\x07\n\x01a\x12\x02\x1a\x00\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04{\xbc\x0e>\x00\x00\x00"
+                ],
+                [
+                    b"\nF\n'\n\x01a\x12\"\x1a \n\x1e\x97\x01\x93\x02\x9e\x01\xac\x06\xff\x01\xd8\x05\xe1\x07\xd8\x04g]\x9a\x05\xff\x06\xde\x07\x8f\x04\x97\x04\xda\x03\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x9a\xb7I\n\r\n\x01c\x12\x08\x12\x06\n\x04\xfb\x87\x83\xbf"
+                ],
             ],
-            [
-                b'\nL\n-\n\x01a\x12(\x1a&\n$\x87\x07\xce\x01\xe7\x06\xee\x04\xe1\x03\xf1\x03\xd7\x07\xbe\x02\xb8\x05\xe0\x05\xe4\x01\x88\x06\xb6\x03\xb9\x05\x83\x06\xf8\x04\xe2\x04\xf4\x06\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04\xbc\x99+@'
-            ],
-            [
-                b'\n2\n\x12\n\x01a\x12\r\x1a\x0b\n\t\x99\x02\xde\x04\x9f\x04\xc5\x053\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\x12\x07\x83\xbe'
-            ],
-            [
-                b'\nJ\n\r\n\x01b\x12\x08\x1a\x06\n\x04\x9b\x94\xad\x04\n\r\n\x01c\x12\x08\x12\x06\n\x04\xc3\x8a\x08\xbf\n*\n\x01a\x12%\x1a#\n!\x9c\x02\xb2\x02\xcd\x02\x9d\x07\x8d\x01\xb6\x05a\xf1\x01\xf0\x05\xdb\x02\xac\x04\xbd\x05\xe0\x04\xd2\x06\xaf\x02\xa8\x01\x8b\x04'
-            ],
-            [
-                b'\n3\n\x13\n\x01a\x12\x0e\x1a\x0c\n\n<\xe2\x05\x8a\x01\xb3\x07?\xfd\x01\n\r\n\x01b\x12\x08\x1a\x06\n\x04\xf6\xa2\xc5\x01\n\r\n\x01c\x12\x08\x12\x06\n\x04\x1b\x931\xbf\x00\x00'
-            ],
-            [
-                b'\n&\n\x07\n\x01a\x12\x02\x1a\x00\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x89\xcc=\n\r\n\x01c\x12\x08\x12\x06\n\x04{\xbc\x0e>\x00\x00\x00'
-            ],
-            [
-                b'\nF\n\'\n\x01a\x12"\x1a \n\x1e\x97\x01\x93\x02\x9e\x01\xac\x06\xff\x01\xd8\x05\xe1\x07\xd8\x04g]\x9a\x05\xff\x06\xde\x07\x8f\x04\x97\x04\xda\x03\n\x0c\n\x01b\x12\x07\x1a\x05\n\x03\x9a\xb7I\n\r\n\x01c\x12\x08\x12\x06\n\x04\xfb\x87\x83\xbf'
-            ]
-        ],
-                       dtype=dtype).flatten()
+            dtype=dtype,
+        ).flatten()
         self._test_infer_unicode(model_name, client, in0)
 
     def _test_str_dtype(self, client, model_name, dtype=np.object_):
@@ -151,25 +147,39 @@ def _test_bytes(self, model_name):
         # This clients will fail for binary_data=False when the binary input
         # is not UTF-8 encodable. They should work for other cases however.
         binary_false_clients = [
-            (tritonhttpclient.InferenceServerClient("localhost:8000",
-                                                    verbose=True),
-             tritonhttpclient, True, False),
-            (tritonhttpclient.InferenceServerClient("localhost:8000",
-                                                    verbose=True),
-             tritonhttpclient, False, False),
-            (tritonhttpclient.InferenceServerClient("localhost:8000",
-                                                    verbose=True),
-             tritonhttpclient, False, True),
+            (
+                tritonhttpclient.InferenceServerClient("localhost:8000", verbose=True),
+                tritonhttpclient,
+                True,
+                False,
+            ),
+            (
+                tritonhttpclient.InferenceServerClient("localhost:8000", verbose=True),
+                tritonhttpclient,
+                False,
+                False,
+            ),
+            (
+                tritonhttpclient.InferenceServerClient("localhost:8000", verbose=True),
+                tritonhttpclient,
+                False,
+                True,
+            ),
         ]
 
         # These clients work for every data type
         other_clients = [
-            (tritongrpcclient.InferenceServerClient("localhost:8001",
-                                                    verbose=True),
-             tritongrpcclient, False),
-            (tritonhttpclient.InferenceServerClient("localhost:8000",
-                                                    verbose=True),
-             tritonhttpclient, True, True),
+            (
+                tritongrpcclient.InferenceServerClient("localhost:8001", verbose=True),
+                tritongrpcclient,
+                False,
+            ),
+            (
+                tritonhttpclient.InferenceServerClient("localhost:8000", verbose=True),
+                tritonhttpclient,
+                True,
+                True,
+            ),
         ]
 
         for client in other_clients + binary_false_clients:
@@ -194,5 +204,5 @@ def test_tf_unicode_bytes(self):
         self._test_bytes("string_identity")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_tf_gpu_io/tf_gpu_io_test.py b/qa/L0_tf_gpu_io/tf_gpu_io_test.py
old mode 100644
new mode 100755
index 23cdb5252f..fd3550e434
--- a/qa/L0_tf_gpu_io/tf_gpu_io_test.py
+++ b/qa/L0_tf_gpu_io/tf_gpu_io_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,30 +31,35 @@
 sys.path.append("../common")
 
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
 
 TENSOR_SIZE = 16384
 
 
 class TfGpuIoTest(tu.TestResultCollector):
-
-    def _test_helper(self,
-                     model_name,
-                     shape,
-                     override_input_names=[],
-                     override_output_names=[],
-                     batching_enabled=False):
+    def _test_helper(
+        self,
+        model_name,
+        shape,
+        override_input_names=[],
+        override_output_names=[],
+        batching_enabled=False,
+    ):
         try:
             bs = 1
             if batching_enabled:
-                shape = [[
-                    bs,
-                ] + shape]
+                shape = [
+                    [
+                        bs,
+                    ]
+                    + shape
+                ]
             iu.infer_zero(
                 self,
-                'graphdef',
+                "graphdef",
                 bs,
                 np.float32,
                 shape,
@@ -66,26 +73,33 @@ def _test_helper(self,
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_sig_tag0(self):
-        self._test_helper("sig_tag0", [16],
-                          override_input_names=["INPUT"],
-                          override_output_names=["OUTPUT"])
+        self._test_helper(
+            "sig_tag0",
+            [16],
+            override_input_names=["INPUT"],
+            override_output_names=["OUTPUT"],
+        )
 
     def test_graphdef_zero_1_float32_def(self):
-        self._test_helper("graphdef_zero_1_float32_def", [TENSOR_SIZE],
-                          batching_enabled=True)
+        self._test_helper(
+            "graphdef_zero_1_float32_def", [TENSOR_SIZE], batching_enabled=True
+        )
 
     def test_graphdef_zero_1_float32_gpu(self):
-        self._test_helper("graphdef_zero_1_float32_gpu", [TENSOR_SIZE],
-                          batching_enabled=True)
+        self._test_helper(
+            "graphdef_zero_1_float32_gpu", [TENSOR_SIZE], batching_enabled=True
+        )
 
     def test_savedmodel_zero_1_float32_def(self):
-        self._test_helper("savedmodel_zero_1_float32_def", [TENSOR_SIZE],
-                          batching_enabled=True)
+        self._test_helper(
+            "savedmodel_zero_1_float32_def", [TENSOR_SIZE], batching_enabled=True
+        )
 
     def test_savedmodel_zero_1_float32_gpu(self):
-        self._test_helper("savedmodel_zero_1_float32_gpu", [TENSOR_SIZE],
-                          batching_enabled=True)
+        self._test_helper(
+            "savedmodel_zero_1_float32_gpu", [TENSOR_SIZE], batching_enabled=True
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_tf_parameters/test.sh b/qa/L0_tf_parameters/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_tf_parameters/tf_parameter_test.py b/qa/L0_tf_parameters/tf_parameter_test.py
old mode 100644
new mode 100755
index 4cdd8aa045..f1a4621d93
--- a/qa/L0_tf_parameters/tf_parameter_test.py
+++ b/qa/L0_tf_parameters/tf_parameter_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -25,53 +27,55 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import sys
-sys.path.append('../common')
 
+sys.path.append("../common")
+
+import unittest
+
+import numpy as np
 import test_util as tu
 import tritonclient.http as tritonhttpclient
 import tritonclient.utils
-import numpy as np
-import unittest
 
 
 class TFParameterTest(tu.TestResultCollector):
-
     def setUp(self):
-        self._client = tritonhttpclient.InferenceServerClient("localhost:8000",
-                                                              verbose=True)
+        self._client = tritonhttpclient.InferenceServerClient(
+            "localhost:8000", verbose=True
+        )
 
     def _infer_helper(self):
         # The model has a single variable which is added to the input.  Since the
         # variable is initialized to zero the input and output must match.
-        model_name = 'graphdef_variable'
+        model_name = "graphdef_variable"
         input = np.array([10], dtype=np.int32)
 
         inputs = []
-        inputs.append(tritonhttpclient.InferInput('INPUT', input.shape,
-                                                  'INT32'))
+        inputs.append(tritonhttpclient.InferInput("INPUT", input.shape, "INT32"))
         inputs[-1].set_data_from_numpy(input)
 
         outputs = []
-        outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT'))
+        outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT"))
 
-        results = self._client.infer(model_name=model_name,
-                                     inputs=inputs,
-                                     outputs=outputs)
-        output = results.as_numpy('OUTPUT')
+        results = self._client.infer(
+            model_name=model_name, inputs=inputs, outputs=outputs
+        )
+        output = results.as_numpy("OUTPUT")
         np.testing.assert_array_equal(output, input)
 
     def test_tf_variable(self):
         self._infer_helper()
 
     def test_tf_variable_error(self):
-        with self.assertRaises(
-                tritonclient.utils.InferenceServerException) as e:
+        with self.assertRaises(tritonclient.utils.InferenceServerException) as e:
             self._infer_helper()
         self.assertIn(
-            "FAILED_PRECONDITION: Could not find variable VARIABLE. This " +
-            "could mean that the variable has been deleted. In TF1, it can " +
-            "also mean the variable is uninitialized.", e.exception.message())
+            "FAILED_PRECONDITION: Could not find variable VARIABLE. This "
+            + "could mean that the variable has been deleted. In TF1, it can "
+            + "also mean the variable is uninitialized.",
+            e.exception.message(),
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py b/qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py
old mode 100644
new mode 100755
index f4dcc5bdba..b4a11ac04e
--- a/qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py
+++ b/qa/L0_tf_tag_sigdef/tf_tag_sigdef_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -29,6 +31,7 @@
 sys.path.append("../common")
 
 import unittest
+
 import numpy as np
 import test_util as tu
 import tritonhttpclient as httpclient
@@ -49,16 +52,14 @@ def _test_helper(self, modelVersion, tag, sig_def):
         # for details
         multiplier = modelVersion + 1
         output_name = "OUTPUT"
-        triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                         verbose=True)
+        triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True)
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT', shape, "FP32"))
+        inputs.append(httpclient.InferInput("INPUT", shape, "FP32"))
         input_data = np.ones(shape=shape).astype(np.float32)
         inputs[0].set_data_from_numpy(input_data, binary_data=True)
 
-        outputs.append(
-            httpclient.InferRequestedOutput(output_name, binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput(output_name, binary_data=True))
         results = triton_client.infer(model_name, inputs, outputs=outputs)
         output_data = results.as_numpy(output_name)
         test_output = input_data * multiplier
@@ -77,5 +78,5 @@ def test_tag_sig_def(self):
         self._test_helper(3, self.test_tag, self.test_sig_def)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_tf_unknown_rank/test.sh b/qa/L0_tf_unknown_rank/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_tf_unknown_rank/tf_unknown_rank_test.py b/qa/L0_tf_unknown_rank/tf_unknown_rank_test.py
old mode 100644
new mode 100755
index 66297d671d..add6b32c13
--- a/qa/L0_tf_unknown_rank/tf_unknown_rank_test.py
+++ b/qa/L0_tf_unknown_rank/tf_unknown_rank_test.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,6 +31,7 @@
 sys.path.append("../common")
 
 import unittest
+
 import numpy as np
 import test_util as tu
 import tritonhttpclient
@@ -40,18 +43,19 @@ class UnknownRankTest(tu.TestResultCollector):
     def infer_unknown(self, model_name, tensor_shape):
         print("About to run the test")
         input_data = np.random.random_sample(tensor_shape).astype(np.float32)
-        client = tritonhttpclient.InferenceServerClient('localhost:8000')
+        client = tritonhttpclient.InferenceServerClient("localhost:8000")
         inputs = [
-            tritonhttpclient.InferInput("INPUT", input_data.shape,
-                                        np_to_triton_dtype(input_data.dtype))
+            tritonhttpclient.InferInput(
+                "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype)
+            )
         ]
         inputs[0].set_data_from_numpy(input_data)
         results = client.infer(model_name, inputs)
-        self.assertTrue(np.array_equal(results.as_numpy('OUTPUT'), input_data))
+        self.assertTrue(np.array_equal(results.as_numpy("OUTPUT"), input_data))
 
     def test_success(self):
         model_name = "unknown_rank_success"
-        tensor_shape = (1)
+        tensor_shape = 1
         try:
             self.infer_unknown(model_name, tensor_shape)
         except InferenceServerException as ex:
@@ -63,15 +67,16 @@ def test_wrong_input(self):
         try:
             self.infer_unknown(model_name, tensor_shape)
             self.fail(
-                "Found success when expected failure with model given " \
+                "Found success when expected failure with model given "
                 "wrong input tensor [1,2] for input [-1,1]."
             )
         except InferenceServerException as ex:
             self.assertIn(
-                "unexpected shape for input \'INPUT\' for model " \
-                "\'unknown_rank_wrong_output\'. Expected [1], got [1,2]",
-                ex.message())
+                "unexpected shape for input 'INPUT' for model "
+                "'unknown_rank_wrong_output'. Expected [1], got [1,2]",
+                ex.message(),
+            )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_tftrt_optimization/tftrt_optimization_test.py b/qa/L0_tftrt_optimization/tftrt_optimization_test.py
old mode 100644
new mode 100755
index 9129d8d87d..9e59677317
--- a/qa/L0_tftrt_optimization/tftrt_optimization_test.py
+++ b/qa/L0_tftrt_optimization/tftrt_optimization_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -29,47 +31,45 @@
 sys.path.append("../common")
 
 import unittest
+
 import numpy as np
 import test_util as tu
 import tritonhttpclient as httpclient
 
 
 class TFTRTOptimizationTest(tu.TestResultCollector):
-
     def setUp(self):
-        self.input0_ = np.arange(start=0, stop=16,
-                                 dtype=np.float32).reshape(1, 16)
+        self.input0_ = np.arange(start=0, stop=16, dtype=np.float32).reshape(1, 16)
         self.input1_ = np.ones(shape=16, dtype=np.float32).reshape(1, 16)
         self.expected_output0_ = self.input0_ + self.input1_
         self.expected_output1_ = self.input0_ - self.input1_
 
     def _addsub_infer(self, model_name):
-        triton_client = httpclient.InferenceServerClient("localhost:8000",
-                                                         verbose=True)
+        triton_client = httpclient.InferenceServerClient("localhost:8000", verbose=True)
 
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "FP32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "FP32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "FP32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "FP32"))
 
         # Initialize the data
         inputs[0].set_data_from_numpy(self.input0_, binary_data=True)
         inputs[1].set_data_from_numpy(self.input1_, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=True))
 
         results = triton_client.infer(model_name, inputs, outputs=outputs)
 
-        output0_data = results.as_numpy('OUTPUT0')
-        output1_data = results.as_numpy('OUTPUT1')
+        output0_data = results.as_numpy("OUTPUT0")
+        output1_data = results.as_numpy("OUTPUT1")
 
-        self.assertTrue(np.array_equal(self.expected_output0_, output0_data),
-                        "incorrect sum")
-        self.assertTrue(np.array_equal(self.expected_output1_, output1_data),
-                        "incorrect difference")
+        self.assertTrue(
+            np.array_equal(self.expected_output0_, output0_data), "incorrect sum"
+        )
+        self.assertTrue(
+            np.array_equal(self.expected_output1_, output1_data), "incorrect difference"
+        )
 
     def test_graphdef(self):
         self._addsub_infer("graphdef_float32_float32_float32_trt")
@@ -80,5 +80,5 @@ def test_savedmodel(self):
         self._addsub_infer("savedmodel_float32_float32_float32_param")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_trace/test.sh b/qa/L0_trace/test.sh
index cf81a1a1ec..aeaa96e367 100755
--- a/qa/L0_trace/test.sh
+++ b/qa/L0_trace/test.sh
@@ -722,4 +722,4 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-exit $RET
+exit $RET
\ No newline at end of file
diff --git a/qa/L0_trace/trace_endpoint_test.py b/qa/L0_trace/trace_endpoint_test.py
old mode 100644
new mode 100755
index 8f9c482656..a962da5d4c
--- a/qa/L0_trace/trace_endpoint_test.py
+++ b/qa/L0_trace/trace_endpoint_test.py
@@ -30,18 +30,18 @@
 
 sys.path.append("../common")
 
+import json
 import sys
 import unittest
-import tritonclient.http as httpclient
+
+import test_util as tu
 import tritonclient.grpc as grpcclient
-import json
+import tritonclient.http as httpclient
 from google.protobuf import json_format
-import test_util as tu
 
 
 # Similar set up as dynamic batcher tests
 class TraceEndpointTest(tu.TestResultCollector):
-
     def tearDown(self):
         # Clear all trace settings to initial state.
         # Note that the tearDown function uses HTTP client so the pass/fail
@@ -53,13 +53,13 @@ def tearDown(self):
             "trace_level": None,
             "trace_rate": None,
             "trace_count": None,
-            "log_frequency": None
+            "log_frequency": None,
         }
         triton_client = httpclient.InferenceServerClient("localhost:8000")
-        triton_client.update_trace_settings(model_name="simple",
-                                            settings=clear_settings)
-        triton_client.update_trace_settings(model_name=None,
-                                            settings=clear_settings)
+        triton_client.update_trace_settings(
+            model_name="simple", settings=clear_settings
+        )
+        triton_client.update_trace_settings(model_name=None, settings=clear_settings)
 
     def check_server_initial_state(self):
         # Helper function to make sure the trace setting is properly
@@ -72,11 +72,12 @@ def check_server_initial_state(self):
             "trace_level": ["TIMESTAMPS"],
             "trace_rate": "1",
             "trace_count": "-1",
-            "log_frequency": "0"
+            "log_frequency": "0",
         }
         triton_client = httpclient.InferenceServerClient("localhost:8000")
-        self.assertEqual(initial_settings,
-                         triton_client.get_trace_settings(model_name="simple"))
+        self.assertEqual(
+            initial_settings, triton_client.get_trace_settings(model_name="simple")
+        )
         self.assertEqual(initial_settings, triton_client.get_trace_settings())
 
     def test_http_get_settings(self):
@@ -87,46 +88,50 @@ def test_http_get_settings(self):
             "trace_level": ["TIMESTAMPS"],
             "trace_rate": "1",
             "trace_count": "-1",
-            "log_frequency": "0"
+            "log_frequency": "0",
         }
         triton_client = httpclient.InferenceServerClient("localhost:8000")
-        self.assertEqual(initial_settings,
-                         triton_client.get_trace_settings(model_name="simple"),
-                         "Unexpected initial model trace settings")
-        self.assertEqual(initial_settings, triton_client.get_trace_settings(),
-                         "Unexpected initial global settings")
+        self.assertEqual(
+            initial_settings,
+            triton_client.get_trace_settings(model_name="simple"),
+            "Unexpected initial model trace settings",
+        )
+        self.assertEqual(
+            initial_settings,
+            triton_client.get_trace_settings(),
+            "Unexpected initial global settings",
+        )
 
     def test_grpc_get_settings(self):
         # Model trace settings will be the same as global trace settings since
         # no update has been made.
         initial_settings = grpcclient.service_pb2.TraceSettingResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "trace_file": {
-                        "value": ["global_unittest.log"]
-                    },
-                    "trace_level": {
-                        "value": ["TIMESTAMPS"]
-                    },
-                    "trace_rate": {
-                        "value": ["1"]
-                    },
-                    "trace_count": {
-                        "value": ["-1"]
-                    },
-                    "log_frequency": {
-                        "value": ["0"]
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "trace_file": {"value": ["global_unittest.log"]},
+                        "trace_level": {"value": ["TIMESTAMPS"]},
+                        "trace_rate": {"value": ["1"]},
+                        "trace_count": {"value": ["-1"]},
+                        "log_frequency": {"value": ["0"]},
+                    }
                 }
-            }), initial_settings)
+            ),
+            initial_settings,
+        )
 
         triton_client = grpcclient.InferenceServerClient("localhost:8001")
-        self.assertEqual(initial_settings,
-                         triton_client.get_trace_settings(model_name="simple"),
-                         "Unexpected initial model trace settings")
-        self.assertEqual(initial_settings, triton_client.get_trace_settings(),
-                         "Unexpected initial global settings")
+        self.assertEqual(
+            initial_settings,
+            triton_client.get_trace_settings(model_name="simple"),
+            "Unexpected initial model trace settings",
+        )
+        self.assertEqual(
+            initial_settings,
+            triton_client.get_trace_settings(),
+            "Unexpected initial global settings",
+        )
 
     def test_http_update_settings(self):
         # Update model and global trace settings in order,
@@ -139,47 +144,51 @@ def test_http_update_settings(self):
             "trace_level": ["TIMESTAMPS"],
             "trace_rate": "1",
             "trace_count": "-1",
-            "log_frequency": "0"
+            "log_frequency": "0",
         }
         expected_second_model_settings = {
             "trace_file": "model.log",
             "trace_level": ["TIMESTAMPS", "TENSORS"],
             "trace_rate": "1",
             "trace_count": "-1",
-            "log_frequency": "0"
+            "log_frequency": "0",
         }
         expected_global_settings = {
             "trace_file": "another.log",
             "trace_level": ["TIMESTAMPS", "TENSORS"],
             "trace_rate": "1",
             "trace_count": "-1",
-            "log_frequency": "0"
+            "log_frequency": "0",
         }
 
         model_update_settings = {"trace_file": "model.log"}
         global_update_settings = {
             "trace_file": "another.log",
-            "trace_level": ["TIMESTAMPS", "TENSORS"]
+            "trace_level": ["TIMESTAMPS", "TENSORS"],
         }
 
         triton_client = httpclient.InferenceServerClient("localhost:8000")
         self.assertEqual(
             expected_first_model_settings,
-            triton_client.update_trace_settings(model_name="simple",
-                                                settings=model_update_settings),
-            "Unexpected updated model trace settings")
+            triton_client.update_trace_settings(
+                model_name="simple", settings=model_update_settings
+            ),
+            "Unexpected updated model trace settings",
+        )
         # Note that 'trace_level' may be mismatch due to the order of
         # the levels listed, currently we assume the order is the same
         # for simplicity. But the order shouldn't be enforced and this checking
         # needs to be improved when this kind of failure is reported
         self.assertEqual(
             expected_global_settings,
-            triton_client.update_trace_settings(
-                settings=global_update_settings),
-            "Unexpected updated global settings")
-        self.assertEqual(expected_second_model_settings,
-                         triton_client.get_trace_settings(model_name="simple"),
-                         "Unexpected model trace settings after global update")
+            triton_client.update_trace_settings(settings=global_update_settings),
+            "Unexpected updated global settings",
+        )
+        self.assertEqual(
+            expected_second_model_settings,
+            triton_client.get_trace_settings(model_name="simple"),
+            "Unexpected model trace settings after global update",
+        )
 
     def test_grpc_update_settings(self):
         # Update model and global trace settings in order,
@@ -187,98 +196,82 @@ def test_grpc_update_settings(self):
         # the model setting fields that haven't been specified.
         self.check_server_initial_state()
 
-        expected_first_model_settings = grpcclient.service_pb2.TraceSettingResponse(
-        )
+        expected_first_model_settings = grpcclient.service_pb2.TraceSettingResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "trace_file": {
-                        "value": ["model.log"]
-                    },
-                    "trace_level": {
-                        "value": ["TIMESTAMPS"]
-                    },
-                    "trace_rate": {
-                        "value": ["1"]
-                    },
-                    "trace_count": {
-                        "value": ["-1"]
-                    },
-                    "log_frequency": {
-                        "value": ["0"]
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "trace_file": {"value": ["model.log"]},
+                        "trace_level": {"value": ["TIMESTAMPS"]},
+                        "trace_rate": {"value": ["1"]},
+                        "trace_count": {"value": ["-1"]},
+                        "log_frequency": {"value": ["0"]},
+                    }
                 }
-            }), expected_first_model_settings)
-
-        expected_second_model_settings = grpcclient.service_pb2.TraceSettingResponse(
+            ),
+            expected_first_model_settings,
         )
+
+        expected_second_model_settings = grpcclient.service_pb2.TraceSettingResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "trace_file": {
-                        "value": ["model.log"]
-                    },
-                    "trace_level": {
-                        "value": ["TIMESTAMPS", "TENSORS"]
-                    },
-                    "trace_rate": {
-                        "value": ["1"]
-                    },
-                    "trace_count": {
-                        "value": ["-1"]
-                    },
-                    "log_frequency": {
-                        "value": ["0"]
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "trace_file": {"value": ["model.log"]},
+                        "trace_level": {"value": ["TIMESTAMPS", "TENSORS"]},
+                        "trace_rate": {"value": ["1"]},
+                        "trace_count": {"value": ["-1"]},
+                        "log_frequency": {"value": ["0"]},
+                    }
                 }
-            }), expected_second_model_settings)
+            ),
+            expected_second_model_settings,
+        )
 
         expected_global_settings = grpcclient.service_pb2.TraceSettingResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "trace_file": {
-                        "value": ["another.log"]
-                    },
-                    "trace_level": {
-                        "value": ["TIMESTAMPS", "TENSORS"]
-                    },
-                    "trace_rate": {
-                        "value": ["1"]
-                    },
-                    "trace_count": {
-                        "value": ["-1"]
-                    },
-                    "log_frequency": {
-                        "value": ["0"]
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "trace_file": {"value": ["another.log"]},
+                        "trace_level": {"value": ["TIMESTAMPS", "TENSORS"]},
+                        "trace_rate": {"value": ["1"]},
+                        "trace_count": {"value": ["-1"]},
+                        "log_frequency": {"value": ["0"]},
+                    }
                 }
-            }), expected_global_settings)
+            ),
+            expected_global_settings,
+        )
 
         model_update_settings = {"trace_file": "model.log"}
         global_update_settings = {
             "trace_file": "another.log",
-            "trace_level": ["TIMESTAMPS", "TENSORS"]
+            "trace_level": ["TIMESTAMPS", "TENSORS"],
         }
 
         triton_client = grpcclient.InferenceServerClient("localhost:8001")
         self.assertEqual(
             expected_first_model_settings,
-            triton_client.update_trace_settings(model_name="simple",
-                                                settings=model_update_settings),
-            "Unexpected updated model trace settings")
+            triton_client.update_trace_settings(
+                model_name="simple", settings=model_update_settings
+            ),
+            "Unexpected updated model trace settings",
+        )
         # Note that 'trace_level' may be mismatch due to the order of
         # the levels listed, currently we assume the order is the same
         # for simplicity. But the order shouldn't be enforced and this checking
         # needs to be improved when this kind of failure is reported
         self.assertEqual(
             expected_global_settings,
-            triton_client.update_trace_settings(
-                settings=global_update_settings),
-            "Unexpected updated global settings")
-        self.assertEqual(expected_second_model_settings,
-                         triton_client.get_trace_settings(model_name="simple"),
-                         "Unexpected model trace settings after global update")
+            triton_client.update_trace_settings(settings=global_update_settings),
+            "Unexpected updated global settings",
+        )
+        self.assertEqual(
+            expected_second_model_settings,
+            triton_client.get_trace_settings(model_name="simple"),
+            "Unexpected model trace settings after global update",
+        )
 
     def test_http_clear_settings(self):
         # Clear global and model trace settings in order,
@@ -290,37 +283,33 @@ def test_http_clear_settings(self):
         # model 'simple' has 'trace_rate' and 'log_frequency' specified
         # global has 'trace_level', 'trace_count' and 'trace_rate' specified
         triton_client = httpclient.InferenceServerClient("localhost:8000")
-        triton_client.update_trace_settings(model_name="simple",
-                                            settings={
-                                                "trace_rate": "12",
-                                                "log_frequency": "34"
-                                            })
-        triton_client.update_trace_settings(settings={
-            "trace_rate": "56",
-            "trace_count": "78",
-            "trace_level": ["OFF"]
-        })
+        triton_client.update_trace_settings(
+            model_name="simple", settings={"trace_rate": "12", "log_frequency": "34"}
+        )
+        triton_client.update_trace_settings(
+            settings={"trace_rate": "56", "trace_count": "78", "trace_level": ["OFF"]}
+        )
 
         expected_global_settings = {
             "trace_file": "global_unittest.log",
             "trace_level": ["OFF"],
             "trace_rate": "1",
             "trace_count": "-1",
-            "log_frequency": "0"
+            "log_frequency": "0",
         }
         expected_first_model_settings = {
             "trace_file": "global_unittest.log",
             "trace_level": ["OFF"],
             "trace_rate": "12",
             "trace_count": "-1",
-            "log_frequency": "34"
+            "log_frequency": "34",
         }
         expected_second_model_settings = {
             "trace_file": "global_unittest.log",
             "trace_level": ["OFF"],
             "trace_rate": "1",
             "trace_count": "-1",
-            "log_frequency": "34"
+            "log_frequency": "34",
         }
         global_clear_settings = {"trace_rate": None, "trace_count": None}
         model_clear_settings = {"trace_rate": None, "trace_level": None}
@@ -329,18 +318,25 @@ def test_http_clear_settings(self):
         self.assertEqual(
             expected_global_settings,
             triton_client.update_trace_settings(settings=global_clear_settings),
-            "Unexpected cleared global trace settings")
-        self.assertEqual(expected_first_model_settings,
-                         triton_client.get_trace_settings(model_name="simple"),
-                         "Unexpected model trace settings after global clear")
+            "Unexpected cleared global trace settings",
+        )
+        self.assertEqual(
+            expected_first_model_settings,
+            triton_client.get_trace_settings(model_name="simple"),
+            "Unexpected model trace settings after global clear",
+        )
         self.assertEqual(
             expected_second_model_settings,
-            triton_client.update_trace_settings(model_name="simple",
-                                                settings=model_clear_settings),
-            "Unexpected model trace settings after model clear")
-        self.assertEqual(expected_global_settings,
-                         triton_client.get_trace_settings(),
-                         "Unexpected global trace settings after model clear")
+            triton_client.update_trace_settings(
+                model_name="simple", settings=model_clear_settings
+            ),
+            "Unexpected model trace settings after model clear",
+        )
+        self.assertEqual(
+            expected_global_settings,
+            triton_client.get_trace_settings(),
+            "Unexpected global trace settings after model clear",
+        )
 
     def test_grpc_clear_settings(self):
         # Clear global and model trace settings in order,
@@ -352,82 +348,58 @@ def test_grpc_clear_settings(self):
         # model 'simple' has 'trace_rate' and 'log_frequency' specified
         # global has 'trace_level', 'trace_count' and 'trace_rate' specified
         triton_client = grpcclient.InferenceServerClient("localhost:8001")
-        triton_client.update_trace_settings(model_name="simple",
-                                            settings={
-                                                "trace_rate": "12",
-                                                "log_frequency": "34"
-                                            })
-        triton_client.update_trace_settings(settings={
-            "trace_rate": "56",
-            "trace_count": "78",
-            "trace_level": ["OFF"]
-        })
+        triton_client.update_trace_settings(
+            model_name="simple", settings={"trace_rate": "12", "log_frequency": "34"}
+        )
+        triton_client.update_trace_settings(
+            settings={"trace_rate": "56", "trace_count": "78", "trace_level": ["OFF"]}
+        )
 
         expected_global_settings = grpcclient.service_pb2.TraceSettingResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "trace_file": {
-                        "value": ["global_unittest.log"]
-                    },
-                    "trace_level": {
-                        "value": ["OFF"]
-                    },
-                    "trace_rate": {
-                        "value": ["1"]
-                    },
-                    "trace_count": {
-                        "value": ["-1"]
-                    },
-                    "log_frequency": {
-                        "value": ["0"]
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "trace_file": {"value": ["global_unittest.log"]},
+                        "trace_level": {"value": ["OFF"]},
+                        "trace_rate": {"value": ["1"]},
+                        "trace_count": {"value": ["-1"]},
+                        "log_frequency": {"value": ["0"]},
+                    }
                 }
-            }), expected_global_settings)
-        expected_first_model_settings = grpcclient.service_pb2.TraceSettingResponse(
+            ),
+            expected_global_settings,
         )
+        expected_first_model_settings = grpcclient.service_pb2.TraceSettingResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "trace_file": {
-                        "value": ["global_unittest.log"]
-                    },
-                    "trace_level": {
-                        "value": ["OFF"]
-                    },
-                    "trace_rate": {
-                        "value": ["12"]
-                    },
-                    "trace_count": {
-                        "value": ["-1"]
-                    },
-                    "log_frequency": {
-                        "value": ["34"]
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "trace_file": {"value": ["global_unittest.log"]},
+                        "trace_level": {"value": ["OFF"]},
+                        "trace_rate": {"value": ["12"]},
+                        "trace_count": {"value": ["-1"]},
+                        "log_frequency": {"value": ["34"]},
+                    }
                 }
-            }), expected_first_model_settings)
-        expected_second_model_settings = grpcclient.service_pb2.TraceSettingResponse(
+            ),
+            expected_first_model_settings,
         )
+        expected_second_model_settings = grpcclient.service_pb2.TraceSettingResponse()
         json_format.Parse(
-            json.dumps({
-                "settings": {
-                    "trace_file": {
-                        "value": ["global_unittest.log"]
-                    },
-                    "trace_level": {
-                        "value": ["OFF"]
-                    },
-                    "trace_rate": {
-                        "value": ["1"]
-                    },
-                    "trace_count": {
-                        "value": ["-1"]
-                    },
-                    "log_frequency": {
-                        "value": ["34"]
-                    },
+            json.dumps(
+                {
+                    "settings": {
+                        "trace_file": {"value": ["global_unittest.log"]},
+                        "trace_level": {"value": ["OFF"]},
+                        "trace_rate": {"value": ["1"]},
+                        "trace_count": {"value": ["-1"]},
+                        "log_frequency": {"value": ["34"]},
+                    }
                 }
-            }), expected_second_model_settings)
+            ),
+            expected_second_model_settings,
+        )
 
         global_clear_settings = {"trace_rate": None, "trace_count": None}
         model_clear_settings = {"trace_rate": None, "trace_level": None}
@@ -436,19 +408,26 @@ def test_grpc_clear_settings(self):
         self.assertEqual(
             expected_global_settings,
             triton_client.update_trace_settings(settings=global_clear_settings),
-            "Unexpected cleared global trace settings")
-        self.assertEqual(expected_first_model_settings,
-                         triton_client.get_trace_settings(model_name="simple"),
-                         "Unexpected model trace settings after global clear")
+            "Unexpected cleared global trace settings",
+        )
+        self.assertEqual(
+            expected_first_model_settings,
+            triton_client.get_trace_settings(model_name="simple"),
+            "Unexpected model trace settings after global clear",
+        )
         self.assertEqual(
             expected_second_model_settings,
-            triton_client.update_trace_settings(model_name="simple",
-                                                settings=model_clear_settings),
-            "Unexpected model trace settings after model clear")
-        self.assertEqual(expected_global_settings,
-                         triton_client.get_trace_settings(),
-                         "Unexpected global trace settings after model clear")
+            triton_client.update_trace_settings(
+                model_name="simple", settings=model_clear_settings
+            ),
+            "Unexpected model trace settings after model clear",
+        )
+        self.assertEqual(
+            expected_global_settings,
+            triton_client.get_trace_settings(),
+            "Unexpected global trace settings after model clear",
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_triton_repo_agent/test.sh b/qa/L0_triton_repo_agent/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_trt_data_dependent_shape/trt_data_dependent_shape_test.py b/qa/L0_trt_data_dependent_shape/trt_data_dependent_shape_test.py
old mode 100644
new mode 100755
index 3f2eeeaa40..ee0b675d84
--- a/qa/L0_trt_data_dependent_shape/trt_data_dependent_shape_test.py
+++ b/qa/L0_trt_data_dependent_shape/trt_data_dependent_shape_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -29,16 +31,17 @@
 sys.path.append("../common")
 
 import unittest
+
 import numpy as np
 import test_util as tu
 import tritonclient.http as client
 
 
 class TrtDataDependentShapeTest(tu.TestResultCollector):
-
     def setUp(self):
-        self.triton_client = client.InferenceServerClient("localhost:8000",
-                                                          verbose=True)
+        self.triton_client = client.InferenceServerClient(
+            "localhost:8000", verbose=True
+        )
 
     def test_fixed(self):
         model_name = "plan_nobatch_nonzero_fixed"
@@ -46,15 +49,16 @@ def test_fixed(self):
         expected_output_np = np.nonzero(input_np)
 
         inputs = []
-        inputs.append(client.InferInput('INPUT', [4, 4], "INT32"))
+        inputs.append(client.InferInput("INPUT", [4, 4], "INT32"))
         inputs[-1].set_data_from_numpy(input_np)
 
         results = self.triton_client.infer(model_name=model_name, inputs=inputs)
         # Validate the results by comparing with precomputed values.
-        output_np = results.as_numpy('OUTPUT')
+        output_np = results.as_numpy("OUTPUT")
         self.assertTrue(
             np.array_equal(output_np, expected_output_np),
-            "OUTPUT expected: {}, got {}".format(expected_output_np, output_np))
+            "OUTPUT expected: {}, got {}".format(expected_output_np, output_np),
+        )
 
     def test_dynamic(self):
         model_name = "plan_nobatch_nonzero_dynamic"
@@ -65,16 +69,17 @@ def test_dynamic(self):
         expected_output_np = np.nonzero(input_np)
 
         inputs = []
-        inputs.append(client.InferInput('INPUT', [20, 16], "INT32"))
+        inputs.append(client.InferInput("INPUT", [20, 16], "INT32"))
         inputs[-1].set_data_from_numpy(input_np)
 
         results = self.triton_client.infer(model_name=model_name, inputs=inputs)
         # Validate the results by comparing with precomputed values.
-        output_np = results.as_numpy('OUTPUT')
+        output_np = results.as_numpy("OUTPUT")
         self.assertTrue(
             np.array_equal(output_np, expected_output_np),
-            "OUTPUT expected: {}, got {}".format(expected_output_np, output_np))
+            "OUTPUT expected: {}, got {}".format(expected_output_np, output_np),
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_trt_dla/dla_test.py b/qa/L0_trt_dla/dla_test.py
old mode 100644
new mode 100755
index c4fe48a22d..d71d277ac4
--- a/qa/L0_trt_dla/dla_test.py
+++ b/qa/L0_trt_dla/dla_test.py
@@ -30,22 +30,21 @@
 sys.path.append("../common")
 
 import unittest
+
 import numpy as np
-from PIL import Image
 import test_util as tu
-
 import tritonclient.http as httpclient
+from PIL import Image
 
 
 class InferTest(tu.TestResultCollector):
-
     def _preprocess(self, img, dtype):
         """
         Pre-process an image to meet the size and type
         requirements specified by the parameters.
         """
 
-        sample_img = img.convert('RGB')
+        sample_img = img.convert("RGB")
         resized_img = sample_img.resize((224, 224), Image.BILINEAR)
         resized = np.array(resized_img)
 
@@ -57,8 +56,7 @@ def _preprocess(self, img, dtype):
 
     def test_resnet50(self):
         try:
-            triton_client = httpclient.InferenceServerClient(
-                url="localhost:8000")
+            triton_client = httpclient.InferenceServerClient(url="localhost:8000")
         except Exception as e:
             print("channel creation failed: " + str(e))
             sys.exit(1)
@@ -74,22 +72,21 @@ def test_resnet50(self):
         batched_image_data = image_data
         for i in range(1, batch_size):
             batched_image_data = np.concatenate(
-                (batched_image_data, image_data), axis=0)
+                (batched_image_data, image_data), axis=0
+            )
 
         inputs = [
-            httpclient.InferInput('input_tensor_0', [batch_size, 3, 224, 224],
-                                  'INT8')
+            httpclient.InferInput("input_tensor_0", [batch_size, 3, 224, 224], "INT8")
         ]
         inputs[0].set_data_from_numpy(batched_image_data, binary_data=True)
 
         outputs = [
-            httpclient.InferRequestedOutput('topk_layer_output_index',
-                                            binary_data=True)
+            httpclient.InferRequestedOutput("topk_layer_output_index", binary_data=True)
         ]
 
         results = triton_client.infer(model_name, inputs, outputs=outputs)
 
-        output_data = results.as_numpy('topk_layer_output_index')
+        output_data = results.as_numpy("topk_layer_output_index")
         print(output_data)
 
         # Validate the results by comparing with precomputed values.
@@ -99,5 +96,5 @@ def test_resnet50(self):
             self.assertEqual(output_data[i][0][0], EXPECTED_CLASS_INDEX)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_trt_dla/test.sh b/qa/L0_trt_dla/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_trt_dynamic_shape/test.sh b/qa/L0_trt_dynamic_shape/test.sh
index 99ecc7f2b8..43a39dd199 100755
--- a/qa/L0_trt_dynamic_shape/test.sh
+++ b/qa/L0_trt_dynamic_shape/test.sh
@@ -305,7 +305,7 @@ kill $SERVER_PID
 wait $SERVER_PID
 
 
-# Adding test cases for mulitple optimization profiles with static shapes.
+# Adding test cases for multiple optimization profiles with static shapes.
 # Will load only the following profiles with the static shapes:
 # Profile 7: [1, 33]
 # Profile 8: [3, 33]
diff --git a/qa/L0_trt_dynamic_shape/trt_dynamic_shape_test.py b/qa/L0_trt_dynamic_shape/trt_dynamic_shape_test.py
old mode 100644
new mode 100755
index d01bc51ee1..d9f890d9b6
--- a/qa/L0_trt_dynamic_shape/trt_dynamic_shape_test.py
+++ b/qa/L0_trt_dynamic_shape/trt_dynamic_shape_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -29,33 +31,48 @@
 sys.path.append("../common")
 
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
 import tritonhttpclient
 from tritonclientutils import InferenceServerException
 
 
 class TrtDynamicShapeTest(tu.TestResultCollector):
-
     def setUp(self):
         self.dtype_ = np.float32
-        self.model_name_ = 'plan'
+        self.model_name_ = "plan"
 
     def test_load_specific_optimization_profile(self):
         # Only OP 5 should be available, which only allow batch size 8
         tensor_shape = (1,)
         try:
-            iu.infer_exact(self, self.model_name_, (1,) + tensor_shape, 1,
-                           self.dtype_, self.dtype_, self.dtype_)
+            iu.infer_exact(
+                self,
+                self.model_name_,
+                (1,) + tensor_shape,
+                1,
+                self.dtype_,
+                self.dtype_,
+                self.dtype_,
+            )
         except InferenceServerException as ex:
             self.assertTrue(
                 "model expected the shape of dimension 0 to be between 6 and 8 but received 1"
-                in ex.message())
+                in ex.message()
+            )
 
         try:
-            iu.infer_exact(self, self.model_name_, (8,) + tensor_shape, 8,
-                           self.dtype_, self.dtype_, self.dtype_)
+            iu.infer_exact(
+                self,
+                self.model_name_,
+                (8,) + tensor_shape,
+                8,
+                self.dtype_,
+                self.dtype_,
+                self.dtype_,
+            )
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -64,37 +81,60 @@ def test_load_default_optimization_profile(self):
         tensor_shape = (33,)
 
         try:
-            iu.infer_exact(self, self.model_name_, (8,) + tensor_shape, 8,
-                           self.dtype_, self.dtype_, self.dtype_)
+            iu.infer_exact(
+                self,
+                self.model_name_,
+                (8,) + tensor_shape,
+                8,
+                self.dtype_,
+                self.dtype_,
+                self.dtype_,
+            )
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
         over_tensor_shape = (34,)
         try:
-            iu.infer_exact(self, self.model_name_, (8,) + over_tensor_shape, 8,
-                           self.dtype_, self.dtype_, self.dtype_)
+            iu.infer_exact(
+                self,
+                self.model_name_,
+                (8,) + over_tensor_shape,
+                8,
+                self.dtype_,
+                self.dtype_,
+                self.dtype_,
+            )
         except InferenceServerException as ex:
             self.assertTrue(
                 "model expected the shape of dimension 1 to be between 1 and 33 but received 34"
-                in ex.message())
+                in ex.message()
+            )
 
     def test_select_optimization_profile(self):
         # Different profile has different optimized input shape
         batch_size = 4
         tensor_shape = (16,)
         try:
-            iu.infer_exact(self, self.model_name_, (batch_size,) + tensor_shape,
-                           batch_size, self.dtype_, self.dtype_, self.dtype_)
+            iu.infer_exact(
+                self,
+                self.model_name_,
+                (batch_size,) + tensor_shape,
+                batch_size,
+                self.dtype_,
+                self.dtype_,
+                self.dtype_,
+            )
         except InferenceServerException as ex:
             self.assertTrue(False, "unexpected error {}".format(ex))
 
     def test_load_wrong_optimization_profile(self):
         client = tritonhttpclient.InferenceServerClient("localhost:8000")
-        model_name = tu.get_model_name(self.model_name_, self.dtype_,
-                                       self.dtype_, self.dtype_)
+        model_name = tu.get_model_name(
+            self.model_name_, self.dtype_, self.dtype_, self.dtype_
+        )
         model_status = client.is_model_ready(model_name, "1")
         self.assertFalse(model_status, "expected model to be not ready")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_trt_error_propagation/trt_error_propagation_test.py b/qa/L0_trt_error_propagation/trt_error_propagation_test.py
old mode 100644
new mode 100755
index 69c7ecaa28..83527a7533
--- a/qa/L0_trt_error_propagation/trt_error_propagation_test.py
+++ b/qa/L0_trt_error_propagation/trt_error_propagation_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -25,16 +27,15 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import unittest
+
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import InferenceServerException
 
 
 class TestTrtErrorPropagation(unittest.TestCase):
-
     def setUp(self):
         # Initialize client
-        self.__triton = grpcclient.InferenceServerClient("localhost:8001",
-                                                         verbose=True)
+        self.__triton = grpcclient.InferenceServerClient("localhost:8001", verbose=True)
 
     def test_invalid_trt_model(self):
         with self.assertRaises(InferenceServerException) as cm:
@@ -42,13 +43,18 @@ def test_invalid_trt_model(self):
         err_msg = str(cm.exception)
         # All 'expected_msg_parts' should be present in the 'err_msg' in order
         expected_msg_parts = [
-            "load failed for model", "version 1 is at UNAVAILABLE state: ",
-            "Internal: unable to create TensorRT engine: ", "Error Code ",
-            "Internal Error "
+            "load failed for model",
+            "version 1 is at UNAVAILABLE state: ",
+            "Internal: unable to create TensorRT engine: ",
+            "Error Code ",
+            "Internal Error ",
         ]
         for expected_msg_part in expected_msg_parts:
-            self.assertIn(expected_msg_part, err_msg,
-                          "Cannot find an expected part of error message")
+            self.assertIn(
+                expected_msg_part,
+                err_msg,
+                "Cannot find an expected part of error message",
+            )
             _, err_msg = err_msg.split(expected_msg_part)
 
     def test_invalid_trt_model_autocomplete(self):
@@ -57,8 +63,10 @@ def test_invalid_trt_model_autocomplete(self):
         err_msg = str(cm.exception)
         self.assertIn(
             "Internal: unable to load plan file to auto complete config",
-            err_msg, "Caught an unexpected exception")
+            err_msg,
+            "Caught an unexpected exception",
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_trt_plugin/test.sh b/qa/L0_trt_plugin/test.sh
old mode 100644
new mode 100755
diff --git a/qa/L0_trt_plugin/trt_plugin_test.py b/qa/L0_trt_plugin/trt_plugin_test.py
old mode 100644
new mode 100755
index 8862348f7d..36f87335b6
--- a/qa/L0_trt_plugin/trt_plugin_test.py
+++ b/qa/L0_trt_plugin/trt_plugin_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,55 +30,52 @@
 
 sys.path.append("../common")
 
+import os
 import unittest
+
 import numpy as np
-import os
 import test_util as tu
-
 import tritonclient.http as httpclient
 
 # By default, find tritonserver on "localhost", but can be overridden
 # with TRITONSERVER_IPADDR envvar
-_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost')
+_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
 
 
 class PluginModelTest(tu.TestResultCollector):
-
     def _full_exact(self, model_name, plugin_name, shape):
         print(f"{_tritonserver_ipaddr}:8000")
-        triton_client = httpclient.InferenceServerClient(
-            f"{_tritonserver_ipaddr}:8000")
+        triton_client = httpclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8000")
 
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', list(shape), "FP32"))
+        inputs.append(httpclient.InferInput("INPUT0", list(shape), "FP32"))
 
         input0_data = np.ones(shape=shape).astype(np.float32)
         inputs[0].set_data_from_numpy(input0_data, binary_data=True)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
 
-        results = triton_client.infer(model_name + '_' + plugin_name,
-                                      inputs,
-                                      outputs=outputs)
+        results = triton_client.infer(
+            model_name + "_" + plugin_name, inputs, outputs=outputs
+        )
 
-        output0_data = results.as_numpy('OUTPUT0')
+        output0_data = results.as_numpy("OUTPUT0")
 
         # Verify values of Clip, GELU, and Normalize
-        if plugin_name == 'CustomClipPlugin':
+        if plugin_name == "CustomClipPlugin":
             # Clip data to minimum of .1, maximum of .5
             test_output = np.clip(input0_data, 0.1, 0.5)
             self.assertTrue(np.isclose(output0_data, test_output).all())
-        elif plugin_name == 'CustomGeluPluginDynamic':
+        elif plugin_name == "CustomGeluPluginDynamic":
             # Add bias
             input0_data += 1
             # Calculate Gelu activation
-            test_output = (input0_data *
-                           0.5) * (1 + np.tanh((0.797885 * input0_data) +
-                                               (0.035677 * (input0_data**3))))
+            test_output = (input0_data * 0.5) * (
+                1 + np.tanh((0.797885 * input0_data) + (0.035677 * (input0_data**3)))
+            )
             self.assertTrue(np.isclose(output0_data, test_output).all())
-        elif plugin_name == 'Normalize_TRT':
+        elif plugin_name == "Normalize_TRT":
             # L2 norm is sqrt(sum([1]*16)))
             test_output = input0_data / np.sqrt(sum([1] * 16))
             self.assertTrue(np.isclose(output0_data, test_output).all())
@@ -85,19 +84,24 @@ def _full_exact(self, model_name, plugin_name, shape):
 
     def test_raw_fff_clip(self):
         for bs in (1, 8):
-            self._full_exact('plan_float32_float32_float32', 'CustomClipPlugin',
-                             (bs, 16))
+            self._full_exact(
+                "plan_float32_float32_float32", "CustomClipPlugin", (bs, 16)
+            )
 
     def test_raw_fff_gelu(self):
-        self._full_exact('plan_nobatch_float32_float32_float32',
-                         'CustomGeluPluginDynamic', (16, 1, 1))
+        self._full_exact(
+            "plan_nobatch_float32_float32_float32",
+            "CustomGeluPluginDynamic",
+            (16, 1, 1),
+        )
 
     def test_raw_fff_norm(self):
         # model that supports batching
         for bs in (1, 8):
-            self._full_exact('plan_float32_float32_float32', 'Normalize_TRT',
-                             (bs, 16, 16, 16))
+            self._full_exact(
+                "plan_float32_float32_float32", "Normalize_TRT", (bs, 16, 16, 16)
+            )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_trt_reformat_free/trt_reformat_free_test.py b/qa/L0_trt_reformat_free/trt_reformat_free_test.py
old mode 100644
new mode 100755
index 4192b878d8..ea36f9c24a
--- a/qa/L0_trt_reformat_free/trt_reformat_free_test.py
+++ b/qa/L0_trt_reformat_free/trt_reformat_free_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,12 +30,13 @@
 
 sys.path.append("../common")
 
-from builtins import range
 import unittest
+from builtins import range
+
 import numpy as np
 import test_util as tu
-import tritonhttpclient
 import tritonclient.utils.shared_memory as shm
+import tritonhttpclient
 
 
 def div_up(a, b):
@@ -47,36 +50,36 @@ def reformat(format, tensor_np):
         factor = 32
     else:
         raise ValueError(
-            "Unexpected format {} for testing reformat-free input".format(
-                format))
+            "Unexpected format {} for testing reformat-free input".format(format)
+        )
     shape = list(tensor_np.shape) + [factor]
     shape[-4] = div_up(shape[-4], factor)
     reformatted_tensor_np = np.empty(shape, tensor_np.dtype)
     if len(tensor_np.shape) == 3:
         batch = [(tensor_np, reformatted_tensor_np)]
     elif len(tensor_np.shape) == 4:
-        batch = [(tensor_np[idx], reformatted_tensor_np[idx])
-                 for idx in range(tensor_np.shape[0])]
+        batch = [
+            (tensor_np[idx], reformatted_tensor_np[idx])
+            for idx in range(tensor_np.shape[0])
+        ]
     else:
         raise ValueError(
             "Unexpected numpy shape {} for testing reformat-free input".format(
-                tensor_np.shape))
-    for (tensor, reformatted_tensor) in batch:
+                tensor_np.shape
+            )
+        )
+    for tensor, reformatted_tensor in batch:
         for c in range(tensor.shape[0]):
             for h in range(tensor.shape[1]):
                 for w in range(tensor.shape[2]):
-                    reformatted_tensor[c //
-                                       factor][h][w][c %
-                                                     factor] = tensor[c][h][w]
+                    reformatted_tensor[c // factor][h][w][c % factor] = tensor[c][h][w]
     return reformatted_tensor_np
 
 
 class TrtReformatFreeTest(tu.TestResultCollector):
-
     def add_reformat_free_data_as_shared_memory(self, name, tensor, tensor_np):
         byte_size = tensor_np.size * tensor_np.dtype.itemsize
-        self.shm_handles.append(
-            shm.create_shared_memory_region(name, name, byte_size))
+        self.shm_handles.append(shm.create_shared_memory_region(name, name, byte_size))
         # Put data values into shared memory
         shm.set_shared_memory_region(self.shm_handles[-1], [tensor_np])
         # Register shared memory with Triton Server
@@ -87,7 +90,8 @@ def add_reformat_free_data_as_shared_memory(self, name, tensor, tensor_np):
     def setUp(self):
         self.shm_handles = []
         self.triton_client = tritonhttpclient.InferenceServerClient(
-            "localhost:8000", verbose=True)
+            "localhost:8000", verbose=True
+        )
 
     def tearDown(self):
         self.triton_client.unregister_system_shared_memory()
@@ -105,39 +109,42 @@ def test_nobatch_chw2_input(self):
         # for non-linear format tensor, the data buffer is padded and thus the
         # data byte size may not match what is calculated from tensor shape
         inputs = []
-        inputs.append(tritonhttpclient.InferInput('INPUT0', [13, 2, 1], "FP16"))
-        self.add_reformat_free_data_as_shared_memory("input0", inputs[-1],
-                                                     reformatted_input_np)
-        inputs.append(tritonhttpclient.InferInput('INPUT1', [13, 2, 1], "FP16"))
-        self.add_reformat_free_data_as_shared_memory("input1", inputs[-1],
-                                                     reformatted_input_np)
+        inputs.append(tritonhttpclient.InferInput("INPUT0", [13, 2, 1], "FP16"))
+        self.add_reformat_free_data_as_shared_memory(
+            "input0", inputs[-1], reformatted_input_np
+        )
+        inputs.append(tritonhttpclient.InferInput("INPUT1", [13, 2, 1], "FP16"))
+        self.add_reformat_free_data_as_shared_memory(
+            "input1", inputs[-1], reformatted_input_np
+        )
 
         outputs = []
         outputs.append(
-            tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
+            tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True)
+        )
         outputs.append(
-            tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True))
+            tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True)
+        )
 
-        results = self.triton_client.infer(model_name=model_name,
-                                           inputs=inputs,
-                                           outputs=outputs)
+        results = self.triton_client.infer(
+            model_name=model_name, inputs=inputs, outputs=outputs
+        )
         # Validate the results by comparing with precomputed values.
-        output0_np = results.as_numpy('OUTPUT0')
-        output1_np = results.as_numpy('OUTPUT1')
+        output0_np = results.as_numpy("OUTPUT0")
+        output1_np = results.as_numpy("OUTPUT1")
         self.assertTrue(
             np.array_equal(output0_np, expected_output0_np),
-            "OUTPUT0 expected: {}, got {}".format(expected_output0_np,
-                                                  output0_np))
+            "OUTPUT0 expected: {}, got {}".format(expected_output0_np, output0_np),
+        )
         self.assertTrue(
             np.array_equal(output1_np, expected_output1_np),
-            "OUTPUT0 expected: {}, got {}".format(expected_output1_np,
-                                                  output1_np))
+            "OUTPUT0 expected: {}, got {}".format(expected_output1_np, output1_np),
+        )
 
     def test_chw2_input(self):
         model_name = "plan_CHW2_LINEAR_float16_float16_float16"
         for bs in [1, 8]:
-            input_np = np.arange(26 * bs, dtype=np.float16).reshape(
-                (bs, 13, 2, 1))
+            input_np = np.arange(26 * bs, dtype=np.float16).reshape((bs, 13, 2, 1))
             expected_output0_np = input_np + input_np
             expected_output1_np = input_np - input_np
             reformatted_input_np = reformat("CHW2", input_np)
@@ -147,37 +154,37 @@ def test_chw2_input(self):
             # and thus the data byte size may not match what is calculated from
             # tensor shape
             inputs = []
-            inputs.append(
-                tritonhttpclient.InferInput('INPUT0', [bs, 13, 2, 1], "FP16"))
+            inputs.append(tritonhttpclient.InferInput("INPUT0", [bs, 13, 2, 1], "FP16"))
             self.add_reformat_free_data_as_shared_memory(
-                "input0" + str(bs), inputs[-1], reformatted_input_np)
-            inputs.append(
-                tritonhttpclient.InferInput('INPUT1', [bs, 13, 2, 1], "FP16"))
+                "input0" + str(bs), inputs[-1], reformatted_input_np
+            )
+            inputs.append(tritonhttpclient.InferInput("INPUT1", [bs, 13, 2, 1], "FP16"))
             self.add_reformat_free_data_as_shared_memory(
-                "input1" + str(bs), inputs[-1], reformatted_input_np)
+                "input1" + str(bs), inputs[-1], reformatted_input_np
+            )
 
             outputs = []
             outputs.append(
-                tritonhttpclient.InferRequestedOutput('OUTPUT0',
-                                                      binary_data=True))
+                tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True)
+            )
             outputs.append(
-                tritonhttpclient.InferRequestedOutput('OUTPUT1',
-                                                      binary_data=True))
+                tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True)
+            )
 
-            results = self.triton_client.infer(model_name=model_name,
-                                               inputs=inputs,
-                                               outputs=outputs)
+            results = self.triton_client.infer(
+                model_name=model_name, inputs=inputs, outputs=outputs
+            )
             # Validate the results by comparing with precomputed values.
-            output0_np = results.as_numpy('OUTPUT0')
-            output1_np = results.as_numpy('OUTPUT1')
+            output0_np = results.as_numpy("OUTPUT0")
+            output1_np = results.as_numpy("OUTPUT1")
             self.assertTrue(
                 np.array_equal(output0_np, expected_output0_np),
-                "OUTPUT0 expected: {}, got {}".format(expected_output0_np,
-                                                      output0_np))
+                "OUTPUT0 expected: {}, got {}".format(expected_output0_np, output0_np),
+            )
             self.assertTrue(
                 np.array_equal(output1_np, expected_output1_np),
-                "OUTPUT0 expected: {}, got {}".format(expected_output1_np,
-                                                      output1_np))
+                "OUTPUT0 expected: {}, got {}".format(expected_output1_np, output1_np),
+            )
 
     def test_nobatch_chw32_input(self):
         model_name = "plan_nobatch_CHW32_LINEAR_float32_float32_float32"
@@ -190,39 +197,42 @@ def test_nobatch_chw32_input(self):
         # for non-linear format tensor, the data buffer is padded and thus the
         # data byte size may not match what is calculated from tensor shape
         inputs = []
-        inputs.append(tritonhttpclient.InferInput('INPUT0', [13, 2, 1], "FP32"))
-        self.add_reformat_free_data_as_shared_memory("input0", inputs[-1],
-                                                     reformatted_input_np)
-        inputs.append(tritonhttpclient.InferInput('INPUT1', [13, 2, 1], "FP32"))
-        self.add_reformat_free_data_as_shared_memory("input1", inputs[-1],
-                                                     reformatted_input_np)
+        inputs.append(tritonhttpclient.InferInput("INPUT0", [13, 2, 1], "FP32"))
+        self.add_reformat_free_data_as_shared_memory(
+            "input0", inputs[-1], reformatted_input_np
+        )
+        inputs.append(tritonhttpclient.InferInput("INPUT1", [13, 2, 1], "FP32"))
+        self.add_reformat_free_data_as_shared_memory(
+            "input1", inputs[-1], reformatted_input_np
+        )
 
         outputs = []
         outputs.append(
-            tritonhttpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
+            tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True)
+        )
         outputs.append(
-            tritonhttpclient.InferRequestedOutput('OUTPUT1', binary_data=True))
+            tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True)
+        )
 
-        results = self.triton_client.infer(model_name=model_name,
-                                           inputs=inputs,
-                                           outputs=outputs)
+        results = self.triton_client.infer(
+            model_name=model_name, inputs=inputs, outputs=outputs
+        )
         # Validate the results by comparing with precomputed values.
-        output0_np = results.as_numpy('OUTPUT0')
-        output1_np = results.as_numpy('OUTPUT1')
+        output0_np = results.as_numpy("OUTPUT0")
+        output1_np = results.as_numpy("OUTPUT1")
         self.assertTrue(
             np.array_equal(output0_np, expected_output0_np),
-            "OUTPUT0 expected: {}, got {}".format(expected_output0_np,
-                                                  output0_np))
+            "OUTPUT0 expected: {}, got {}".format(expected_output0_np, output0_np),
+        )
         self.assertTrue(
             np.array_equal(output1_np, expected_output1_np),
-            "OUTPUT0 expected: {}, got {}".format(expected_output1_np,
-                                                  output1_np))
+            "OUTPUT0 expected: {}, got {}".format(expected_output1_np, output1_np),
+        )
 
     def test_chw32_input(self):
         model_name = "plan_CHW32_LINEAR_float32_float32_float32"
         for bs in [1, 8]:
-            input_np = np.arange(26 * bs, dtype=np.float32).reshape(
-                (bs, 13, 2, 1))
+            input_np = np.arange(26 * bs, dtype=np.float32).reshape((bs, 13, 2, 1))
             expected_output0_np = input_np + input_np
             expected_output1_np = input_np - input_np
             reformatted_input_np = reformat("CHW32", input_np)
@@ -232,38 +242,38 @@ def test_chw32_input(self):
             # and thus the data byte size may not match what is calculated from
             # tensor shape
             inputs = []
-            inputs.append(
-                tritonhttpclient.InferInput('INPUT0', [bs, 13, 2, 1], "FP32"))
+            inputs.append(tritonhttpclient.InferInput("INPUT0", [bs, 13, 2, 1], "FP32"))
             self.add_reformat_free_data_as_shared_memory(
-                "input0" + str(bs), inputs[-1], reformatted_input_np)
-            inputs.append(
-                tritonhttpclient.InferInput('INPUT1', [bs, 13, 2, 1], "FP32"))
+                "input0" + str(bs), inputs[-1], reformatted_input_np
+            )
+            inputs.append(tritonhttpclient.InferInput("INPUT1", [bs, 13, 2, 1], "FP32"))
             self.add_reformat_free_data_as_shared_memory(
-                "input1" + str(bs), inputs[-1], reformatted_input_np)
+                "input1" + str(bs), inputs[-1], reformatted_input_np
+            )
 
             outputs = []
             outputs.append(
-                tritonhttpclient.InferRequestedOutput('OUTPUT0',
-                                                      binary_data=True))
+                tritonhttpclient.InferRequestedOutput("OUTPUT0", binary_data=True)
+            )
             outputs.append(
-                tritonhttpclient.InferRequestedOutput('OUTPUT1',
-                                                      binary_data=True))
+                tritonhttpclient.InferRequestedOutput("OUTPUT1", binary_data=True)
+            )
 
-            results = self.triton_client.infer(model_name=model_name,
-                                               inputs=inputs,
-                                               outputs=outputs)
+            results = self.triton_client.infer(
+                model_name=model_name, inputs=inputs, outputs=outputs
+            )
             # Validate the results by comparing with precomputed values.
-            output0_np = results.as_numpy('OUTPUT0')
-            output1_np = results.as_numpy('OUTPUT1')
+            output0_np = results.as_numpy("OUTPUT0")
+            output1_np = results.as_numpy("OUTPUT1")
             self.assertTrue(
                 np.array_equal(output0_np, expected_output0_np),
-                "OUTPUT0 expected: {}, got {}".format(expected_output0_np,
-                                                      output0_np))
+                "OUTPUT0 expected: {}, got {}".format(expected_output0_np, output0_np),
+            )
             self.assertTrue(
                 np.array_equal(output1_np, expected_output1_np),
-                "OUTPUT0 expected: {}, got {}".format(expected_output1_np,
-                                                      output1_np))
+                "OUTPUT0 expected: {}, got {}".format(expected_output1_np, output1_np),
+            )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_trt_shape_tensors/test.sh b/qa/L0_trt_shape_tensors/test.sh
old mode 100644
new mode 100755
index e0f0faa229..eed67d9dcb
--- a/qa/L0_trt_shape_tensors/test.sh
+++ b/qa/L0_trt_shape_tensors/test.sh
@@ -49,7 +49,7 @@ SERVER_ARGS="--model-repository=`pwd`/models"
 SERVER_LOG="./inference_server.log"
 source ../common/util.sh
 
-rm -fr  *.log 
+rm -fr  *.log
 rm -fr models && mkdir models
 cp -r /data/inferenceserver/${REPO_VERSION}/qa_shapetensor_model_repository/* models/.
 
diff --git a/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py b/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py
old mode 100644
new mode 100755
index 14609dbb94..a83795f981
--- a/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py
+++ b/qa/L0_trt_shape_tensors/trt_shape_tensor_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,20 +30,19 @@
 
 sys.path.append("../common")
 
-from builtins import range
 import os
-import unittest
-import time
 import threading
-import numpy as np
+import time
+import unittest
+from builtins import range
+
 import infer_util as iu
-import test_util as tu
+import numpy as np
 import sequence_util as su
-
+import test_util as tu
 import tritongrpcclient as grpcclient
 
-TEST_SYSTEM_SHARED_MEMORY = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
+TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
 
 _model_instances = 1
 _max_queue_delay_ms = 10000
@@ -52,7 +53,6 @@
 
 
 class InferShapeTensorTest(tu.TestResultCollector):
-
     def setUp(self):
         # The helper client for setup will be GRPC for simplicity.
         self.triton_client_ = grpcclient.InferenceServerClient("localhost:8001")
@@ -75,14 +75,16 @@ def check_deferred_exception(self):
             if len(_deferred_exceptions) > 0:
                 raise _deferred_exceptions[0]
 
-    def check_response(self,
-                       bs,
-                       thresholds,
-                       shape_values,
-                       dummy_input_shapes,
-                       shm_region_names=None,
-                       precreated_shm_regions=None,
-                       shm_suffix=""):
+    def check_response(
+        self,
+        bs,
+        thresholds,
+        shape_values,
+        dummy_input_shapes,
+        shm_region_names=None,
+        precreated_shm_regions=None,
+        shm_suffix="",
+    ):
         try:
             # Add batch size to shape as full shape is expected
             for i in range(len(dummy_input_shapes)):
@@ -93,7 +95,7 @@ def check_response(self,
 
             iu.infer_shape_tensor(
                 self,
-                'plan',
+                "plan",
                 np.float32,
                 shape_values,
                 dummy_input_shapes,
@@ -101,7 +103,8 @@ def check_response(self,
                 use_streaming=False,
                 shm_suffix=shm_suffix,
                 use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                batch_size=bs)
+                batch_size=bs,
+            )
 
             end_ms = int(round(time.time() * 1000))
 
@@ -110,13 +113,21 @@ def check_response(self,
             if lt_ms is not None:
                 self.assertTrue(
                     (end_ms - start_ms) < lt_ms,
-                    "expected less than " + str(lt_ms) +
-                    "ms response time, got " + str(end_ms - start_ms) + " ms")
+                    "expected less than "
+                    + str(lt_ms)
+                    + "ms response time, got "
+                    + str(end_ms - start_ms)
+                    + " ms",
+                )
             if gt_ms is not None:
                 self.assertTrue(
                     (end_ms - start_ms) > gt_ms,
-                    "expected greater than " + str(gt_ms) +
-                    "ms response time, got " + str(end_ms - start_ms) + " ms")
+                    "expected greater than "
+                    + str(gt_ms)
+                    + "ms response time, got "
+                    + str(end_ms - start_ms)
+                    + " ms",
+                )
         except Exception as ex:
             self.add_deferred_exception(ex)
 
@@ -126,8 +137,9 @@ def check_setup(self, model_name):
         bconfig = config.dynamic_batching
         self.assertTrue(2 in bconfig.preferred_batch_size)
         self.assertTrue(6 in bconfig.preferred_batch_size)
-        self.assertEqual(bconfig.max_queue_delay_microseconds,
-                         _max_queue_delay_ms * 1000)  # 10 secs
+        self.assertEqual(
+            bconfig.max_queue_delay_microseconds, _max_queue_delay_ms * 1000
+        )  # 10 secs
 
     def check_status(self, model_name, batch_exec, exec_cnt, infer_cnt):
         # There is a time window between when responses are returned and statistics are updated.
@@ -135,113 +147,154 @@ def check_status(self, model_name, batch_exec, exec_cnt, infer_cnt):
         # inference statistics to be ready.
         num_tries = 10
         for i in range(num_tries):
-            stats = self.triton_client_.get_inference_statistics(
-                model_name, "1")
+            stats = self.triton_client_.get_inference_statistics(model_name, "1")
             self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats")
             actual_exec_cnt = stats.model_stats[0].execution_count
             if actual_exec_cnt == exec_cnt:
                 break
-            print("WARNING: expect {} executions, got {} (attempt {})".format(
-                exec_cnt, actual_exec_cnt, i))
+            print(
+                "WARNING: expect {} executions, got {} (attempt {})".format(
+                    exec_cnt, actual_exec_cnt, i
+                )
+            )
             time.sleep(1)
 
-        self.assertEqual(stats.model_stats[0].name, model_name,
-                         "expect model stats for model {}".format(model_name))
         self.assertEqual(
-            stats.model_stats[0].version, "1",
-            "expect model stats for model {} version 1".format(model_name))
+            stats.model_stats[0].name,
+            model_name,
+            "expect model stats for model {}".format(model_name),
+        )
+        self.assertEqual(
+            stats.model_stats[0].version,
+            "1",
+            "expect model stats for model {} version 1".format(model_name),
+        )
 
         if batch_exec is not None:
             batch_stats = stats.model_stats[0].batch_stats
             print(batch_stats)
             self.assertEqual(
-                len(batch_stats), len(batch_exec),
+                len(batch_stats),
+                len(batch_exec),
                 "expected {} different batch-sizes, got {}".format(
-                    len(batch_exec), len(batch_stats)))
+                    len(batch_exec), len(batch_stats)
+                ),
+            )
 
             for batch_stat in batch_stats:
                 bs = batch_stat.batch_size
                 bc = batch_stat.compute_infer.count
                 self.assertTrue(
-                    bs in batch_exec,
-                    "did not find expected batch-size {}".format(bs))
+                    bs in batch_exec, "did not find expected batch-size {}".format(bs)
+                )
                 # Get count from one of the stats
                 self.assertEqual(
-                    bc, batch_exec[bs],
-                    "expected model-execution-count {} for batch size {}, got {}"
-                    .format(batch_exec[bs], bs, bc))
+                    bc,
+                    batch_exec[bs],
+                    "expected model-execution-count {} for batch size {}, got {}".format(
+                        batch_exec[bs], bs, bc
+                    ),
+                )
 
         actual_exec_cnt = stats.model_stats[0].execution_count
         self.assertEqual(
-            actual_exec_cnt, exec_cnt,
-            "expected model-exec-count {}, got {}".format(
-                exec_cnt, actual_exec_cnt))
+            actual_exec_cnt,
+            exec_cnt,
+            "expected model-exec-count {}, got {}".format(exec_cnt, actual_exec_cnt),
+        )
 
         actual_infer_cnt = stats.model_stats[0].inference_count
         self.assertEqual(
-            actual_infer_cnt, infer_cnt,
+            actual_infer_cnt,
+            infer_cnt,
             "expected model-inference-count {}, got {}".format(
-                infer_cnt, actual_infer_cnt))
+                infer_cnt, actual_infer_cnt
+            ),
+        )
 
         actual_infer_cnt = stats.model_stats[0].inference_count
         self.assertEqual(
-            actual_infer_cnt, infer_cnt,
+            actual_infer_cnt,
+            infer_cnt,
             "expected model-inference-count {}, got {}".format(
-                infer_cnt, actual_infer_cnt))
+                infer_cnt, actual_infer_cnt
+            ),
+        )
 
     def test_static_batch(self):
         iu.infer_shape_tensor(
             self,
-            'plan',
-            np.float32, [[32, 32]], [[8, 4, 4]],
+            "plan",
+            np.float32,
+            [[32, 32]],
+            [[8, 4, 4]],
             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-            batch_size=8)
+            batch_size=8,
+        )
         iu.infer_shape_tensor(
             self,
-            'plan',
-            np.float32, [[4, 4]], [[8, 32, 32]],
+            "plan",
+            np.float32,
+            [[4, 4]],
+            [[8, 32, 32]],
             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-            batch_size=8)
+            batch_size=8,
+        )
         iu.infer_shape_tensor(
             self,
-            'plan',
-            np.float32, [[4, 4]], [[8, 4, 4]],
+            "plan",
+            np.float32,
+            [[4, 4]],
+            [[8, 4, 4]],
             use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-            batch_size=8)
+            batch_size=8,
+        )
 
     def test_nobatch(self):
         iu.infer_shape_tensor(
             self,
-            'plan_nobatch',
-            np.float32, [[32, 32]], [[4, 4]],
-            use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY)
+            "plan_nobatch",
+            np.float32,
+            [[32, 32]],
+            [[4, 4]],
+            use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+        )
         iu.infer_shape_tensor(
             self,
-            'plan_nobatch',
-            np.float32, [[4, 4]], [[32, 32]],
-            use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY)
+            "plan_nobatch",
+            np.float32,
+            [[4, 4]],
+            [[32, 32]],
+            use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+        )
         iu.infer_shape_tensor(
             self,
-            'plan_nobatch',
-            np.float32, [[4, 4]], [[4, 4]],
-            use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY)
+            "plan_nobatch",
+            np.float32,
+            [[4, 4]],
+            [[4, 4]],
+            use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
+        )
 
     def test_wrong_shape_values(self):
         over_shape_values = [[32, 33]]
         try:
             iu.infer_shape_tensor(
                 self,
-                'plan',
+                "plan",
                 np.float32,
-                over_shape_values, [[8, 4, 4]],
+                over_shape_values,
+                [[8, 4, 4]],
                 use_system_shared_memory=TEST_SYSTEM_SHARED_MEMORY,
-                batch_size=8)
+                batch_size=8,
+            )
         # InferenceServerException will be raised from different namespace,
         # use dynamic type characteristic to catch both ex
         except Exception as ex:
             self.assertTrue(
                 "The shape value at index 2 is expected to be in range from 1 to 32, Got: 33"
-                in ex.message())
+                in ex.message()
+            )
 
     # Dynamic Batcher tests
     def test_dynamic_different_shape_values(self):
@@ -257,22 +310,27 @@ def test_dynamic_different_shape_values(self):
 
             threads = []
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(3, (6000, None)),
-                                 kwargs={
-                                     'shape_values': [[2, 2]],
-                                     'dummy_input_shapes': [[16, 16]],
-                                     'shm_suffix': '{}'.format(len(threads))
-                                 }))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(3, (6000, None)),
+                    kwargs={
+                        "shape_values": [[2, 2]],
+                        "dummy_input_shapes": [[16, 16]],
+                        "shm_suffix": "{}".format(len(threads)),
+                    },
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(3, (_max_queue_delay_ms * 1.5,
-                                           _max_queue_delay_ms)),
-                                 kwargs={
-                                     'shape_values': [[4, 4]],
-                                     'dummy_input_shapes': [[16, 16]],
-                                     'shm_suffix': '{}'.format(len(threads))
-                                 }))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(3, (_max_queue_delay_ms * 1.5, _max_queue_delay_ms)),
+                    kwargs={
+                        "shape_values": [[4, 4]],
+                        "dummy_input_shapes": [[16, 16]],
+                        "shm_suffix": "{}".format(len(threads)),
+                    },
+                )
+            )
             threads[0].start()
             time.sleep(1)
             threads[1].start()
@@ -295,21 +353,27 @@ def test_dynamic_identical_shape_values(self):
 
             threads = []
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(4, (6000, None)),
-                                 kwargs={
-                                     'shape_values': [[4, 4]],
-                                     'dummy_input_shapes': [[16, 16]],
-                                     'shm_suffix': '{}'.format(len(threads))
-                                 }))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(4, (6000, None)),
+                    kwargs={
+                        "shape_values": [[4, 4]],
+                        "dummy_input_shapes": [[16, 16]],
+                        "shm_suffix": "{}".format(len(threads)),
+                    },
+                )
+            )
             threads.append(
-                threading.Thread(target=self.check_response,
-                                 args=(2, (6000, None)),
-                                 kwargs={
-                                     'shape_values': [[4, 4]],
-                                     'dummy_input_shapes': [[16, 16]],
-                                     'shm_suffix': '{}'.format(len(threads))
-                                 }))
+                threading.Thread(
+                    target=self.check_response,
+                    args=(2, (6000, None)),
+                    kwargs={
+                        "shape_values": [[4, 4]],
+                        "dummy_input_shapes": [[16, 16]],
+                        "shm_suffix": "{}".format(len(threads)),
+                    },
+                )
+            )
             threads[0].start()
             time.sleep(1)
             threads[1].start()
@@ -322,7 +386,6 @@ def test_dynamic_identical_shape_values(self):
 
 
 class SequenceBatcherShapeTensorTest(su.SequenceBatcherTestUtil):
-
     def get_expected_result(self, expected_result, value, flag_str=None):
         # Adjust the expected_result for models
         expected_result = value
@@ -345,20 +408,21 @@ def test_sequence_identical_shape_values(self):
             # Need scheduler to wait for queue to contain all
             # inferences for both sequences.
             self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
-            self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]),
-                             12)
-            self.assertTrue(
-                "TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ)
-            self.assertEqual(
-                int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0)
+            self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12)
+            self.assertTrue("TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ)
+            self.assertEqual(int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0)
             precreated_shm0_handles = self.precreate_register_shape_tensor_regions(
-                ((2, 1), (4, 2), (8, 3)), dtype, 0)
+                ((2, 1), (4, 2), (8, 3)), dtype, 0
+            )
             precreated_shm1_handles = self.precreate_register_shape_tensor_regions(
-                ((2, 11), (4, 12), (8, 13)), dtype, 1)
+                ((2, 11), (4, 12), (8, 13)), dtype, 1
+            )
             precreated_shm2_handles = self.precreate_register_shape_tensor_regions(
-                ((2, 111), (4, 112), (8, 113)), dtype, 2)
+                ((2, 111), (4, 112), (8, 113)), dtype, 2
+            )
             precreated_shm3_handles = self.precreate_register_shape_tensor_regions(
-                ((2, 1111), (4, 1112), (8, 1113)), dtype, 3)
+                ((2, 1111), (4, 1112), (8, 1113)), dtype, 3
+            )
             threads = []
             threads.append(
                 threading.Thread(
@@ -369,12 +433,17 @@ def test_sequence_identical_shape_values(self):
                         1001,
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 2, 1, None), (None, 4, 2, None), ("end", 8,
-                                                                     3, None)),
+                        (
+                            ("start", 2, 1, None),
+                            (None, 4, 2, None),
+                            ("end", 8, 3, None),
+                        ),
                         self.get_expected_result(6, 3, "end"),
-                        precreated_shm0_handles),
-                    kwargs={'sequence_name': "{}".format(self._testMethodName)
-                           }))
+                        precreated_shm0_handles,
+                    ),
+                    kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                )
+            )
             threads.append(
                 threading.Thread(
                     target=self.check_sequence_shape_tensor_io,
@@ -384,12 +453,17 @@ def test_sequence_identical_shape_values(self):
                         1002,
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 2, 11, None), (None, 4, 12, None),
-                         ("end", 8, 13, None)),
+                        (
+                            ("start", 2, 11, None),
+                            (None, 4, 12, None),
+                            ("end", 8, 13, None),
+                        ),
                         self.get_expected_result(36, 13, "end"),
-                        precreated_shm1_handles),
-                    kwargs={'sequence_name': "{}".format(self._testMethodName)
-                           }))
+                        precreated_shm1_handles,
+                    ),
+                    kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                )
+            )
             threads.append(
                 threading.Thread(
                     target=self.check_sequence_shape_tensor_io,
@@ -399,12 +473,17 @@ def test_sequence_identical_shape_values(self):
                         1003,
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 2, 111, None), (None, 4, 112, None),
-                         ("end", 8, 113, None)),
+                        (
+                            ("start", 2, 111, None),
+                            (None, 4, 112, None),
+                            ("end", 8, 113, None),
+                        ),
                         self.get_expected_result(336, 113, "end"),
-                        precreated_shm2_handles),
-                    kwargs={'sequence_name': "{}".format(self._testMethodName)
-                           }))
+                        precreated_shm2_handles,
+                    ),
+                    kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                )
+            )
             threads.append(
                 threading.Thread(
                     target=self.check_sequence_shape_tensor_io,
@@ -414,12 +493,17 @@ def test_sequence_identical_shape_values(self):
                         1004,
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 2, 1111, None), (None, 4, 1112, None),
-                         ("end", 8, 1113, None)),
+                        (
+                            ("start", 2, 1111, None),
+                            (None, 4, 1112, None),
+                            ("end", 8, 1113, None),
+                        ),
                         self.get_expected_result(3336, 1113, "end"),
-                        precreated_shm3_handles),
-                    kwargs={'sequence_name': "{}".format(self._testMethodName)
-                           }))
+                        precreated_shm3_handles,
+                    ),
+                    kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                )
+            )
 
             for t in threads:
                 t.start()
@@ -447,13 +531,17 @@ def test_sequence_different_shape_values(self):
         dtype = np.float32
 
         precreated_shm0_handles = self.precreate_register_shape_tensor_regions(
-            ((1, 1), (1, 2), (1, 3)), dtype, 0)
+            ((1, 1), (1, 2), (1, 3)), dtype, 0
+        )
         precreated_shm1_handles = self.precreate_register_shape_tensor_regions(
-            ((32, 11), (32, 12), (32, 13)), dtype, 1)
+            ((32, 11), (32, 12), (32, 13)), dtype, 1
+        )
         precreated_shm2_handles = self.precreate_register_shape_tensor_regions(
-            ((16, 111), (16, 112), (16, 113)), dtype, 2)
+            ((16, 111), (16, 112), (16, 113)), dtype, 2
+        )
         precreated_shm3_handles = self.precreate_register_shape_tensor_regions(
-            ((1, 1111), (1, 1112), (1, 1113)), dtype, 3)
+            ((1, 1111), (1, 1112), (1, 1113)), dtype, 3
+        )
         try:
             model_name = tu.get_sequence_model_name("plan", dtype)
             self.check_setup(model_name)
@@ -461,12 +549,9 @@ def test_sequence_different_shape_values(self):
             # Need scheduler to wait for queue to contain all
             # inferences for both sequences.
             self.assertTrue("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
-            self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]),
-                             12)
-            self.assertTrue(
-                "TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ)
-            self.assertEqual(
-                int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0)
+            self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12)
+            self.assertTrue("TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ)
+            self.assertEqual(int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0)
 
             threads = []
             threads.append(
@@ -478,12 +563,17 @@ def test_sequence_different_shape_values(self):
                         1001,
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 1, 1, None), (None, 1, 2, None), ("end", 1,
-                                                                     3, None)),
+                        (
+                            ("start", 1, 1, None),
+                            (None, 1, 2, None),
+                            ("end", 1, 3, None),
+                        ),
                         self.get_expected_result(6, 3, "end"),
-                        precreated_shm0_handles),
-                    kwargs={'sequence_name': "{}".format(self._testMethodName)
-                           }))
+                        precreated_shm0_handles,
+                    ),
+                    kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                )
+            )
             threads.append(
                 threading.Thread(
                     target=self.check_sequence_shape_tensor_io,
@@ -493,12 +583,17 @@ def test_sequence_different_shape_values(self):
                         1002,
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 32, 11, None), (None, 32, 12, None),
-                         ("end", 32, 13, None)),
+                        (
+                            ("start", 32, 11, None),
+                            (None, 32, 12, None),
+                            ("end", 32, 13, None),
+                        ),
                         self.get_expected_result(36, 13, "end"),
-                        precreated_shm1_handles),
-                    kwargs={'sequence_name': "{}".format(self._testMethodName)
-                           }))
+                        precreated_shm1_handles,
+                    ),
+                    kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                )
+            )
             threads.append(
                 threading.Thread(
                     target=self.check_sequence_shape_tensor_io,
@@ -508,12 +603,17 @@ def test_sequence_different_shape_values(self):
                         1003,
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 16, 111, None), (None, 16, 112, None),
-                         ("end", 16, 113, None)),
+                        (
+                            ("start", 16, 111, None),
+                            (None, 16, 112, None),
+                            ("end", 16, 113, None),
+                        ),
                         self.get_expected_result(336, 113, "end"),
-                        precreated_shm2_handles),
-                    kwargs={'sequence_name': "{}".format(self._testMethodName)
-                           }))
+                        precreated_shm2_handles,
+                    ),
+                    kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                )
+            )
             threads.append(
                 threading.Thread(
                     target=self.check_sequence_shape_tensor_io,
@@ -523,12 +623,17 @@ def test_sequence_different_shape_values(self):
                         1004,
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 1, 1111, None), (None, 1, 1112, None),
-                         ("end", 1, 1113, None)),
+                        (
+                            ("start", 1, 1111, None),
+                            (None, 1, 1112, None),
+                            ("end", 1, 1113, None),
+                        ),
                         self.get_expected_result(3336, 1113, "end"),
-                        precreated_shm3_handles),
-                    kwargs={'sequence_name': "{}".format(self._testMethodName)
-                           }))
+                        precreated_shm3_handles,
+                    ),
+                    kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                )
+            )
 
             for t in threads:
                 t.start()
@@ -549,12 +654,7 @@ def test_sequence_different_shape_values(self):
 
 
 class DynaSequenceBatcherTest(su.SequenceBatcherTestUtil):
-
-    def get_expected_result(self,
-                            expected_result,
-                            corrid,
-                            value,
-                            flag_str=None):
+    def get_expected_result(self, expected_result, corrid, value, flag_str=None):
         expected_result = value
         if flag_str is not None:
             if "start" in flag_str:
@@ -568,20 +668,23 @@ def _multi_sequence_different_shape_impl(self, sleep_secs):
         dtype = np.float32
 
         precreated_shm0_handles = self.precreate_register_dynaseq_shape_tensor_regions(
-            ((1, 1), (12, 2), (2, 3)), dtype, 0)
+            ((1, 1), (12, 2), (2, 3)), dtype, 0
+        )
         precreated_shm1_handles = self.precreate_register_dynaseq_shape_tensor_regions(
-            ((3, 11), (4, 12), (5, 13)), dtype, 1)
+            ((3, 11), (4, 12), (5, 13)), dtype, 1
+        )
         precreated_shm2_handles = self.precreate_register_dynaseq_shape_tensor_regions(
-            ((6, 111), (7, 112), (8, 113)), dtype, 2)
+            ((6, 111), (7, 112), (8, 113)), dtype, 2
+        )
         precreated_shm3_handles = self.precreate_register_dynaseq_shape_tensor_regions(
-            ((9, 1111), (10, 1112), (11, 1113)), dtype, 3)
+            ((9, 1111), (10, 1112), (11, 1113)), dtype, 3
+        )
 
         try:
             model_name = tu.get_dyna_sequence_model_name("plan", dtype)
             self.check_setup(model_name)
             self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
-            self.assertFalse(
-                "TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ)
+            self.assertFalse("TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ)
 
             corrids = [1001, 1002, 1003, 1004]
             threads = []
@@ -594,17 +697,22 @@ def _multi_sequence_different_shape_impl(self, sleep_secs):
                         corrids[0],
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 1, 1, None), (None, 12, 2, None), ("end", 2,
-                                                                      3, None)),
-                        self.get_expected_result(4 + corrids[0], corrids[0], 3,
-                                                 "end"),
-                        precreated_shm0_handles),
+                        (
+                            ("start", 1, 1, None),
+                            (None, 12, 2, None),
+                            ("end", 2, 3, None),
+                        ),
+                        self.get_expected_result(4 + corrids[0], corrids[0], 3, "end"),
+                        precreated_shm0_handles,
+                    ),
                     kwargs={
-                        'sequence_name':
-                            "{}_{}".format(self._testMethodName, corrids[0]),
-                        'using_dynamic_batcher':
-                            True
-                    }))
+                        "sequence_name": "{}_{}".format(
+                            self._testMethodName, corrids[0]
+                        ),
+                        "using_dynamic_batcher": True,
+                    },
+                )
+            )
             threads.append(
                 threading.Thread(
                     target=self.check_sequence_shape_tensor_io,
@@ -614,17 +722,24 @@ def _multi_sequence_different_shape_impl(self, sleep_secs):
                         corrids[1],
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 3, 11, None), (None, 4, 12, None),
-                         ("end", 5, 13, None)),
-                        self.get_expected_result(36 + corrids[1], corrids[1],
-                                                 13, "end"),
-                        precreated_shm1_handles),
+                        (
+                            ("start", 3, 11, None),
+                            (None, 4, 12, None),
+                            ("end", 5, 13, None),
+                        ),
+                        self.get_expected_result(
+                            36 + corrids[1], corrids[1], 13, "end"
+                        ),
+                        precreated_shm1_handles,
+                    ),
                     kwargs={
-                        'sequence_name':
-                            "{}_{}".format(self._testMethodName, corrids[1]),
-                        'using_dynamic_batcher':
-                            True
-                    }))
+                        "sequence_name": "{}_{}".format(
+                            self._testMethodName, corrids[1]
+                        ),
+                        "using_dynamic_batcher": True,
+                    },
+                )
+            )
             threads.append(
                 threading.Thread(
                     target=self.check_sequence_shape_tensor_io,
@@ -634,17 +749,24 @@ def _multi_sequence_different_shape_impl(self, sleep_secs):
                         corrids[2],
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 6, 111, None), (None, 7, 112, None),
-                         ("end", 8, 113, None)),
-                        self.get_expected_result(336 + corrids[2], corrids[2],
-                                                 113, "end"),
-                        precreated_shm2_handles),
+                        (
+                            ("start", 6, 111, None),
+                            (None, 7, 112, None),
+                            ("end", 8, 113, None),
+                        ),
+                        self.get_expected_result(
+                            336 + corrids[2], corrids[2], 113, "end"
+                        ),
+                        precreated_shm2_handles,
+                    ),
                     kwargs={
-                        'sequence_name':
-                            "{}_{}".format(self._testMethodName, corrids[2]),
-                        'using_dynamic_batcher':
-                            True
-                    }))
+                        "sequence_name": "{}_{}".format(
+                            self._testMethodName, corrids[2]
+                        ),
+                        "using_dynamic_batcher": True,
+                    },
+                )
+            )
             threads.append(
                 threading.Thread(
                     target=self.check_sequence_shape_tensor_io,
@@ -654,17 +776,24 @@ def _multi_sequence_different_shape_impl(self, sleep_secs):
                         corrids[3],
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 9, 1111, None), (None, 10, 1112, None),
-                         ("end", 11, 1113, None)),
-                        self.get_expected_result(3336 + corrids[3], corrids[3],
-                                                 1113, "end"),
-                        precreated_shm3_handles),
+                        (
+                            ("start", 9, 1111, None),
+                            (None, 10, 1112, None),
+                            ("end", 11, 1113, None),
+                        ),
+                        self.get_expected_result(
+                            3336 + corrids[3], corrids[3], 1113, "end"
+                        ),
+                        precreated_shm3_handles,
+                    ),
                     kwargs={
-                        'sequence_name':
-                            "{}_{}".format(self._testMethodName, corrids[3]),
-                        'using_dynamic_batcher':
-                            True
-                    }))
+                        "sequence_name": "{}_{}".format(
+                            self._testMethodName, corrids[3]
+                        ),
+                        "using_dynamic_batcher": True,
+                    },
+                )
+            )
 
             for t in threads:
                 t.start()
@@ -688,21 +817,24 @@ def _multi_sequence_identical_shape_impl(self, sleep_secs):
         dtype = np.float32
 
         precreated_shm0_handles = self.precreate_register_dynaseq_shape_tensor_regions(
-            ((2, 1), (4, 2), (8, 3)), dtype, 0)
+            ((2, 1), (4, 2), (8, 3)), dtype, 0
+        )
         precreated_shm1_handles = self.precreate_register_dynaseq_shape_tensor_regions(
-            ((2, 11), (4, 12), (8, 13)), dtype, 1)
+            ((2, 11), (4, 12), (8, 13)), dtype, 1
+        )
         precreated_shm2_handles = self.precreate_register_dynaseq_shape_tensor_regions(
-            ((2, 111), (4, 112), (8, 113)), dtype, 2)
+            ((2, 111), (4, 112), (8, 113)), dtype, 2
+        )
         precreated_shm3_handles = self.precreate_register_dynaseq_shape_tensor_regions(
-            ((2, 1111), (4, 1112), (8, 1113)), dtype, 3)
+            ((2, 1111), (4, 1112), (8, 1113)), dtype, 3
+        )
 
         try:
             model_name = tu.get_dyna_sequence_model_name("plan", dtype)
 
             self.check_setup(model_name)
             self.assertFalse("TRITONSERVER_DELAY_SCHEDULER" in os.environ)
-            self.assertFalse(
-                "TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ)
+            self.assertFalse("TRITONSERVER_BACKLOG_DELAY_SCHEDULER" in os.environ)
 
             corrids = [1001, 1002, 1003, 1004]
             threads = []
@@ -715,17 +847,22 @@ def _multi_sequence_identical_shape_impl(self, sleep_secs):
                         corrids[0],
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 2, 1, None), (None, 4, 2, None), ("end", 8,
-                                                                     3, None)),
-                        self.get_expected_result(4 + corrids[0], corrids[0], 3,
-                                                 "end"),
-                        precreated_shm0_handles),
+                        (
+                            ("start", 2, 1, None),
+                            (None, 4, 2, None),
+                            ("end", 8, 3, None),
+                        ),
+                        self.get_expected_result(4 + corrids[0], corrids[0], 3, "end"),
+                        precreated_shm0_handles,
+                    ),
                     kwargs={
-                        'sequence_name':
-                            "{}_{}".format(self._testMethodName, corrids[0]),
-                        'using_dynamic_batcher':
-                            True
-                    }))
+                        "sequence_name": "{}_{}".format(
+                            self._testMethodName, corrids[0]
+                        ),
+                        "using_dynamic_batcher": True,
+                    },
+                )
+            )
             threads.append(
                 threading.Thread(
                     target=self.check_sequence_shape_tensor_io,
@@ -735,17 +872,24 @@ def _multi_sequence_identical_shape_impl(self, sleep_secs):
                         corrids[1],
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 2, 11, None), (None, 4, 12, None),
-                         ("end", 8, 13, None)),
-                        self.get_expected_result(36 + corrids[1], corrids[1],
-                                                 13, "end"),
-                        precreated_shm1_handles),
+                        (
+                            ("start", 2, 11, None),
+                            (None, 4, 12, None),
+                            ("end", 8, 13, None),
+                        ),
+                        self.get_expected_result(
+                            36 + corrids[1], corrids[1], 13, "end"
+                        ),
+                        precreated_shm1_handles,
+                    ),
                     kwargs={
-                        'sequence_name':
-                            "{}_{}".format(self._testMethodName, corrids[1]),
-                        'using_dynamic_batcher':
-                            True
-                    }))
+                        "sequence_name": "{}_{}".format(
+                            self._testMethodName, corrids[1]
+                        ),
+                        "using_dynamic_batcher": True,
+                    },
+                )
+            )
             threads.append(
                 threading.Thread(
                     target=self.check_sequence_shape_tensor_io,
@@ -755,17 +899,24 @@ def _multi_sequence_identical_shape_impl(self, sleep_secs):
                         corrids[2],
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 2, 111, None), (None, 4, 112, None),
-                         ("end", 8, 113, None)),
-                        self.get_expected_result(336 + corrids[2], corrids[2],
-                                                 113, "end"),
-                        precreated_shm2_handles),
+                        (
+                            ("start", 2, 111, None),
+                            (None, 4, 112, None),
+                            ("end", 8, 113, None),
+                        ),
+                        self.get_expected_result(
+                            336 + corrids[2], corrids[2], 113, "end"
+                        ),
+                        precreated_shm2_handles,
+                    ),
                     kwargs={
-                        'sequence_name':
-                            "{}_{}".format(self._testMethodName, corrids[2]),
-                        'using_dynamic_batcher':
-                            True
-                    }))
+                        "sequence_name": "{}_{}".format(
+                            self._testMethodName, corrids[2]
+                        ),
+                        "using_dynamic_batcher": True,
+                    },
+                )
+            )
             threads.append(
                 threading.Thread(
                     target=self.check_sequence_shape_tensor_io,
@@ -775,17 +926,24 @@ def _multi_sequence_identical_shape_impl(self, sleep_secs):
                         corrids[3],
                         (None, None),
                         # (flag_str, shape_value, value, pre_delay_ms)
-                        (("start", 2, 1111, None), (None, 4, 1112, None),
-                         ("end", 8, 1113, None)),
-                        self.get_expected_result(3336 + corrids[3], corrids[3],
-                                                 1113, "end"),
-                        precreated_shm3_handles),
+                        (
+                            ("start", 2, 1111, None),
+                            (None, 4, 1112, None),
+                            ("end", 8, 1113, None),
+                        ),
+                        self.get_expected_result(
+                            3336 + corrids[3], corrids[3], 1113, "end"
+                        ),
+                        precreated_shm3_handles,
+                    ),
                     kwargs={
-                        'sequence_name':
-                            "{}_{}".format(self._testMethodName, corrids[3]),
-                        'using_dynamic_batcher':
-                            True
-                    }))
+                        "sequence_name": "{}_{}".format(
+                            self._testMethodName, corrids[3]
+                        ),
+                        "using_dynamic_batcher": True,
+                    },
+                )
+            )
 
             for t in threads:
                 t.start()
@@ -827,5 +985,5 @@ def test_dynaseq_different_shape_values_parallel(self):
         self._multi_sequence_different_shape_impl(0)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_vertex_ai/test.sh b/qa/L0_vertex_ai/test.sh
old mode 100644
new mode 100755
index d334d6c886..3113a66d1f
--- a/qa/L0_vertex_ai/test.sh
+++ b/qa/L0_vertex_ai/test.sh
@@ -106,7 +106,7 @@ function vertex_ai_wait_for_server_ready() {
     WAIT_RET=1
 }
 
-# Helper function to unset all AIP vairables before test
+# Helper function to unset all AIP variables before test
 function unset_vertex_variables() {
     unset AIP_MODE
     unset AIP_HTTP_PORT
@@ -418,7 +418,7 @@ else
   fi
 fi
 
-# Test AIP_STORAGE_URI won't be used if model repository is specified 
+# Test AIP_STORAGE_URI won't be used if model repository is specified
 SERVER_ARGS="--model-repository=single_model"
 run_server_nowait
 vertex_ai_wait_for_server_ready $SERVER_PID 10
diff --git a/qa/L0_vertex_ai/vertex_ai_test.py b/qa/L0_vertex_ai/vertex_ai_test.py
old mode 100644
new mode 100755
index 77f78aad36..b6f9fc42b4
--- a/qa/L0_vertex_ai/vertex_ai_test.py
+++ b/qa/L0_vertex_ai/vertex_ai_test.py
@@ -30,34 +30,30 @@
 sys.path.append("../common")
 
 import os
+import sys
 import unittest
+
 import numpy as np
+import requests
 import test_util as tu
 import tritonclient.http as httpclient
 
-import os
-import requests
-import sys
-
 
 class VertexAiTest(tu.TestResultCollector):
-
     def setUp(self):
-        port = os.getenv('AIP_HTTP_PORT', '8080')
-        predict_endpoint = os.getenv('AIP_PREDICT_ROUTE', '/predict')
-        self.model_ = os.getenv('TEST_EXPLICIT_MODEL_NAME', 'addsub')
+        port = os.getenv("AIP_HTTP_PORT", "8080")
+        predict_endpoint = os.getenv("AIP_PREDICT_ROUTE", "/predict")
+        self.model_ = os.getenv("TEST_EXPLICIT_MODEL_NAME", "addsub")
         self.url_ = "http://localhost:{}{}".format(port, predict_endpoint)
-        self.input_data_ = [
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-        ]
+        self.input_data_ = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
         self.expected_output0_data_ = [x * 2 for x in self.input_data_]
         self.expected_output1_data_ = [0 for x in self.input_data_]
 
     def test_predict(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -65,22 +61,20 @@ def test_predict(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=False)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
         request_body, _ = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+            inputs, outputs=outputs
+        )
 
-        headers = {'Content-Type': 'application/json'}
+        headers = {"Content-Type": "application/json"}
         r = requests.post(self.url_, data=request_body, headers=headers)
         r.raise_for_status()
 
-        result = httpclient.InferenceServerClient.parse_response_body(
-            r._content)
+        result = httpclient.InferenceServerClient.parse_response_body(r._content)
 
-        output0_data = result.as_numpy('OUTPUT0')
-        output1_data = result.as_numpy('OUTPUT1')
+        output0_data = result.as_numpy("OUTPUT0")
+        output1_data = result.as_numpy("OUTPUT1")
         for i in range(16):
             self.assertEqual(output0_data[0][i], self.expected_output0_data_[i])
             self.assertEqual(output1_data[0][i], self.expected_output1_data_[i])
@@ -88,8 +82,8 @@ def test_predict(self):
     def test_predict_specified_model(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -97,27 +91,23 @@ def test_predict_specified_model(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=False)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
         request_body, _ = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+            inputs, outputs=outputs
+        )
 
         headers = {
-            'Content-Type':
-                'application/json',
-            "X-Vertex-Ai-Triton-Redirect":
-                "v2/models/{}/infer".format(self.model_)
+            "Content-Type": "application/json",
+            "X-Vertex-Ai-Triton-Redirect": "v2/models/{}/infer".format(self.model_),
         }
         r = requests.post(self.url_, data=request_body, headers=headers)
         r.raise_for_status()
 
-        result = httpclient.InferenceServerClient.parse_response_body(
-            r._content)
+        result = httpclient.InferenceServerClient.parse_response_body(r._content)
 
-        output0_data = result.as_numpy('OUTPUT0')
-        output1_data = result.as_numpy('OUTPUT1')
+        output0_data = result.as_numpy("OUTPUT0")
+        output1_data = result.as_numpy("OUTPUT1")
         if self.model_ == "addsub":
             expected_output0_data = [x * 2 for x in self.input_data_]
             expected_output1_data = [0 for x in self.input_data_]
@@ -131,8 +121,8 @@ def test_predict_specified_model(self):
     def test_predict_request_binary(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -140,25 +130,26 @@ def test_predict_request_binary(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=True)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
-        request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
+        (
+            request_body,
+            header_length,
+        ) = httpclient.InferenceServerClient.generate_request_body(
+            inputs, outputs=outputs
+        )
 
         headers = {
-            'Content-Type':
-                'application/vnd.vertex-ai-triton.binary+json;json-header-size={}'
-                .format(header_length)
+            "Content-Type": "application/vnd.vertex-ai-triton.binary+json;json-header-size={}".format(
+                header_length
+            )
         }
         r = requests.post(self.url_, data=request_body, headers=headers)
         r.raise_for_status()
 
-        result = httpclient.InferenceServerClient.parse_response_body(
-            r._content)
-        output0_data = result.as_numpy('OUTPUT0')
-        output1_data = result.as_numpy('OUTPUT1')
+        result = httpclient.InferenceServerClient.parse_response_body(r._content)
+        output0_data = result.as_numpy("OUTPUT0")
+        output1_data = result.as_numpy("OUTPUT1")
         for i in range(16):
             self.assertEqual(output0_data[0][i], self.expected_output0_data_[i])
             self.assertEqual(output1_data[0][i], self.expected_output1_data_[i])
@@ -166,8 +157,8 @@ def test_predict_request_binary(self):
     def test_predict_response_binary(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -175,23 +166,23 @@ def test_predict_response_binary(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=False)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=True))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
         request_body, _ = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+            inputs, outputs=outputs
+        )
 
-        headers = {'Content-Type': 'application/json'}
+        headers = {"Content-Type": "application/json"}
         r = requests.post(self.url_, data=request_body, headers=headers)
         r.raise_for_status()
 
-        header_length_str = r.headers['Inference-Header-Content-Length']
+        header_length_str = r.headers["Inference-Header-Content-Length"]
         result = httpclient.InferenceServerClient.parse_response_body(
-            r._content, header_length=int(header_length_str))
+            r._content, header_length=int(header_length_str)
+        )
 
-        output0_data = result.as_numpy('OUTPUT0')
-        output1_data = result.as_numpy('OUTPUT1')
+        output0_data = result.as_numpy("OUTPUT0")
+        output1_data = result.as_numpy("OUTPUT1")
         for i in range(16):
             self.assertEqual(output0_data[0][i], self.expected_output0_data_[i])
             self.assertEqual(output1_data[0][i], self.expected_output1_data_[i])
@@ -199,8 +190,8 @@ def test_predict_response_binary(self):
     def test_malformed_binary_header(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -208,29 +199,34 @@ def test_malformed_binary_header(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=True)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
-        request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
+        (
+            request_body,
+            header_length,
+        ) = httpclient.InferenceServerClient.generate_request_body(
+            inputs, outputs=outputs
+        )
 
         headers = {
-            'Content-Type':
-                'additional-string/application/vnd.vertex-ai-triton.binary+json;json-header-size={}'
-                .format(header_length)
+            "Content-Type": "additional-string/application/vnd.vertex-ai-triton.binary+json;json-header-size={}".format(
+                header_length
+            )
         }
         r = requests.post(self.url_, data=request_body, headers=headers)
         self.assertEqual(
-            400, r.status_code,
+            400,
+            r.status_code,
             "Expected error code {} returned for the request; got: {}".format(
-                400, r.status_code))
+                400, r.status_code
+            ),
+        )
 
     def test_malformed_binary_header_not_number(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -238,29 +234,34 @@ def test_malformed_binary_header_not_number(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=True)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
-        request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
+        (
+            request_body,
+            header_length,
+        ) = httpclient.InferenceServerClient.generate_request_body(
+            inputs, outputs=outputs
+        )
 
         headers = {
-            'Content-Type':
-                'application/vnd.vertex-ai-triton.binary+json;json-header-size=additional-string{}'
-                .format(header_length)
+            "Content-Type": "application/vnd.vertex-ai-triton.binary+json;json-header-size=additional-string{}".format(
+                header_length
+            )
         }
         r = requests.post(self.url_, data=request_body, headers=headers)
         self.assertEqual(
-            400, r.status_code,
+            400,
+            r.status_code,
             "Expected error code {} returned for the request; got: {}".format(
-                400, r.status_code))
+                400, r.status_code
+            ),
+        )
 
     def test_malformed_binary_header_negative_number(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -268,28 +269,32 @@ def test_malformed_binary_header_negative_number(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=True)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
-        request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
+        (
+            request_body,
+            header_length,
+        ) = httpclient.InferenceServerClient.generate_request_body(
+            inputs, outputs=outputs
+        )
 
         headers = {
-            'Content-Type':
-                'application/vnd.vertex-ai-triton.binary+json;json-header-size=-123'
+            "Content-Type": "application/vnd.vertex-ai-triton.binary+json;json-header-size=-123"
         }
         r = requests.post(self.url_, data=request_body, headers=headers)
         self.assertEqual(
-            400, r.status_code,
+            400,
+            r.status_code,
             "Expected error code {} returned for the request; got: {}".format(
-                400, r.status_code))
+                400, r.status_code
+            ),
+        )
 
     def test_malformed_binary_header_large_number(self):
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Initialize the data
         input_data = np.array(self.input_data_, dtype=np.int32)
@@ -297,23 +302,27 @@ def test_malformed_binary_header_large_number(self):
         inputs[0].set_data_from_numpy(input_data, binary_data=True)
         inputs[1].set_data_from_numpy(input_data, binary_data=False)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT0', binary_data=False))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT1', binary_data=False))
-        request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
-            inputs, outputs=outputs)
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=False))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
+        (
+            request_body,
+            header_length,
+        ) = httpclient.InferenceServerClient.generate_request_body(
+            inputs, outputs=outputs
+        )
 
         headers = {
-            'Content-Type':
-                'application/vnd.vertex-ai-triton.binary+json;json-header-size=12345'
+            "Content-Type": "application/vnd.vertex-ai-triton.binary+json;json-header-size=12345"
         }
         r = requests.post(self.url_, data=request_body, headers=headers)
         self.assertEqual(
-            400, r.status_code,
+            400,
+            r.status_code,
             "Expected error code {} returned for the request; got: {}".format(
-                400, r.status_code))
+                400, r.status_code
+            ),
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_warmup/decoupled/1/model.py b/qa/L0_warmup/decoupled/1/model.py
old mode 100644
new mode 100755
index db7c6903f5..52481ae83f
--- a/qa/L0_warmup/decoupled/1/model.py
+++ b/qa/L0_warmup/decoupled/1/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,11 +30,12 @@
 
 
 class TritonPythonModel:
-    """Test model that always returns 0 response for all requests. """
+    """Test model that always returns 0 response for all requests."""
 
     def execute(self, requests):
         for request in requests:
             request.get_response_sender().send(
-                flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+            )
 
         return None
diff --git a/qa/L0_warmup/failing_infer/1/model.py b/qa/L0_warmup/failing_infer/1/model.py
old mode 100644
new mode 100755
index 1935fe6cd9..65814c77d4
--- a/qa/L0_warmup/failing_infer/1/model.py
+++ b/qa/L0_warmup/failing_infer/1/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,7 +30,7 @@
 
 
 class TritonPythonModel:
-    """Test model that always returns error for all requests. """
+    """Test model that always returns error for all requests."""
 
     def execute(self, requests):
         responses = []
@@ -36,8 +38,9 @@ def execute(self, requests):
         for _ in requests:
             responses.append(
                 pb_utils.InferenceResponse(
-                    output_tensors=[],
-                    error=pb_utils.TritonError("An Error Occurred")))
+                    output_tensors=[], error=pb_utils.TritonError("An Error Occurred")
+                )
+            )
 
         # You must return a list of pb_utils.InferenceResponse. Length
         # of this list must match the length of `requests` list.
diff --git a/qa/L0_warmup/test.sh b/qa/L0_warmup/test.sh
old mode 100644
new mode 100755
diff --git a/qa/common/check_copyright.py b/qa/common/check_copyright.py
index f5d84995e0..ff18ca8e39 100755
--- a/qa/common/check_copyright.py
+++ b/qa/common/check_copyright.py
@@ -28,44 +28,68 @@
 
 import argparse
 import os
-import re
 import pathlib
+import re
 
 FLAGS = None
-SKIP_EXTS = ('jpeg', 'jpg', 'pgm', 'png', 'log', 'preprocessed', 'jmx', 'gz',
-             'json', 'pdf', 'so', 'onnx', 'svg')
-REPO_PATH_FROM_THIS_FILE = '../..'
+SKIP_EXTS = (
+    "jpeg",
+    "jpg",
+    "pgm",
+    "png",
+    "log",
+    "preprocessed",
+    "jmx",
+    "gz",
+    "json",
+    "pdf",
+    "so",
+    "onnx",
+    "svg",
+)
+REPO_PATH_FROM_THIS_FILE = "../.."
 SKIP_PATHS = (
-    'build', 'deploy/gke-marketplace-app/.gitignore',
-    'deploy/gke-marketplace-app/server-deployer/chart/.helmignore',
-    'deploy/gcp/.helmignore', 'deploy/aws/.helmignore',
-    'deploy/fleetcommand/.helmignore', 'docs/.gitignore',
-    'docs/_static/.gitattributes', 'docs/examples/model_repository',
-    'docs/examples/jetson', 'docker', 'qa/common/cuda_op_kernel.cu.cc.patch',
-    'qa/ensemble_models/mix_platform_float32_float32_float32/output0_labels.txt',
-    'qa/ensemble_models/mix_type_int32_float32_float32/output0_labels.txt',
-    'qa/ensemble_models/mix_ensemble_int32_float32_float32/output0_labels.txt',
-    'qa/ensemble_models/wrong_label_int32_float32_float32/output0_labels.txt',
-    'qa/ensemble_models/label_override_int32_float32_float32/output0_labels.txt',
-    'qa/L0_model_config/noautofill_platform',
-    'qa/L0_model_config/autofill_noplatform',
-    'qa/L0_model_config/autofill_noplatform_success',
-    'qa/L0_model_config/special_cases',
-    'qa/L0_model_config/cli_messages/cli_override/expected',
-    'qa/L0_model_config/cli_messages/cli_deprecation/expected',
-    'qa/L0_model_namespacing/test_duplication',
-    'qa/L0_model_namespacing/test_dynamic_resolution',
-    'qa/L0_model_namespacing/test_ensemble_duplication',
-    'qa/L0_model_namespacing/test_no_duplication',
-    'qa/L0_perf_nomodel/baseline', 'qa/L0_perf_nomodel/legacy_baseline',
-    'qa/L0_warmup/raw_mug_data', 'qa/L0_java_resnet/expected_output_data',
-    'qa/L0_trt_dla_jetson/trt_dla_model_store',
-    'qa/openvino_models/dynamic_batch', 'qa/openvino_models/fixed_batch',
-    'CITATION.cff', 'TRITON_VERSION')
+    "build",
+    "deploy/gke-marketplace-app/.gitignore",
+    "deploy/gke-marketplace-app/server-deployer/chart/.helmignore",
+    "deploy/gcp/.helmignore",
+    "deploy/aws/.helmignore",
+    "deploy/fleetcommand/.helmignore",
+    "docs/.gitignore",
+    "docs/_static/.gitattributes",
+    "docs/examples/model_repository",
+    "docs/examples/jetson",
+    "docker",
+    "qa/common/cuda_op_kernel.cu.cc.patch",
+    "qa/ensemble_models/mix_platform_float32_float32_float32/output0_labels.txt",
+    "qa/ensemble_models/mix_type_int32_float32_float32/output0_labels.txt",
+    "qa/ensemble_models/mix_ensemble_int32_float32_float32/output0_labels.txt",
+    "qa/ensemble_models/wrong_label_int32_float32_float32/output0_labels.txt",
+    "qa/ensemble_models/label_override_int32_float32_float32/output0_labels.txt",
+    "qa/L0_model_config/noautofill_platform",
+    "qa/L0_model_config/autofill_noplatform",
+    "qa/L0_model_config/autofill_noplatform_success",
+    "qa/L0_model_config/special_cases",
+    "qa/L0_model_config/cli_messages/cli_override/expected",
+    "qa/L0_model_config/cli_messages/cli_deprecation/expected",
+    "qa/L0_model_namespacing/test_duplication",
+    "qa/L0_model_namespacing/test_dynamic_resolution",
+    "qa/L0_model_namespacing/test_ensemble_duplication",
+    "qa/L0_model_namespacing/test_no_duplication",
+    "qa/L0_perf_nomodel/baseline",
+    "qa/L0_perf_nomodel/legacy_baseline",
+    "qa/L0_warmup/raw_mug_data",
+    "qa/L0_java_resnet/expected_output_data",
+    "qa/L0_trt_dla_jetson/trt_dla_model_store",
+    "qa/openvino_models/dynamic_batch",
+    "qa/openvino_models/fixed_batch",
+    "CITATION.cff",
+    "TRITON_VERSION",
+)
 
-COPYRIGHT_YEAR_RE = 'Copyright( \\(c\\))? 20[1-9][0-9](-(20)?[1-9][0-9])?(,((20[2-9][0-9](-(20)?[2-9][0-9])?)|([2-9][0-9](-[2-9][0-9])?)))*,? NVIDIA CORPORATION( & AFFILIATES)?. All rights reserved.'
+COPYRIGHT_YEAR_RE = "Copyright( \\(c\\))? 20[1-9][0-9](-(20)?[1-9][0-9])?(,((20[2-9][0-9](-(20)?[2-9][0-9])?)|([2-9][0-9](-[2-9][0-9])?)))*,? NVIDIA CORPORATION( & AFFILIATES)?. All rights reserved."
 
-COPYRIGHT = '''
+COPYRIGHT = """
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
@@ -90,10 +114,11 @@
 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-'''
+"""
 
-repo_abs_path = pathlib.Path(__file__).parent.joinpath(
-    REPO_PATH_FROM_THIS_FILE).resolve()
+repo_abs_path = (
+    pathlib.Path(__file__).parent.joinpath(REPO_PATH_FROM_THIS_FILE).resolve()
+)
 
 copyright_year_re = re.compile(COPYRIGHT_YEAR_RE)
 
@@ -103,19 +128,20 @@ def visit(path):
         print("visiting " + path)
 
     for skip in SKIP_EXTS:
-        if path.endswith('.' + skip):
+        if path.endswith("." + skip):
             if FLAGS.verbose:
                 print("skipping due to extension: " + path)
             return True
 
     for skip in SKIP_PATHS:
         if str(pathlib.Path(path).resolve()).startswith(
-                str(repo_abs_path.joinpath(skip).resolve())):
+            str(repo_abs_path.joinpath(skip).resolve())
+        ):
             if FLAGS.verbose:
                 print("skipping due to path prefix: " + path)
             return True
 
-    with open(path, 'r') as f:
+    with open(path, "r") as f:
         first_line = True
         line = None
         try:
@@ -126,9 +152,13 @@ def visit(path):
                 # start of the file
                 if first_line:
                     first_line = False
-                    if (fline.startswith("#!") or fline.startswith("..") or
-                            fline.startswith("<!--") or
-                            fline.startswith("/*") or fline.startswith("{{/*")):
+                    if (
+                        fline.startswith("#!")
+                        or fline.startswith("..")
+                        or fline.startswith("<!--")
+                        or fline.startswith("/*")
+                        or fline.startswith("{{/*")
+                    ):
                         continue
                 # Skip empty lines...
                 if len(fline.strip()) != 0:
@@ -153,25 +183,32 @@ def visit(path):
         # or a year range. It is optionally allowed to have '# ' or
         # '// ' prefix.
         prefix = ""
-        if line.startswith('# '):
-            prefix = '# '
-        elif line.startswith('// '):
-            prefix = '// '
+        if line.startswith("# "):
+            prefix = "# "
+        elif line.startswith("// "):
+            prefix = "// "
         elif not line.startswith(COPYRIGHT_YEAR_RE[0]):
             print(
                 "incorrect prefix for copyright line, allowed prefixes '# ' or '// ', for "
-                + path + ": " + line)
+                + path
+                + ": "
+                + line
+            )
             return False
 
         # Check if the copyright year line matches the regex
         # and see if the year(s) are reasonable
         years = []
 
-        copyright_row = line[len(prefix):]
+        copyright_row = line[len(prefix) :]
         if copyright_year_re.match(copyright_row):
-            for year in copyright_row.split("(c) " if "(c) " in
-                                            copyright_row else "Copyright "
-                                           )[1].split(" NVIDIA ")[0].split(","):
+            for year in (
+                copyright_row.split(
+                    "(c) " if "(c) " in copyright_row else "Copyright "
+                )[1]
+                .split(" NVIDIA ")[0]
+                .split(",")
+            ):
                 if len(year) == 4:  # 2021
                     years.append(int(year))
                 elif len(year) == 2:  # 21
@@ -190,17 +227,21 @@ def visit(path):
             return False
 
         if years[0] > FLAGS.year:
-            print("copyright start year greater than current year for " + path +
-                  ": " + line)
+            print(
+                "copyright start year greater than current year for "
+                + path
+                + ": "
+                + line
+            )
             return False
         if years[-1] > FLAGS.year:
-            print("copyright end year greater than current year for " + path +
-                  ": " + line)
+            print(
+                "copyright end year greater than current year for " + path + ": " + line
+            )
             return False
         for i in range(1, len(years)):
             if years[i - 1] >= years[i]:
-                print("copyright years are not increasing for " + path + ": " +
-                      line)
+                print("copyright years are not increasing for " + path + ": " + line)
                 return False
 
         # Subsequent lines must match the copyright body.
@@ -220,7 +261,7 @@ def visit(path):
             if len(copyright_body[copyright_idx]) == 0:
                 expected = prefix.strip()
             else:
-                expected = (prefix + copyright_body[copyright_idx])
+                expected = prefix + copyright_body[copyright_idx]
             if line != expected:
                 print("incorrect copyright body for " + path)
                 print("  expected: '" + expected + "'")
@@ -229,8 +270,11 @@ def visit(path):
             copyright_idx += 1
 
         if copyright_idx != len(copyright_body):
-            print("missing " + str(len(copyright_body) - copyright_idx) +
-                  " lines of the copyright body")
+            print(
+                "missing "
+                + str(len(copyright_body) - copyright_idx)
+                + " lines of the copyright body"
+            )
             return False
 
     if FLAGS.verbose:
@@ -238,24 +282,20 @@ def visit(path):
     return True
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-y',
-                        '--year',
-                        type=int,
-                        required=True,
-                        help='Copyright year')
-    parser.add_argument('paths',
-                        type=str,
-                        nargs='*',
-                        default=None,
-                        help='Directories or files to check')
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument("-y", "--year", type=int, required=True, help="Copyright year")
+    parser.add_argument(
+        "paths", type=str, nargs="*", default=None, help="Directories or files to check"
+    )
     FLAGS = parser.parse_args()
 
     if FLAGS.paths is None or len(FLAGS.paths) == 0:
diff --git a/qa/common/check_massif_log.py b/qa/common/check_massif_log.py
index 700c58ecbd..3d08922f88 100755
--- a/qa/common/check_massif_log.py
+++ b/qa/common/check_massif_log.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,24 +26,22 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import sys
+import math
 import re
+import sys
 from collections import defaultdict
-import math
 
 
 def parse_massif_out(filename):
     """
     Extract the allocation data from the massif output file, and compile
-    it into a dictionary.    
-    
+    it into a dictionary.
+
     """
     # Read the file
-    with open(filename, 'r') as f:
+    with open(filename, "r") as f:
         contents = f.read()
-        snapshots = re.findall('snapshot=(.*?)heap_tree',
-                               contents,
-                               flags=re.DOTALL)
+        snapshots = re.findall("snapshot=(.*?)heap_tree", contents, flags=re.DOTALL)
 
     # Create snapshot dictionary
     summary = defaultdict(list)
@@ -52,7 +52,7 @@ def parse_massif_out(filename):
 
         # Put columns and values into dictionary
         for col in columns:
-            k, v = col.split('=')
+            k, v = col.split("=")
             summary[k].append(int(v))
 
     # Return dict
@@ -61,42 +61,43 @@ def parse_massif_out(filename):
 
 def is_unbounded_growth(summary, max_allowed_alloc, start_from_middle):
     """
-    Check whether the heap allocations is increasing     
-    
+    Check whether the heap allocations is increasing
+
     """
-    totals = summary['mem_heap_B']
+    totals = summary["mem_heap_B"]
 
     if len(totals) < 5:
         print("Error: Not enough snapshots")
         return False
 
     # Measure difference between mean and maximum memory usage
-    processed_snapshot = totals[len(totals) //
-                                2:] if start_from_middle else totals
+    processed_snapshot = totals[len(totals) // 2 :] if start_from_middle else totals
     processed_snapshot.sort(reverse=True)
     # Remove 5% of the max value which will be treated as outlier
     num_max_min_dropout = math.ceil(0.05 * len(processed_snapshot))
     start = num_max_min_dropout
     end = len(processed_snapshot) - num_max_min_dropout
     mem_heap_avg = sum(processed_snapshot[start:end]) / len(
-        processed_snapshot[start:end])
+        processed_snapshot[start:end]
+    )
     mem_heap_max = max(processed_snapshot[start:end])
 
     # Compute change in allocation rate
     memory_allocation_delta_mb = (mem_heap_max - mem_heap_avg) / 1e6
 
-    print("Change in memory allocation: %f MB, MAX ALLOWED: %f MB" %
-          (memory_allocation_delta_mb, max_allowed_alloc))
+    print(
+        "Change in memory allocation: %f MB, MAX ALLOWED: %f MB"
+        % (memory_allocation_delta_mb, max_allowed_alloc)
+    )
 
-    return (memory_allocation_delta_mb > max_allowed_alloc)
+    return memory_allocation_delta_mb > max_allowed_alloc
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # FIXME turn to proper argument handling
     summary = parse_massif_out(sys.argv[1])
     max_allowed_alloc = float(sys.argv[2])
-    start_from_middle = ((len(sys.argv) == 4) and
-                         (sys.argv[3] == "--start-from-middle"))
+    start_from_middle = (len(sys.argv) == 4) and (sys.argv[3] == "--start-from-middle")
     if is_unbounded_growth(summary, max_allowed_alloc, start_from_middle):
         sys.exit(1)
     else:
diff --git a/qa/common/check_valgrind_log.py b/qa/common/check_valgrind_log.py
index 80f919bd42..201d0e922c 100755
--- a/qa/common/check_valgrind_log.py
+++ b/qa/common/check_valgrind_log.py
@@ -1,4 +1,6 @@
-# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,8 +26,8 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import sys
 import argparse
+import sys
 
 # Check the valgrind logs for memory leaks, ignoring known memory leaks
 #   * cnmem https://github.com/NVIDIA/cnmem/issues/12
@@ -37,8 +39,12 @@
 #     -> dlerror
 
 LEAK_WHITE_LIST = [
-    'cnmem', 'tensorflow::NewSession', 'dl-init', 'dl-open', 'dlerror',
-    'libtorch'
+    "cnmem",
+    "tensorflow::NewSession",
+    "dl-init",
+    "dl-open",
+    "dlerror",
+    "libtorch",
 ]
 
 
@@ -52,31 +58,29 @@ def check_valgrind_log(log_file):
     ----------
     log_file: str
         The path to the log file
-    
+
     Returns
     -------
     list of str
         a list of the leak records as strings
     """
 
-    with open(args.input_log_file, 'r') as f:
+    with open(args.input_log_file, "r") as f:
         logs = f.read()
 
     # Find the pid and start and end of definite leak reports
-    pid_token_end = logs.find('==', logs.find('==') + 1) + 2
+    pid_token_end = logs.find("==", logs.find("==") + 1) + 2
     pid_token = logs[:pid_token_end]
-    leaks_start = logs.find('are definitely lost')
-    first_leak_line = logs.rfind('\n', 0, leaks_start)
+    leaks_start = logs.find("are definitely lost")
+    first_leak_line = logs.rfind("\n", 0, leaks_start)
     if leaks_start == -1 or first_leak_line == -1:
         # No leaks in log
         return []
     end_of_leaks = logs.find(f"{pid_token} LEAK SUMMARY:")
     if end_of_leaks == -1:
-        print(
-            f"\n***\n*** Test Failed for {log_file}: Malformed Valgrind log.\n***"
-        )
+        print(f"\n***\n*** Test Failed for {log_file}: Malformed Valgrind log.\n***")
         sys.exit(1)
-    leak_records_section = logs[first_leak_line + 1:end_of_leaks]
+    leak_records_section = logs[first_leak_line + 1 : end_of_leaks]
 
     # Each leak record is separated by a line containing '==<pid>== \n'
     record_separator = f"{pid_token} \n"
@@ -94,21 +98,21 @@ def check_valgrind_log(log_file):
     return filtered_leak_records
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        '-f',
-        '--input-log-file',
+        "-f",
+        "--input-log-file",
         type=str,
         required=True,
-        help="The name of the file containing the valgrind logs.")
+        help="The name of the file containing the valgrind logs.",
+    )
     args = parser.parse_args()
 
     leak_records = check_valgrind_log(log_file=args.input_log_file)
     if leak_records:
         for leak in leak_records:
             print(leak)
-        print(
-            f"\n***\n*** Test Failed: {len(leak_records)} leaks detected.\n***")
+        print(f"\n***\n*** Test Failed: {len(leak_records)} leaks detected.\n***")
         sys.exit(1)
     sys.exit(0)
diff --git a/qa/common/cuda_op_kernel.cu.cc.patch b/qa/common/cuda_op_kernel.cu.cc.patch
index 8a32873f4f..24d915aa20 100644
--- a/qa/common/cuda_op_kernel.cu.cc.patch
+++ b/qa/common/cuda_op_kernel.cu.cc.patch
@@ -4,7 +4,7 @@ index a9d66f9..a92e218 100644
 +++ b/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc
 @@ -14,10 +14,12 @@ limitations under the License.
  ==============================================================================*/
- 
+
  #if GOOGLE_CUDA
 -#define EIGEN_USE_GPU
 -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -16,12 +16,12 @@ index a9d66f9..a92e218 100644
 +//#include "tensorflow/core/util/gpu_launch_config.h"
 +#include <algorithm>
 +#include <stdint.h>
- 
+
  __global__ void AddOneKernel(const int* in, const int N, int* out) {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
 @@ -27,8 +29,9 @@ __global__ void AddOneKernel(const int* in, const int N, int* out) {
  }
- 
+
  void AddOneKernelLauncher(const int* in, const int N, int* out) {
 -  TF_CHECK_OK(::tensorflow::GpuLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
 -                                            in, N, out));
@@ -29,5 +29,5 @@ index a9d66f9..a92e218 100644
 +  int grid_size = (N + block_size - 1) / block_size;
 +  AddOneKernel<<<grid_size, block_size>>>(in, N, out);
  }
- 
+
  #endif
diff --git a/qa/common/gen_ensemble_model_utils.py b/qa/common/gen_ensemble_model_utils.py
old mode 100644
new mode 100755
index e7e792d0c4..ceac0340dc
--- a/qa/common/gen_ensemble_model_utils.py
+++ b/qa/common/gen_ensemble_model_utils.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,8 +27,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
-import test_util as tu
+
 import numpy as np
+import test_util as tu
 
 BASIC_ENSEMBLE_TYPES = ["simple", "sequence", "fan"]
 
@@ -64,11 +67,13 @@ def fixed_to_variable_size(shape):
 
 
 def platform_types_and_validation():
-    res = [("graphdef", tu.validate_for_tf_model),
-           ("savedmodel", tu.validate_for_tf_model),
-           ("plan", tu.validate_for_trt_model),
-           ("onnx", tu.validate_for_onnx_model),
-           ("libtorch", tu.validate_for_libtorch_model)]
+    res = [
+        ("graphdef", tu.validate_for_tf_model),
+        ("savedmodel", tu.validate_for_tf_model),
+        ("plan", tu.validate_for_trt_model),
+        ("onnx", tu.validate_for_onnx_model),
+        ("libtorch", tu.validate_for_libtorch_model),
+    ]
     return res
 
 
@@ -86,21 +91,41 @@ def __init__(self, ensemble_type):
         else:
             self._get_schedule = AddSubEnsembleSchedule._get_simple_ensemble_schedule
 
-    def get_schedule(self, base_model_name, input_shape, output0_shape,
-                     output1_shape, input_model_dtype, output0_model_dtype,
-                     output1_model_dtype):
-        return self._get_schedule(base_model_name, input_shape, output0_shape,
-                                  output1_shape, input_model_dtype,
-                                  output0_model_dtype, output1_model_dtype)
+    def get_schedule(
+        self,
+        base_model_name,
+        input_shape,
+        output0_shape,
+        output1_shape,
+        input_model_dtype,
+        output0_model_dtype,
+        output1_model_dtype,
+    ):
+        return self._get_schedule(
+            base_model_name,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_model_dtype,
+            output0_model_dtype,
+            output1_model_dtype,
+        )
 
     @classmethod
-    def _get_simple_ensemble_schedule(cls, base_model_name, input_shape,
-                                      output0_shape, output1_shape, input_dtype,
-                                      output0_dtype, output1_dtype):
+    def _get_simple_ensemble_schedule(
+        cls,
+        base_model_name,
+        input_shape,
+        output0_shape,
+        output1_shape,
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    ):
         # libtorch model uses other naming convention for outputs
         output_index_delimiter = "__" if "libtorch" in base_model_name else ""
         # ensemble input -> addsub -> ensemble output
-        schedule = '''
+        schedule = """
 ensemble_scheduling {{
   step [
     {{
@@ -125,19 +150,27 @@ def _get_simple_ensemble_schedule(cls, base_model_name, input_shape,
     }}
   ]
 }}
-'''.format(base_model_name, delimiter=output_index_delimiter)
+""".format(
+            base_model_name, delimiter=output_index_delimiter
+        )
         return schedule
 
     @classmethod
-    def _get_sequence_ensemble_schedule(cls, base_model_name, input_shape,
-                                        output0_shape, output1_shape,
-                                        input_dtype, output0_dtype,
-                                        output1_dtype):
+    def _get_sequence_ensemble_schedule(
+        cls,
+        base_model_name,
+        input_shape,
+        output0_shape,
+        output1_shape,
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    ):
         # libtorch model uses other naming convention for outputs
         output_index_delimiter = "__" if "libtorch" in base_model_name else ""
         # ensemble input -> nop -> addsub -> ensemble output
         nop_input_shape = fixed_to_variable_size(input_shape)
-        schedule = '''
+        schedule = """
 ensemble_scheduling {{
   step [
     {{
@@ -182,16 +215,25 @@ def _get_sequence_ensemble_schedule(cls, base_model_name, input_shape,
     }}
   ]
 }}
-'''.format(input_dtype,
-           tu.shape_to_dims_str(nop_input_shape),
-           base_model_name,
-           delimiter=output_index_delimiter)
+""".format(
+            input_dtype,
+            tu.shape_to_dims_str(nop_input_shape),
+            base_model_name,
+            delimiter=output_index_delimiter,
+        )
         return schedule
 
     @classmethod
-    def _get_fan_ensemble_schedule(cls, base_model_name, input_shape,
-                                   output0_shape, output1_shape, input_dtype,
-                                   output0_dtype, output1_dtype):
+    def _get_fan_ensemble_schedule(
+        cls,
+        base_model_name,
+        input_shape,
+        output0_shape,
+        output1_shape,
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    ):
         # libtorch model uses other naming convention for outputs
         output_index_delimiter = "__" if "libtorch" in base_model_name else ""
 
@@ -200,7 +242,7 @@ def _get_fan_ensemble_schedule(cls, base_model_name, input_shape,
         nop_input_shape = fixed_to_variable_size(input_shape)
         nop_output0_shape = fixed_to_variable_size(output0_shape)
         nop_output1_shape = fixed_to_variable_size(output1_shape)
-        schedule = '''
+        schedule = """
 ensemble_scheduling {{
   step [
     {{
@@ -277,14 +319,16 @@ def _get_fan_ensemble_schedule(cls, base_model_name, input_shape,
     }}
   ]
 }}
-'''.format(input_dtype,
-           tu.shape_to_dims_str(nop_input_shape),
-           base_model_name,
-           output0_dtype,
-           tu.shape_to_dims_str(nop_output0_shape),
-           output1_dtype,
-           tu.shape_to_dims_str(nop_output1_shape),
-           delimiter=output_index_delimiter)
+""".format(
+            input_dtype,
+            tu.shape_to_dims_str(nop_input_shape),
+            base_model_name,
+            output0_dtype,
+            tu.shape_to_dims_str(nop_output0_shape),
+            output1_dtype,
+            tu.shape_to_dims_str(nop_output1_shape),
+            delimiter=output_index_delimiter,
+        )
         return schedule
 
 
@@ -299,25 +343,45 @@ def __init__(self, ensemble_type, ensemble_test_type="zero"):
         if ensemble_type == "fan":
             self._get_schedule = IdentityEnsembleSchedule._get_fan_ensemble_schedule
         elif ensemble_type == "sequence":
-            self._get_schedule = IdentityEnsembleSchedule._get_sequence_ensemble_schedule
+            self._get_schedule = (
+                IdentityEnsembleSchedule._get_sequence_ensemble_schedule
+            )
         else:
             self._get_schedule = IdentityEnsembleSchedule._get_simple_ensemble_schedule
 
-    def get_schedule(self, dtype, input_shapes, input_model_shapes,
-                     output_shapes, output_model_shapes):
-        return self._get_schedule(dtype, input_shapes, input_model_shapes,
-                                  output_shapes, output_model_shapes,
-                                  self._test_type)
+    def get_schedule(
+        self,
+        dtype,
+        input_shapes,
+        input_model_shapes,
+        output_shapes,
+        output_model_shapes,
+    ):
+        return self._get_schedule(
+            dtype,
+            input_shapes,
+            input_model_shapes,
+            output_shapes,
+            output_model_shapes,
+            self._test_type,
+        )
 
     @classmethod
-    def _get_simple_ensemble_schedule(cls, dtype, input_shapes,
-                                      input_model_shapes, output_shapes,
-                                      output_model_shapes, test_type):
+    def _get_simple_ensemble_schedule(
+        cls,
+        dtype,
+        input_shapes,
+        input_model_shapes,
+        output_shapes,
+        output_model_shapes,
+        test_type,
+    ):
         # ensemble reshaped input -> nop with reshaped tensor shape -> ensemble
         # reshaped output (actual ensemble input/output is not visible in schedule)
         steps = []
         for idx in range(len(input_shapes)):
-            steps.append('''
+            steps.append(
+                """
     {{
       model_name: "nop_{}_{}"
       model_version: -1
@@ -334,30 +398,45 @@ def _get_simple_ensemble_schedule(cls, dtype, input_shapes,
         value: "OUTPUT{}"
       }}
     }}
-'''.format(np_to_model_dtype(dtype),
-            tu.shape_to_dims_str(input_model_shapes[idx]), idx, idx, idx))
-
-        schedule = '''
+""".format(
+                    np_to_model_dtype(dtype),
+                    tu.shape_to_dims_str(input_model_shapes[idx]),
+                    idx,
+                    idx,
+                    idx,
+                )
+            )
+
+        schedule = """
 ensemble_scheduling {{
   step [
 {}
   ]
 }}
-'''.format(",".join(steps))
+""".format(
+            ",".join(steps)
+        )
 
         return schedule
 
     @classmethod
-    def _get_sequence_ensemble_schedule(cls, dtype, input_shapes,
-                                        input_model_shapes, output_shapes,
-                                        output_model_shapes, test_type):
+    def _get_sequence_ensemble_schedule(
+        cls,
+        dtype,
+        input_shapes,
+        input_model_shapes,
+        output_shapes,
+        output_model_shapes,
+        test_type,
+    ):
         in_str = "tunnel_in_" if test_type == "reshape" else ""
         out_str = "tunnel_out_" if test_type == "reshape" else ""
         # ensemble reshaped input -> nop with another input only reshape ->
         # nop with output only reshape -> ensemble reshaped output
         steps = []
         for idx in range(len(input_shapes)):
-            steps.append('''
+            steps.append(
+                """
     {{
       model_name: "nop_{in_str}{type}_{shape}"
       model_version: -1
@@ -390,26 +469,37 @@ def _get_sequence_ensemble_schedule(cls, dtype, input_shapes,
         value: "OUTPUT{idx}"
       }}
     }}
-'''.format(type=np_to_model_dtype(dtype),
-            in_str=in_str,
-            out_str=out_str,
-            idx=idx,
-            shape=tu.shape_to_dims_str(input_model_shapes[idx])))
-
-        schedule = '''
+""".format(
+                    type=np_to_model_dtype(dtype),
+                    in_str=in_str,
+                    out_str=out_str,
+                    idx=idx,
+                    shape=tu.shape_to_dims_str(input_model_shapes[idx]),
+                )
+            )
+
+        schedule = """
 ensemble_scheduling {{
   step [
 {}
   ]
 }}
-'''.format(",".join(steps))
+""".format(
+            ",".join(steps)
+        )
 
         return schedule
 
     @classmethod
-    def _get_fan_ensemble_schedule(cls, dtype, input_shapes, input_model_shapes,
-                                   output_shapes, output_model_shapes,
-                                   test_type):
+    def _get_fan_ensemble_schedule(
+        cls,
+        dtype,
+        input_shapes,
+        input_model_shapes,
+        output_shapes,
+        output_model_shapes,
+        test_type,
+    ):
         # Note that the simple and sequence test already test "fan" in some
         # degree, because there is no direct match from nop input/output
         # like what is in addsub-like ensemble.
@@ -426,7 +516,8 @@ def _get_fan_ensemble_schedule(cls, dtype, input_shapes, input_model_shapes,
             intermediate_shapes = [[-1]] * len(input_model_shapes)
         steps = []
         for idx in range(len(input_shapes)):
-            steps.append('''
+            steps.append(
+                """
     {{
       model_name: "nop_{in_str}{type}_{shape}"
       model_version: -1
@@ -475,20 +566,25 @@ def _get_fan_ensemble_schedule(cls, dtype, input_shapes, input_model_shapes,
         value: "OUTPUT{idx}"
       }}
     }}
-'''.format(type=np_to_model_dtype(dtype),
-            in_str=in_str,
-            out_str=out_str,
-            intermediate_shape=tu.shape_to_dims_str(intermediate_shapes[idx]),
-            idx=idx,
-            shape=tu.shape_to_dims_str(input_model_shapes[idx])))
-
-        schedule = '''
+""".format(
+                    type=np_to_model_dtype(dtype),
+                    in_str=in_str,
+                    out_str=out_str,
+                    intermediate_shape=tu.shape_to_dims_str(intermediate_shapes[idx]),
+                    idx=idx,
+                    shape=tu.shape_to_dims_str(input_model_shapes[idx]),
+                )
+            )
+
+        schedule = """
 ensemble_scheduling {{
   step [
 {}
   ]
 }}
-'''.format(",".join(steps))
+""".format(
+            ",".join(steps)
+        )
 
         return schedule
 
@@ -503,7 +599,9 @@ def __init__(self, ensemble_type):
         if ensemble_type == "fan":
             self._get_schedule = SequenceEnsembleSchedule._get_fan_ensemble_schedule
         elif ensemble_type == "sequence":
-            self._get_schedule = SequenceEnsembleSchedule._get_sequence_ensemble_schedule
+            self._get_schedule = (
+                SequenceEnsembleSchedule._get_sequence_ensemble_schedule
+            )
         else:
             self._get_schedule = SequenceEnsembleSchedule._get_simple_ensemble_schedule
 
@@ -515,7 +613,7 @@ def _get_simple_ensemble_schedule(cls, base_model_name, shape, model_dtype):
         # libtorch model uses other naming convention
         index_suffix = "__0" if "libtorch" in base_model_name else ""
         # ensemble input -> sequence -> ensemble output
-        schedule = '''
+        schedule = """
 ensemble_scheduling {{
   step [
     {{
@@ -532,22 +630,24 @@ def _get_simple_ensemble_schedule(cls, base_model_name, shape, model_dtype):
     }}
   ]
 }}
-'''.format(base_model_name, index=index_suffix)
+""".format(
+            base_model_name, index=index_suffix
+        )
         return schedule
 
     @classmethod
-    def _get_sequence_ensemble_schedule(cls, base_model_name, shape,
-                                        model_dtype):
+    def _get_sequence_ensemble_schedule(cls, base_model_name, shape, model_dtype):
         # nop cannot handle STRING data type, fall back to simple
         if model_dtype == "TYPE_STRING":
             return SequenceEnsembleSchedule._get_simple_ensemble_schedule(
-                base_model_name, shape, model_dtype)
+                base_model_name, shape, model_dtype
+            )
 
         # libtorch model uses other naming convention
         index_suffix = "__0" if "libtorch" in base_model_name else ""
         # ensemble input -> nop -> sequence -> ensemble output
         nop_input_shape = fixed_to_variable_size(shape)
-        schedule = '''
+        schedule = """
 ensemble_scheduling {{
   step [
     {{
@@ -580,10 +680,12 @@ def _get_sequence_ensemble_schedule(cls, base_model_name, shape,
     }}
   ]
 }}
-'''.format(model_dtype,
-           tu.shape_to_dims_str(nop_input_shape),
-           base_model_name,
-           index=index_suffix)
+""".format(
+            model_dtype,
+            tu.shape_to_dims_str(nop_input_shape),
+            base_model_name,
+            index=index_suffix,
+        )
         return schedule
 
     @classmethod
@@ -591,14 +693,15 @@ def _get_fan_ensemble_schedule(cls, base_model_name, shape, model_dtype):
         # nop cannot handle STRING data type, fall back to simple
         if model_dtype == "TYPE_STRING":
             return SequenceEnsembleSchedule._get_simple_ensemble_schedule(
-                base_model_name, shape, model_dtype)
+                base_model_name, shape, model_dtype
+            )
 
         # libtorch model uses other naming convention
         index_suffix = "__0" if "libtorch" in base_model_name else ""
         # Not a "fan" due to configuration of base sequence model
         # ensemble input -> nop -> sequence -> nop -> ensemble output
         nop_shape = fixed_to_variable_size(shape)
-        schedule = '''
+        schedule = """
 ensemble_scheduling {{
   step [
     {{
@@ -647,37 +750,41 @@ def _get_fan_ensemble_schedule(cls, base_model_name, shape, model_dtype):
     }}
   ]
 }}
-'''.format(model_dtype,
-           tu.shape_to_dims_str(nop_shape),
-           base_model_name,
-           model_dtype,
-           tu.shape_to_dims_str(nop_shape),
-           index=index_suffix)
+""".format(
+            model_dtype,
+            tu.shape_to_dims_str(nop_shape),
+            base_model_name,
+            model_dtype,
+            tu.shape_to_dims_str(nop_shape),
+            index=index_suffix,
+        )
         return schedule
 
 
-def create_ensemble_modelfile(base_model,
-                              models_dir,
-                              max_batch,
-                              model_version,
-                              input_shape,
-                              output0_shape,
-                              output1_shape,
-                              input_dtype,
-                              output0_dtype,
-                              output1_dtype,
-                              swap=False):
-
+def create_ensemble_modelfile(
+    base_model,
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    swap=False,
+):
     # No actual model file in ensemble model
 
     # Use a different model name for the non-batching variant
     for ensemble_type in BASIC_ENSEMBLE_TYPES:
         ensemble_model_name = "{}_{}{}".format(
-            ensemble_type, base_model, "_nobatch" if max_batch == 0 else "")
-        model_name = tu.get_model_name(ensemble_model_name, input_dtype,
-                                       output0_dtype, output1_dtype)
-        model_version_dir = models_dir + "/" + model_name + "/" + str(
-            model_version)
+            ensemble_type, base_model, "_nobatch" if max_batch == 0 else ""
+        )
+        model_name = tu.get_model_name(
+            ensemble_model_name, input_dtype, output0_dtype, output1_dtype
+        )
+        model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
         try:
             os.makedirs(model_version_dir)
@@ -685,12 +792,20 @@ def create_ensemble_modelfile(base_model,
             pass  # ignore existing dir
 
 
-def create_ensemble_modelconfig(base_model, models_dir, max_batch,
-                                model_version, input_shape, output0_shape,
-                                output1_shape, input_dtype, output0_dtype,
-                                output1_dtype, output0_label_cnt,
-                                version_policy):
-
+def create_ensemble_modelconfig(
+    base_model,
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    output0_label_cnt,
+    version_policy,
+):
     # No validation as long as the base model supports the type and shape
 
     input_model_dtype = np_to_model_dtype(input_dtype)
@@ -705,29 +820,43 @@ def create_ensemble_modelconfig(base_model, models_dir, max_batch,
 
         # Use a different model name for the non-batching variant
         ensemble_model_name = "{}_{}{}".format(
-            ensemble_type, base_model, "_nobatch" if max_batch == 0 else "")
-        model_name = tu.get_model_name(ensemble_model_name, input_dtype,
-                                       output0_dtype, output1_dtype)
+            ensemble_type, base_model, "_nobatch" if max_batch == 0 else ""
+        )
+        model_name = tu.get_model_name(
+            ensemble_model_name, input_dtype, output0_dtype, output1_dtype
+        )
         base_model_name = tu.get_model_name(
             "{}{}".format(base_model, "_nobatch" if max_batch == 0 else ""),
-            input_dtype, output0_dtype, output1_dtype)
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
 
         ensemble_schedule = AddSubEnsembleSchedule(ensemble_type).get_schedule(
-            base_model_name, input_shape, output0_shape, output1_shape,
-            input_model_dtype, output0_model_dtype, output1_model_dtype)
+            base_model_name,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_model_dtype,
+            output0_model_dtype,
+            output1_model_dtype,
+        )
 
         config_dir = models_dir + "/" + model_name
-        config = create_general_modelconfig(model_name,
-                                            "ensemble",
-                                            max_batch,
-                                            repeat(input_dtype, 2),
-                                            repeat(input_shape, 2),
-                                            repeat(None, 2),
-                                            [output0_dtype, output1_dtype],
-                                            [output0_shape, output1_shape],
-                                            repeat(None, 2), [labels, None],
-                                            version_policy=version_policy,
-                                            force_tensor_number_suffix=True)
+        config = create_general_modelconfig(
+            model_name,
+            "ensemble",
+            max_batch,
+            repeat(input_dtype, 2),
+            repeat(input_shape, 2),
+            repeat(None, 2),
+            [output0_dtype, output1_dtype],
+            [output0_shape, output1_shape],
+            repeat(None, 2),
+            [labels, None],
+            version_policy=version_policy,
+            force_tensor_number_suffix=True,
+        )
         config += ensemble_schedule
 
         try:
@@ -744,19 +873,24 @@ def create_ensemble_modelconfig(base_model, models_dir, max_batch,
                     lfile.write("label" + str(l) + "\n")
 
 
-def create_identity_ensemble_modelfile(ensemble_test_type, models_dir,
-                                       model_version, max_batch, dtype,
-                                       input_shapes, output_shapes):
+def create_identity_ensemble_modelfile(
+    ensemble_test_type,
+    models_dir,
+    model_version,
+    max_batch,
+    dtype,
+    input_shapes,
+    output_shapes,
+):
     io_cnt = len(input_shapes)
 
     # Use a different model name for the non-batching variant
     for ensemble_type in BASIC_ENSEMBLE_TYPES:
         ensemble_prefix = "{}_{}".format(ensemble_type, ensemble_test_type)
         model_name = tu.get_zero_model_name(
-            ensemble_prefix + ("_nobatch" if max_batch == 0 else ""), io_cnt,
-            dtype)
-        model_version_dir = models_dir + "/" + model_name + "/" + str(
-            model_version)
+            ensemble_prefix + ("_nobatch" if max_batch == 0 else ""), io_cnt, dtype
+        )
+        model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
         try:
             os.makedirs(model_version_dir)
@@ -764,37 +898,46 @@ def create_identity_ensemble_modelfile(ensemble_test_type, models_dir,
             pass  # ignore existing dir
 
 
-def create_identity_ensemble_modelconfig(ensemble_test_type, models_dir,
-                                         model_version, max_batch, dtype,
-                                         input_shapes, input_model_shapes,
-                                         output_shapes, output_model_shapes):
+def create_identity_ensemble_modelconfig(
+    ensemble_test_type,
+    models_dir,
+    model_version,
+    max_batch,
+    dtype,
+    input_shapes,
+    input_model_shapes,
+    output_shapes,
+    output_model_shapes,
+):
     io_cnt = len(input_shapes)
 
     for ensemble_type in BASIC_ENSEMBLE_TYPES:
         # Use a different model name for the non-batching variant
         ensemble_prefix = "{}_{}".format(ensemble_type, ensemble_test_type)
         model_name = tu.get_zero_model_name(
-            ensemble_prefix + ("_nobatch" if max_batch == 0 else ""), io_cnt,
-            dtype)
+            ensemble_prefix + ("_nobatch" if max_batch == 0 else ""), io_cnt, dtype
+        )
 
         ensemble_schedule = IdentityEnsembleSchedule(
-            ensemble_type,
-            ensemble_test_type).get_schedule(dtype, input_shapes,
-                                             input_model_shapes, output_shapes,
-                                             output_model_shapes)
+            ensemble_type, ensemble_test_type
+        ).get_schedule(
+            dtype, input_shapes, input_model_shapes, output_shapes, output_model_shapes
+        )
 
         config_dir = models_dir + "/" + model_name
-        config = create_general_modelconfig(model_name,
-                                            "ensemble",
-                                            max_batch,
-                                            repeat(dtype, io_cnt),
-                                            input_shapes,
-                                            input_model_shapes,
-                                            repeat(dtype, io_cnt),
-                                            output_shapes,
-                                            output_model_shapes,
-                                            repeat(None, io_cnt),
-                                            force_tensor_number_suffix=True)
+        config = create_general_modelconfig(
+            model_name,
+            "ensemble",
+            max_batch,
+            repeat(dtype, io_cnt),
+            input_shapes,
+            input_model_shapes,
+            repeat(dtype, io_cnt),
+            output_shapes,
+            output_model_shapes,
+            repeat(None, io_cnt),
+            force_tensor_number_suffix=True,
+        )
         config += ensemble_schedule
 
         try:
@@ -806,18 +949,18 @@ def create_identity_ensemble_modelconfig(ensemble_test_type, models_dir,
             cfile.write(config)
 
 
-def create_sequence_ensemble_modelfile(base_model, models_dir, max_batch,
-                                       model_version, shape, dtype):
-
+def create_sequence_ensemble_modelfile(
+    base_model, models_dir, max_batch, model_version, shape, dtype
+):
     # No actual model file in ensemble model
 
     # Use a different model name for the non-batching variant
     for ensemble_type in BASIC_ENSEMBLE_TYPES:
         ensemble_model_name = "{}_{}{}".format(
-            ensemble_type, base_model, "_nobatch" if max_batch == 0 else "")
+            ensemble_type, base_model, "_nobatch" if max_batch == 0 else ""
+        )
         model_name = tu.get_sequence_model_name(ensemble_model_name, dtype)
-        model_version_dir = models_dir + "/" + model_name + "/" + str(
-            model_version)
+        model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
         try:
             os.makedirs(model_version_dir)
@@ -825,9 +968,9 @@ def create_sequence_ensemble_modelfile(base_model, models_dir, max_batch,
             pass  # ignore existing dir
 
 
-def create_sequence_ensemble_modelconfig(base_model, models_dir, max_batch,
-                                         model_version, shape, dtype):
-
+def create_sequence_ensemble_modelconfig(
+    base_model, models_dir, max_batch, model_version, shape, dtype
+):
     # No validation as long as the base model supports the type and shape
 
     model_dtype = np_to_model_dtype(dtype)
@@ -835,19 +978,30 @@ def create_sequence_ensemble_modelconfig(base_model, models_dir, max_batch,
     for ensemble_type in BASIC_ENSEMBLE_TYPES:
         # Use a different model name for the non-batching variant
         ensemble_model_name = "{}_{}{}".format(
-            ensemble_type, base_model, "_nobatch" if max_batch == 0 else "")
+            ensemble_type, base_model, "_nobatch" if max_batch == 0 else ""
+        )
         model_name = tu.get_sequence_model_name(ensemble_model_name, dtype)
         base_model_name = tu.get_sequence_model_name(
-            "{}{}".format(base_model, "_nobatch" if max_batch == 0 else ""),
-            dtype)
+            "{}{}".format(base_model, "_nobatch" if max_batch == 0 else ""), dtype
+        )
 
-        ensemble_schedule = SequenceEnsembleSchedule(
-            ensemble_type).get_schedule(base_model_name, shape, model_dtype)
+        ensemble_schedule = SequenceEnsembleSchedule(ensemble_type).get_schedule(
+            base_model_name, shape, model_dtype
+        )
 
         config_dir = models_dir + "/" + model_name
-        config = create_general_modelconfig(model_name, "ensemble", max_batch,
-                                            [dtype], [shape], [None], [dtype],
-                                            [shape], [None], [None])
+        config = create_general_modelconfig(
+            model_name,
+            "ensemble",
+            max_batch,
+            [dtype],
+            [shape],
+            [None],
+            [dtype],
+            [shape],
+            [None],
+            [None],
+        )
         config += ensemble_schedule
 
         try:
@@ -859,12 +1013,12 @@ def create_sequence_ensemble_modelconfig(base_model, models_dir, max_batch,
             cfile.write(config)
 
 
-def create_nop_modelconfig(models_dir,
-                           tensor_shape,
-                           tensor_dtype,
-                           tensor_model_shape=None):
-    model_name = "nop_{}_{}".format(dtype_str(tensor_dtype),
-                                    tu.shape_to_dims_str(tensor_shape))
+def create_nop_modelconfig(
+    models_dir, tensor_shape, tensor_dtype, tensor_model_shape=None
+):
+    model_name = "nop_{}_{}".format(
+        dtype_str(tensor_dtype), tu.shape_to_dims_str(tensor_shape)
+    )
     # Make [] to [1].
     # Note that this doesn't affect the naming ("nop_{}_" instead of "nop_{}_1")
     if len(tensor_shape) == 0:
@@ -883,7 +1037,8 @@ def create_nop_modelconfig(models_dir,
         repeat(tensor_model_shape, 2),
         repeat(None, 2),
         backend="identity",
-        instance_group_str="instance_group [ { kind: KIND_CPU } ]")
+        instance_group_str="instance_group [ { kind: KIND_CPU } ]",
+    )
 
     try:
         os.makedirs(config_dir)
@@ -897,9 +1052,11 @@ def create_nop_modelconfig(models_dir,
 def create_nop_tunnel_modelconfig(models_dir, tensor_shape, tensor_dtype):
     # Must be fixed size
     in_model_name = "nop_tunnel_in_{}_{}".format(
-        dtype_str(tensor_dtype), tu.shape_to_dims_str(tensor_shape))
+        dtype_str(tensor_dtype), tu.shape_to_dims_str(tensor_shape)
+    )
     out_model_name = "nop_tunnel_out_{}_{}".format(
-        dtype_str(tensor_dtype), tu.shape_to_dims_str(tensor_shape))
+        dtype_str(tensor_dtype), tu.shape_to_dims_str(tensor_shape)
+    )
     # Make [] to [1].
     # Note that this doesn't affect the naming ("nop_{}_" instead of "nop_{}_1")
     if len(tensor_shape) == 0:
@@ -907,8 +1064,7 @@ def create_nop_tunnel_modelconfig(models_dir, tensor_shape, tensor_dtype):
     internal_shape = 1
     for dim in tensor_shape:
         if dim < 0:
-            raise Exception(
-                "Must specify fixed size input / output for nop tunnel")
+            raise Exception("Must specify fixed size input / output for nop tunnel")
         internal_shape *= dim
 
     # Tunnel in nop (reshape to one dimension)
@@ -925,7 +1081,8 @@ def create_nop_tunnel_modelconfig(models_dir, tensor_shape, tensor_dtype):
         repeat(None, 2),
         repeat(None, 2),
         backend="identity",
-        instance_group_str="instance_group [ { kind: KIND_CPU } ]")
+        instance_group_str="instance_group [ { kind: KIND_CPU } ]",
+    )
 
     try:
         os.makedirs(config_dir)
@@ -949,7 +1106,8 @@ def create_nop_tunnel_modelconfig(models_dir, tensor_shape, tensor_dtype):
         repeat(None, 2),
         repeat(None, 2),
         backend="identity",
-        instance_group_str="instance_group [ { kind: KIND_CPU } ]")
+        instance_group_str="instance_group [ { kind: KIND_CPU } ]",
+    )
 
     try:
         os.makedirs(config_dir)
@@ -960,21 +1118,23 @@ def create_nop_tunnel_modelconfig(models_dir, tensor_shape, tensor_dtype):
         cfile.write(config)
 
 
-def create_general_modelconfig(model_name,
-                               platform,
-                               max_batch,
-                               input_dtypes,
-                               input_shapes,
-                               input_model_shapes,
-                               output_dtypes,
-                               output_shapes,
-                               output_model_shapes,
-                               label_filenames,
-                               backend=None,
-                               version_policy=None,
-                               default_model_filename=None,
-                               instance_group_str="",
-                               force_tensor_number_suffix=False):
+def create_general_modelconfig(
+    model_name,
+    platform,
+    max_batch,
+    input_dtypes,
+    input_shapes,
+    input_model_shapes,
+    output_dtypes,
+    output_shapes,
+    output_model_shapes,
+    label_filenames,
+    backend=None,
+    version_policy=None,
+    default_model_filename=None,
+    instance_group_str="",
+    force_tensor_number_suffix=False,
+):
     assert len(input_dtypes) == len(input_shapes)
     assert len(input_model_shapes) == len(input_shapes)
     assert len(output_dtypes) == len(output_shapes)
@@ -985,10 +1145,9 @@ def create_general_modelconfig(model_name,
     version_policy_str = "{ latest { num_versions: 1 }}"
     if version_policy is not None:
         type, val = version_policy
-        if type == 'latest':
-            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(
-                val)
-        elif type == 'specific':
+        if type == "latest":
+            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(val)
+        elif type == "specific":
             version_policy_str = "{{ specific {{ versions: {} }}}}".format(val)
         else:
             version_policy_str = "{ all { }}"
@@ -996,7 +1155,8 @@ def create_general_modelconfig(model_name,
     default_model_filename_str = ""
     if default_model_filename is not None:
         default_model_filename_str = 'default_model_filename: "{}"'.format(
-            default_model_filename)
+            default_model_filename
+        )
 
     # If backend is specified use backend instead of platform
     if backend is not None:
@@ -1006,21 +1166,28 @@ def create_general_modelconfig(model_name,
         key = "platform"
         val = platform
 
-    config = '''
+    config = """
 name: "{}"
 {}: "{}"
 max_batch_size: {}
 version_policy: {}
 {}
 {}
-'''.format(model_name, key, val, max_batch, version_policy_str,
-           default_model_filename_str, instance_group_str)
+""".format(
+        model_name,
+        key,
+        val,
+        max_batch,
+        version_policy_str,
+        default_model_filename_str,
+        instance_group_str,
+    )
 
     for idx in range(len(input_dtypes)):
         idx_str = ""
         if len(input_dtypes) != 1 or force_tensor_number_suffix:
             idx_str = str(idx)
-        config += '''
+        config += """
 input [
   {{
     name: "INPUT{}"
@@ -1028,15 +1195,18 @@ def create_general_modelconfig(model_name,
     dims: [ {} ]
     {}
   }}
-]'''.format(idx_str, dtype_str(input_dtypes[idx]),
+]""".format(
+            idx_str,
+            dtype_str(input_dtypes[idx]),
             tu.shape_to_dims_str(input_shapes[idx]),
-            reshape_str(input_shapes[idx], input_model_shapes[idx]))
+            reshape_str(input_shapes[idx], input_model_shapes[idx]),
+        )
 
     for idx in range(len(output_dtypes)):
         idx_str = ""
         if len(input_dtypes) != 1 or force_tensor_number_suffix:
             idx_str = str(idx)
-        config += '''
+        config += """
 output [
   {{
     name: "OUTPUT{}"
@@ -1045,10 +1215,13 @@ def create_general_modelconfig(model_name,
     {}
     {}
   }}
-]'''.format(idx_str, dtype_str(output_dtypes[idx]),
+]""".format(
+            idx_str,
+            dtype_str(output_dtypes[idx]),
             tu.shape_to_dims_str(output_shapes[idx]),
             reshape_str(output_shapes[idx], output_model_shapes[idx]),
-            label_str(label_filenames[idx]))
+            label_str(label_filenames[idx]),
+        )
     return config
 
 
@@ -1063,8 +1236,7 @@ def dtype_str(dtype):
 def reshape_str(shape, model_shape):
     if model_shape is None or shape == model_shape:
         return ""
-    return "reshape: {{ shape: [ {} ] }}".format(
-        tu.shape_to_dims_str(model_shape))
+    return "reshape: {{ shape: [ {} ] }}".format(tu.shape_to_dims_str(model_shape))
 
 
 def label_str(label):
diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops
index f1102c9988..0f35e4a5e9 100755
--- a/qa/common/gen_qa_custom_ops
+++ b/qa/common/gen_qa_custom_ops
@@ -44,7 +44,7 @@ PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-p
 
 CUDA_DEVICE=${NV_GPU:=0}
 
-[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS) || DOCKER_GPU_ARGS="--gpus device=$CUDA_DEVICE" 
+[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS) || DOCKER_GPU_ARGS="--gpus device=$CUDA_DEVICE"
 
 ###
 HOST_BUILD_DIR=${HOST_BUILD_DIR:=/tmp}
@@ -66,8 +66,8 @@ PYTSCRIPT=pyt_gen.cmds
 SRCDIR=/tmp/src
 DESTDIR=/tmp/custom_ops
 
-# Tensorflow  
-# Set compilation option by "Select a particular C++ dialect." 
+# Tensorflow
+# Set compilation option by "Select a particular C++ dialect."
 [[ "${NVIDIA_UPSTREAM_VERSION}" < "22.10"  ]] && STD_FLAG="c++14" || STD_FLAG="c++17"
 
 cat >$HOST_SRCDIR/$TFSCRIPT <<EOF
diff --git a/qa/common/gen_qa_custom_ops_models.py b/qa/common/gen_qa_custom_ops_models.py
old mode 100644
new mode 100755
index 15f7ab5ee1..31219f82aa
--- a/qa/common/gen_qa_custom_ops_models.py
+++ b/qa/common/gen_qa_custom_ops_models.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,9 +39,13 @@ def create_zeroout_modelfile(create_savedmodel, models_dir, model_version):
 
     # Create the model that uses custom operator.
     tf.compat.v1.reset_default_graph()
-    zin = tf.compat.v1.placeholder(tf.int32, [
-        None,
-    ], "to_zero")
+    zin = tf.compat.v1.placeholder(
+        tf.int32,
+        [
+            None,
+        ],
+        "to_zero",
+    )
     zout = zero_out(zin, name="zeroed")
 
     model_name = "savedmodel_zeroout" if create_savedmodel else "graphdef_zeroout"
@@ -55,30 +61,35 @@ def create_zeroout_modelfile(create_savedmodel, models_dir, model_version):
             input_name = "to_zero"
             output_name = "zeroed"
             input_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-                input_name + ":0")
+                input_name + ":0"
+            )
             output_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-                output_name + ":0")
+                output_name + ":0"
+            )
             input_dict = dict()
             output_dict = dict()
             input_dict[input_name] = input_tensor
             output_dict[output_name] = output_tensor
-            tf.compat.v1.saved_model.simple_save(sess,
-                                                 model_version_dir +
-                                                 "/model.savedmodel",
-                                                 inputs=input_dict,
-                                                 outputs=output_dict)
+            tf.compat.v1.saved_model.simple_save(
+                sess,
+                model_version_dir + "/model.savedmodel",
+                inputs=input_dict,
+                outputs=output_dict,
+            )
     else:
         with tf.compat.v1.Session() as sess:
-            graph_io.write_graph(sess.graph.as_graph_def(),
-                                 model_version_dir,
-                                 "model.graphdef",
-                                 as_text=False)
+            graph_io.write_graph(
+                sess.graph.as_graph_def(),
+                model_version_dir,
+                "model.graphdef",
+                as_text=False,
+            )
 
 
 def create_zeroout_modelconfig(create_savedmodel, models_dir, model_version):
     model_name = "savedmodel_zeroout" if create_savedmodel else "graphdef_zeroout"
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "{}"
 max_batch_size: 0
@@ -96,9 +107,10 @@ def create_zeroout_modelconfig(create_savedmodel, models_dir, model_version):
     dims: [ -1 ]
   }}
 ]
-'''.format(
+""".format(
         model_name,
-        "tensorflow_savedmodel" if create_savedmodel else "tensorflow_graphdef")
+        "tensorflow_savedmodel" if create_savedmodel else "tensorflow_graphdef",
+    )
 
     try:
         os.makedirs(config_dir)
@@ -116,9 +128,13 @@ def create_cudaop_modelfile(create_savedmodel, models_dir, model_version):
 
     # Create the model that uses custom operator.
     tf.compat.v1.reset_default_graph()
-    zin = tf.compat.v1.placeholder(tf.int32, [
-        None,
-    ], "in")
+    zin = tf.compat.v1.placeholder(
+        tf.int32,
+        [
+            None,
+        ],
+        "in",
+    )
     zout = add_one(zin, name="out")
 
     model_name = "savedmodel_cudaop" if create_savedmodel else "graphdef_cudaop"
@@ -134,30 +150,35 @@ def create_cudaop_modelfile(create_savedmodel, models_dir, model_version):
             input_name = "in"
             output_name = "out"
             input_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-                input_name + ":0")
+                input_name + ":0"
+            )
             output_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-                output_name + ":0")
+                output_name + ":0"
+            )
             input_dict = dict()
             output_dict = dict()
             input_dict[input_name] = input_tensor
             output_dict[output_name] = output_tensor
-            tf.compat.v1.saved_model.simple_save(sess,
-                                                 model_version_dir +
-                                                 "/model.savedmodel",
-                                                 inputs=input_dict,
-                                                 outputs=output_dict)
+            tf.compat.v1.saved_model.simple_save(
+                sess,
+                model_version_dir + "/model.savedmodel",
+                inputs=input_dict,
+                outputs=output_dict,
+            )
     else:
         with tf.compat.v1.Session() as sess:
-            graph_io.write_graph(sess.graph.as_graph_def(),
-                                 model_version_dir,
-                                 "model.graphdef",
-                                 as_text=False)
+            graph_io.write_graph(
+                sess.graph.as_graph_def(),
+                model_version_dir,
+                "model.graphdef",
+                as_text=False,
+            )
 
 
 def create_cudaop_modelconfig(create_savedmodel, models_dir, model_version):
     model_name = "savedmodel_cudaop" if create_savedmodel else "graphdef_cudaop"
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "{}"
 max_batch_size: 0
@@ -175,9 +196,10 @@ def create_cudaop_modelconfig(create_savedmodel, models_dir, model_version):
     dims: [ -1 ]
   }}
 ]
-'''.format(
+""".format(
         model_name,
-        "tensorflow_savedmodel" if create_savedmodel else "tensorflow_graphdef")
+        "tensorflow_savedmodel" if create_savedmodel else "tensorflow_graphdef",
+    )
 
     try:
         os.makedirs(config_dir)
@@ -195,9 +217,13 @@ def create_busyop_modelfile(create_savedmodel, models_dir, model_version):
 
     # Create the model that uses custom operator.
     tf.compat.v1.reset_default_graph()
-    zin = tf.compat.v1.placeholder(tf.int32, [
-        None,
-    ], "in")
+    zin = tf.compat.v1.placeholder(
+        tf.int32,
+        [
+            None,
+        ],
+        "in",
+    )
     zout = busy_loop(zin, name="out")
 
     model_name = "savedmodel_busyop" if create_savedmodel else "graphdef_busyop"
@@ -213,30 +239,35 @@ def create_busyop_modelfile(create_savedmodel, models_dir, model_version):
             input_name = "in"
             output_name = "out"
             input_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-                input_name + ":0")
+                input_name + ":0"
+            )
             output_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-                output_name + ":0")
+                output_name + ":0"
+            )
             input_dict = dict()
             output_dict = dict()
             input_dict[input_name] = input_tensor
             output_dict[output_name] = output_tensor
-            tf.compat.v1.saved_model.simple_save(sess,
-                                                 model_version_dir +
-                                                 "/model.savedmodel",
-                                                 inputs=input_dict,
-                                                 outputs=output_dict)
+            tf.compat.v1.saved_model.simple_save(
+                sess,
+                model_version_dir + "/model.savedmodel",
+                inputs=input_dict,
+                outputs=output_dict,
+            )
     else:
         with tf.compat.v1.Session() as sess:
-            graph_io.write_graph(sess.graph.as_graph_def(),
-                                 model_version_dir,
-                                 "model.graphdef",
-                                 as_text=False)
+            graph_io.write_graph(
+                sess.graph.as_graph_def(),
+                model_version_dir,
+                "model.graphdef",
+                as_text=False,
+            )
 
 
 def create_busyop_modelconfig(create_savedmodel, models_dir, model_version):
     model_name = "savedmodel_busyop" if create_savedmodel else "graphdef_busyop"
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "{}"
 max_batch_size: 0
@@ -254,9 +285,10 @@ def create_busyop_modelconfig(create_savedmodel, models_dir, model_version):
     dims: [ -1 ]
   }}
 ]
-'''.format(
+""".format(
         model_name,
-        "tensorflow_savedmodel" if create_savedmodel else "tensorflow_graphdef")
+        "tensorflow_savedmodel" if create_savedmodel else "tensorflow_graphdef",
+    )
 
     try:
         os.makedirs(config_dir)
@@ -288,7 +320,6 @@ def create_moduloop_modelfile(models_dir, model_version):
     )
 
     class ModuloCustomNet(nn.Module):
-
         def __init__(self):
             super(ModuloCustomNet, self).__init__()
 
@@ -298,8 +329,7 @@ def forward(self, input0, input1):
     moduloCustomModel = ModuloCustomNet()
     example_input0 = torch.arange(1, 11, dtype=torch.float32)
     example_input1 = torch.tensor([2] * 10, dtype=torch.float32)
-    traced = torch.jit.trace(moduloCustomModel,
-                             (example_input0, example_input1))
+    traced = torch.jit.trace(moduloCustomModel, (example_input0, example_input1))
 
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
@@ -314,7 +344,7 @@ def forward(self, input0, input1):
 def create_moduloop_modelconfig(models_dir, model_version):
     model_name = "libtorch_modulo"
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "pytorch_libtorch"
 max_batch_size: 0
@@ -337,7 +367,9 @@ def create_moduloop_modelconfig(models_dir, model_version):
     dims: [ 10 ]
   }}
 ]
-'''.format(model_name)
+""".format(
+        model_name
+    )
 
     try:
         os.makedirs(config_dir)
@@ -353,20 +385,17 @@ def create_visionop_modelfile(models_dir, model_version):
     model_name = "libtorch_visionop"
 
     class CustomVisionNet(nn.Module):
-
         def __init__(self):
             super(CustomVisionNet, self).__init__()
 
         def forward(self, input, boxes):
-            return torchvision.ops.roi_align(input, boxes, [5, 5], 1.0, -1,
-                                             False)
+            return torchvision.ops.roi_align(input, boxes, [5, 5], 1.0, -1, False)
 
     visionCustomModel = CustomVisionNet()
     visionCustomModel.eval()
     scripted = torch.jit.script(visionCustomModel)
 
-    model_version_dir = models_dir + "/" + \
-        model_name + "/" + str(model_version)
+    model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
         os.makedirs(model_version_dir)
@@ -379,7 +408,7 @@ def forward(self, input, boxes):
 def create_visionop_modelconfig(models_dir, model_version):
     model_name = "libtorch_visionop"
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "pytorch_libtorch"
 max_batch_size: 0
@@ -402,7 +431,9 @@ def create_visionop_modelconfig(models_dir, model_version):
     dims: [1, 3, 5, 5]
   }}
 ]
-'''.format(model_name)
+""".format(
+        model_name
+    )
 
     try:
         os.makedirs(config_dir)
@@ -465,55 +496,69 @@ def create_vision_op_models(models_dir):
         create_visionop_modelfile(models_dir, model_version)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
-    parser.add_argument('--zero_out_lib_path',
-                        type=str,
-                        required=False,
-                        default="./libzeroout.so",
-                        help='Fullpath to libzeroout.so')
-    parser.add_argument('--cuda_op_lib_path',
-                        type=str,
-                        required=False,
-                        default="./libcudaop.so",
-                        help='Fullpath to libcudaop.so')
-    parser.add_argument('--busy_op_lib_path',
-                        type=str,
-                        required=False,
-                        default="./libbusyop.so",
-                        help='Fullpath to libbusyop.so')
-    parser.add_argument('--graphdef',
-                        required=False,
-                        action='store_true',
-                        help='Generate GraphDef models')
-    parser.add_argument('--savedmodel',
-                        required=False,
-                        action='store_true',
-                        help='Generate SavedModel models')
-    parser.add_argument('--libtorch',
-                        required=False,
-                        action='store_true',
-                        help='Generate Pytorch LibTorch models')
+    parser.add_argument(
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
+    parser.add_argument(
+        "--zero_out_lib_path",
+        type=str,
+        required=False,
+        default="./libzeroout.so",
+        help="Fullpath to libzeroout.so",
+    )
+    parser.add_argument(
+        "--cuda_op_lib_path",
+        type=str,
+        required=False,
+        default="./libcudaop.so",
+        help="Fullpath to libcudaop.so",
+    )
+    parser.add_argument(
+        "--busy_op_lib_path",
+        type=str,
+        required=False,
+        default="./libbusyop.so",
+        help="Fullpath to libbusyop.so",
+    )
+    parser.add_argument(
+        "--graphdef",
+        required=False,
+        action="store_true",
+        help="Generate GraphDef models",
+    )
+    parser.add_argument(
+        "--savedmodel",
+        required=False,
+        action="store_true",
+        help="Generate SavedModel models",
+    )
+    parser.add_argument(
+        "--libtorch",
+        required=False,
+        action="store_true",
+        help="Generate Pytorch LibTorch models",
+    )
     FLAGS, unparsed = parser.parse_known_args()
 
     if FLAGS.graphdef or FLAGS.savedmodel:
         # Use Tensorflow 2 as default. Need to disable the v2 behavior for
         # model generation scripts.
         import tensorflow as tf
+
         tf.compat.v1.disable_eager_execution()
         from tensorflow.python.framework import graph_io
+
         create_zero_out_models(FLAGS.models_dir)
         create_cuda_op_models(FLAGS.models_dir)
         create_busy_op_models(FLAGS.models_dir)
 
     if FLAGS.libtorch:
         import torch
-        from torch import nn
-        import torchvision
         import torch.utils.cpp_extension
+        import torchvision
+        from torch import nn
+
         create_modulo_op_models(FLAGS.models_dir)
         create_vision_op_models(FLAGS.models_dir)
diff --git a/qa/common/gen_qa_dyna_sequence_implicit_models.py b/qa/common/gen_qa_dyna_sequence_implicit_models.py
old mode 100644
new mode 100755
index c5a95780ca..1c4815d5dd
--- a/qa/common/gen_qa_dyna_sequence_implicit_models.py
+++ b/qa/common/gen_qa_dyna_sequence_implicit_models.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,6 +28,7 @@
 
 import argparse
 import os
+
 import numpy as np
 
 FLAGS = None
@@ -98,12 +101,12 @@ def np_to_trt_dtype(np_dtype):
 
 
 def create_onnx_modelfile(models_dir, model_version, max_batch, dtype, shape):
-
     if not tu.validate_for_onnx_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_dyna_sequence_model_name(
-        "onnx_nobatch" if max_batch == 0 else "onnx", dtype)
+        "onnx_nobatch" if max_batch == 0 else "onnx", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     # Create the model. For now don't implement a proper accumulator
@@ -127,132 +130,181 @@ def create_onnx_modelfile(models_dir, model_version, max_batch, dtype, shape):
     batch_dim = [] if max_batch == 0 else [None]
 
     onnx_input = onnx.helper.make_tensor_value_info(
-        "INPUT", onnx_dtype, batch_dim + onnx_input_shape)
+        "INPUT", onnx_dtype, batch_dim + onnx_input_shape
+    )
     onnx_input_state = onnx.helper.make_tensor_value_info(
-        "INPUT_STATE", onnx_dtype, batch_dim + onnx_input_shape)
-    onnx_start = onnx.helper.make_tensor_value_info("START", onnx_control_dtype,
-                                                    batch_dim + [1])
-    onnx_ready = onnx.helper.make_tensor_value_info("READY", onnx_control_dtype,
-                                                    batch_dim + [1])
-    onnx_corrid = onnx.helper.make_tensor_value_info("CORRID",
-                                                     onnx.TensorProto.UINT64,
-                                                     batch_dim + [1])
-    onnx_end = onnx.helper.make_tensor_value_info("END", onnx_control_dtype,
-                                                  batch_dim + [1])
+        "INPUT_STATE", onnx_dtype, batch_dim + onnx_input_shape
+    )
+    onnx_start = onnx.helper.make_tensor_value_info(
+        "START", onnx_control_dtype, batch_dim + [1]
+    )
+    onnx_ready = onnx.helper.make_tensor_value_info(
+        "READY", onnx_control_dtype, batch_dim + [1]
+    )
+    onnx_corrid = onnx.helper.make_tensor_value_info(
+        "CORRID", onnx.TensorProto.UINT64, batch_dim + [1]
+    )
+    onnx_end = onnx.helper.make_tensor_value_info(
+        "END", onnx_control_dtype, batch_dim + [1]
+    )
     onnx_output = onnx.helper.make_tensor_value_info(
-        "OUTPUT", onnx_dtype, batch_dim + onnx_output_shape)
+        "OUTPUT", onnx_dtype, batch_dim + onnx_output_shape
+    )
     onnx_output_state = onnx.helper.make_tensor_value_info(
-        "OUTPUT_STATE", onnx_dtype, batch_dim + onnx_output_shape)
+        "OUTPUT_STATE", onnx_dtype, batch_dim + onnx_output_shape
+    )
 
     internal_input = onnx.helper.make_node("Identity", ["INPUT"], ["_INPUT"])
-    internal_input_state = onnx.helper.make_node("Identity", ["INPUT_STATE"],
-                                                 ["_INPUT_STATE"])
-    # cast int8, int16 input to higer precision int as Onnx Add/Sub operator doesn't support those type
+    internal_input_state = onnx.helper.make_node(
+        "Identity", ["INPUT_STATE"], ["_INPUT_STATE"]
+    )
+    # cast int8, int16 input to higher precision int as Onnx Add/Sub operator doesn't support those type
     # Also casting String data type to int32
-    if ((onnx_dtype == onnx.TensorProto.INT8) or
-        (onnx_dtype == onnx.TensorProto.INT16) or
-        (onnx_dtype == onnx.TensorProto.STRING)):
-        internal_input = onnx.helper.make_node("Cast", ["INPUT"], ["_INPUT"],
-                                               to=onnx.TensorProto.INT32)
-        internal_input_state = onnx.helper.make_node("Cast", ["INPUT_STATE"],
-                                                     ["_INPUT_STATE"],
-                                                     to=onnx.TensorProto.INT32)
+    if (
+        (onnx_dtype == onnx.TensorProto.INT8)
+        or (onnx_dtype == onnx.TensorProto.INT16)
+        or (onnx_dtype == onnx.TensorProto.STRING)
+    ):
+        internal_input = onnx.helper.make_node(
+            "Cast", ["INPUT"], ["_INPUT"], to=onnx.TensorProto.INT32
+        )
+        internal_input_state = onnx.helper.make_node(
+            "Cast", ["INPUT_STATE"], ["_INPUT_STATE"], to=onnx.TensorProto.INT32
+        )
 
     # Convert boolean value to int32 value
     if onnx_control_dtype == onnx.TensorProto.BOOL:
-        internal_input1 = onnx.helper.make_node("Cast", ["START"], ["_START"],
-                                                to=onnx.TensorProto.INT32)
-        internal_input2 = onnx.helper.make_node("Cast", ["READY"], ["_READY"],
-                                                to=onnx.TensorProto.INT32)
-        not_start_cast = onnx.helper.make_node("Not", ["START"],
-                                               ["_NOT_START_CAST"])
-        not_start = onnx.helper.make_node("Cast", ["_NOT_START_CAST"],
-                                          ["_NOT_START"],
-                                          to=onnx.TensorProto.INT32)
-        not_ready_cast = onnx.helper.make_node("Not", ["START"],
-                                               ["_NOT_READY_CAST"])
-        not_ready = onnx.helper.make_node("Cast", ["_NOT_READY_CAST"],
-                                          ["_NOT_READY"],
-                                          to=onnx.TensorProto.INT32)
-
-        input_state_cond = onnx.helper.make_node("And",
-                                                 ["READY", "_NOT_START_CAST"],
-                                                 ["input_state_cond"])
-        input_state_cond_cast = onnx.helper.make_node("Cast",
-                                                      ["input_state_cond"],
-                                                      ["input_state_cond_cast"],
-                                                      to=onnx.TensorProto.INT32)
+        internal_input1 = onnx.helper.make_node(
+            "Cast", ["START"], ["_START"], to=onnx.TensorProto.INT32
+        )
+        internal_input2 = onnx.helper.make_node(
+            "Cast", ["READY"], ["_READY"], to=onnx.TensorProto.INT32
+        )
+        not_start_cast = onnx.helper.make_node("Not", ["START"], ["_NOT_START_CAST"])
+        not_start = onnx.helper.make_node(
+            "Cast", ["_NOT_START_CAST"], ["_NOT_START"], to=onnx.TensorProto.INT32
+        )
+        not_ready_cast = onnx.helper.make_node("Not", ["START"], ["_NOT_READY_CAST"])
+        not_ready = onnx.helper.make_node(
+            "Cast", ["_NOT_READY_CAST"], ["_NOT_READY"], to=onnx.TensorProto.INT32
+        )
+
+        input_state_cond = onnx.helper.make_node(
+            "And", ["READY", "_NOT_START_CAST"], ["input_state_cond"]
+        )
+        input_state_cond_cast = onnx.helper.make_node(
+            "Cast",
+            ["input_state_cond"],
+            ["input_state_cond_cast"],
+            to=onnx.TensorProto.INT32,
+        )
         mul_state = onnx.helper.make_node(
-            "Mul", ["_INPUT_STATE", "input_state_cond_cast"], ["mul_state"])
+            "Mul", ["_INPUT_STATE", "input_state_cond_cast"], ["mul_state"]
+        )
         add = onnx.helper.make_node("Add", ["_INPUT", "mul_state"], ["CAST"])
 
     else:
-        start_cast = onnx.helper.make_node("Cast", ["START"], ["_START_CAST"],
-                                           to=onnx.TensorProto.BOOL)
-        not_start_cast = onnx.helper.make_node("Not", ["_START_CAST"],
-                                               ["_NOT_START_CAST"])
-        not_start = onnx.helper.make_node("Cast", ["_NOT_START_CAST"],
-                                          ["_NOT_START"],
-                                          to=onnx.TensorProto.INT32)
-
-        ready_cast = onnx.helper.make_node("Cast", ["READY"], ["_READY_CAST"],
-                                           to=onnx.TensorProto.BOOL)
-        not_ready_cast = onnx.helper.make_node("Not", ["_READY_CAST"],
-                                               ["_NOT_READY_CAST"])
-        not_ready = onnx.helper.make_node("Cast", ["_NOT_READY_CAST"],
-                                          ["_NOT_READY"],
-                                          to=onnx.TensorProto.INT32)
+        start_cast = onnx.helper.make_node(
+            "Cast", ["START"], ["_START_CAST"], to=onnx.TensorProto.BOOL
+        )
+        not_start_cast = onnx.helper.make_node(
+            "Not", ["_START_CAST"], ["_NOT_START_CAST"]
+        )
+        not_start = onnx.helper.make_node(
+            "Cast", ["_NOT_START_CAST"], ["_NOT_START"], to=onnx.TensorProto.INT32
+        )
+
+        ready_cast = onnx.helper.make_node(
+            "Cast", ["READY"], ["_READY_CAST"], to=onnx.TensorProto.BOOL
+        )
+        not_ready_cast = onnx.helper.make_node(
+            "Not", ["_READY_CAST"], ["_NOT_READY_CAST"]
+        )
+        not_ready = onnx.helper.make_node(
+            "Cast", ["_NOT_READY_CAST"], ["_NOT_READY"], to=onnx.TensorProto.INT32
+        )
 
         # Take advantage of knowledge that the READY false value is 0 and true is 1
         input_state_cond = onnx.helper.make_node(
-            "And", ["_NOT_START_CAST", "_READY_CAST"], ["input_state_cond"])
-        input_state_cond_cast = onnx.helper.make_node("Cast",
-                                                      ["input_state_cond"],
-                                                      ["input_state_cond_cast"],
-                                                      to=onnx.TensorProto.INT32)
+            "And", ["_NOT_START_CAST", "_READY_CAST"], ["input_state_cond"]
+        )
+        input_state_cond_cast = onnx.helper.make_node(
+            "Cast",
+            ["input_state_cond"],
+            ["input_state_cond_cast"],
+            to=onnx.TensorProto.INT32,
+        )
         mul_state = onnx.helper.make_node(
-            "Mul", ["_INPUT_STATE", "input_state_cond_cast"], ["mul_state"])
+            "Mul", ["_INPUT_STATE", "input_state_cond_cast"], ["mul_state"]
+        )
         add = onnx.helper.make_node("Add", ["_INPUT", "mul_state"], ["CAST"])
 
     cast = onnx.helper.make_node("Cast", ["CAST"], ["OUTPUT"], to=onnx_dtype)
-    cast_output_state = onnx.helper.make_node("Cast", ["CAST"],
-                                              ["OUTPUT_STATE"],
-                                              to=onnx_dtype)
+    cast_output_state = onnx.helper.make_node(
+        "Cast", ["CAST"], ["OUTPUT_STATE"], to=onnx_dtype
+    )
 
     # Avoid cast from float16 to float16
     # (bug in Onnx Runtime, cast from float16 to float16 will become cast from float16 to float32)
     if onnx_dtype == onnx.TensorProto.FLOAT16:
         cast = onnx.helper.make_node("Identity", ["CAST"], ["OUTPUT"])
-        cast_output_state = onnx.helper.make_node("Identity", ["CAST"],
-                                                  ["OUTPUT_STATE"])
+        cast_output_state = onnx.helper.make_node(
+            "Identity", ["CAST"], ["OUTPUT_STATE"]
+        )
 
     if onnx_control_dtype == onnx.TensorProto.BOOL:
         onnx_nodes = [
-            internal_input, internal_input_state, internal_input1,
-            internal_input2, not_start_cast, not_start, not_ready_cast,
-            not_ready, input_state_cond, input_state_cond_cast, mul_state, add,
-            cast, cast_output_state
+            internal_input,
+            internal_input_state,
+            internal_input1,
+            internal_input2,
+            not_start_cast,
+            not_start,
+            not_ready_cast,
+            not_ready,
+            input_state_cond,
+            input_state_cond_cast,
+            mul_state,
+            add,
+            cast,
+            cast_output_state,
         ]
     else:
         onnx_nodes = [
-            internal_input, internal_input_state, start_cast, not_start_cast,
-            not_start, ready_cast, not_ready_cast, not_ready, input_state_cond,
-            input_state_cond_cast, mul_state, add, cast, cast_output_state
+            internal_input,
+            internal_input_state,
+            start_cast,
+            not_start_cast,
+            not_start,
+            ready_cast,
+            not_ready_cast,
+            not_ready,
+            input_state_cond,
+            input_state_cond_cast,
+            mul_state,
+            add,
+            cast,
+            cast_output_state,
         ]
 
     onnx_inputs = [
-        onnx_end, onnx_corrid, onnx_input_state, onnx_input, onnx_start,
-        onnx_ready
+        onnx_end,
+        onnx_corrid,
+        onnx_input_state,
+        onnx_input,
+        onnx_start,
+        onnx_ready,
     ]
     onnx_outputs = [onnx_output, onnx_output_state]
-    graph_proto = onnx.helper.make_graph(onnx_nodes, model_name, onnx_inputs,
-                                         onnx_outputs)
+    graph_proto = onnx.helper.make_graph(
+        onnx_nodes, model_name, onnx_inputs, onnx_outputs
+    )
 
     if FLAGS.onnx_opset > 0:
         model_opset = onnx.helper.make_operatorsetid("", FLAGS.onnx_opset)
-        model_def = onnx.helper.make_model(graph_proto,
-                                           producer_name="triton",
-                                           opset_imports=[model_opset])
+        model_def = onnx.helper.make_model(
+            graph_proto, producer_name="triton", opset_imports=[model_opset]
+        )
     else:
         model_def = onnx.helper.make_model(graph_proto, producer_name="triton")
 
@@ -265,14 +317,14 @@ def create_onnx_modelfile(models_dir, model_version, max_batch, dtype, shape):
 
 
 def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape):
-
     if not tu.validate_for_onnx_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_dyna_sequence_model_name(
-        "onnx_nobatch" if max_batch == 0 else "onnx", dtype)
+        "onnx_nobatch" if max_batch == 0 else "onnx", dtype
+    )
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "onnxruntime_onnx"
 max_batch_size: {}
@@ -323,7 +375,7 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape):
       output_name: "OUTPUT_STATE"
       data_type: {dtype}
       dims: {dims}
-    }} 
+    }}
   ]
 }}
 input [
@@ -345,14 +397,16 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape):
     kind: KIND_CPU
   }}
 ]
-'''.format(
+""".format(
         model_name,
         max_batch,
         "oldest { max_candidate_sequences: 6\npreferred_batch_size: [ 4 ]\nmax_queue_delay_microseconds: 0\n}"
-        if max_batch > 0 else "",
+        if max_batch > 0
+        else "",
         dtype=np_to_model_dtype(dtype),
         dims=tu.shape_to_dims_str(shape),
-        type="fp32" if dtype == np.float32 else "int32")
+        type="fp32" if dtype == np.float32 else "int32",
+    )
 
     try:
         os.makedirs(config_dir)
@@ -363,8 +417,7 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape):
         cfile.write(config)
 
 
-def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
-                                shape):
+def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype, shape):
     trt_dtype = np_to_trt_dtype(dtype)
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
@@ -378,25 +431,31 @@ def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
 
     constant_1_data = trt.Weights(np.ones([1 for i in shape], dtype=dtype))
     constant_1 = network.add_constant([1 for i in shape], constant_1_data)
-    not_start = network.add_elementwise(constant_1.get_output(0), start0,
-                                        trt.ElementWiseOperation.SUB)
+    not_start = network.add_elementwise(
+        constant_1.get_output(0), start0, trt.ElementWiseOperation.SUB
+    )
     not_start.set_output_type(0, trt_dtype)
 
     input_state_cond_temp = network.add_elementwise(
-        ready0, not_start.get_output(0), trt.ElementWiseOperation.SUM)
-    constant_2 = network.add_elementwise(constant_1.get_output(0),
-                                         constant_1.get_output(0),
-                                         trt.ElementWiseOperation.SUM)
+        ready0, not_start.get_output(0), trt.ElementWiseOperation.SUM
+    )
+    constant_2 = network.add_elementwise(
+        constant_1.get_output(0), constant_1.get_output(0), trt.ElementWiseOperation.SUM
+    )
     input_state_cond = network.add_elementwise(
-        input_state_cond_temp.get_output(0), constant_2.get_output(0),
-        trt.ElementWiseOperation.FLOOR_DIV)
-    internal_state = network.add_elementwise(in_state0,
-                                             input_state_cond.get_output(0),
-                                             trt.ElementWiseOperation.PROD)
-    out0 = network.add_elementwise(internal_state.get_output(0), in0,
-                                   trt.ElementWiseOperation.SUM)
-    out0_state = network.add_elementwise(internal_state.get_output(0), in0,
-                                         trt.ElementWiseOperation.SUM)
+        input_state_cond_temp.get_output(0),
+        constant_2.get_output(0),
+        trt.ElementWiseOperation.FLOOR_DIV,
+    )
+    internal_state = network.add_elementwise(
+        in_state0, input_state_cond.get_output(0), trt.ElementWiseOperation.PROD
+    )
+    out0 = network.add_elementwise(
+        internal_state.get_output(0), in0, trt.ElementWiseOperation.SUM
+    )
+    out0_state = network.add_elementwise(
+        internal_state.get_output(0), in0, trt.ElementWiseOperation.SUM
+    )
 
     out0.get_output(0).name = "OUTPUT"
     network.mark_output(out0.get_output(0))
@@ -416,7 +475,8 @@ def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
     del network
 
     model_name = tu.get_dyna_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -428,8 +488,7 @@ def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
         f.write(engine_bytes)
 
 
-def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
-                                   shape):
+def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype, shape):
     trt_dtype = np_to_trt_dtype(dtype)
     trt_memory_format = trt.TensorFormat.LINEAR
 
@@ -445,25 +504,31 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
 
     constant_1_data = trt.Weights(np.ones([1 for i in shape], dtype=dtype))
     constant_1 = network.add_constant([1 for i in shape], constant_1_data)
-    not_start = network.add_elementwise(constant_1.get_output(0), start0,
-                                        trt.ElementWiseOperation.SUB)
+    not_start = network.add_elementwise(
+        constant_1.get_output(0), start0, trt.ElementWiseOperation.SUB
+    )
     not_start.set_output_type(0, trt_dtype)
 
     input_state_cond_temp = network.add_elementwise(
-        ready0, not_start.get_output(0), trt.ElementWiseOperation.SUM)
-    constant_2 = network.add_elementwise(constant_1.get_output(0),
-                                         constant_1.get_output(0),
-                                         trt.ElementWiseOperation.SUM)
+        ready0, not_start.get_output(0), trt.ElementWiseOperation.SUM
+    )
+    constant_2 = network.add_elementwise(
+        constant_1.get_output(0), constant_1.get_output(0), trt.ElementWiseOperation.SUM
+    )
     input_state_cond = network.add_elementwise(
-        input_state_cond_temp.get_output(0), constant_2.get_output(0),
-        trt.ElementWiseOperation.FLOOR_DIV)
-    internal_state = network.add_elementwise(in_state0,
-                                             input_state_cond.get_output(0),
-                                             trt.ElementWiseOperation.PROD)
-    out0 = network.add_elementwise(internal_state.get_output(0), in0,
-                                   trt.ElementWiseOperation.SUM)
-    out0_state = network.add_elementwise(internal_state.get_output(0), in0,
-                                         trt.ElementWiseOperation.SUM)
+        input_state_cond_temp.get_output(0),
+        constant_2.get_output(0),
+        trt.ElementWiseOperation.FLOOR_DIV,
+    )
+    internal_state = network.add_elementwise(
+        in_state0, input_state_cond.get_output(0), trt.ElementWiseOperation.PROD
+    )
+    out0 = network.add_elementwise(
+        internal_state.get_output(0), in0, trt.ElementWiseOperation.SUM
+    )
+    out0_state = network.add_elementwise(
+        internal_state.get_output(0), in0, trt.ElementWiseOperation.SUM
+    )
 
     out0.get_output(0).name = "OUTPUT"
     network.mark_output(out0.get_output(0))
@@ -480,7 +545,7 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
     out0.get_output(0).allowed_formats = 1 << int(trt_memory_format)
     out0_state.get_output(0).allowed_formats = 1 << int(trt_memory_format)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         in0.dynamic_range = (-128.0, 127.0)
         in_state0.dynamic_range = (-128.0, 127.0)
         out0.dynamic_range = (-128.0, 127.0)
@@ -490,9 +555,9 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
 
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
-    elif (trt_dtype == trt.float16):
+    elif trt_dtype == trt.float16:
         flags |= 1 << int(trt.BuilderFlag.FP16)
 
     config = builder.create_builder_config()
@@ -507,7 +572,8 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
         del engine
 
     model_name = tu.get_dyna_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -520,27 +586,26 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
 
 
 def create_plan_modelfile(models_dir, model_version, max_batch, dtype, shape):
-
     if not tu.validate_for_trt_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     if dtype != np.float32:
-        create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch,
-                                       dtype, shape)
+        create_plan_fixed_rf_modelfile(
+            models_dir, model_version, max_batch, dtype, shape
+        )
     else:
-        create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
-                                    shape)
+        create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype, shape)
 
 
 def create_plan_modelconfig(models_dir, model_version, max_batch, dtype, shape):
-
     if not tu.validate_for_trt_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_dyna_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: {}
@@ -591,7 +656,7 @@ def create_plan_modelconfig(models_dir, model_version, max_batch, dtype, shape):
       output_name: "OUTPUT_STATE"
       data_type: {dtype}
       dims: {dims}
-    }} 
+    }}
   ]
 }}
 input [
@@ -613,14 +678,16 @@ def create_plan_modelconfig(models_dir, model_version, max_batch, dtype, shape):
     kind: KIND_GPU
   }}
 ]
-'''.format(
+""".format(
         model_name,
         max_batch,
         "oldest { max_candidate_sequences: 6\npreferred_batch_size: [ 4 ]\nmax_queue_delay_microseconds: 0\n}"
-        if max_batch > 0 else "",
+        if max_batch > 0
+        else "",
         dtype=np_to_model_dtype(dtype),
         dims=tu.shape_to_dims_str(shape),
-        type="fp32" if dtype == np.float32 else "int32")
+        type="fp32" if dtype == np.float32 else "int32",
+    )
 
     try:
         os.makedirs(config_dir)
@@ -652,51 +719,63 @@ def create_models(models_dir, dtype, shape, no_batch=True):
             create_plan_modelfile(models_dir, model_version, 0, dtype, shape)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
-    parser.add_argument('--graphdef',
-                        required=False,
-                        action='store_true',
-                        help='Generate GraphDef models')
-    parser.add_argument('--savedmodel',
-                        required=False,
-                        action='store_true',
-                        help='Generate SavedModel models')
-    parser.add_argument('--tensorrt',
-                        required=False,
-                        action='store_true',
-                        help='Generate TensorRT PLAN models')
     parser.add_argument(
-        '--tensorrt-shape-io',
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
+    parser.add_argument(
+        "--graphdef",
+        required=False,
+        action="store_true",
+        help="Generate GraphDef models",
+    )
+    parser.add_argument(
+        "--savedmodel",
+        required=False,
+        action="store_true",
+        help="Generate SavedModel models",
+    )
+    parser.add_argument(
+        "--tensorrt",
         required=False,
-        action='store_true',
-        help='Generate TensorRT PLAN models w/ shape tensor i/o')
-    parser.add_argument('--onnx',
-                        required=False,
-                        action='store_true',
-                        help='Generate Onnx models')
+        action="store_true",
+        help="Generate TensorRT PLAN models",
+    )
+    parser.add_argument(
+        "--tensorrt-shape-io",
+        required=False,
+        action="store_true",
+        help="Generate TensorRT PLAN models w/ shape tensor i/o",
+    )
+    parser.add_argument(
+        "--onnx", required=False, action="store_true", help="Generate Onnx models"
+    )
     parser.add_argument(
-        '--onnx_opset',
+        "--onnx_opset",
         type=int,
         required=False,
         default=0,
-        help='Opset used for Onnx models. Default is to use ONNXRT default')
-    parser.add_argument('--libtorch',
-                        required=False,
-                        action='store_true',
-                        help='Generate Pytorch LibTorch models')
-    parser.add_argument('--openvino',
-                        required=False,
-                        action='store_true',
-                        help='Generate OpenVino models')
-    parser.add_argument('--variable',
-                        required=False,
-                        action='store_true',
-                        help='Used variable-shape tensors for input/output')
+        help="Opset used for Onnx models. Default is to use ONNXRT default",
+    )
+    parser.add_argument(
+        "--libtorch",
+        required=False,
+        action="store_true",
+        help="Generate Pytorch LibTorch models",
+    )
+    parser.add_argument(
+        "--openvino",
+        required=False,
+        action="store_true",
+        help="Generate OpenVino models",
+    )
+    parser.add_argument(
+        "--variable",
+        required=False,
+        action="store_true",
+        help="Used variable-shape tensors for input/output",
+    )
     FLAGS, unparsed = parser.parse_known_args()
 
     if FLAGS.onnx:
@@ -709,12 +788,21 @@ def create_models(models_dir, dtype, shape, no_batch=True):
 
     # Tests with models that accept fixed-shape input/output tensors
     if not FLAGS.variable:
-        create_models(FLAGS.models_dir, np.int32, [
-            1,
-        ])
+        create_models(
+            FLAGS.models_dir,
+            np.int32,
+            [
+                1,
+            ],
+        )
 
     # Tests with models that accept variable-shape input/output tensors
     if FLAGS.variable:
-        create_models(FLAGS.models_dir, np.int32, [
-            -1,
-        ], False)
+        create_models(
+            FLAGS.models_dir,
+            np.int32,
+            [
+                -1,
+            ],
+            False,
+        )
diff --git a/qa/common/gen_qa_dyna_sequence_models.py b/qa/common/gen_qa_dyna_sequence_models.py
old mode 100644
new mode 100755
index 7202d682dd..8a02497bfa
--- a/qa/common/gen_qa_dyna_sequence_models.py
+++ b/qa/common/gen_qa_dyna_sequence_models.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,6 +28,7 @@
 
 import argparse
 import os
+
 import numpy as np
 
 FLAGS = None
@@ -149,9 +152,9 @@ def np_to_torch_dtype(np_dtype):
     return None
 
 
-def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
-                        dtype, shape):
-
+def create_tf_modelfile(
+    create_savedmodel, models_dir, model_version, max_batch, dtype, shape
+):
     if not tu.validate_for_tf_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
@@ -167,32 +170,53 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
     # dimension.
     tf.compat.v1.reset_default_graph()
     if create_savedmodel and (max_batch == 0):
-        input0 = tf.compat.v1.placeholder(tf_input_dtype, [
-            1,
-        ], "INPUT")
+        input0 = tf.compat.v1.placeholder(
+            tf_input_dtype,
+            [
+                1,
+            ],
+            "INPUT",
+        )
         if tf_input_dtype == tf.string:
-            input0 = tf.strings.to_number(tf.strings.join(["0", input0]),
-                                          tf_dtype)
-        start0 = tf.compat.v1.placeholder(tf_dtype, [
-            1,
-        ], "START")
-        end0 = tf.compat.v1.placeholder(tf_dtype, [
-            1,
-        ], "END")
-        ready0 = tf.compat.v1.placeholder(tf_dtype, [
-            1,
-        ], "READY")
-        corrid0 = tf.compat.v1.placeholder(tf.uint64, [
-            1,
-        ], "CORRID")
+            input0 = tf.strings.to_number(tf.strings.join(["0", input0]), tf_dtype)
+        start0 = tf.compat.v1.placeholder(
+            tf_dtype,
+            [
+                1,
+            ],
+            "START",
+        )
+        end0 = tf.compat.v1.placeholder(
+            tf_dtype,
+            [
+                1,
+            ],
+            "END",
+        )
+        ready0 = tf.compat.v1.placeholder(
+            tf_dtype,
+            [
+                1,
+            ],
+            "READY",
+        )
+        corrid0 = tf.compat.v1.placeholder(
+            tf.uint64,
+            [
+                1,
+            ],
+            "CORRID",
+        )
         corrid_cast0 = tf.cast(corrid0, tf_dtype)
-        acc = tf.compat.v1.get_variable("ACC", [
-            1,
-        ], dtype=tf_dtype)
-        tmp0 = tf.compat.v1.where(tf.equal(start0, 1), input0,
-                                  tf.add(acc, input0))
-        tmp1 = tf.compat.v1.where(tf.equal(end0, 1), tf.add(tmp0, corrid_cast0),
-                                  tmp0)
+        acc = tf.compat.v1.get_variable(
+            "ACC",
+            [
+                1,
+            ],
+            dtype=tf_dtype,
+        )
+        tmp0 = tf.compat.v1.where(tf.equal(start0, 1), input0, tf.add(acc, input0))
+        tmp1 = tf.compat.v1.where(tf.equal(end0, 1), tf.add(tmp0, corrid_cast0), tmp0)
         newacc = tf.compat.v1.where(tf.equal(ready0, 1), tmp1, acc)
         assign = tf.compat.v1.assign(acc, newacc)
         if tf_input_dtype == tf.string:
@@ -207,12 +231,16 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
         # output shape being [None, 1]. So instead we just return 0 if
         # not-ready and 'INPUT'+'START'+('END'*'CORRID')
         # otherwise... the tests know to expect this.
-        input0 = tf.compat.v1.placeholder(tf_input_dtype, [
-            None,
-        ] + tu.shape_to_tf_shape(shape), "INPUT")
+        input0 = tf.compat.v1.placeholder(
+            tf_input_dtype,
+            [
+                None,
+            ]
+            + tu.shape_to_tf_shape(shape),
+            "INPUT",
+        )
         if tf_input_dtype == tf.string:
-            input0 = tf.strings.to_number(tf.strings.join(["0", input0]),
-                                          tf_dtype)
+            input0 = tf.strings.to_number(tf.strings.join(["0", input0]), tf_dtype)
         start0 = tf.compat.v1.placeholder(tf_dtype, [None, 1], "START")
         end0 = tf.compat.v1.placeholder(tf_dtype, [None, 1], "END")
         ready0 = tf.compat.v1.placeholder(tf_dtype, [None, 1], "READY")
@@ -221,7 +249,8 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
         tmp = tf.compat.v1.where(
             tf.equal(ready0, 1),
             tf.add(tf.add(start0, input0), tf.multiply(end0, corrid_cast0)),
-            tf.zeros(tf.shape(input=input0), dtype=tf_dtype))
+            tf.zeros(tf.shape(input=input0), dtype=tf_dtype),
+        )
         if tf_input_dtype == tf.string:
             tf.strings.as_string(tmp, name="OUTPUT")
         else:
@@ -230,10 +259,12 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
     # Use a different model name for the non-batching variant
     if create_savedmodel:
         model_name = tu.get_dyna_sequence_model_name(
-            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", dtype)
+            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", dtype
+        )
     else:
         model_name = tu.get_dyna_sequence_model_name(
-            "graphdef_nobatch" if max_batch == 0 else "graphdef", dtype)
+            "graphdef_nobatch" if max_batch == 0 else "graphdef", dtype
+        )
 
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
@@ -246,17 +277,21 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
         with tf.compat.v1.Session() as sess:
             sess.run(tf.compat.v1.initializers.global_variables())
             input0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-                "INPUT:0")
+                "INPUT:0"
+            )
             start0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-                "START:0")
-            end0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-                "END:0")
+                "START:0"
+            )
+            end0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name("END:0")
             ready0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-                "READY:0")
-            corrid0_tensor = tf.compat.v1.get_default_graph(
-            ).get_tensor_by_name("CORRID:0")
-            output0_tensor = tf.compat.v1.get_default_graph(
-            ).get_tensor_by_name("OUTPUT:0")
+                "READY:0"
+            )
+            corrid0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
+                "CORRID:0"
+            )
+            output0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
+                "OUTPUT:0"
+            )
             tf.compat.v1.saved_model.simple_save(
                 sess,
                 model_version_dir + "/model.savedmodel",
@@ -265,34 +300,39 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
                     "START": start0_tensor,
                     "END": end0_tensor,
                     "READY": ready0_tensor,
-                    "CORRID": corrid0_tensor
+                    "CORRID": corrid0_tensor,
                 },
-                outputs={"OUTPUT": output0_tensor})
+                outputs={"OUTPUT": output0_tensor},
+            )
     else:
         with tf.compat.v1.Session() as sess:
             sess.run(tf.compat.v1.initializers.global_variables())
-            graph_io.write_graph(sess.graph.as_graph_def(),
-                                 model_version_dir,
-                                 "model.graphdef",
-                                 as_text=False)
-
+            graph_io.write_graph(
+                sess.graph.as_graph_def(),
+                model_version_dir,
+                "model.graphdef",
+                as_text=False,
+            )
 
-def create_tf_modelconfig(create_savedmodel, models_dir, model_version,
-                          max_batch, dtype, shape):
 
+def create_tf_modelconfig(
+    create_savedmodel, models_dir, model_version, max_batch, dtype, shape
+):
     if not tu.validate_for_tf_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     # Use a different model name for the non-batching variant
     if create_savedmodel:
         model_name = tu.get_dyna_sequence_model_name(
-            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", dtype)
+            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", dtype
+        )
     else:
         model_name = tu.get_dyna_sequence_model_name(
-            "graphdef_nobatch" if max_batch == 0 else "graphdef", dtype)
+            "graphdef_nobatch" if max_batch == 0 else "graphdef", dtype
+        )
 
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "{}"
 max_batch_size: {}
@@ -357,15 +397,20 @@ def create_tf_modelconfig(create_savedmodel, models_dir, model_version,
     kind: KIND_CPU
   }}
 ]
-'''.format(
+""".format(
         model_name,
         "tensorflow_savedmodel" if create_savedmodel else "tensorflow_graphdef",
         max_batch,
         "oldest { max_candidate_sequences: 6\npreferred_batch_size: [ 4 ]\nmax_queue_delay_microseconds: 0\n}"
-        if max_batch > 0 else "", "fp32" if dtype == np.float32 else "int32",
+        if max_batch > 0
+        else "",
+        "fp32" if dtype == np.float32 else "int32",
+        "fp32" if dtype == np.float32 else "int32",
         "fp32" if dtype == np.float32 else "int32",
-        "fp32" if dtype == np.float32 else "int32", np_to_model_dtype(dtype),
-        tu.shape_to_dims_str(shape), np_to_model_dtype(dtype))
+        np_to_model_dtype(dtype),
+        tu.shape_to_dims_str(shape),
+        np_to_model_dtype(dtype),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -376,8 +421,9 @@ def create_tf_modelconfig(create_savedmodel, models_dir, model_version,
         cfile.write(config)
 
 
-def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
-                                       dtype, shape):
+def create_plan_shape_tensor_modelfile(
+    models_dir, model_version, max_batch, dtype, shape
+):
     # Note that resize layer does not support int tensors.
     # The model takes three inputs (INPUT, DUMMY_INPUT and SHAPE_INPUT)
     # and four control inputs(START, END, READY, CORR_ID).
@@ -394,16 +440,15 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
 
-    unit_shape = ([1] * len(shape))
-    dummy_shape = ([-1] * len(shape))
+    unit_shape = [1] * len(shape)
+    dummy_shape = [-1] * len(shape)
     if max_batch != 0:
         in0 = network.add_input("INPUT", trt.int32, [-1] + dummy_shape)
-        dummy_in0 = network.add_input("DUMMY_INPUT", trt_dtype,
-                                      [-1] + dummy_shape)
-        shape_in0 = network.add_input("SHAPE_INPUT", trt.int32,
-                                      [1 + len(shape)])
+        dummy_in0 = network.add_input("DUMMY_INPUT", trt_dtype, [-1] + dummy_shape)
+        shape_in0 = network.add_input("SHAPE_INPUT", trt.int32, [1 + len(shape)])
         start0 = network.add_input("START", trt.int32, [-1] + unit_shape)
         end0 = network.add_input("END", trt.int32, [-1] + unit_shape)
         ready0 = network.add_input("READY", trt.int32, [-1] + unit_shape)
@@ -419,10 +464,12 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
 
     add0 = network.add_elementwise(in0, start0, trt.ElementWiseOperation.SUM)
     mul0 = network.add_elementwise(end0, corrid0, trt.ElementWiseOperation.PROD)
-    sum0 = network.add_elementwise(add0.get_output(0), mul0.get_output(0),
-                                   trt.ElementWiseOperation.SUM)
-    out0 = network.add_elementwise(sum0.get_output(0), ready0,
-                                   trt.ElementWiseOperation.PROD).get_output(0)
+    sum0 = network.add_elementwise(
+        add0.get_output(0), mul0.get_output(0), trt.ElementWiseOperation.SUM
+    )
+    out0 = network.add_elementwise(
+        sum0.get_output(0), ready0, trt.ElementWiseOperation.PROD
+    ).get_output(0)
 
     resize_layer = network.add_resize(dummy_in0)
     resize_layer.set_input(1, shape_in0)
@@ -449,7 +496,7 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
     shape_out0.get_output(0).allowed_formats = 1 << int(trt_memory_format)
     resized_out0.allowed_formats = 1 << int(trt_memory_format)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         dummy_in0.dynamic_range = (-128.0, 127.0)
         resized_out0.dynamic_range = (-128.0, 127.0)
         start0.dynamic_range = (-128.0, 127.0)
@@ -457,9 +504,9 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
         ready0.dynamic_range = (-128.0, 127.0)
 
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
-    elif (trt_dtype == trt.float16):
+    elif trt_dtype == trt.float16:
         flags |= 1 << int(trt.BuilderFlag.FP16)
 
     min_prefix = []
@@ -479,14 +526,27 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
     profile.set_shape("INPUT", min_shape, opt_shape, max_shape)
     profile.set_shape_input("SHAPE_INPUT", min_shape, opt_shape, max_shape)
     profile.set_shape("DUMMY_INPUT", min_shape, opt_shape, max_shape)
-    profile.set_shape("START", min_prefix + unit_shape, opt_prefix + unit_shape,
-                      max_prefix + unit_shape)
-    profile.set_shape("END", min_prefix + unit_shape, opt_prefix + unit_shape,
-                      max_prefix + unit_shape)
-    profile.set_shape("READY", min_prefix + unit_shape, opt_prefix + unit_shape,
-                      max_prefix + unit_shape)
-    profile.set_shape("CORRID", min_prefix + unit_shape,
-                      opt_prefix + unit_shape, max_prefix + unit_shape)
+    profile.set_shape(
+        "START",
+        min_prefix + unit_shape,
+        opt_prefix + unit_shape,
+        max_prefix + unit_shape,
+    )
+    profile.set_shape(
+        "END", min_prefix + unit_shape, opt_prefix + unit_shape, max_prefix + unit_shape
+    )
+    profile.set_shape(
+        "READY",
+        min_prefix + unit_shape,
+        opt_prefix + unit_shape,
+        max_prefix + unit_shape,
+    )
+    profile.set_shape(
+        "CORRID",
+        min_prefix + unit_shape,
+        opt_prefix + unit_shape,
+        max_prefix + unit_shape,
+    )
 
     config = builder.create_builder_config()
     config.flags = flags
@@ -500,7 +560,8 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
         del engine
 
     model_name = tu.get_dyna_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -512,8 +573,7 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
         f.write(engine_bytes)
 
 
-def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
-                                shape):
+def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype, shape):
     trt_dtype = np_to_trt_dtype(dtype)
     # Create the model. For now don't implement a proper accumulator
     # just return 0 if not-ready and 'INPUT'+'START'+('END'*'CORRID')
@@ -528,10 +588,12 @@ def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
     corrid0 = network.add_input("CORRID", trt.int32, [1 for i in shape])
     add0 = network.add_elementwise(in0, start0, trt.ElementWiseOperation.SUM)
     mul0 = network.add_elementwise(end0, corrid0, trt.ElementWiseOperation.PROD)
-    sum0 = network.add_elementwise(add0.get_output(0), mul0.get_output(0),
-                                   trt.ElementWiseOperation.SUM)
-    out0 = network.add_elementwise(sum0.get_output(0), ready0,
-                                   trt.ElementWiseOperation.PROD)
+    sum0 = network.add_elementwise(
+        add0.get_output(0), mul0.get_output(0), trt.ElementWiseOperation.SUM
+    )
+    out0 = network.add_elementwise(
+        sum0.get_output(0), ready0, trt.ElementWiseOperation.PROD
+    )
 
     out0.get_output(0).name = "OUTPUT"
     network.mark_output(out0.get_output(0))
@@ -548,7 +610,8 @@ def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
     del network
 
     model_name = tu.get_dyna_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -560,8 +623,7 @@ def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
         f.write(engine_bytes)
 
 
-def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
-                                   shape):
+def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype, shape):
     trt_dtype = np_to_trt_dtype(dtype)
     trt_memory_format = trt.TensorFormat.LINEAR
     # Create the model. For now don't implement a proper accumulator
@@ -577,10 +639,12 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
     corrid0 = network.add_input("CORRID", trt.int32, [1 for i in shape])
     add0 = network.add_elementwise(in0, start0, trt.ElementWiseOperation.SUM)
     mul0 = network.add_elementwise(end0, corrid0, trt.ElementWiseOperation.PROD)
-    sum0 = network.add_elementwise(add0.get_output(0), mul0.get_output(0),
-                                   trt.ElementWiseOperation.SUM)
-    out0 = network.add_elementwise(sum0.get_output(0), ready0,
-                                   trt.ElementWiseOperation.PROD)
+    sum0 = network.add_elementwise(
+        add0.get_output(0), mul0.get_output(0), trt.ElementWiseOperation.SUM
+    )
+    out0 = network.add_elementwise(
+        sum0.get_output(0), ready0, trt.ElementWiseOperation.PROD
+    )
 
     out0.get_output(0).name = "OUTPUT"
     network.mark_output(out0.get_output(0))
@@ -594,7 +658,7 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
     corrid0.allowed_formats = 1 << int(trt_memory_format)
     out0.get_output(0).allowed_formats = 1 << int(trt_memory_format)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         in0.dynamic_range = (-128.0, 127.0)
         out0.dynamic_range = (-128.0, 127.0)
         start0.dynamic_range = (-128.0, 127.0)
@@ -603,9 +667,9 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
         corrid0.dynamic_range = (-128.0, 127.0)
 
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
-    elif (trt_dtype == trt.float16):
+    elif trt_dtype == trt.float16:
         flags |= 1 << int(trt.BuilderFlag.FP16)
 
     config = builder.create_builder_config()
@@ -620,7 +684,8 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
         del engine
 
     model_name = tu.get_dyna_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -632,8 +697,7 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
         f.write(engine_bytes)
 
 
-def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
-                                  shape):
+def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype, shape):
     trt_dtype = np_to_trt_dtype(dtype)
     # Create the model. For now don't implement a proper accumulator
     # just return 0 if not-ready and 'INPUT'+'START'*('END'*'CORRID')
@@ -641,9 +705,10 @@ def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
 
-    unit_shape = ([1] * len(shape))
+    unit_shape = [1] * len(shape)
     if max_batch != 0:
         in0 = network.add_input("INPUT", trt_dtype, [-1] + shape)
         start0 = network.add_input("START", trt_dtype, [-1] + unit_shape)
@@ -659,10 +724,12 @@ def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
 
     add0 = network.add_elementwise(in0, start0, trt.ElementWiseOperation.SUM)
     mul0 = network.add_elementwise(end0, corrid0, trt.ElementWiseOperation.PROD)
-    sum0 = network.add_elementwise(add0.get_output(0), mul0.get_output(0),
-                                   trt.ElementWiseOperation.SUM)
-    out0 = network.add_elementwise(sum0.get_output(0), ready0,
-                                   trt.ElementWiseOperation.PROD)
+    sum0 = network.add_elementwise(
+        add0.get_output(0), mul0.get_output(0), trt.ElementWiseOperation.SUM
+    )
+    out0 = network.add_elementwise(
+        sum0.get_output(0), ready0, trt.ElementWiseOperation.PROD
+    )
 
     out0.get_output(0).name = "OUTPUT"
     network.mark_output(out0.get_output(0))
@@ -687,14 +754,27 @@ def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
     profile = builder.create_optimization_profile()
     profile.set_shape("INPUT", min_shape, opt_shape, max_shape)
     if max_batch != 0:
-        profile.set_shape("START", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
-        profile.set_shape("END", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
-        profile.set_shape("READY", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
-        profile.set_shape("CORRID", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
+        profile.set_shape(
+            "START",
+            [1] + unit_shape,
+            [max_batch] + unit_shape,
+            [max_batch] + unit_shape,
+        )
+        profile.set_shape(
+            "END", [1] + unit_shape, [max_batch] + unit_shape, [max_batch] + unit_shape
+        )
+        profile.set_shape(
+            "READY",
+            [1] + unit_shape,
+            [max_batch] + unit_shape,
+            [max_batch] + unit_shape,
+        )
+        profile.set_shape(
+            "CORRID",
+            [1] + unit_shape,
+            [max_batch] + unit_shape,
+            [max_batch] + unit_shape,
+        )
     else:
         profile.set_shape("START", unit_shape, unit_shape, unit_shape)
         profile.set_shape("END", unit_shape, unit_shape, unit_shape)
@@ -712,7 +792,8 @@ def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
         del engine
 
     model_name = tu.get_dyna_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -724,8 +805,9 @@ def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
         f.write(engine_bytes)
 
 
-def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
-                                     dtype, shape):
+def create_plan_dynamic_rf_modelfile(
+    models_dir, model_version, max_batch, dtype, shape
+):
     trt_dtype = np_to_trt_dtype(dtype)
     trt_memory_format = trt.TensorFormat.LINEAR
 
@@ -735,9 +817,10 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
 
-    unit_shape = ([1] * len(shape))
+    unit_shape = [1] * len(shape)
     if max_batch != 0:
         in0 = network.add_input("INPUT", trt_dtype, [-1] + shape)
         start0 = network.add_input("START", trt_dtype, [-1] + unit_shape)
@@ -753,10 +836,12 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
 
     add0 = network.add_elementwise(in0, start0, trt.ElementWiseOperation.SUM)
     mul0 = network.add_elementwise(end0, corrid0, trt.ElementWiseOperation.PROD)
-    sum0 = network.add_elementwise(add0.get_output(0), mul0.get_output(0),
-                                   trt.ElementWiseOperation.SUM)
-    out0 = network.add_elementwise(sum0.get_output(0), ready0,
-                                   trt.ElementWiseOperation.PROD)
+    sum0 = network.add_elementwise(
+        add0.get_output(0), mul0.get_output(0), trt.ElementWiseOperation.SUM
+    )
+    out0 = network.add_elementwise(
+        sum0.get_output(0), ready0, trt.ElementWiseOperation.PROD
+    )
 
     out0.get_output(0).name = "OUTPUT"
     network.mark_output(out0.get_output(0))
@@ -768,7 +853,7 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
     ready0.allowed_formats = 1 << int(trt_memory_format)
     out0.get_output(0).allowed_formats = 1 << int(trt_memory_format)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         in0.dynamic_range = (-128.0, 127.0)
         out0.dynamic_range = (-128.0, 127.0)
         start0.dynamic_range = (-128.0, 127.0)
@@ -777,9 +862,9 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
         corrid0.dynamic_range = (-128.0, 127.0)
 
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
-    elif (trt_dtype == trt.float16):
+    elif trt_dtype == trt.float16:
         flags |= 1 << int(trt.BuilderFlag.FP16)
 
     min_shape = []
@@ -802,14 +887,27 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
     profile = builder.create_optimization_profile()
     profile.set_shape("INPUT", min_shape, opt_shape, max_shape)
     if max_batch != 0:
-        profile.set_shape("START", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
-        profile.set_shape("END", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
-        profile.set_shape("READY", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
-        profile.set_shape("CORRID", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
+        profile.set_shape(
+            "START",
+            [1] + unit_shape,
+            [max_batch] + unit_shape,
+            [max_batch] + unit_shape,
+        )
+        profile.set_shape(
+            "END", [1] + unit_shape, [max_batch] + unit_shape, [max_batch] + unit_shape
+        )
+        profile.set_shape(
+            "READY",
+            [1] + unit_shape,
+            [max_batch] + unit_shape,
+            [max_batch] + unit_shape,
+        )
+        profile.set_shape(
+            "CORRID",
+            [1] + unit_shape,
+            [max_batch] + unit_shape,
+            [max_batch] + unit_shape,
+        )
     else:
         profile.set_shape("START", unit_shape, unit_shape, unit_shape)
         profile.set_shape("END", unit_shape, unit_shape, unit_shape)
@@ -828,7 +926,8 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
         del engine
 
     model_name = tu.get_dyna_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -841,38 +940,41 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
 
 
 def create_plan_modelfile(models_dir, model_version, max_batch, dtype, shape):
-
     if not tu.validate_for_trt_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     if dtype != np.float32:
-        if (not tu.shape_is_fixed(shape)):
-            create_plan_dynamic_rf_modelfile(models_dir, model_version,
-                                             max_batch, dtype, shape)
+        if not tu.shape_is_fixed(shape):
+            create_plan_dynamic_rf_modelfile(
+                models_dir, model_version, max_batch, dtype, shape
+            )
         else:
-            create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch,
-                                           dtype, shape)
+            create_plan_fixed_rf_modelfile(
+                models_dir, model_version, max_batch, dtype, shape
+            )
     else:
-        if (not tu.shape_is_fixed(shape)):
-            create_plan_dynamic_modelfile(models_dir, model_version, max_batch,
-                                          dtype, shape)
+        if not tu.shape_is_fixed(shape):
+            create_plan_dynamic_modelfile(
+                models_dir, model_version, max_batch, dtype, shape
+            )
         else:
-            create_plan_fixed_modelfile(models_dir, model_version, max_batch,
-                                        dtype, shape)
+            create_plan_fixed_modelfile(
+                models_dir, model_version, max_batch, dtype, shape
+            )
 
 
 def create_plan_modelconfig(models_dir, model_version, max_batch, dtype, shape):
-
     if not tu.validate_for_trt_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_dyna_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     config_dir = models_dir + "/" + model_name
 
     if FLAGS.tensorrt_shape_io:
         shape_tensor_dim = len(shape)
-        config = '''
+        config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: {}
@@ -967,17 +1069,27 @@ def create_plan_modelconfig(models_dir, model_version, max_batch, dtype, shape):
     kind: KIND_GPU
   }}
 ]
-'''.format(
-            model_name, max_batch,
+""".format(
+            model_name,
+            max_batch,
             "oldest { max_candidate_sequences: 6\npreferred_batch_size: [ 4 ]\nmax_queue_delay_microseconds: 0\n}"
-            if max_batch > 0 else "", "int32", "int32", "int32",
-            tu.shape_to_dims_str(shape), np_to_model_dtype(dtype),
-            tu.shape_to_dims_str(shape), shape_tensor_dim,
-            tu.shape_to_dims_str(shape), np_to_model_dtype(dtype),
-            tu.shape_to_dims_str(shape), shape_tensor_dim)
+            if max_batch > 0
+            else "",
+            "int32",
+            "int32",
+            "int32",
+            tu.shape_to_dims_str(shape),
+            np_to_model_dtype(dtype),
+            tu.shape_to_dims_str(shape),
+            shape_tensor_dim,
+            tu.shape_to_dims_str(shape),
+            np_to_model_dtype(dtype),
+            tu.shape_to_dims_str(shape),
+            shape_tensor_dim,
+        )
 
     else:
-        config = '''
+        config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: {}
@@ -1042,14 +1154,20 @@ def create_plan_modelconfig(models_dir, model_version, max_batch, dtype, shape):
     kind: KIND_GPU
   }}
 ]
-'''.format(
-            model_name, max_batch,
+""".format(
+            model_name,
+            max_batch,
             "oldest { max_candidate_sequences: 6\npreferred_batch_size: [ 4 ]\nmax_queue_delay_microseconds: 0\n}"
-            if max_batch > 0 else "", "int32" if dtype == np.int32 else "fp32",
+            if max_batch > 0
+            else "",
+            "int32" if dtype == np.int32 else "fp32",
             "int32" if dtype == np.int32 else "fp32",
-            "int32" if dtype == np.int32 else "fp32", np_to_model_dtype(dtype),
-            tu.shape_to_dims_str(shape), np_to_model_dtype(dtype),
-            tu.shape_to_dims_str(shape))
+            "int32" if dtype == np.int32 else "fp32",
+            np_to_model_dtype(dtype),
+            tu.shape_to_dims_str(shape),
+            np_to_model_dtype(dtype),
+            tu.shape_to_dims_str(shape),
+        )
 
     try:
         os.makedirs(config_dir)
@@ -1061,12 +1179,12 @@ def create_plan_modelconfig(models_dir, model_version, max_batch, dtype, shape):
 
 
 def create_onnx_modelfile(models_dir, model_version, max_batch, dtype, shape):
-
     if not tu.validate_for_onnx_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_dyna_sequence_model_name(
-        "onnx_nobatch" if max_batch == 0 else "onnx", dtype)
+        "onnx_nobatch" if max_batch == 0 else "onnx", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     # Create the model. For now don't implement a proper accumulator
@@ -1085,32 +1203,40 @@ def create_onnx_modelfile(models_dir, model_version, max_batch, dtype, shape):
     batch_dim = [] if max_batch == 0 else [None]
 
     onnx_input = onnx.helper.make_tensor_value_info(
-        "INPUT", onnx_dtype, batch_dim + onnx_input_shape)
-    onnx_start = onnx.helper.make_tensor_value_info("START", onnx_control_dtype,
-                                                    batch_dim + [1])
-    onnx_end = onnx.helper.make_tensor_value_info("END", onnx_control_dtype,
-                                                  batch_dim + [1])
-    onnx_ready = onnx.helper.make_tensor_value_info("READY", onnx_control_dtype,
-                                                    batch_dim + [1])
-    onnx_corrid = onnx.helper.make_tensor_value_info("CORRID",
-                                                     onnx.TensorProto.UINT64,
-                                                     batch_dim + [1])
+        "INPUT", onnx_dtype, batch_dim + onnx_input_shape
+    )
+    onnx_start = onnx.helper.make_tensor_value_info(
+        "START", onnx_control_dtype, batch_dim + [1]
+    )
+    onnx_end = onnx.helper.make_tensor_value_info(
+        "END", onnx_control_dtype, batch_dim + [1]
+    )
+    onnx_ready = onnx.helper.make_tensor_value_info(
+        "READY", onnx_control_dtype, batch_dim + [1]
+    )
+    onnx_corrid = onnx.helper.make_tensor_value_info(
+        "CORRID", onnx.TensorProto.UINT64, batch_dim + [1]
+    )
     onnx_output = onnx.helper.make_tensor_value_info(
-        "OUTPUT", onnx_dtype, batch_dim + onnx_output_shape)
+        "OUTPUT", onnx_dtype, batch_dim + onnx_output_shape
+    )
 
     internal_input = onnx.helper.make_node("Identity", ["INPUT"], ["_INPUT"])
 
-    # cast int8, int16 input to higer precision int as Onnx Add/Sub operator doesn't support those type
+    # cast int8, int16 input to higher precision int as Onnx Add/Sub operator doesn't support those type
     # Also casting String data type to int32
-    if ((onnx_dtype == onnx.TensorProto.INT8) or
-        (onnx_dtype == onnx.TensorProto.INT16) or
-        (onnx_dtype == onnx.TensorProto.STRING)):
-        internal_input = onnx.helper.make_node("Cast", ["INPUT"], ["_INPUT"],
-                                               to=onnx.TensorProto.INT32)
-
-    onnx_corrid_cast0 = onnx.helper.make_node("Cast", ["CORRID"],
-                                              ["onnx_corrid_cast0"],
-                                              to=onnx_control_dtype)
+    if (
+        (onnx_dtype == onnx.TensorProto.INT8)
+        or (onnx_dtype == onnx.TensorProto.INT16)
+        or (onnx_dtype == onnx.TensorProto.STRING)
+    ):
+        internal_input = onnx.helper.make_node(
+            "Cast", ["INPUT"], ["_INPUT"], to=onnx.TensorProto.INT32
+        )
+
+    onnx_corrid_cast0 = onnx.helper.make_node(
+        "Cast", ["CORRID"], ["onnx_corrid_cast0"], to=onnx_control_dtype
+    )
     add0 = onnx.helper.make_node("Add", ["_INPUT", "START"], ["add0"])
     mul0 = onnx.helper.make_node("Mul", ["END", "onnx_corrid_cast0"], ["mul0"])
     sum0 = onnx.helper.make_node("Add", ["add0", "mul0"], ["sum0"])
@@ -1122,19 +1248,18 @@ def create_onnx_modelfile(models_dir, model_version, max_batch, dtype, shape):
     if onnx_dtype == onnx.TensorProto.FLOAT16:
         cast = onnx.helper.make_node("Identity", ["CAST"], ["OUTPUT"])
 
-    onnx_nodes = [
-        internal_input, onnx_corrid_cast0, add0, mul0, sum0, res0, cast
-    ]
+    onnx_nodes = [internal_input, onnx_corrid_cast0, add0, mul0, sum0, res0, cast]
     onnx_inputs = [onnx_input, onnx_start, onnx_end, onnx_ready, onnx_corrid]
     onnx_outputs = [onnx_output]
 
-    graph_proto = onnx.helper.make_graph(onnx_nodes, model_name, onnx_inputs,
-                                         onnx_outputs)
+    graph_proto = onnx.helper.make_graph(
+        onnx_nodes, model_name, onnx_inputs, onnx_outputs
+    )
     if FLAGS.onnx_opset > 0:
         model_opset = onnx.helper.make_operatorsetid("", FLAGS.onnx_opset)
-        model_def = onnx.helper.make_model(graph_proto,
-                                           producer_name="triton",
-                                           opset_imports=[model_opset])
+        model_def = onnx.helper.make_model(
+            graph_proto, producer_name="triton", opset_imports=[model_opset]
+        )
     else:
         model_def = onnx.helper.make_model(graph_proto, producer_name="triton")
 
@@ -1147,14 +1272,14 @@ def create_onnx_modelfile(models_dir, model_version, max_batch, dtype, shape):
 
 
 def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape):
-
     if not tu.validate_for_onnx_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_dyna_sequence_model_name(
-        "onnx_nobatch" if max_batch == 0 else "onnx", dtype)
+        "onnx_nobatch" if max_batch == 0 else "onnx", dtype
+    )
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "onnxruntime_onnx"
 max_batch_size: {}
@@ -1219,16 +1344,18 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape):
     kind: KIND_CPU
   }}
 ]
-'''.format(
+""".format(
         model_name,
         max_batch,
         "oldest { max_candidate_sequences: 6\npreferred_batch_size: [ 4 ]\nmax_queue_delay_microseconds: 0\n}"
-        if max_batch > 0 else "",
+        if max_batch > 0
+        else "",
         np_to_model_dtype(dtype),
         tu.shape_to_dims_str(shape),
         np_to_model_dtype(dtype),
         tu.shape_to_dims_str(shape),
-        type="fp32" if dtype == np.float32 else "int32")
+        type="fp32" if dtype == np.float32 else "int32",
+    )
 
     try:
         os.makedirs(config_dir)
@@ -1239,22 +1366,19 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape):
         cfile.write(config)
 
 
-def create_libtorch_modelfile(models_dir, model_version, max_batch, dtype,
-                              shape):
-
-    if not tu.validate_for_libtorch_model(dtype, dtype, dtype, shape, shape,
-                                          shape):
+def create_libtorch_modelfile(models_dir, model_version, max_batch, dtype, shape):
+    if not tu.validate_for_libtorch_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     torch_dtype = np_to_torch_dtype(dtype)
 
     model_name = tu.get_dyna_sequence_model_name(
-        "libtorch_nobatch" if max_batch == 0 else "libtorch", dtype)
+        "libtorch_nobatch" if max_batch == 0 else "libtorch", dtype
+    )
     # handle for -1 (when variable) since can't create tensor with shape of [-1]
     shape = [abs(ips) for ips in shape]
 
     class SequenceNet(nn.Module):
-
         def __init__(self):
             super(SequenceNet, self).__init__()
 
@@ -1265,9 +1389,16 @@ def forward(self, input0, start0, end0, ready0, corrid0):
     sequenceModel = SequenceNet()
     example_input = torch.zeros(shape, dtype=torch_dtype)
     example_corrid_input = torch.zeros(shape, dtype=torch.long)
-    traced = torch.jit.trace(sequenceModel,
-                             (example_input, example_input, example_input,
-                              example_input, example_corrid_input))
+    traced = torch.jit.trace(
+        sequenceModel,
+        (
+            example_input,
+            example_input,
+            example_input,
+            example_input,
+            example_corrid_input,
+        ),
+    )
 
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
@@ -1279,18 +1410,16 @@ def forward(self, input0, start0, end0, ready0, corrid0):
     traced.save(model_version_dir + "/model.pt")
 
 
-def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
-                                shape):
-
-    if not tu.validate_for_libtorch_model(dtype, dtype, dtype, shape, shape,
-                                          shape):
+def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype, shape):
+    if not tu.validate_for_libtorch_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_dyna_sequence_model_name(
-        "libtorch_nobatch" if max_batch == 0 else "libtorch", dtype)
+        "libtorch_nobatch" if max_batch == 0 else "libtorch", dtype
+    )
     config_dir = models_dir + "/" + model_name
     #  FIX FOR LibTorch
-    config = '''
+    config = """
 name: "{}"
 platform: "pytorch_libtorch"
 max_batch_size: {}
@@ -1355,13 +1484,19 @@ def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
     kind: KIND_CPU
   }}
 ]
-'''.format(
-        model_name, max_batch,
+""".format(
+        model_name,
+        max_batch,
         "oldest { max_candidate_sequences: 6\npreferred_batch_size: [ 4 ]\nmax_queue_delay_microseconds: 0\n}"
-        if max_batch > 0 else "", "int32" if dtype == np.int32 else "fp32",
+        if max_batch > 0
+        else "",
+        "int32" if dtype == np.int32 else "fp32",
+        "int32" if dtype == np.int32 else "fp32",
         "int32" if dtype == np.int32 else "fp32",
-        "int32" if dtype == np.int32 else "fp32", np_to_model_dtype(dtype),
-        tu.shape_to_dims_str(shape), np_to_model_dtype(dtype))
+        np_to_model_dtype(dtype),
+        tu.shape_to_dims_str(shape),
+        np_to_model_dtype(dtype),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -1372,19 +1507,22 @@ def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
         cfile.write(config)
 
 
-def create_openvino_modelfile(models_dir, model_version, max_batch, dtype,
-                              shape):
-
-    batch_dim = [] if max_batch == 0 else [
-        max_batch,
-    ]
-    if not tu.validate_for_openvino_model(dtype, dtype, dtype,
-                                          batch_dim + shape, batch_dim + shape,
-                                          batch_dim + shape):
+def create_openvino_modelfile(models_dir, model_version, max_batch, dtype, shape):
+    batch_dim = (
+        []
+        if max_batch == 0
+        else [
+            max_batch,
+        ]
+    )
+    if not tu.validate_for_openvino_model(
+        dtype, dtype, dtype, batch_dim + shape, batch_dim + shape, batch_dim + shape
+    ):
         return
 
     model_name = tu.get_dyna_sequence_model_name(
-        "openvino_nobatch" if max_batch == 0 else "openvino", dtype)
+        "openvino_nobatch" if max_batch == 0 else "openvino", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     in0 = ng.parameter(shape=batch_dim + shape, dtype=dtype, name="INPUT")
@@ -1398,8 +1536,7 @@ def create_openvino_modelfile(models_dir, model_version, max_batch, dtype,
     tmp = ng.add(tmp1, tmp2)
     op0 = ng.multiply(tmp, ready, name="OUTPUT")
 
-    function = ng.impl.Function([op0], [in0, start, end, ready, corrid],
-                                model_name)
+    function = ng.impl.Function([op0], [in0, start, end, ready, corrid], model_name)
     ie_network = IENetwork(ng.impl.Function.to_capsule(function))
 
     try:
@@ -1407,25 +1544,29 @@ def create_openvino_modelfile(models_dir, model_version, max_batch, dtype,
     except OSError as ex:
         pass  # ignore existing dir
 
-    ie_network.serialize(model_version_dir + "/model.xml",
-                         model_version_dir + "/model.bin")
-
-
-def create_openvino_modelconfig(models_dir, model_version, max_batch, dtype,
-                                shape):
-
-    batch_dim = [] if max_batch == 0 else [
-        max_batch,
-    ]
-    if not tu.validate_for_openvino_model(dtype, dtype, dtype,
-                                          batch_dim + shape, batch_dim + shape,
-                                          batch_dim + shape):
+    ie_network.serialize(
+        model_version_dir + "/model.xml", model_version_dir + "/model.bin"
+    )
+
+
+def create_openvino_modelconfig(models_dir, model_version, max_batch, dtype, shape):
+    batch_dim = (
+        []
+        if max_batch == 0
+        else [
+            max_batch,
+        ]
+    )
+    if not tu.validate_for_openvino_model(
+        dtype, dtype, dtype, batch_dim + shape, batch_dim + shape, batch_dim + shape
+    ):
         return
 
     model_name = tu.get_dyna_sequence_model_name(
-        "openvino_nobatch" if max_batch == 0 else "openvino", dtype)
+        "openvino_nobatch" if max_batch == 0 else "openvino", dtype
+    )
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 backend: "openvino"
 max_batch_size: {}
@@ -1485,13 +1626,19 @@ def create_openvino_modelconfig(models_dir, model_version, max_batch, dtype,
     dims: [ 1 ]
   }}
 ]
-'''.format(
-        model_name, max_batch,
+""".format(
+        model_name,
+        max_batch,
         "oldest { max_candidate_sequences: 6\npreferred_batch_size: [ 4 ]\nmax_queue_delay_microseconds: 0\n}"
-        if max_batch > 0 else "", "int32" if dtype == np.int32 else "fp32",
+        if max_batch > 0
+        else "",
         "int32" if dtype == np.int32 else "fp32",
-        "int32" if dtype == np.int32 else "fp32", np_to_model_dtype(dtype),
-        tu.shape_to_dims_str(shape), np_to_model_dtype(dtype))
+        "int32" if dtype == np.int32 else "fp32",
+        "int32" if dtype == np.int32 else "fp32",
+        np_to_model_dtype(dtype),
+        tu.shape_to_dims_str(shape),
+        np_to_model_dtype(dtype),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -1506,12 +1653,10 @@ def create_shape_tensor_models(models_dir, dtype, shape, no_batch=True):
     model_version = 1
 
     create_plan_modelconfig(models_dir, model_version, 8, dtype, shape)
-    create_plan_shape_tensor_modelfile(models_dir, model_version, 8, dtype,
-                                       shape)
+    create_plan_shape_tensor_modelfile(models_dir, model_version, 8, dtype, shape)
     if no_batch:
         create_plan_modelconfig(models_dir, model_version, 0, dtype, shape)
-        create_plan_shape_tensor_modelfile(models_dir, model_version, 0, dtype,
-                                           shape)
+        create_plan_shape_tensor_modelfile(models_dir, model_version, 0, dtype, shape)
 
 
 def create_models(models_dir, dtype, shape, no_batch=True):
@@ -1521,34 +1666,26 @@ def create_models(models_dir, dtype, shape, no_batch=True):
         create_tf_modelconfig(False, models_dir, model_version, 8, dtype, shape)
         create_tf_modelfile(False, models_dir, model_version, 8, dtype, shape)
         if no_batch:
-            create_tf_modelconfig(False, models_dir, model_version, 0, dtype,
-                                  shape)
-            create_tf_modelfile(False, models_dir, model_version, 0, dtype,
-                                shape)
+            create_tf_modelconfig(False, models_dir, model_version, 0, dtype, shape)
+            create_tf_modelfile(False, models_dir, model_version, 0, dtype, shape)
 
     if FLAGS.savedmodel:
         create_tf_modelconfig(True, models_dir, model_version, 8, dtype, shape)
         create_tf_modelfile(True, models_dir, model_version, 8, dtype, shape)
         if no_batch:
-            create_tf_modelconfig(True, models_dir, model_version, 0, dtype,
-                                  shape)
-            create_tf_modelfile(True, models_dir, model_version, 0, dtype,
-                                shape)
+            create_tf_modelconfig(True, models_dir, model_version, 0, dtype, shape)
+            create_tf_modelfile(True, models_dir, model_version, 0, dtype, shape)
 
     if FLAGS.tensorrt:
         suffix = []
         if dtype == np.int8:
             suffix = [1, 1]
 
-        create_plan_modelconfig(models_dir, model_version, 8, dtype,
-                                shape + suffix)
-        create_plan_modelfile(models_dir, model_version, 8, dtype,
-                              shape + suffix)
+        create_plan_modelconfig(models_dir, model_version, 8, dtype, shape + suffix)
+        create_plan_modelfile(models_dir, model_version, 8, dtype, shape + suffix)
         if no_batch:
-            create_plan_modelconfig(models_dir, model_version, 0, dtype,
-                                    shape + suffix)
-            create_plan_modelfile(models_dir, model_version, 0, dtype,
-                                  shape + suffix)
+            create_plan_modelconfig(models_dir, model_version, 0, dtype, shape + suffix)
+            create_plan_modelfile(models_dir, model_version, 0, dtype, shape + suffix)
 
     if FLAGS.onnx:
         create_onnx_modelconfig(models_dir, model_version, 8, dtype, shape)
@@ -1561,71 +1698,80 @@ def create_models(models_dir, dtype, shape, no_batch=True):
         create_libtorch_modelconfig(models_dir, model_version, 8, dtype, shape)
         create_libtorch_modelfile(models_dir, model_version, 8, dtype, shape)
         if no_batch:
-            create_libtorch_modelconfig(models_dir, model_version, 0, dtype,
-                                        shape)
-            create_libtorch_modelfile(models_dir, model_version, 0, dtype,
-                                      shape)
+            create_libtorch_modelconfig(models_dir, model_version, 0, dtype, shape)
+            create_libtorch_modelfile(models_dir, model_version, 0, dtype, shape)
 
     if FLAGS.openvino:
         create_openvino_modelconfig(models_dir, model_version, 8, dtype, shape)
         create_openvino_modelfile(models_dir, model_version, 8, dtype, shape)
         if no_batch:
-            create_openvino_modelconfig(models_dir, model_version, 0, dtype,
-                                        shape)
-            create_openvino_modelfile(models_dir, model_version, 0, dtype,
-                                      shape)
+            create_openvino_modelconfig(models_dir, model_version, 0, dtype, shape)
+            create_openvino_modelfile(models_dir, model_version, 0, dtype, shape)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
-    parser.add_argument('--graphdef',
-                        required=False,
-                        action='store_true',
-                        help='Generate GraphDef models')
-    parser.add_argument('--savedmodel',
-                        required=False,
-                        action='store_true',
-                        help='Generate SavedModel models')
-    parser.add_argument('--tensorrt',
-                        required=False,
-                        action='store_true',
-                        help='Generate TensorRT PLAN models')
     parser.add_argument(
-        '--tensorrt-shape-io',
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
+    parser.add_argument(
+        "--graphdef",
+        required=False,
+        action="store_true",
+        help="Generate GraphDef models",
+    )
+    parser.add_argument(
+        "--savedmodel",
         required=False,
-        action='store_true',
-        help='Generate TensorRT PLAN models w/ shape tensor i/o')
-    parser.add_argument('--onnx',
-                        required=False,
-                        action='store_true',
-                        help='Generate Onnx models')
+        action="store_true",
+        help="Generate SavedModel models",
+    )
     parser.add_argument(
-        '--onnx_opset',
+        "--tensorrt",
+        required=False,
+        action="store_true",
+        help="Generate TensorRT PLAN models",
+    )
+    parser.add_argument(
+        "--tensorrt-shape-io",
+        required=False,
+        action="store_true",
+        help="Generate TensorRT PLAN models w/ shape tensor i/o",
+    )
+    parser.add_argument(
+        "--onnx", required=False, action="store_true", help="Generate Onnx models"
+    )
+    parser.add_argument(
+        "--onnx_opset",
         type=int,
         required=False,
         default=0,
-        help='Opset used for Onnx models. Default is to use ONNXRT default')
-    parser.add_argument('--libtorch',
-                        required=False,
-                        action='store_true',
-                        help='Generate Pytorch LibTorch models')
-    parser.add_argument('--openvino',
-                        required=False,
-                        action='store_true',
-                        help='Generate OpenVino models')
-    parser.add_argument('--variable',
-                        required=False,
-                        action='store_true',
-                        help='Used variable-shape tensors for input/output')
+        help="Opset used for Onnx models. Default is to use ONNXRT default",
+    )
+    parser.add_argument(
+        "--libtorch",
+        required=False,
+        action="store_true",
+        help="Generate Pytorch LibTorch models",
+    )
+    parser.add_argument(
+        "--openvino",
+        required=False,
+        action="store_true",
+        help="Generate OpenVino models",
+    )
+    parser.add_argument(
+        "--variable",
+        required=False,
+        action="store_true",
+        help="Used variable-shape tensors for input/output",
+    )
     FLAGS, unparsed = parser.parse_known_args()
 
     if FLAGS.graphdef or FLAGS.savedmodel:
         import tensorflow as tf
         from tensorflow.python.framework import graph_io
+
         tf.compat.v1.disable_eager_execution()
     if FLAGS.tensorrt or FLAGS.tensorrt_shape_io:
         import tensorrt as trt
@@ -1641,18 +1787,31 @@ def create_models(models_dir, dtype, shape, no_batch=True):
     import test_util as tu
 
     if FLAGS.tensorrt_shape_io:
-        create_shape_tensor_models(FLAGS.models_dir, np.float32, [
-            -1,
-        ])
+        create_shape_tensor_models(
+            FLAGS.models_dir,
+            np.float32,
+            [
+                -1,
+            ],
+        )
     else:
         # Tests with models that accept fixed-shape input/output tensors
         if not FLAGS.variable:
-            create_models(FLAGS.models_dir, np.int32, [
-                1,
-            ])
+            create_models(
+                FLAGS.models_dir,
+                np.int32,
+                [
+                    1,
+                ],
+            )
 
         # Tests with models that accept variable-shape input/output tensors
         if FLAGS.variable:
-            create_models(FLAGS.models_dir, np.int32, [
-                -1,
-            ], False)
+            create_models(
+                FLAGS.models_dir,
+                np.int32,
+                [
+                    -1,
+                ],
+                False,
+            )
diff --git a/qa/common/gen_qa_identity_models.py b/qa/common/gen_qa_identity_models.py
old mode 100644
new mode 100755
index 786b4a7724..97685d7127
--- a/qa/common/gen_qa_identity_models.py
+++ b/qa/common/gen_qa_identity_models.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -25,10 +27,11 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-from builtins import range
 import os
-import numpy as np
+from builtins import range
+
 import gen_ensemble_model_utils as emu
+import numpy as np
 
 FLAGS = None
 np_dtype_string = np.dtype(object)
@@ -152,9 +155,9 @@ def np_to_torch_dtype(np_dtype):
         return List[str]
 
 
-def create_tf_modelfile(create_savedmodel, models_dir, model_version, io_cnt,
-                        max_batch, dtype, shape):
-
+def create_tf_modelfile(
+    create_savedmodel, models_dir, model_version, io_cnt, max_batch, dtype, shape
+):
     if not tu.validate_for_tf_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
@@ -166,23 +169,29 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, io_cnt,
         input_name = "INPUT{}".format(io_num)
         output_name = "OUTPUT{}".format(io_num)
         if max_batch == 0:
-            tin = tf.compat.v1.placeholder(tf_dtype,
-                                           tu.shape_to_tf_shape(shape),
-                                           input_name)
+            tin = tf.compat.v1.placeholder(
+                tf_dtype, tu.shape_to_tf_shape(shape), input_name
+            )
         else:
-            tin = tf.compat.v1.placeholder(tf_dtype, [
-                None,
-            ] + tu.shape_to_tf_shape(shape), input_name)
+            tin = tf.compat.v1.placeholder(
+                tf_dtype,
+                [
+                    None,
+                ]
+                + tu.shape_to_tf_shape(shape),
+                input_name,
+            )
         toutput = tf.identity(tin, name=output_name)
 
     # Use model name based on io_cnt and non-batching variant
     if create_savedmodel:
         model_name = tu.get_zero_model_name(
-            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", io_cnt,
-            dtype)
+            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", io_cnt, dtype
+        )
     else:
         model_name = tu.get_zero_model_name(
-            "graphdef_nobatch" if max_batch == 0 else "graphdef", io_cnt, dtype)
+            "graphdef_nobatch" if max_batch == 0 else "graphdef", io_cnt, dtype
+        )
 
     model_version_dir = os.path.join(models_dir, model_name, str(model_version))
     os.makedirs(model_version_dir, exist_ok=True)
@@ -194,28 +203,33 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, io_cnt,
             for io_num in range(io_cnt):
                 input_name = "INPUT{}".format(io_num)
                 output_name = "OUTPUT{}".format(io_num)
-                input_tensor = tf.compat.v1.get_default_graph(
-                ).get_tensor_by_name(input_name + ":0")
-                output_tensor = tf.compat.v1.get_default_graph(
-                ).get_tensor_by_name(output_name + ":0")
+                input_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
+                    input_name + ":0"
+                )
+                output_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
+                    output_name + ":0"
+                )
                 input_dict[input_name] = input_tensor
                 output_dict[output_name] = output_tensor
-            tf.compat.v1.saved_model.simple_save(sess,
-                                                 model_version_dir +
-                                                 "/model.savedmodel",
-                                                 inputs=input_dict,
-                                                 outputs=output_dict)
+            tf.compat.v1.saved_model.simple_save(
+                sess,
+                model_version_dir + "/model.savedmodel",
+                inputs=input_dict,
+                outputs=output_dict,
+            )
     else:
         with tf.compat.v1.Session() as sess:
-            graph_io.write_graph(sess.graph.as_graph_def(),
-                                 model_version_dir,
-                                 "model.graphdef",
-                                 as_text=False)
-
+            graph_io.write_graph(
+                sess.graph.as_graph_def(),
+                model_version_dir,
+                "model.graphdef",
+                as_text=False,
+            )
 
-def create_tf_modelconfig(create_savedmodel, models_dir, model_version, io_cnt,
-                          max_batch, dtype, shape):
 
+def create_tf_modelconfig(
+    create_savedmodel, models_dir, model_version, io_cnt, max_batch, dtype, shape
+):
     if not tu.validate_for_tf_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
@@ -224,24 +238,26 @@ def create_tf_modelconfig(create_savedmodel, models_dir, model_version, io_cnt,
     # Use a different model name for the non-batching variant
     if create_savedmodel:
         model_name = tu.get_zero_model_name(
-            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", io_cnt,
-            dtype)
+            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", io_cnt, dtype
+        )
     else:
         model_name = tu.get_zero_model_name(
-            "graphdef_nobatch" if max_batch == 0 else "graphdef", io_cnt, dtype)
+            "graphdef_nobatch" if max_batch == 0 else "graphdef", io_cnt, dtype
+        )
 
     config_dir = os.path.join(models_dir, model_name)
-    config = '''
+    config = """
 name: "{}"
 platform: "{}"
 max_batch_size: {}
-'''.format(
+""".format(
         model_name,
         "tensorflow_savedmodel" if create_savedmodel else "tensorflow_graphdef",
-        max_batch)
+        max_batch,
+    )
 
     for io_num in range(io_cnt):
-        config += '''
+        config += """
 input [
   {{
     name: "INPUT{}"
@@ -256,8 +272,14 @@ def create_tf_modelconfig(create_savedmodel, models_dir, model_version, io_cnt,
     dims: [ {} ]
   }}
 ]
-'''.format(io_num, np_to_model_dtype(dtype), shape_str, io_num,
-           np_to_model_dtype(dtype), shape_str)
+""".format(
+            io_num,
+            np_to_model_dtype(dtype),
+            shape_str,
+            io_num,
+            np_to_model_dtype(dtype),
+            shape_str,
+        )
 
     os.makedirs(config_dir, exist_ok=True)
 
@@ -265,32 +287,49 @@ def create_tf_modelconfig(create_savedmodel, models_dir, model_version, io_cnt,
         cfile.write(config)
 
 
-def create_ensemble_modelfile(create_savedmodel, models_dir, model_version,
-                              io_cnt, max_batch, dtype, shape):
-    if not tu.validate_for_ensemble_model("zero", dtype, dtype, dtype, shape,
-                                          shape, shape):
+def create_ensemble_modelfile(
+    create_savedmodel, models_dir, model_version, io_cnt, max_batch, dtype, shape
+):
+    if not tu.validate_for_ensemble_model(
+        "zero", dtype, dtype, dtype, shape, shape, shape
+    ):
         return
 
-    emu.create_identity_ensemble_modelfile("zero", models_dir, model_version,
-                                           max_batch, dtype, [shape] * io_cnt,
-                                           [shape] * io_cnt)
-
-
-def create_ensemble_modelconfig(create_savedmodel, models_dir, model_version,
-                                io_cnt, max_batch, dtype, shape):
-    if not tu.validate_for_ensemble_model("zero", dtype, dtype, dtype, shape,
-                                          shape, shape):
+    emu.create_identity_ensemble_modelfile(
+        "zero",
+        models_dir,
+        model_version,
+        max_batch,
+        dtype,
+        [shape] * io_cnt,
+        [shape] * io_cnt,
+    )
+
+
+def create_ensemble_modelconfig(
+    create_savedmodel, models_dir, model_version, io_cnt, max_batch, dtype, shape
+):
+    if not tu.validate_for_ensemble_model(
+        "zero", dtype, dtype, dtype, shape, shape, shape
+    ):
         return
 
-    emu.create_identity_ensemble_modelconfig("zero", models_dir, model_version,
-                                             max_batch, dtype, [shape] * io_cnt,
-                                             [shape] * io_cnt, [shape] * io_cnt,
-                                             [shape] * io_cnt)
-
+    emu.create_identity_ensemble_modelconfig(
+        "zero",
+        models_dir,
+        model_version,
+        max_batch,
+        dtype,
+        [shape] * io_cnt,
+        [shape] * io_cnt,
+        [shape] * io_cnt,
+        [shape] * io_cnt,
+    )
 
-def create_onnx_modelfile(create_savedmodel, models_dir, model_version, io_cnt,
-                          max_batch, dtype, shape):
 
+def create_onnx_modelfile(
+    create_savedmodel, models_dir, model_version, io_cnt, max_batch, dtype, shape
+):
     if not tu.validate_for_onnx_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
@@ -298,7 +337,8 @@ def create_onnx_modelfile(create_savedmodel, models_dir, model_version, io_cnt,
 
     # Create the model
     model_name = tu.get_zero_model_name(
-        "onnx_nobatch" if max_batch == 0 else "onnx", io_cnt, dtype)
+        "onnx_nobatch" if max_batch == 0 else "onnx", io_cnt, dtype
+    )
     model_version_dir = os.path.join(models_dir, model_name, str(model_version))
 
     batch_dim = [] if max_batch == 0 else [None]
@@ -315,21 +355,25 @@ def create_onnx_modelfile(create_savedmodel, models_dir, model_version, io_cnt,
         out_name = "OUTPUT{}".format(io_num)
 
         onnx_inputs.append(
-            onnx.helper.make_tensor_value_info(in_name, onnx_dtype,
-                                               batch_dim + in_shape))
+            onnx.helper.make_tensor_value_info(
+                in_name, onnx_dtype, batch_dim + in_shape
+            )
+        )
         onnx_outputs.append(
-            onnx.helper.make_tensor_value_info(out_name, onnx_dtype,
-                                               batch_dim + out_shape))
-        onnx_nodes.append(
-            onnx.helper.make_node("Identity", [in_name], [out_name]))
-
-    graph_proto = onnx.helper.make_graph(onnx_nodes, model_name, onnx_inputs,
-                                         onnx_outputs)
+            onnx.helper.make_tensor_value_info(
+                out_name, onnx_dtype, batch_dim + out_shape
+            )
+        )
+        onnx_nodes.append(onnx.helper.make_node("Identity", [in_name], [out_name]))
+
+    graph_proto = onnx.helper.make_graph(
+        onnx_nodes, model_name, onnx_inputs, onnx_outputs
+    )
     if FLAGS.onnx_opset > 0:
         model_opset = onnx.helper.make_operatorsetid("", FLAGS.onnx_opset)
-        model_def = onnx.helper.make_model(graph_proto,
-                                           producer_name="triton",
-                                           opset_imports=[model_opset])
+        model_def = onnx.helper.make_model(
+            graph_proto, producer_name="triton", opset_imports=[model_opset]
+        )
     else:
         model_def = onnx.helper.make_model(graph_proto, producer_name="triton")
 
@@ -338,28 +382,31 @@ def create_onnx_modelfile(create_savedmodel, models_dir, model_version, io_cnt,
     onnx.save(model_def, model_version_dir + "/model.onnx")
 
 
-def create_onnx_modelconfig(create_savedmodel, models_dir, model_version,
-                            io_cnt, max_batch, dtype, shape):
-
+def create_onnx_modelconfig(
+    create_savedmodel, models_dir, model_version, io_cnt, max_batch, dtype, shape
+):
     if not tu.validate_for_onnx_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     # Use a different model name for the non-batching variant
     model_name = tu.get_zero_model_name(
-        "onnx_nobatch" if max_batch == 0 else "onnx", io_cnt, dtype)
+        "onnx_nobatch" if max_batch == 0 else "onnx", io_cnt, dtype
+    )
     config_dir = os.path.join(models_dir, model_name)
 
-    config = emu.create_general_modelconfig(model_name,
-                                            "onnxruntime_onnx",
-                                            max_batch,
-                                            emu.repeat(dtype, io_cnt),
-                                            emu.repeat(shape, io_cnt),
-                                            emu.repeat(shape, io_cnt),
-                                            emu.repeat(dtype, io_cnt),
-                                            emu.repeat(shape, io_cnt),
-                                            emu.repeat(shape, io_cnt),
-                                            emu.repeat(None, io_cnt),
-                                            force_tensor_number_suffix=True)
+    config = emu.create_general_modelconfig(
+        model_name,
+        "onnxruntime_onnx",
+        max_batch,
+        emu.repeat(dtype, io_cnt),
+        emu.repeat(shape, io_cnt),
+        emu.repeat(shape, io_cnt),
+        emu.repeat(dtype, io_cnt),
+        emu.repeat(shape, io_cnt),
+        emu.repeat(shape, io_cnt),
+        emu.repeat(None, io_cnt),
+        force_tensor_number_suffix=True,
+    )
 
     os.makedirs(config_dir, exist_ok=True)
 
@@ -367,95 +414,99 @@ def create_onnx_modelconfig(create_savedmodel, models_dir, model_version,
         cfile.write(config)
 
 
-def create_libtorch_modelfile(create_savedmodel, models_dir, model_version,
-                              io_cnt, max_batch, dtype, shape):
-
-    if not tu.validate_for_libtorch_model(dtype, dtype, dtype, shape, shape,
-                                          shape, max_batch):
+def create_libtorch_modelfile(
+    create_savedmodel, models_dir, model_version, io_cnt, max_batch, dtype, shape
+):
+    if not tu.validate_for_libtorch_model(
+        dtype, dtype, dtype, shape, shape, shape, max_batch
+    ):
         return
 
     model_name = tu.get_zero_model_name(
-        "libtorch_nobatch" if max_batch == 0 else "libtorch", io_cnt, dtype)
+        "libtorch_nobatch" if max_batch == 0 else "libtorch", io_cnt, dtype
+    )
 
     # Create the model
     if io_cnt == 1:
-        if (dtype == np_dtype_string):
+        if dtype == np_dtype_string:
 
             class IdentityNet(nn.Module):
-
                 def __init__(self):
                     super(IdentityNet, self).__init__()
 
                 def forward(self, input0: List[str]) -> List[str]:
                     return input0
+
         else:
 
             class IdentityNet(nn.Module):
-
                 def __init__(self):
                     super(IdentityNet, self).__init__()
 
                 def forward(self, input0):
                     return input0
+
     elif io_cnt == 2:
-        if (dtype == np_dtype_string):
+        if dtype == np_dtype_string:
 
             class IdentityNet(nn.Module):
-
                 def __init__(self):
                     super(IdentityNet, self).__init__()
 
-                def forward(self, input0: List[str],
-                            input1: List[str]) -> Tuple[List[str], List[str]]:
+                def forward(
+                    self, input0: List[str], input1: List[str]
+                ) -> Tuple[List[str], List[str]]:
                     return input0, input1
+
         else:
 
             class IdentityNet(nn.Module):
-
                 def __init__(self):
                     super(IdentityNet, self).__init__()
 
                 def forward(self, input0, input1):
                     return input0, input1
+
     elif io_cnt == 3:
-        if (dtype == np_dtype_string):
+        if dtype == np_dtype_string:
 
             class IdentityNet(nn.Module):
-
                 def __init__(self):
                     super(IdentityNet, self).__init__()
 
                 def forward(
-                    self, input0: List[str], input1: List[str],
-                    input2: List[str]
+                    self, input0: List[str], input1: List[str], input2: List[str]
                 ) -> Tuple[List[str], List[str], List[str]]:
                     return input0, input1, input2
+
         else:
 
             class IdentityNet(nn.Module):
-
                 def __init__(self):
                     super(IdentityNet, self).__init__()
 
                 def forward(self, input0, input1, input2):
                     return input0, input1, input2
+
     elif io_cnt == 4:
-        if (dtype == np_dtype_string):
+        if dtype == np_dtype_string:
 
             class IdentityNet(nn.Module):
-
                 def __init__(self):
                     super(IdentityNet, self).__init__()
 
                 def forward(
-                    self, input0: List[str], input1: List[str],
-                    input2: List[str], input3: List[str]
+                    self,
+                    input0: List[str],
+                    input1: List[str],
+                    input2: List[str],
+                    input3: List[str],
                 ) -> Tuple[List[str], List[str], List[str], List[str]]:
                     return input0, input1, input2, input3
+
         else:
 
             class IdentityNet(nn.Module):
-
                 def __init__(self):
                     super(IdentityNet, self).__init__()
 
@@ -471,11 +522,12 @@ def forward(self, input0, input1, input2, input3):
     traced.save(model_version_dir + "/model.pt")
 
 
-def create_libtorch_modelconfig(create_savedmodel, models_dir, model_version,
-                                io_cnt, max_batch, dtype, shape):
-
-    if not tu.validate_for_libtorch_model(dtype, dtype, dtype, shape, shape,
-                                          shape, max_batch):
+def create_libtorch_modelconfig(
+    create_savedmodel, models_dir, model_version, io_cnt, max_batch, dtype, shape
+):
+    if not tu.validate_for_libtorch_model(
+        dtype, dtype, dtype, shape, shape, shape, max_batch
+    ):
         return
 
     # Unpack version policy
@@ -483,19 +535,22 @@ def create_libtorch_modelconfig(create_savedmodel, models_dir, model_version,
 
     # Use a different model name for the non-batching variant
     model_name = tu.get_zero_model_name(
-        "libtorch_nobatch" if max_batch == 0 else "libtorch", io_cnt, dtype)
+        "libtorch_nobatch" if max_batch == 0 else "libtorch", io_cnt, dtype
+    )
     shape_str = tu.shape_to_dims_str(shape)
 
     config_dir = os.path.join(models_dir, model_name)
-    config = '''
+    config = """
 name: "{}"
 platform: "pytorch_libtorch"
 max_batch_size: {}
 version_policy: {}
-'''.format(model_name, max_batch, version_policy_str)
+""".format(
+        model_name, max_batch, version_policy_str
+    )
 
     for io_num in range(io_cnt):
-        config += '''
+        config += """
 input [
   {{
     name: "INPUT__{}"
@@ -510,8 +565,14 @@ def create_libtorch_modelconfig(create_savedmodel, models_dir, model_version,
     dims: [ {} ]
   }}
 ]
-'''.format(io_num, np_to_model_dtype(dtype), shape_str, io_num,
-           np_to_model_dtype(dtype), shape_str)
+""".format(
+            io_num,
+            np_to_model_dtype(dtype),
+            shape_str,
+            io_num,
+            np_to_model_dtype(dtype),
+            shape_str,
+        )
 
     os.makedirs(config_dir, exist_ok=True)
 
@@ -519,15 +580,12 @@ def create_libtorch_modelconfig(create_savedmodel, models_dir, model_version,
         cfile.write(config)
 
 
-def create_libtorch_linalg_modelfile(create_savedmodel, models_dir,
-                                     model_version):
-
+def create_libtorch_linalg_modelfile(create_savedmodel, models_dir, model_version):
     model_name = "libtorch_float32_linalg"
 
     # To test the linalg library, this script uses two inverse matrix operations
     # to return the original input.
     class IdentityNet(nn.Module):
-
         def __init__(self, ref_pts):
             super(IdentityNet, self).__init__()
             ref_pts = torch.as_tensor(ref_pts)
@@ -548,9 +606,7 @@ def forward(self, src: torch.Tensor):
     traced.save(model_version_dir + "/model.pt")
 
 
-def create_libtorch_linalg_modelconfig(create_savedmodel, models_dir,
-                                       model_version):
-
+def create_libtorch_linalg_modelconfig(create_savedmodel, models_dir, model_version):
     # Unpack version policy
     version_policy_str = "{ latest { num_versions: 1 }}"
 
@@ -562,15 +618,17 @@ def create_libtorch_linalg_modelconfig(create_savedmodel, models_dir,
     shape_str = tu.shape_to_dims_str(shape)
 
     config_dir = os.path.join(models_dir, model_name)
-    config = '''
+    config = """
 name: "{}"
 platform: "pytorch_libtorch"
 max_batch_size: {}
 version_policy: {}
-'''.format(model_name, max_batch, version_policy_str)
+""".format(
+        model_name, max_batch, version_policy_str
+    )
 
     for io_num in range(io_cnt):
-        config += '''
+        config += """
 input [
   {{
     name: "INPUT__{}"
@@ -585,8 +643,14 @@ def create_libtorch_linalg_modelconfig(create_savedmodel, models_dir,
     dims: [ {} ]
   }}
 ]
-'''.format(io_num, np_to_model_dtype(dtype), shape_str, io_num,
-           np_to_model_dtype(dtype), shape_str)
+""".format(
+            io_num,
+            np_to_model_dtype(dtype),
+            shape_str,
+            io_num,
+            np_to_model_dtype(dtype),
+            shape_str,
+        )
 
     os.makedirs(config_dir, exist_ok=True)
 
@@ -594,20 +658,25 @@ def create_libtorch_linalg_modelconfig(create_savedmodel, models_dir,
         cfile.write(config)
 
 
-def create_openvino_modelfile(models_dir, model_version, io_cnt, max_batch,
-                              dtype, shape):
-
-    batch_dim = [] if max_batch == 0 else [
-        max_batch,
-    ]
-    if not tu.validate_for_openvino_model(dtype, dtype, dtype,
-                                          batch_dim + shape, batch_dim + shape,
-                                          batch_dim + shape):
+def create_openvino_modelfile(
+    models_dir, model_version, io_cnt, max_batch, dtype, shape
+):
+    batch_dim = (
+        []
+        if max_batch == 0
+        else [
+            max_batch,
+        ]
+    )
+    if not tu.validate_for_openvino_model(
+        dtype, dtype, dtype, batch_dim + shape, batch_dim + shape, batch_dim + shape
+    ):
         return
 
     # Create the model
     model_name = tu.get_zero_model_name(
-        "openvino_nobatch" if max_batch == 0 else "openvino", io_cnt, dtype)
+        "openvino_nobatch" if max_batch == 0 else "openvino", io_cnt, dtype
+    )
     model_version_dir = os.path.join(models_dir, model_name, str(model_version))
 
     openvino_inputs = []
@@ -616,28 +685,33 @@ def create_openvino_modelfile(models_dir, model_version, io_cnt, max_batch,
         in_name = "INPUT{}".format(io_num)
         out_name = "OUTPUT{}".format(io_num)
         openvino_inputs.append(
-            ng.parameter(shape=batch_dim + shape, dtype=dtype, name=in_name))
-        openvino_outputs.append(
-            ng.result(openvino_inputs[io_num], name=out_name))
+            ng.parameter(shape=batch_dim + shape, dtype=dtype, name=in_name)
+        )
+        openvino_outputs.append(ng.result(openvino_inputs[io_num], name=out_name))
 
     function = ng.impl.Function(openvino_outputs, openvino_inputs, model_name)
     ie_network = IENetwork(ng.impl.Function.to_capsule(function))
 
     os.makedirs(model_version_dir, exist_ok=True)
 
-    ie_network.serialize(model_version_dir + "/model.xml",
-                         model_version_dir + "/model.bin")
-
-
-def create_openvino_modelconfig(models_dir, model_version, io_cnt, max_batch,
-                                dtype, shape):
-
-    batch_dim = [] if max_batch == 0 else [
-        max_batch,
-    ]
-    if not tu.validate_for_openvino_model(dtype, dtype, dtype,
-                                          batch_dim + shape, batch_dim + shape,
-                                          batch_dim + shape):
+    ie_network.serialize(
+        model_version_dir + "/model.xml", model_version_dir + "/model.bin"
+    )
+
+
+def create_openvino_modelconfig(
+    models_dir, model_version, io_cnt, max_batch, dtype, shape
+):
+    batch_dim = (
+        []
+        if max_batch == 0
+        else [
+            max_batch,
+        ]
+    )
+    if not tu.validate_for_openvino_model(
+        dtype, dtype, dtype, batch_dim + shape, batch_dim + shape, batch_dim + shape
+    ):
         return
 
     # Unpack version policy
@@ -645,19 +719,22 @@ def create_openvino_modelconfig(models_dir, model_version, io_cnt, max_batch,
 
     # Use a different model name for the non-batching variant
     model_name = tu.get_zero_model_name(
-        "openvino_nobatch" if max_batch == 0 else "openvino", io_cnt, dtype)
+        "openvino_nobatch" if max_batch == 0 else "openvino", io_cnt, dtype
+    )
     shape_str = tu.shape_to_dims_str(shape)
 
     config_dir = os.path.join(models_dir, model_name)
-    config = '''
+    config = """
 name: "{}"
 backend: "openvino"
 max_batch_size: {}
 version_policy: {}
-'''.format(model_name, max_batch, version_policy_str)
+""".format(
+        model_name, max_batch, version_policy_str
+    )
 
     for io_num in range(io_cnt):
-        config += '''
+        config += """
 input [
   {{
     name: "INPUT__{}"
@@ -672,8 +749,14 @@ def create_openvino_modelconfig(models_dir, model_version, io_cnt, max_batch,
     dims: [ {} ]
   }}
 ]
-'''.format(io_num, np_to_model_dtype(dtype), shape_str, io_num,
-           np_to_model_dtype(dtype), shape_str)
+""".format(
+            io_num,
+            np_to_model_dtype(dtype),
+            shape_str,
+            io_num,
+            np_to_model_dtype(dtype),
+            shape_str,
+        )
 
     os.makedirs(config_dir, exist_ok=True)
 
@@ -681,29 +764,39 @@ def create_openvino_modelconfig(models_dir, model_version, io_cnt, max_batch,
         cfile.write(config)
 
 
-def create_plan_modelfile(create_savedmodel, models_dir, model_version, io_cnt,
-                          max_batch, dtype, shape, profile_max_size):
-
+def create_plan_modelfile(
+    create_savedmodel,
+    models_dir,
+    model_version,
+    io_cnt,
+    max_batch,
+    dtype,
+    shape,
+    profile_max_size,
+):
     if not tu.validate_for_trt_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     # generate models with different configuration to ensure test coverage
     if dtype != np.float32:
-        create_plan_dynamic_rf_modelfile(models_dir, model_version, io_cnt,
-                                         max_batch, dtype, shape,
-                                         profile_max_size)
+        create_plan_dynamic_rf_modelfile(
+            models_dir, model_version, io_cnt, max_batch, dtype, shape, profile_max_size
+        )
     else:
-        create_plan_dynamic_modelfile(models_dir, model_version, io_cnt,
-                                      max_batch, dtype, shape, profile_max_size)
+        create_plan_dynamic_modelfile(
+            models_dir, model_version, io_cnt, max_batch, dtype, shape, profile_max_size
+        )
 
 
-def create_plan_dynamic_rf_modelfile(models_dir, model_version, io_cnt,
-                                     max_batch, dtype, shape, profile_max_size):
+def create_plan_dynamic_rf_modelfile(
+    models_dir, model_version, io_cnt, max_batch, dtype, shape, profile_max_size
+):
     # Create the model
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
     if max_batch == 0:
         shape_with_batchsize = [i for i in shape]
     else:
@@ -712,8 +805,9 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, io_cnt,
     trt_dtype = np_to_trt_dtype(dtype)
     trt_memory_format = trt.TensorFormat.LINEAR
     for io_num in range(io_cnt):
-        in_node = network.add_input("INPUT{}".format(io_num), trt_dtype,
-                                    shape_with_batchsize)
+        in_node = network.add_input(
+            "INPUT{}".format(io_num), trt_dtype, shape_with_batchsize
+        )
         in_node.allowed_formats = 1 << int(trt_memory_format)
 
         out_node = network.add_identity(in_node)
@@ -723,7 +817,7 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, io_cnt,
         network.mark_output(out_node.get_output(0))
         out_node.get_output(0).allowed_formats = 1 << int(trt_memory_format)
 
-        if (trt_dtype == trt.int8):
+        if trt_dtype == trt.int8:
             in_node.dynamic_range = (-128.0, 127.0)
             out_node.get_output(0).dynamic_range = (-128.0, 127.0)
 
@@ -747,15 +841,14 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, io_cnt,
 
     profile = builder.create_optimization_profile()
     for io_num in range(io_cnt):
-        profile.set_shape("INPUT{}".format(io_num), min_shape, opt_shape,
-                          max_shape)
+        profile.set_shape("INPUT{}".format(io_num), min_shape, opt_shape, max_shape)
 
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
     datatype_set = set([trt_dtype])
     for dt in datatype_set:
-        if (dt == trt.int8):
+        if dt == trt.int8:
             flags |= 1 << int(trt.BuilderFlag.INT8)
-        elif (dt == trt.float16):
+        elif dt == trt.float16:
             flags |= 1 << int(trt.BuilderFlag.FP16)
     config = builder.create_builder_config()
     config.flags = flags
@@ -769,7 +862,8 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, io_cnt,
         del engine
 
     model_name = tu.get_zero_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", io_cnt, dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", io_cnt, dtype
+    )
     model_version_dir = os.path.join(models_dir, model_name, str(model_version))
     os.makedirs(model_version_dir, exist_ok=True)
 
@@ -777,9 +871,9 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, io_cnt,
         f.write(engine_bytes)
 
 
-def create_plan_shape_tensor_modelfile(models_dir, model_version, io_cnt,
-                                       max_batch, dtype, shape,
-                                       profile_max_size):
+def create_plan_shape_tensor_modelfile(
+    models_dir, model_version, io_cnt, max_batch, dtype, shape, profile_max_size
+):
     # Note that resize layer does not support int tensors.
     # The model takes two inputs (INPUT and DUMMY_INPUT)
     # and produce two outputs.
@@ -792,7 +886,8 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, io_cnt,
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
     if max_batch == 0:
         shape_with_batchsize = len(shape)
         dummy_shape = [-1] * shape_with_batchsize
@@ -803,11 +898,13 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, io_cnt,
     trt_dtype = np_to_trt_dtype(dtype)
     trt_memory_format = trt.TensorFormat.LINEAR
     for io_num in range(io_cnt):
-        in_node = network.add_input("INPUT{}".format(io_num), trt.int32,
-                                    [shape_with_batchsize])
+        in_node = network.add_input(
+            "INPUT{}".format(io_num), trt.int32, [shape_with_batchsize]
+        )
         in_node.allowed_formats = 1 << int(trt_memory_format)
-        dummy_in_node = network.add_input("DUMMY_INPUT{}".format(io_num),
-                                          trt_dtype, dummy_shape)
+        dummy_in_node = network.add_input(
+            "DUMMY_INPUT{}".format(io_num), trt_dtype, dummy_shape
+        )
         dummy_in_node.allowed_formats = 1 << int(trt_memory_format)
         resize_layer = network.add_resize(dummy_in_node)
         resize_layer.set_input(1, in_node)
@@ -826,7 +923,7 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, io_cnt,
         network.mark_output_for_shapes(out_node.get_output(0))
         out_node.get_output(0).allowed_formats = 1 << int(trt_memory_format)
 
-        if (trt_dtype == trt.int8):
+        if trt_dtype == trt.int8:
             in_node.dynamic_range = (-128.0, 127.0)
             out_node.get_output(0).dynamic_range = (-128.0, 127.0)
 
@@ -847,19 +944,21 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, io_cnt,
 
     profile = builder.create_optimization_profile()
     for io_num in range(io_cnt):
-        profile.set_shape_input("INPUT{}".format(io_num), min_shape, opt_shape,
-                                max_shape)
-        profile.set_shape("DUMMY_INPUT{}".format(io_num), min_shape, opt_shape,
-                          max_shape)
+        profile.set_shape_input(
+            "INPUT{}".format(io_num), min_shape, opt_shape, max_shape
+        )
+        profile.set_shape(
+            "DUMMY_INPUT{}".format(io_num), min_shape, opt_shape, max_shape
+        )
 
     config.add_optimization_profile(profile)
 
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
     datatype_set = set([trt_dtype])
     for dt in datatype_set:
-        if (dt == trt.int8):
+        if dt == trt.int8:
             flags |= 1 << int(trt.BuilderFlag.INT8)
-        elif (dt == trt.float16):
+        elif dt == trt.float16:
             flags |= 1 << int(trt.BuilderFlag.FP16)
     config.flags = flags
 
@@ -872,7 +971,8 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, io_cnt,
         del engine
 
     model_name = tu.get_zero_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", io_cnt, dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", io_cnt, dtype
+    )
     model_version_dir = os.path.join(models_dir, model_name, str(model_version))
     os.makedirs(model_version_dir, exist_ok=True)
 
@@ -880,13 +980,15 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, io_cnt,
         f.write(engine_bytes)
 
 
-def create_plan_dynamic_modelfile(models_dir, model_version, io_cnt, max_batch,
-                                  dtype, shape, profile_max_size):
+def create_plan_dynamic_modelfile(
+    models_dir, model_version, io_cnt, max_batch, dtype, shape, profile_max_size
+):
     # Create the model
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
     if max_batch == 0:
         shape_with_batchsize = [i for i in shape]
     else:
@@ -894,8 +996,9 @@ def create_plan_dynamic_modelfile(models_dir, model_version, io_cnt, max_batch,
 
     trt_dtype = np_to_trt_dtype(dtype)
     for io_num in range(io_cnt):
-        in_node = network.add_input("INPUT{}".format(io_num), trt_dtype,
-                                    shape_with_batchsize)
+        in_node = network.add_input(
+            "INPUT{}".format(io_num), trt_dtype, shape_with_batchsize
+        )
         out_node = network.add_identity(in_node)
         out_node.get_output(0).name = "OUTPUT{}".format(io_num)
         network.mark_output(out_node.get_output(0))
@@ -920,8 +1023,7 @@ def create_plan_dynamic_modelfile(models_dir, model_version, io_cnt, max_batch,
 
     profile = builder.create_optimization_profile()
     for io_num in range(io_cnt):
-        profile.set_shape("INPUT{}".format(io_num), min_shape, opt_shape,
-                          max_shape)
+        profile.set_shape("INPUT{}".format(io_num), min_shape, opt_shape, max_shape)
     config = builder.create_builder_config()
     config.add_optimization_profile(profile)
     config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 20)
@@ -933,7 +1035,8 @@ def create_plan_dynamic_modelfile(models_dir, model_version, io_cnt, max_batch,
         del engine
 
     model_name = tu.get_zero_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", io_cnt, dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", io_cnt, dtype
+    )
     model_version_dir = os.path.join(models_dir, model_name, str(model_version))
     os.makedirs(model_version_dir, exist_ok=True)
 
@@ -941,27 +1044,31 @@ def create_plan_dynamic_modelfile(models_dir, model_version, io_cnt, max_batch,
         f.write(engine_bytes)
 
 
-def create_plan_modelconfig(create_savedmodel, models_dir, model_version,
-                            io_cnt, max_batch, dtype, shape):
+def create_plan_modelconfig(
+    create_savedmodel, models_dir, model_version, io_cnt, max_batch, dtype, shape
+):
     if not tu.validate_for_trt_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     shape_str = tu.shape_to_dims_str(shape)
 
     model_name = tu.get_zero_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", io_cnt, dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", io_cnt, dtype
+    )
     config_dir = os.path.join(models_dir, model_name)
 
     if FLAGS.tensorrt_shape_io:
         shape_tensor_dim = len(shape)
-        config = '''
+        config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: {}
-'''.format(model_name, max_batch)
+""".format(
+            model_name, max_batch
+        )
 
         for io_num in range(io_cnt):
-            config += '''
+            config += """
 input [
   {{
     name: "DUMMY_INPUT{}"
@@ -988,19 +1095,30 @@ def create_plan_modelconfig(create_savedmodel, models_dir, model_version,
     is_shape_tensor: true
   }}
 ]
-'''.format(io_num, np_to_model_dtype(dtype), shape_str, io_num,
-            shape_tensor_dim, io_num, np_to_model_dtype(dtype), shape_str,
-            io_num, shape_tensor_dim)
+""".format(
+                io_num,
+                np_to_model_dtype(dtype),
+                shape_str,
+                io_num,
+                shape_tensor_dim,
+                io_num,
+                np_to_model_dtype(dtype),
+                shape_str,
+                io_num,
+                shape_tensor_dim,
+            )
 
     else:
-        config = '''
+        config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: {}
-'''.format(model_name, max_batch)
+""".format(
+            model_name, max_batch
+        )
 
         for io_num in range(io_cnt):
-            config += '''
+            config += """
 input [
   {{
     name: "INPUT{}"
@@ -1015,8 +1133,14 @@ def create_plan_modelconfig(create_savedmodel, models_dir, model_version,
     dims: [ {} ]
   }}
 ]
-'''.format(io_num, np_to_model_dtype(dtype), shape_str, io_num,
-            np_to_model_dtype(dtype), shape_str)
+""".format(
+                io_num,
+                np_to_model_dtype(dtype),
+                shape_str,
+                io_num,
+                np_to_model_dtype(dtype),
+                shape_str,
+            )
 
     os.makedirs(config_dir, exist_ok=True)
 
@@ -1024,172 +1148,212 @@ def create_plan_modelconfig(create_savedmodel, models_dir, model_version,
         cfile.write(config)
 
 
-def create_shape_tensor_models(models_dir,
-                               dtype,
-                               shape,
-                               io_cnt=1,
-                               no_batch=True):
+def create_shape_tensor_models(models_dir, dtype, shape, io_cnt=1, no_batch=True):
     model_version = 1
 
-    create_plan_modelconfig(True, models_dir, model_version, io_cnt, 8, dtype,
-                            shape)
-    create_plan_shape_tensor_modelfile(models_dir, model_version, io_cnt, 8,
-                                       dtype, shape, 32)
+    create_plan_modelconfig(True, models_dir, model_version, io_cnt, 8, dtype, shape)
+    create_plan_shape_tensor_modelfile(
+        models_dir, model_version, io_cnt, 8, dtype, shape, 32
+    )
     if no_batch:
-        create_plan_modelconfig(True, models_dir, model_version, io_cnt, 0,
-                                dtype, shape)
-        create_plan_shape_tensor_modelfile(models_dir, model_version, io_cnt, 0,
-                                           dtype, shape, 32)
+        create_plan_modelconfig(
+            True, models_dir, model_version, io_cnt, 0, dtype, shape
+        )
+        create_plan_shape_tensor_modelfile(
+            models_dir, model_version, io_cnt, 0, dtype, shape, 32
+        )
 
 
 def create_models(models_dir, dtype, shape, io_cnt=1, no_batch=True):
     model_version = 1
 
     if FLAGS.graphdef:
-        create_tf_modelconfig(False, models_dir, model_version, io_cnt, 8,
-                              dtype, shape)
-        create_tf_modelfile(False, models_dir, model_version, io_cnt, 8, dtype,
-                            shape)
+        create_tf_modelconfig(False, models_dir, model_version, io_cnt, 8, dtype, shape)
+        create_tf_modelfile(False, models_dir, model_version, io_cnt, 8, dtype, shape)
         if no_batch:
-            create_tf_modelconfig(False, models_dir, model_version, io_cnt, 0,
-                                  dtype, shape)
-            create_tf_modelfile(False, models_dir, model_version, io_cnt, 0,
-                                dtype, shape)
+            create_tf_modelconfig(
+                False, models_dir, model_version, io_cnt, 0, dtype, shape
+            )
+            create_tf_modelfile(
+                False, models_dir, model_version, io_cnt, 0, dtype, shape
+            )
 
     if FLAGS.savedmodel:
-        create_tf_modelconfig(True, models_dir, model_version, io_cnt, 8, dtype,
-                              shape)
-        create_tf_modelfile(True, models_dir, model_version, io_cnt, 8, dtype,
-                            shape)
+        create_tf_modelconfig(True, models_dir, model_version, io_cnt, 8, dtype, shape)
+        create_tf_modelfile(True, models_dir, model_version, io_cnt, 8, dtype, shape)
         if no_batch:
-            create_tf_modelconfig(True, models_dir, model_version, io_cnt, 0,
-                                  dtype, shape)
-            create_tf_modelfile(True, models_dir, model_version, io_cnt, 0,
-                                dtype, shape)
+            create_tf_modelconfig(
+                True, models_dir, model_version, io_cnt, 0, dtype, shape
+            )
+            create_tf_modelfile(
+                True, models_dir, model_version, io_cnt, 0, dtype, shape
+            )
 
     if FLAGS.onnx:
-        create_onnx_modelconfig(True, models_dir, model_version, io_cnt, 8,
-                                dtype, shape)
-        create_onnx_modelfile(True, models_dir, model_version, io_cnt, 8, dtype,
-                              shape)
+        create_onnx_modelconfig(
+            True, models_dir, model_version, io_cnt, 8, dtype, shape
+        )
+        create_onnx_modelfile(True, models_dir, model_version, io_cnt, 8, dtype, shape)
         if no_batch:
-            create_onnx_modelconfig(True, models_dir, model_version, io_cnt, 0,
-                                    dtype, shape)
-            create_onnx_modelfile(True, models_dir, model_version, io_cnt, 0,
-                                  dtype, shape)
+            create_onnx_modelconfig(
+                True, models_dir, model_version, io_cnt, 0, dtype, shape
+            )
+            create_onnx_modelfile(
+                True, models_dir, model_version, io_cnt, 0, dtype, shape
+            )
 
     if FLAGS.openvino:
-        create_openvino_modelconfig(models_dir, model_version, io_cnt, 8, dtype,
-                                    shape)
-        create_openvino_modelfile(models_dir, model_version, io_cnt, 8, dtype,
-                                  shape)
+        create_openvino_modelconfig(models_dir, model_version, io_cnt, 8, dtype, shape)
+        create_openvino_modelfile(models_dir, model_version, io_cnt, 8, dtype, shape)
         if no_batch:
-            create_openvino_modelconfig(models_dir, model_version, io_cnt, 0,
-                                        dtype, shape)
-            create_openvino_modelfile(models_dir, model_version, io_cnt, 0,
-                                      dtype, shape)
+            create_openvino_modelconfig(
+                models_dir, model_version, io_cnt, 0, dtype, shape
+            )
+            create_openvino_modelfile(
+                models_dir, model_version, io_cnt, 0, dtype, shape
+            )
 
     if FLAGS.libtorch:
-        create_libtorch_modelconfig(True, models_dir, model_version, io_cnt, 8,
-                                    dtype, shape)
-        create_libtorch_modelfile(True, models_dir, model_version, io_cnt, 8,
-                                  dtype, shape)
+        create_libtorch_modelconfig(
+            True, models_dir, model_version, io_cnt, 8, dtype, shape
+        )
+        create_libtorch_modelfile(
+            True, models_dir, model_version, io_cnt, 8, dtype, shape
+        )
         if no_batch:
-            create_libtorch_modelconfig(True, models_dir, model_version, io_cnt,
-                                        0, dtype, shape)
-            create_libtorch_modelfile(True, models_dir, model_version, io_cnt,
-                                      0, dtype, shape)
+            create_libtorch_modelconfig(
+                True, models_dir, model_version, io_cnt, 0, dtype, shape
+            )
+            create_libtorch_modelfile(
+                True, models_dir, model_version, io_cnt, 0, dtype, shape
+            )
 
     if FLAGS.tensorrt:
-        create_plan_modelconfig(True, models_dir, model_version, io_cnt, 8,
-                                dtype, shape)
-        create_plan_modelfile(True, models_dir, model_version, io_cnt, 8, dtype,
-                              shape, 32)
+        create_plan_modelconfig(
+            True, models_dir, model_version, io_cnt, 8, dtype, shape
+        )
+        create_plan_modelfile(
+            True, models_dir, model_version, io_cnt, 8, dtype, shape, 32
+        )
         if no_batch:
-            create_plan_modelconfig(True, models_dir, model_version, io_cnt, 0,
-                                    dtype, shape)
-            create_plan_modelfile(True, models_dir, model_version, io_cnt, 0,
-                                  dtype, shape, 32)
+            create_plan_modelconfig(
+                True, models_dir, model_version, io_cnt, 0, dtype, shape
+            )
+            create_plan_modelfile(
+                True, models_dir, model_version, io_cnt, 0, dtype, shape, 32
+            )
 
     if FLAGS.tensorrt_big:
-        create_plan_modelconfig(True, models_dir, model_version, io_cnt, 8,
-                                dtype, shape)
-        create_plan_modelfile(True, models_dir, model_version, io_cnt, 8, dtype,
-                              shape, 16 * 1024 * 1024)
+        create_plan_modelconfig(
+            True, models_dir, model_version, io_cnt, 8, dtype, shape
+        )
+        create_plan_modelfile(
+            True, models_dir, model_version, io_cnt, 8, dtype, shape, 16 * 1024 * 1024
+        )
         if no_batch:
-            create_plan_modelconfig(True, models_dir, model_version, io_cnt, 0,
-                                    dtype, shape)
-            create_plan_modelfile(True, models_dir, model_version, io_cnt, 0,
-                                  dtype, shape, 16 * 1024 * 1024)
+            create_plan_modelconfig(
+                True, models_dir, model_version, io_cnt, 0, dtype, shape
+            )
+            create_plan_modelfile(
+                True,
+                models_dir,
+                model_version,
+                io_cnt,
+                0,
+                dtype,
+                shape,
+                16 * 1024 * 1024,
+            )
 
     if FLAGS.ensemble:
         emu.create_nop_modelconfig(models_dir, shape, dtype)
-        create_ensemble_modelconfig(True, models_dir, model_version, io_cnt, 8,
-                                    dtype, shape)
-        create_ensemble_modelfile(True, models_dir, model_version, io_cnt, 8,
-                                  dtype, shape)
+        create_ensemble_modelconfig(
+            True, models_dir, model_version, io_cnt, 8, dtype, shape
+        )
+        create_ensemble_modelfile(
+            True, models_dir, model_version, io_cnt, 8, dtype, shape
+        )
         if no_batch:
-            create_ensemble_modelconfig(True, models_dir, model_version, io_cnt,
-                                        0, dtype, shape)
-            create_ensemble_modelfile(True, models_dir, model_version, io_cnt,
-                                      0, dtype, shape)
+            create_ensemble_modelconfig(
+                True, models_dir, model_version, io_cnt, 0, dtype, shape
+            )
+            create_ensemble_modelfile(
+                True, models_dir, model_version, io_cnt, 0, dtype, shape
+            )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
-    parser.add_argument('--graphdef',
-                        required=False,
-                        action='store_true',
-                        help='Generate GraphDef models')
-    parser.add_argument('--savedmodel',
-                        required=False,
-                        action='store_true',
-                        help='Generate SavedModel models')
-    parser.add_argument('--onnx',
-                        required=False,
-                        action='store_true',
-                        help='Generate Onnx Runtime Onnx models')
     parser.add_argument(
-        '--onnx_opset',
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
+    parser.add_argument(
+        "--graphdef",
+        required=False,
+        action="store_true",
+        help="Generate GraphDef models",
+    )
+    parser.add_argument(
+        "--savedmodel",
+        required=False,
+        action="store_true",
+        help="Generate SavedModel models",
+    )
+    parser.add_argument(
+        "--onnx",
+        required=False,
+        action="store_true",
+        help="Generate Onnx Runtime Onnx models",
+    )
+    parser.add_argument(
+        "--onnx_opset",
         type=int,
         required=False,
         default=0,
-        help='Opset used for Onnx models. Default is to use ONNXRT default')
-    parser.add_argument('--libtorch',
-                        required=False,
-                        action='store_true',
-                        help='Generate Pytorch LibTorch models')
-    parser.add_argument('--openvino',
-                        required=False,
-                        action='store_true',
-                        help='Generate OpenVino models')
-    parser.add_argument('--tensorrt',
-                        required=False,
-                        action='store_true',
-                        help='Generate TensorRT PLAN models')
+        help="Opset used for Onnx models. Default is to use ONNXRT default",
+    )
+    parser.add_argument(
+        "--libtorch",
+        required=False,
+        action="store_true",
+        help="Generate Pytorch LibTorch models",
+    )
     parser.add_argument(
-        '--tensorrt-big',
+        "--openvino",
         required=False,
-        action='store_true',
-        help='Generate TensorRT PLAN models w/ opt profile with large max')
+        action="store_true",
+        help="Generate OpenVino models",
+    )
     parser.add_argument(
-        '--tensorrt-shape-io',
+        "--tensorrt",
         required=False,
-        action='store_true',
-        help='Generate TensorRT PLAN models w/ shape tensor i/o')
-    parser.add_argument('--ensemble',
-                        required=False,
-                        action='store_true',
-                        help='Generate ensemble models')
+        action="store_true",
+        help="Generate TensorRT PLAN models",
+    )
+    parser.add_argument(
+        "--tensorrt-big",
+        required=False,
+        action="store_true",
+        help="Generate TensorRT PLAN models w/ opt profile with large max",
+    )
+    parser.add_argument(
+        "--tensorrt-shape-io",
+        required=False,
+        action="store_true",
+        help="Generate TensorRT PLAN models w/ shape tensor i/o",
+    )
+    parser.add_argument(
+        "--ensemble",
+        required=False,
+        action="store_true",
+        help="Generate ensemble models",
+    )
     FLAGS, unparsed = parser.parse_known_args()
 
     if FLAGS.graphdef or FLAGS.savedmodel:
         import tensorflow as tf
         from tensorflow.python.framework import graph_io
+
         tf.compat.v1.disable_eager_execution()
     if FLAGS.onnx:
         import onnx
@@ -1210,9 +1374,7 @@ def create_models(models_dir, dtype, shape, io_cnt=1, no_batch=True):
     if FLAGS.tensorrt_big:
         create_models(FLAGS.models_dir, np.float32, [-1], io_cnt=1)
     elif FLAGS.tensorrt_shape_io:
-        create_shape_tensor_models(FLAGS.models_dir,
-                                   np.float32, [-1, -1],
-                                   io_cnt=1)
+        create_shape_tensor_models(FLAGS.models_dir, np.float32, [-1, -1], io_cnt=1)
     else:
         create_models(FLAGS.models_dir, bool, [-1], io_cnt=1)
         create_models(FLAGS.models_dir, np.float32, [-1], io_cnt=1)
@@ -1225,6 +1387,5 @@ def create_models(models_dir, dtype, shape, io_cnt=1, no_batch=True):
     # Create libtorch linalg model
     if FLAGS.libtorch:
         model_version = 1
-        create_libtorch_linalg_modelconfig(True, FLAGS.models_dir,
-                                           model_version)
+        create_libtorch_linalg_modelconfig(True, FLAGS.models_dir, model_version)
         create_libtorch_linalg_modelfile(True, FLAGS.models_dir, model_version)
diff --git a/qa/common/gen_qa_implicit_models.py b/qa/common/gen_qa_implicit_models.py
old mode 100644
new mode 100755
index fc90e59323..814e3987dc
--- a/qa/common/gen_qa_implicit_models.py
+++ b/qa/common/gen_qa_implicit_models.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -1480,4 +1482,4 @@ def create_models(models_dir, dtype, shape, initial_state, no_batch=True):
         ], FLAGS.initial_state, False)
         create_models(FLAGS.models_dir, bool, [
             -1,
-        ], FLAGS.initial_state, False)
+        ], FLAGS.initial_state, False)
\ No newline at end of file
diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository
index 500683e418..c3117fd503 100755
--- a/qa/common/gen_qa_model_repository
+++ b/qa/common/gen_qa_model_repository
@@ -63,7 +63,7 @@ TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$TRITON_VERSION-t
 TENSORRT_IMAGE=${TENSORRT_IMAGE:=nvcr.io/nvidia/tensorrt:$TRITON_VERSION-py3}
 CUDA_DEVICE=${NV_GPU:=0}
 
-[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS) || DOCKER_GPU_ARGS="--gpus device=$CUDA_DEVICE" 
+[[ $RUNNER_GPUS =~ ^[0-9] ]] && DOCKER_GPU_ARGS=$(eval $NV_DOCKER_ARGS) || DOCKER_GPU_ARGS="--gpus device=$CUDA_DEVICE"
 
 ###
 HOST_BUILD_DIR=${HOST_BUILD_DIR:=/tmp}
@@ -97,7 +97,7 @@ rm -fr $HOST_IDENTITYDESTDIR $HOST_IDENTITYBIGDESTDIR $HOST_SHAPEDESTDIR $HOST_S
 rm -fr $HOST_SEQDESTDIR $HOST_DYNASEQDESTDIR $HOST_VARSEQDESTDIR
 rm -fr $HOST_ENSEMBLEDESTDIR $HOST_NOSHAPEDESTDIR $HOST_RESHAPEDESTDIR
 rm -fr $HOST_PLGDESTDIR $HOST_RAGGEDDESTDIR $HOST_FORMATDESTDIR $HOST_DATADEPENDENTDIR
-rm -rf $HOST_IMPLICITSEQDESTDIR $HOST_VARIMPLICITSEQDESTDIR $HOST_DYNASEQIMPLICITDESTDIR 
+rm -rf $HOST_IMPLICITSEQDESTDIR $HOST_VARIMPLICITSEQDESTDIR $HOST_DYNASEQIMPLICITDESTDIR
 rm -rf $HOST_VARINITIALSTATEIMPLICITSEQDESTDIR $HOST_INITIALSTATEIMPLICITSEQDESTDIR
 mkdir -p $HOST_SRCDIR
 mkdir -p $HOST_DESTDIR
@@ -487,7 +487,7 @@ python3 $SRCDIR/gen_qa_trt_data_dependent_shape.py --models_dir=$DATADEPENDENTDI
 chmod -R 777 $DATADEPENDENTDIR
 # Make shared library for custom clip plugin.
 # FIXME: [DLIS-4138] Once TensorRT uploads a new custom sample plugin they maintain, we should switch to
-# that one. This uses the provided plugin code from Release 8.4. This could break when TensorRT makes 
+# that one. This uses the provided plugin code from Release 8.4. This could break when TensorRT makes
 # changes to their plugin code.
 (git clone -b release/8.4 https://github.com/NVIDIA/TensorRT.git && \
 cd /workspace/TensorRT/samples/python/uff_custom_plugin && cmake . && make && \
diff --git a/qa/common/gen_qa_models.py b/qa/common/gen_qa_models.py
old mode 100644
new mode 100755
index c102d0b996..f050861a62
--- a/qa/common/gen_qa_models.py
+++ b/qa/common/gen_qa_models.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -25,10 +27,11 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-from builtins import range
 import os
-import numpy as np
+from builtins import range
+
 import gen_ensemble_model_utils as emu
+import numpy as np
 
 FLAGS = None
 np_dtype_string = np.dtype(object)
@@ -154,19 +157,26 @@ def np_to_torch_dtype(np_dtype):
         return List[str]
 
 
-def create_graphdef_modelfile(models_dir,
-                              max_batch,
-                              model_version,
-                              input_shape,
-                              output0_shape,
-                              output1_shape,
-                              input_dtype,
-                              output0_dtype,
-                              output1_dtype,
-                              swap=False):
-
-    if not tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
-                                    input_shape, output0_shape, output1_shape):
+def create_graphdef_modelfile(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    swap=False,
+):
+    if not tu.validate_for_tf_model(
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+    ):
         return
 
     tf_input_dtype = np_to_tf_dtype(input_dtype)
@@ -177,19 +187,29 @@ def create_graphdef_modelfile(models_dir,
     # dimension.
     tf.compat.v1.reset_default_graph()
     if max_batch == 0:
-        in0 = tf.compat.v1.placeholder(tf_input_dtype,
-                                       tu.shape_to_tf_shape(input_shape),
-                                       "INPUT0")
-        in1 = tf.compat.v1.placeholder(tf_input_dtype,
-                                       tu.shape_to_tf_shape(input_shape),
-                                       "INPUT1")
+        in0 = tf.compat.v1.placeholder(
+            tf_input_dtype, tu.shape_to_tf_shape(input_shape), "INPUT0"
+        )
+        in1 = tf.compat.v1.placeholder(
+            tf_input_dtype, tu.shape_to_tf_shape(input_shape), "INPUT1"
+        )
     else:
-        in0 = tf.compat.v1.placeholder(tf_input_dtype, [
-            None,
-        ] + tu.shape_to_tf_shape(input_shape), "INPUT0")
-        in1 = tf.compat.v1.placeholder(tf_input_dtype, [
-            None,
-        ] + tu.shape_to_tf_shape(input_shape), "INPUT1")
+        in0 = tf.compat.v1.placeholder(
+            tf_input_dtype,
+            [
+                None,
+            ]
+            + tu.shape_to_tf_shape(input_shape),
+            "INPUT0",
+        )
+        in1 = tf.compat.v1.placeholder(
+            tf_input_dtype,
+            [
+                None,
+            ]
+            + tu.shape_to_tf_shape(input_shape),
+            "INPUT1",
+        )
 
     # If the input is a string, then convert each string to the
     # equivalent int32 value.
@@ -216,8 +236,11 @@ def create_graphdef_modelfile(models_dir,
 
     # Use a different model name for the non-batching variant
     model_name = tu.get_model_name(
-        "graphdef_nobatch" if max_batch == 0 else "graphdef", input_dtype,
-        output0_dtype, output1_dtype)
+        "graphdef_nobatch" if max_batch == 0 else "graphdef",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -226,39 +249,57 @@ def create_graphdef_modelfile(models_dir,
         pass  # ignore existing dir
 
     with tf.compat.v1.Session() as sess:
-        graph_io.write_graph(sess.graph.as_graph_def(),
-                             model_version_dir,
-                             "model.graphdef",
-                             as_text=False)
-
-
-def create_graphdef_modelconfig(models_dir, max_batch, model_version,
-                                input_shape, output0_shape, output1_shape,
-                                input_dtype, output0_dtype, output1_dtype,
-                                output0_label_cnt, version_policy):
-
-    if not tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
-                                    input_shape, output0_shape, output1_shape):
+        graph_io.write_graph(
+            sess.graph.as_graph_def(),
+            model_version_dir,
+            "model.graphdef",
+            as_text=False,
+        )
+
+
+def create_graphdef_modelconfig(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    output0_label_cnt,
+    version_policy,
+):
+    if not tu.validate_for_tf_model(
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+    ):
         return
 
     # Unpack version policy
     version_policy_str = "{ latest { num_versions: 1 }}"
     if version_policy is not None:
         type, val = version_policy
-        if type == 'latest':
-            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(
-                val)
-        elif type == 'specific':
+        if type == "latest":
+            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(val)
+        elif type == "specific":
             version_policy_str = "{{ specific {{ versions: {} }}}}".format(val)
         else:
             version_policy_str = "{ all { }}"
 
     # Use a different model name for the non-batching variant
     model_name = tu.get_model_name(
-        "graphdef_nobatch" if max_batch == 0 else "graphdef", input_dtype,
-        output0_dtype, output1_dtype)
+        "graphdef_nobatch" if max_batch == 0 else "graphdef",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "tensorflow_graphdef"
 max_batch_size: {}
@@ -288,13 +329,19 @@ def create_graphdef_modelconfig(models_dir, max_batch, model_version,
     dims: [ {} ]
   }}
 ]
-'''.format(model_name, max_batch, version_policy_str,
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(output0_dtype),
-           tu.shape_to_dims_str(output0_shape),
-           np_to_model_dtype(output1_dtype),
-           tu.shape_to_dims_str(output1_shape))
+""".format(
+        model_name,
+        max_batch,
+        version_policy_str,
+        np_to_model_dtype(input_dtype),
+        tu.shape_to_dims_str(input_shape),
+        np_to_model_dtype(input_dtype),
+        tu.shape_to_dims_str(input_shape),
+        np_to_model_dtype(output0_dtype),
+        tu.shape_to_dims_str(output0_shape),
+        np_to_model_dtype(output1_dtype),
+        tu.shape_to_dims_str(output1_shape),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -309,19 +356,26 @@ def create_graphdef_modelconfig(models_dir, max_batch, model_version,
             lfile.write("label" + str(l) + "\n")
 
 
-def create_savedmodel_modelfile(models_dir,
-                                max_batch,
-                                model_version,
-                                input_shape,
-                                output0_shape,
-                                output1_shape,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                swap=False):
-
-    if not tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
-                                    input_shape, output0_shape, output1_shape):
+def create_savedmodel_modelfile(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    swap=False,
+):
+    if not tu.validate_for_tf_model(
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+    ):
         return
 
     tf_input_dtype = np_to_tf_dtype(input_dtype)
@@ -332,19 +386,29 @@ def create_savedmodel_modelfile(models_dir,
     # dimension.
     tf.compat.v1.reset_default_graph()
     if max_batch == 0:
-        in0 = tf.compat.v1.placeholder(tf_input_dtype,
-                                       tu.shape_to_tf_shape(input_shape),
-                                       "TENSOR_INPUT0")
-        in1 = tf.compat.v1.placeholder(tf_input_dtype,
-                                       tu.shape_to_tf_shape(input_shape),
-                                       "TENSOR_INPUT1")
+        in0 = tf.compat.v1.placeholder(
+            tf_input_dtype, tu.shape_to_tf_shape(input_shape), "TENSOR_INPUT0"
+        )
+        in1 = tf.compat.v1.placeholder(
+            tf_input_dtype, tu.shape_to_tf_shape(input_shape), "TENSOR_INPUT1"
+        )
     else:
-        in0 = tf.compat.v1.placeholder(tf_input_dtype, [
-            None,
-        ] + tu.shape_to_tf_shape(input_shape), "TENSOR_INPUT0")
-        in1 = tf.compat.v1.placeholder(tf_input_dtype, [
-            None,
-        ] + tu.shape_to_tf_shape(input_shape), "TENSOR_INPUT1")
+        in0 = tf.compat.v1.placeholder(
+            tf_input_dtype,
+            [
+                None,
+            ]
+            + tu.shape_to_tf_shape(input_shape),
+            "TENSOR_INPUT0",
+        )
+        in1 = tf.compat.v1.placeholder(
+            tf_input_dtype,
+            [
+                None,
+            ]
+            + tu.shape_to_tf_shape(input_shape),
+            "TENSOR_INPUT1",
+        )
 
     # If the input is a string, then convert each string to the
     # equivalent float value.
@@ -371,8 +435,11 @@ def create_savedmodel_modelfile(models_dir,
 
     # Use a different model name for the non-batching variant
     model_name = tu.get_model_name(
-        "savedmodel_nobatch" if max_batch == 0 else "savedmodel", input_dtype,
-        output0_dtype, output1_dtype)
+        "savedmodel_nobatch" if max_batch == 0 else "savedmodel",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -382,53 +449,68 @@ def create_savedmodel_modelfile(models_dir,
 
     with tf.compat.v1.Session() as sess:
         input0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_INPUT0:0")
+            "TENSOR_INPUT0:0"
+        )
         input1_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_INPUT1:0")
+            "TENSOR_INPUT1:0"
+        )
         output0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_OUTPUT0:0")
+            "TENSOR_OUTPUT0:0"
+        )
         output1_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_OUTPUT1:0")
-        tf.compat.v1.saved_model.simple_save(sess,
-                                             model_version_dir +
-                                             "/model.savedmodel",
-                                             inputs={
-                                                 "INPUT0": input0_tensor,
-                                                 "INPUT1": input1_tensor
-                                             },
-                                             outputs={
-                                                 "OUTPUT0": output0_tensor,
-                                                 "OUTPUT1": output1_tensor
-                                             })
-
-
-def create_savedmodel_modelconfig(models_dir, max_batch, model_version,
-                                  input_shape, output0_shape, output1_shape,
-                                  input_dtype, output0_dtype, output1_dtype,
-                                  output0_label_cnt, version_policy):
-
-    if not tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
-                                    input_shape, output0_shape, output1_shape):
+            "TENSOR_OUTPUT1:0"
+        )
+        tf.compat.v1.saved_model.simple_save(
+            sess,
+            model_version_dir + "/model.savedmodel",
+            inputs={"INPUT0": input0_tensor, "INPUT1": input1_tensor},
+            outputs={"OUTPUT0": output0_tensor, "OUTPUT1": output1_tensor},
+        )
+
+
+def create_savedmodel_modelconfig(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    output0_label_cnt,
+    version_policy,
+):
+    if not tu.validate_for_tf_model(
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+    ):
         return
 
     # Unpack version policy
     version_policy_str = "{ latest { num_versions: 1 }}"
     if version_policy is not None:
         type, val = version_policy
-        if type == 'latest':
-            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(
-                val)
-        elif type == 'specific':
+        if type == "latest":
+            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(val)
+        elif type == "specific":
             version_policy_str = "{{ specific {{ versions: {} }}}}".format(val)
         else:
             version_policy_str = "{ all { }}"
 
     # Use a different model name for the non-batching variant
     model_name = tu.get_model_name(
-        "savedmodel_nobatch" if max_batch == 0 else "savedmodel", input_dtype,
-        output0_dtype, output1_dtype)
+        "savedmodel_nobatch" if max_batch == 0 else "savedmodel",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "tensorflow_savedmodel"
 max_batch_size: {}
@@ -458,13 +540,19 @@ def create_savedmodel_modelconfig(models_dir, max_batch, model_version,
     dims: [ {} ]
   }}
 ]
-'''.format(model_name, max_batch, version_policy_str,
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(output0_dtype),
-           tu.shape_to_dims_str(output0_shape),
-           np_to_model_dtype(output1_dtype),
-           tu.shape_to_dims_str(output1_shape))
+""".format(
+        model_name,
+        max_batch,
+        version_policy_str,
+        np_to_model_dtype(input_dtype),
+        tu.shape_to_dims_str(input_shape),
+        np_to_model_dtype(input_dtype),
+        tu.shape_to_dims_str(input_shape),
+        np_to_model_dtype(output0_dtype),
+        tu.shape_to_dims_str(output0_shape),
+        np_to_model_dtype(output1_dtype),
+        tu.shape_to_dims_str(output1_shape),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -479,10 +567,20 @@ def create_savedmodel_modelconfig(models_dir, max_batch, model_version,
             lfile.write("label" + str(l) + "\n")
 
 
-def create_plan_dynamic_rf_modelfile(models_dir, max_batch, model_version,
-                                     input_shape, output0_shape, output1_shape,
-                                     input_dtype, output0_dtype, output1_dtype,
-                                     swap, min_dim, max_dim):
+def create_plan_dynamic_rf_modelfile(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    swap,
+    min_dim,
+    max_dim,
+):
     trt_input_dtype = np_to_trt_dtype(input_dtype)
     trt_output0_dtype = np_to_trt_dtype(output0_dtype)
     trt_output1_dtype = np_to_trt_dtype(output1_dtype)
@@ -492,7 +590,8 @@ def create_plan_dynamic_rf_modelfile(models_dir, max_batch, model_version,
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
     if max_batch == 0:
         input_with_batchsize = [i for i in input_shape]
     else:
@@ -541,12 +640,12 @@ def create_plan_dynamic_rf_modelfile(models_dir, max_batch, model_version,
     out0.get_output(0).allowed_formats = 1 << int(trt_memory_format)
     out1.get_output(0).allowed_formats = 1 << int(trt_memory_format)
 
-    if (trt_input_dtype == trt.int8):
+    if trt_input_dtype == trt.int8:
         in0.dynamic_range = (-128.0, 127.0)
         in1.dynamic_range = (-128.0, 127.0)
-    if (trt_output0_dtype == trt.int8):
+    if trt_output0_dtype == trt.int8:
         out0.get_output(0).dynamic_range = (-128.0, 127.0)
-    if (trt_output1_dtype == trt.int8):
+    if trt_output1_dtype == trt.int8:
         out1.get_output(0).dynamic_range = (-128.0, 127.0)
 
     min_shape = []
@@ -572,9 +671,9 @@ def create_plan_dynamic_rf_modelfile(models_dir, max_batch, model_version,
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
     datatype_set = set([trt_input_dtype, trt_output0_dtype, trt_output1_dtype])
     for dt in datatype_set:
-        if (dt == trt.int8):
+        if dt == trt.int8:
             flags |= 1 << int(trt.BuilderFlag.INT8)
-        elif (dt == trt.float16):
+        elif dt == trt.float16:
             flags |= 1 << int(trt.BuilderFlag.FP16)
     config = builder.create_builder_config()
     config.flags = flags
@@ -588,8 +687,12 @@ def create_plan_dynamic_rf_modelfile(models_dir, max_batch, model_version,
         del engine
 
     # Use a different model name for different kinds of models
-    model_name = tu.get_model_name("plan_nobatch" if max_batch == 0 else "plan",
-                                   input_dtype, output0_dtype, output1_dtype)
+    model_name = tu.get_model_name(
+        "plan_nobatch" if max_batch == 0 else "plan",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     if min_dim != 1 or max_dim != 32:
         model_name = "{}-{}-{}".format(model_name, min_dim, max_dim)
 
@@ -604,10 +707,20 @@ def create_plan_dynamic_rf_modelfile(models_dir, max_batch, model_version,
         f.write(engine_bytes)
 
 
-def create_plan_dynamic_modelfile(models_dir, max_batch, model_version,
-                                  input_shape, output0_shape, output1_shape,
-                                  input_dtype, output0_dtype, output1_dtype,
-                                  swap, min_dim, max_dim):
+def create_plan_dynamic_modelfile(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    swap,
+    min_dim,
+    max_dim,
+):
     trt_input_dtype = np_to_trt_dtype(input_dtype)
     trt_output0_dtype = np_to_trt_dtype(output0_dtype)
     trt_output1_dtype = np_to_trt_dtype(output1_dtype)
@@ -616,7 +729,8 @@ def create_plan_dynamic_modelfile(models_dir, max_batch, model_version,
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
     if max_batch == 0:
         input_with_batchsize = [i for i in input_shape]
     else:
@@ -667,10 +781,12 @@ def create_plan_dynamic_modelfile(models_dir, max_batch, model_version,
             if i == 0 and (min_dim == 1 and max_dim == 32):
                 max_shape_override[0] += 1
 
-            profile[i].set_shape("INPUT0", [1] + min_shape, opt_bs + opt_shape,
-                                 bs + max_shape_override)
-            profile[i].set_shape("INPUT1", [1] + min_shape, opt_bs + opt_shape,
-                                 bs + max_shape_override)
+            profile[i].set_shape(
+                "INPUT0", [1] + min_shape, opt_bs + opt_shape, bs + max_shape_override
+            )
+            profile[i].set_shape(
+                "INPUT1", [1] + min_shape, opt_bs + opt_shape, bs + max_shape_override
+            )
         config.add_optimization_profile(profile[i])
     # some profiles with non-one min shape for first dim to test autofiller
     for i in range(2):
@@ -679,10 +795,12 @@ def create_plan_dynamic_modelfile(models_dir, max_batch, model_version,
             profile[i + 4].set_shape("INPUT0", min_shape, opt_shape, max_shape)
             profile[i + 4].set_shape("INPUT1", min_shape, opt_shape, max_shape)
         else:
-            profile[i + 4].set_shape("INPUT0", [5 + i] + min_shape,
-                                     [6] + opt_shape, [max_batch] + max_shape)
-            profile[i + 4].set_shape("INPUT1", [5 + i] + min_shape,
-                                     [6] + opt_shape, [max_batch] + max_shape)
+            profile[i + 4].set_shape(
+                "INPUT0", [5 + i] + min_shape, [6] + opt_shape, [max_batch] + max_shape
+            )
+            profile[i + 4].set_shape(
+                "INPUT1", [5 + i] + min_shape, [6] + opt_shape, [max_batch] + max_shape
+            )
         config.add_optimization_profile(profile[i + 4])
     # Will repeat another profile with same min and max shapes as the first profile to test non-zero profile
     # for infer_variable test.
@@ -691,10 +809,12 @@ def create_plan_dynamic_modelfile(models_dir, max_batch, model_version,
         profile[6].set_shape("INPUT0", min_shape, opt_shape, max_shape)
         profile[6].set_shape("INPUT1", min_shape, opt_shape, max_shape)
     else:
-        profile[6].set_shape("INPUT0", [1] + min_shape, [1] + opt_shape,
-                             [max_batch] + max_shape)
-        profile[6].set_shape("INPUT1", [1] + min_shape, [1] + opt_shape,
-                             [max_batch] + max_shape)
+        profile[6].set_shape(
+            "INPUT0", [1] + min_shape, [1] + opt_shape, [max_batch] + max_shape
+        )
+        profile[6].set_shape(
+            "INPUT1", [1] + min_shape, [1] + opt_shape, [max_batch] + max_shape
+        )
     config.add_optimization_profile(profile[6])
 
     # Will add some profiles with static shapes to test the cases where min_shape=opt_shape=max_shape
@@ -702,27 +822,29 @@ def create_plan_dynamic_modelfile(models_dir, max_batch, model_version,
         profile.append(builder.create_optimization_profile())
         if max_batch == 0:
             static_shape = max_shape
-            profile[7 + i].set_shape("INPUT0", static_shape, static_shape,
-                                     static_shape)
-            profile[7 + i].set_shape("INPUT1", static_shape, static_shape,
-                                     static_shape)
+            profile[7 + i].set_shape("INPUT0", static_shape, static_shape, static_shape)
+            profile[7 + i].set_shape("INPUT1", static_shape, static_shape, static_shape)
         else:
             # Skipping alternate batch sizes for testing unsupported batches in L0_trt_dynamic_shape.
             full_static_shape = [1 + (2 * i)] + max_shape
-            profile[7 + i].set_shape("INPUT0", full_static_shape,
-                                     full_static_shape, full_static_shape)
-            profile[7 + i].set_shape("INPUT1", full_static_shape,
-                                     full_static_shape, full_static_shape)
+            profile[7 + i].set_shape(
+                "INPUT0", full_static_shape, full_static_shape, full_static_shape
+            )
+            profile[7 + i].set_shape(
+                "INPUT1", full_static_shape, full_static_shape, full_static_shape
+            )
         config.add_optimization_profile(profile[7 + i])
 
     # Add profiles where each profile supports a specific batch size
     if max_batch != 0:
         for i in range(max_batch):
             profile.append(builder.create_optimization_profile())
-            profile[10 + i].set_shape("INPUT0", [1 + i] + min_shape,
-                                      [1 + i] + opt_shape, [1 + i] + max_shape)
-            profile[10 + i].set_shape("INPUT1", [1 + i] + min_shape,
-                                      [1 + i] + opt_shape, [1 + i] + max_shape)
+            profile[10 + i].set_shape(
+                "INPUT0", [1 + i] + min_shape, [1 + i] + opt_shape, [1 + i] + max_shape
+            )
+            profile[10 + i].set_shape(
+                "INPUT1", [1 + i] + min_shape, [1 + i] + opt_shape, [1 + i] + max_shape
+            )
             config.add_optimization_profile(profile[10 + i])
 
     config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 20)
@@ -734,8 +856,12 @@ def create_plan_dynamic_modelfile(models_dir, max_batch, model_version,
         del engine
 
     # Use a different model name for different kinds of models
-    model_name = tu.get_model_name("plan_nobatch" if max_batch == 0 else "plan",
-                                   input_dtype, output0_dtype, output1_dtype)
+    model_name = tu.get_model_name(
+        "plan_nobatch" if max_batch == 0 else "plan",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     if min_dim != 1 or max_dim != 32:
         model_name = "{}-{}-{}".format(model_name, min_dim, max_dim)
 
@@ -750,10 +876,18 @@ def create_plan_dynamic_modelfile(models_dir, max_batch, model_version,
         f.write(engine_bytes)
 
 
-def create_plan_fixed_rf_modelfile(models_dir, max_batch, model_version,
-                                   input_shape, output0_shape, output1_shape,
-                                   input_dtype, output0_dtype, output1_dtype,
-                                   swap):
+def create_plan_fixed_rf_modelfile(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    swap,
+):
     trt_input_dtype = np_to_trt_dtype(input_dtype)
     trt_output0_dtype = np_to_trt_dtype(output0_dtype)
     trt_output1_dtype = np_to_trt_dtype(output1_dtype)
@@ -784,20 +918,20 @@ def create_plan_fixed_rf_modelfile(models_dir, max_batch, model_version,
     out0.get_output(0).allowed_formats = 1 << int(trt_memory_format)
     out1.get_output(0).allowed_formats = 1 << int(trt_memory_format)
 
-    if (trt_input_dtype == trt.int8):
+    if trt_input_dtype == trt.int8:
         in0.dynamic_range = (-128.0, 127.0)
         in1.dynamic_range = (-128.0, 127.0)
-    if (trt_output0_dtype == trt.int8):
+    if trt_output0_dtype == trt.int8:
         out0.get_output(0).dynamic_range = (-128.0, 127.0)
-    if (trt_output1_dtype == trt.int8):
+    if trt_output1_dtype == trt.int8:
         out1.get_output(0).dynamic_range = (-128.0, 127.0)
 
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
     datatype_set = set([trt_input_dtype, trt_output0_dtype, trt_output1_dtype])
     for dt in datatype_set:
-        if (dt == trt.int8):
+        if dt == trt.int8:
             flags |= 1 << int(trt.BuilderFlag.INT8)
-        elif (dt == trt.float16):
+        elif dt == trt.float16:
             flags |= 1 << int(trt.BuilderFlag.FP16)
     config = builder.create_builder_config()
     config.flags = flags
@@ -810,8 +944,12 @@ def create_plan_fixed_rf_modelfile(models_dir, max_batch, model_version,
         engine_bytes = engine.serialize()
         del engine
 
-    model_name = tu.get_model_name("plan_nobatch" if max_batch == 0 else "plan",
-                                   input_dtype, output0_dtype, output1_dtype)
+    model_name = tu.get_model_name(
+        "plan_nobatch" if max_batch == 0 else "plan",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -823,10 +961,18 @@ def create_plan_fixed_rf_modelfile(models_dir, max_batch, model_version,
         f.write(engine_bytes)
 
 
-def create_plan_fixed_modelfile(models_dir, max_batch, model_version,
-                                input_shape, output0_shape, output1_shape,
-                                input_dtype, output0_dtype, output1_dtype,
-                                swap):
+def create_plan_fixed_modelfile(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    swap,
+):
     trt_input_dtype = np_to_trt_dtype(input_dtype)
     trt_output0_dtype = np_to_trt_dtype(output0_dtype)
     trt_output1_dtype = np_to_trt_dtype(output1_dtype)
@@ -859,8 +1005,12 @@ def create_plan_fixed_modelfile(models_dir, max_batch, model_version,
         del engine
     del network
 
-    model_name = tu.get_model_name("plan_nobatch" if max_batch == 0 else "plan",
-                                   input_dtype, output0_dtype, output1_dtype)
+    model_name = tu.get_model_name(
+        "plan_nobatch" if max_batch == 0 else "plan",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -872,97 +1022,168 @@ def create_plan_fixed_modelfile(models_dir, max_batch, model_version,
         f.write(engine_bytes)
 
 
-def create_plan_modelfile(models_dir,
-                          max_batch,
-                          model_version,
-                          input_shape,
-                          output0_shape,
-                          output1_shape,
-                          input_dtype,
-                          output0_dtype,
-                          output1_dtype,
-                          swap=False,
-                          min_dim=1,
-                          max_dim=32):
-
-    if not tu.validate_for_trt_model(input_dtype, output0_dtype, output1_dtype,
-                                     input_shape, output0_shape, output1_shape):
+def create_plan_modelfile(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    swap=False,
+    min_dim=1,
+    max_dim=32,
+):
+    if not tu.validate_for_trt_model(
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+    ):
         return
 
-    if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
+    if (
+        input_dtype == np.uint8
+        or output0_dtype == np.uint8
+        or output1_dtype == np.uint8
+    ):
         # TRT uint8 cannot be used to represent quantized floating-point value yet
         # EXPLICIT_BATCH network and conversion are required to create models
-        create_plan_dynamic_rf_modelfile(models_dir, max_batch, model_version,
-                                         input_shape, output0_shape,
-                                         output1_shape, input_dtype,
-                                         output0_dtype, output1_dtype, swap,
-                                         min_dim, max_dim)
-
-    elif input_dtype != np.float32 or output0_dtype != np.float32 or output1_dtype != np.float32:
-        if (not tu.shape_is_fixed(input_shape) or
-                not tu.shape_is_fixed(output0_shape) or
-                not tu.shape_is_fixed(output1_shape)):
-            create_plan_dynamic_rf_modelfile(models_dir, max_batch,
-                                             model_version, input_shape,
-                                             output0_shape, output1_shape,
-                                             input_dtype, output0_dtype,
-                                             output1_dtype, swap, min_dim,
-                                             max_dim)
+        create_plan_dynamic_rf_modelfile(
+            models_dir,
+            max_batch,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            swap,
+            min_dim,
+            max_dim,
+        )
+
+    elif (
+        input_dtype != np.float32
+        or output0_dtype != np.float32
+        or output1_dtype != np.float32
+    ):
+        if (
+            not tu.shape_is_fixed(input_shape)
+            or not tu.shape_is_fixed(output0_shape)
+            or not tu.shape_is_fixed(output1_shape)
+        ):
+            create_plan_dynamic_rf_modelfile(
+                models_dir,
+                max_batch,
+                model_version,
+                input_shape,
+                output0_shape,
+                output1_shape,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                swap,
+                min_dim,
+                max_dim,
+            )
         else:
-            create_plan_fixed_rf_modelfile(models_dir, max_batch, model_version,
-                                           input_shape, output0_shape,
-                                           output1_shape, input_dtype,
-                                           output0_dtype, output1_dtype, swap)
+            create_plan_fixed_rf_modelfile(
+                models_dir,
+                max_batch,
+                model_version,
+                input_shape,
+                output0_shape,
+                output1_shape,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                swap,
+            )
 
     else:
-        if (not tu.shape_is_fixed(input_shape) or
-                not tu.shape_is_fixed(output0_shape) or
-                not tu.shape_is_fixed(output1_shape)):
-            create_plan_dynamic_modelfile(models_dir, max_batch, model_version,
-                                          input_shape, output0_shape,
-                                          output1_shape, input_dtype,
-                                          output0_dtype, output1_dtype, swap,
-                                          min_dim, max_dim)
+        if (
+            not tu.shape_is_fixed(input_shape)
+            or not tu.shape_is_fixed(output0_shape)
+            or not tu.shape_is_fixed(output1_shape)
+        ):
+            create_plan_dynamic_modelfile(
+                models_dir,
+                max_batch,
+                model_version,
+                input_shape,
+                output0_shape,
+                output1_shape,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                swap,
+                min_dim,
+                max_dim,
+            )
         else:
-            create_plan_fixed_modelfile(models_dir, max_batch, model_version,
-                                        input_shape, output0_shape,
-                                        output1_shape, input_dtype,
-                                        output0_dtype, output1_dtype, swap)
-
-
-def create_plan_modelconfig(models_dir,
-                            max_batch,
-                            model_version,
-                            input_shape,
-                            output0_shape,
-                            output1_shape,
-                            input_dtype,
-                            output0_dtype,
-                            output1_dtype,
-                            output0_label_cnt,
-                            version_policy,
-                            min_dim=1,
-                            max_dim=32):
-
-    if not tu.validate_for_trt_model(input_dtype, output0_dtype, output1_dtype,
-                                     input_shape, output0_shape, output1_shape):
+            create_plan_fixed_modelfile(
+                models_dir,
+                max_batch,
+                model_version,
+                input_shape,
+                output0_shape,
+                output1_shape,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                swap,
+            )
+
+
+def create_plan_modelconfig(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    output0_label_cnt,
+    version_policy,
+    min_dim=1,
+    max_dim=32,
+):
+    if not tu.validate_for_trt_model(
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+    ):
         return
 
     # Unpack version policy
     version_policy_str = "{ latest { num_versions: 1 }}"
     if version_policy is not None:
         type, val = version_policy
-        if type == 'latest':
-            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(
-                val)
-        elif type == 'specific':
+        if type == "latest":
+            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(val)
+        elif type == "specific":
             version_policy_str = "{{ specific {{ versions: {} }}}}".format(val)
         else:
             version_policy_str = "{ all { }}"
 
     # Use a different model name for different kinds of models
-    model_name = tu.get_model_name("plan_nobatch" if max_batch == 0 else "plan",
-                                   input_dtype, output0_dtype, output1_dtype)
+    model_name = tu.get_model_name(
+        "plan_nobatch" if max_batch == 0 else "plan",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     if min_dim != 1 or max_dim != 32:
         model_name = "{}-{}-{}".format(model_name, min_dim, max_dim)
 
@@ -972,7 +1193,7 @@ def create_plan_modelconfig(models_dir,
         # Note the min and max shapes of first and sixth
         # profile are identical.
         profile_index = 6 if input_dtype == np.float32 else 0
-        config = '''
+        config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: {}
@@ -1007,15 +1228,22 @@ def create_plan_modelconfig(models_dir,
       profile:"{}"
   }}
 ]
-'''.format(model_name, max_batch, version_policy_str,
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(output0_dtype),
-           tu.shape_to_dims_str(output0_shape),
-           np_to_model_dtype(output1_dtype),
-           tu.shape_to_dims_str(output1_shape), profile_index)
+""".format(
+            model_name,
+            max_batch,
+            version_policy_str,
+            np_to_model_dtype(input_dtype),
+            tu.shape_to_dims_str(input_shape),
+            np_to_model_dtype(input_dtype),
+            tu.shape_to_dims_str(input_shape),
+            np_to_model_dtype(output0_dtype),
+            tu.shape_to_dims_str(output0_shape),
+            np_to_model_dtype(output1_dtype),
+            tu.shape_to_dims_str(output1_shape),
+            profile_index,
+        )
     else:
-        config = '''
+        config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: {}
@@ -1045,13 +1273,19 @@ def create_plan_modelconfig(models_dir,
     dims: [ {} ]
   }}
 ]
-'''.format(model_name, max_batch, version_policy_str,
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(output0_dtype),
-           tu.shape_to_dims_str(output0_shape),
-           np_to_model_dtype(output1_dtype),
-           tu.shape_to_dims_str(output1_shape))
+""".format(
+            model_name,
+            max_batch,
+            version_policy_str,
+            np_to_model_dtype(input_dtype),
+            tu.shape_to_dims_str(input_shape),
+            np_to_model_dtype(input_dtype),
+            tu.shape_to_dims_str(input_shape),
+            np_to_model_dtype(output0_dtype),
+            tu.shape_to_dims_str(output0_shape),
+            np_to_model_dtype(output1_dtype),
+            tu.shape_to_dims_str(output1_shape),
+        )
 
     try:
         os.makedirs(config_dir)
@@ -1066,20 +1300,26 @@ def create_plan_modelconfig(models_dir,
             lfile.write("label" + str(l) + "\n")
 
 
-def create_onnx_modelfile(models_dir,
-                          max_batch,
-                          model_version,
-                          input_shape,
-                          output0_shape,
-                          output1_shape,
-                          input_dtype,
-                          output0_dtype,
-                          output1_dtype,
-                          swap=False):
-
-    if not tu.validate_for_onnx_model(input_dtype, output0_dtype, output1_dtype,
-                                      input_shape, output0_shape,
-                                      output1_shape):
+def create_onnx_modelfile(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    swap=False,
+):
+    if not tu.validate_for_onnx_model(
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+    ):
         return
 
     onnx_input_dtype = np_to_onnx_dtype(input_dtype)
@@ -1091,43 +1331,55 @@ def create_onnx_modelfile(models_dir,
     onnx_output1_shape, idx = tu.shape_to_onnx_shape(input_shape, idx)
 
     # Create the model
-    model_name = tu.get_model_name("onnx_nobatch" if max_batch == 0 else "onnx",
-                                   input_dtype, output0_dtype, output1_dtype)
+    model_name = tu.get_model_name(
+        "onnx_nobatch" if max_batch == 0 else "onnx",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     batch_dim = [] if max_batch == 0 else [None]
 
-    in0 = onnx.helper.make_tensor_value_info("INPUT0", onnx_input_dtype,
-                                             batch_dim + onnx_input_shape)
-    in1 = onnx.helper.make_tensor_value_info("INPUT1", onnx_input_dtype,
-                                             batch_dim + onnx_input_shape)
+    in0 = onnx.helper.make_tensor_value_info(
+        "INPUT0", onnx_input_dtype, batch_dim + onnx_input_shape
+    )
+    in1 = onnx.helper.make_tensor_value_info(
+        "INPUT1", onnx_input_dtype, batch_dim + onnx_input_shape
+    )
 
-    out0 = onnx.helper.make_tensor_value_info("OUTPUT0", onnx_output0_dtype,
-                                              batch_dim + onnx_output0_shape)
-    out1 = onnx.helper.make_tensor_value_info("OUTPUT1", onnx_output1_dtype,
-                                              batch_dim + onnx_output1_shape)
+    out0 = onnx.helper.make_tensor_value_info(
+        "OUTPUT0", onnx_output0_dtype, batch_dim + onnx_output0_shape
+    )
+    out1 = onnx.helper.make_tensor_value_info(
+        "OUTPUT1", onnx_output1_dtype, batch_dim + onnx_output1_shape
+    )
 
     internal_in0 = onnx.helper.make_node("Identity", ["INPUT0"], ["_INPUT0"])
     internal_in1 = onnx.helper.make_node("Identity", ["INPUT1"], ["_INPUT1"])
 
-    # cast int8, int16 input to higer precision int as Onnx Add/Sub operator doesn't support those type
+    # cast int8, int16 input to higher precision int as Onnx Add/Sub operator doesn't support those type
     # Also casting String data type to int32
-    if ((onnx_input_dtype == onnx.TensorProto.INT8) or
-        (onnx_input_dtype == onnx.TensorProto.INT16) or
-        (onnx_input_dtype == onnx.TensorProto.STRING)):
-        internal_in0 = onnx.helper.make_node("Cast", ["INPUT0"], ["_INPUT0"],
-                                             to=onnx.TensorProto.INT32)
-        internal_in1 = onnx.helper.make_node("Cast", ["INPUT1"], ["_INPUT1"],
-                                             to=onnx.TensorProto.INT32)
-
-    add = onnx.helper.make_node("Add", ["_INPUT0", "_INPUT1"],
-                                ["CAST0" if not swap else "CAST1"])
-    sub = onnx.helper.make_node("Sub", ["_INPUT0", "_INPUT1"],
-                                ["CAST1" if not swap else "CAST0"])
-    cast0 = onnx.helper.make_node("Cast", ["CAST0"], ["OUTPUT0"],
-                                  to=onnx_output0_dtype)
-    cast1 = onnx.helper.make_node("Cast", ["CAST1"], ["OUTPUT1"],
-                                  to=onnx_output1_dtype)
+    if (
+        (onnx_input_dtype == onnx.TensorProto.INT8)
+        or (onnx_input_dtype == onnx.TensorProto.INT16)
+        or (onnx_input_dtype == onnx.TensorProto.STRING)
+    ):
+        internal_in0 = onnx.helper.make_node(
+            "Cast", ["INPUT0"], ["_INPUT0"], to=onnx.TensorProto.INT32
+        )
+        internal_in1 = onnx.helper.make_node(
+            "Cast", ["INPUT1"], ["_INPUT1"], to=onnx.TensorProto.INT32
+        )
+
+    add = onnx.helper.make_node(
+        "Add", ["_INPUT0", "_INPUT1"], ["CAST0" if not swap else "CAST1"]
+    )
+    sub = onnx.helper.make_node(
+        "Sub", ["_INPUT0", "_INPUT1"], ["CAST1" if not swap else "CAST0"]
+    )
+    cast0 = onnx.helper.make_node("Cast", ["CAST0"], ["OUTPUT0"], to=onnx_output0_dtype)
+    cast1 = onnx.helper.make_node("Cast", ["CAST1"], ["OUTPUT1"], to=onnx_output1_dtype)
 
     # Avoid cast from float16 to float16
     # (bug in Onnx Runtime, cast from float16 to float16 will become cast from float16 to float32)
@@ -1141,13 +1393,14 @@ def create_onnx_modelfile(models_dir,
     onnx_inputs = [in0, in1]
     onnx_outputs = [out0, out1]
 
-    graph_proto = onnx.helper.make_graph(onnx_nodes, model_name, onnx_inputs,
-                                         onnx_outputs)
+    graph_proto = onnx.helper.make_graph(
+        onnx_nodes, model_name, onnx_inputs, onnx_outputs
+    )
     if FLAGS.onnx_opset > 0:
         model_opset = onnx.helper.make_operatorsetid("", FLAGS.onnx_opset)
-        model_def = onnx.helper.make_model(graph_proto,
-                                           producer_name="triton",
-                                           opset_imports=[model_opset])
+        model_def = onnx.helper.make_model(
+            graph_proto, producer_name="triton", opset_imports=[model_opset]
+        )
     else:
         model_def = onnx.helper.make_model(graph_proto, producer_name="triton")
 
@@ -1159,35 +1412,54 @@ def create_onnx_modelfile(models_dir,
     onnx.save(model_def, model_version_dir + "/model.onnx")
 
 
-def create_onnx_modelconfig(models_dir, max_batch, model_version, input_shape,
-                            output0_shape, output1_shape, input_dtype,
-                            output0_dtype, output1_dtype, output0_label_cnt,
-                            version_policy):
-
-    if not tu.validate_for_onnx_model(input_dtype, output0_dtype, output1_dtype,
-                                      input_shape, output0_shape,
-                                      output1_shape):
+def create_onnx_modelconfig(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    output0_label_cnt,
+    version_policy,
+):
+    if not tu.validate_for_onnx_model(
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+    ):
         return
 
     # Use a different model name for the non-batching variant
-    model_name = tu.get_model_name("onnx_nobatch" if max_batch == 0 else "onnx",
-                                   input_dtype, output0_dtype, output1_dtype)
+    model_name = tu.get_model_name(
+        "onnx_nobatch" if max_batch == 0 else "onnx",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     config_dir = models_dir + "/" + model_name
 
     # [TODO] move create_general_modelconfig() out of emu as it is general
     # enough for all backends to use
-    config = emu.create_general_modelconfig(model_name,
-                                            "onnxruntime_onnx",
-                                            max_batch,
-                                            emu.repeat(input_dtype, 2),
-                                            emu.repeat(input_shape, 2),
-                                            emu.repeat(None, 2),
-                                            [output0_dtype, output1_dtype],
-                                            [output0_shape, output1_shape],
-                                            emu.repeat(None, 2),
-                                            ["output0_labels.txt", None],
-                                            version_policy=version_policy,
-                                            force_tensor_number_suffix=True)
+    config = emu.create_general_modelconfig(
+        model_name,
+        "onnxruntime_onnx",
+        max_batch,
+        emu.repeat(input_dtype, 2),
+        emu.repeat(input_shape, 2),
+        emu.repeat(None, 2),
+        [output0_dtype, output1_dtype],
+        [output0_shape, output1_shape],
+        emu.repeat(None, 2),
+        ["output0_labels.txt", None],
+        version_policy=version_policy,
+        force_tensor_number_suffix=True,
+    )
 
     try:
         os.makedirs(config_dir)
@@ -1202,38 +1474,49 @@ def create_onnx_modelconfig(models_dir, max_batch, model_version, input_shape,
             lfile.write("label" + str(l) + "\n")
 
 
-def create_libtorch_modelfile(models_dir,
-                              max_batch,
-                              model_version,
-                              input_shape,
-                              output0_shape,
-                              output1_shape,
-                              input_dtype,
-                              output0_dtype,
-                              output1_dtype,
-                              swap=False):
-
+def create_libtorch_modelfile(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    swap=False,
+):
     if not tu.validate_for_libtorch_model(
-            input_dtype, output0_dtype, output1_dtype, input_shape,
-            output0_shape, output1_shape, max_batch):
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+        max_batch,
+    ):
         return
 
     torch_output0_dtype = np_to_torch_dtype(output0_dtype)
     torch_output1_dtype = np_to_torch_dtype(output1_dtype)
 
     model_name = tu.get_model_name(
-        "libtorch_nobatch" if max_batch == 0 else "libtorch", input_dtype,
-        output0_dtype, output1_dtype)
+        "libtorch_nobatch" if max_batch == 0 else "libtorch",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     # handle for -1 (when variable) since can't create tensor with shape of [-1]
     input_shape = [abs(ips) for ips in input_shape]
 
     # Create the model
-    if (input_dtype
-            == np_dtype_string) and (output0_dtype != np_dtype_string) and (
-                output1_dtype != np_dtype_string):
+    if (
+        (input_dtype == np_dtype_string)
+        and (output0_dtype != np_dtype_string)
+        and (output1_dtype != np_dtype_string)
+    ):
 
         class AddSubNet(nn.Module):
-
             def __init__(self, *args):
                 self.output0_dtype = args[0][0]
                 self.output1_dtype = args[0][1]
@@ -1243,97 +1526,127 @@ def __init__(self, *args):
             def forward(self, INPUT0: List[str], INPUT1: List[str]):
                 input0_int = torch.tensor([int(i) for i in INPUT0])
                 input1_int = torch.tensor([int(i) for i in INPUT1])
-                op0 = input0_int + input1_int if not self.swap else input0_int - input1_int
-                op1 = input0_int - input1_int if not self.swap else input0_int + input1_int
+                op0 = (
+                    input0_int + input1_int
+                    if not self.swap
+                    else input0_int - input1_int
+                )
+                op1 = (
+                    input0_int - input1_int
+                    if not self.swap
+                    else input0_int + input1_int
+                )
                 return op0.to(self.output0_dtype), op1.to(self.output1_dtype)
-    elif (input_dtype
-          == np_dtype_string) and (output0_dtype
-                                   == np_dtype_string) and (output1_dtype
-                                                            == np_dtype_string):
 
-        class AddSubNet(nn.Module):
+    elif (
+        (input_dtype == np_dtype_string)
+        and (output0_dtype == np_dtype_string)
+        and (output1_dtype == np_dtype_string)
+    ):
 
+        class AddSubNet(nn.Module):
             def __init__(self, *args):
                 self.output0_dtype = args[0][0]
                 self.output1_dtype = args[0][1]
                 self.swap = args[0][2]
                 super(AddSubNet, self).__init__()
 
-            def forward(self, INPUT0: List[str],
-                        INPUT1: List[str]) -> Tuple[List[str], List[str]]:
+            def forward(
+                self, INPUT0: List[str], INPUT1: List[str]
+            ) -> Tuple[List[str], List[str]]:
                 input0_int = torch.tensor([int(i) for i in INPUT0])
                 input1_int = torch.tensor([int(i) for i in INPUT1])
                 op0 = [
                     str(i.item())
-                    for i in (input0_int +
-                              input1_int if not self.swap else input0_int -
-                              input1_int)
+                    for i in (
+                        input0_int + input1_int
+                        if not self.swap
+                        else input0_int - input1_int
+                    )
                 ]
                 op1 = [
                     str(i.item())
-                    for i in (input0_int -
-                              input1_int if not self.swap else input0_int +
-                              input1_int)
+                    for i in (
+                        input0_int - input1_int
+                        if not self.swap
+                        else input0_int + input1_int
+                    )
                 ]
                 return op0, op1
-    elif (input_dtype
-          == np_dtype_string) and (output0_dtype == np_dtype_string) and (
-              output1_dtype != np_dtype_string):
 
-        class AddSubNet(nn.Module):
+    elif (
+        (input_dtype == np_dtype_string)
+        and (output0_dtype == np_dtype_string)
+        and (output1_dtype != np_dtype_string)
+    ):
 
+        class AddSubNet(nn.Module):
             def __init__(self, *args):
                 self.output0_dtype = args[0][0]
                 self.output1_dtype = args[0][1]
                 self.swap = args[0][2]
                 super(AddSubNet, self).__init__()
 
-            def forward(self, INPUT0: List[str],
-                        INPUT1: List[str]) -> Tuple[List[str], torch.Tensor]:
+            def forward(
+                self, INPUT0: List[str], INPUT1: List[str]
+            ) -> Tuple[List[str], torch.Tensor]:
                 input0_int = torch.tensor([int(i) for i in INPUT0])
                 input1_int = torch.tensor([int(i) for i in INPUT1])
                 op0 = [
                     str(i.item())
-                    for i in (input0_int +
-                              input1_int if not self.swap else input0_int -
-                              input1_int)
+                    for i in (
+                        input0_int + input1_int
+                        if not self.swap
+                        else input0_int - input1_int
+                    )
                 ]
-                op1 = (input0_int -
-                       input1_int if not self.swap else input0_int +
-                       input1_int).to(self.output1_dtype)
+                op1 = (
+                    input0_int - input1_int
+                    if not self.swap
+                    else input0_int + input1_int
+                ).to(self.output1_dtype)
                 return op0, op1
-    elif (input_dtype == np_dtype_string) and (
-            output0_dtype != np_dtype_string) and (output1_dtype
-                                                   == np_dtype_string):
 
-        class AddSubNet(nn.Module):
+    elif (
+        (input_dtype == np_dtype_string)
+        and (output0_dtype != np_dtype_string)
+        and (output1_dtype == np_dtype_string)
+    ):
 
+        class AddSubNet(nn.Module):
             def __init__(self, *args):
                 self.output0_dtype = args[0][0]
                 self.output1_dtype = args[0][1]
                 self.swap = args[0][2]
                 super(AddSubNet, self).__init__()
 
-            def forward(self, INPUT0: List[str],
-                        INPUT1: List[str]) -> Tuple[torch.Tensor, List[str]]:
+            def forward(
+                self, INPUT0: List[str], INPUT1: List[str]
+            ) -> Tuple[torch.Tensor, List[str]]:
                 input0_int = torch.tensor([int(i) for i in INPUT0])
                 input1_int = torch.tensor([int(i) for i in INPUT1])
-                op0 = (input0_int +
-                       input1_int if not self.swap else input0_int -
-                       input1_int).to(self.output0_dtype)
+                op0 = (
+                    input0_int + input1_int
+                    if not self.swap
+                    else input0_int - input1_int
+                ).to(self.output0_dtype)
                 op1 = [
                     str(i.item())
-                    for i in (input0_int -
-                              input1_int if not self.swap else input0_int +
-                              input1_int)
+                    for i in (
+                        input0_int - input1_int
+                        if not self.swap
+                        else input0_int + input1_int
+                    )
                 ]
                 return op0, op1
-    elif (input_dtype != np_dtype_string) and (
-            output0_dtype == np_dtype_string) and (output1_dtype
-                                                   == np_dtype_string):
 
-        class AddSubNet(nn.Module):
+    elif (
+        (input_dtype != np_dtype_string)
+        and (output0_dtype == np_dtype_string)
+        and (output1_dtype == np_dtype_string)
+    ):
 
+        class AddSubNet(nn.Module):
             def __init__(self, *args):
                 self.output0_dtype = args[0][0]
                 self.output1_dtype = args[0][1]
@@ -1343,21 +1656,21 @@ def __init__(self, *args):
             def forward(self, INPUT0, INPUT1) -> Tuple[List[str], List[str]]:
                 op0 = [
                     str(i.item())
-                    for i in (INPUT0 + INPUT1 if not self.swap else INPUT0 -
-                              INPUT1)
+                    for i in (INPUT0 + INPUT1 if not self.swap else INPUT0 - INPUT1)
                 ]
                 op1 = [
                     str(i.item())
-                    for i in (INPUT0 - INPUT1 if not self.swap else INPUT0 +
-                              INPUT1)
+                    for i in (INPUT0 - INPUT1 if not self.swap else INPUT0 + INPUT1)
                 ]
                 return op0, op1
-    elif (input_dtype != np_dtype_string) and (
-            output0_dtype != np_dtype_string) and (output1_dtype
-                                                   == np_dtype_string):
 
-        class AddSubNet(nn.Module):
+    elif (
+        (input_dtype != np_dtype_string)
+        and (output0_dtype != np_dtype_string)
+        and (output1_dtype == np_dtype_string)
+    ):
 
+        class AddSubNet(nn.Module):
             def __init__(self, *args):
                 self.output0_dtype = args[0][0]
                 self.output1_dtype = args[0][1]
@@ -1365,20 +1678,22 @@ def __init__(self, *args):
                 super(AddSubNet, self).__init__()
 
             def forward(self, INPUT0, INPUT1) -> Tuple[torch.Tensor, List[str]]:
-                op0 = (INPUT0 + INPUT1 if not self.swap else INPUT0 -
-                       INPUT1).to(self.output0_dtype)
+                op0 = (INPUT0 + INPUT1 if not self.swap else INPUT0 - INPUT1).to(
+                    self.output0_dtype
+                )
                 op1 = [
                     str(i.item())
-                    for i in (INPUT0 - INPUT1 if not self.swap else INPUT0 +
-                              INPUT1)
+                    for i in (INPUT0 - INPUT1 if not self.swap else INPUT0 + INPUT1)
                 ]
                 return op0, op1
-    elif (input_dtype != np_dtype_string) and (
-            output0_dtype
-            == np_dtype_string) and (output1_dtype != np_dtype_string):
 
-        class AddSubNet(nn.Module):
+    elif (
+        (input_dtype != np_dtype_string)
+        and (output0_dtype == np_dtype_string)
+        and (output1_dtype != np_dtype_string)
+    ):
 
+        class AddSubNet(nn.Module):
             def __init__(self, *args):
                 self.output0_dtype = args[0][0]
                 self.output1_dtype = args[0][1]
@@ -1388,16 +1703,16 @@ def __init__(self, *args):
             def forward(self, INPUT0, INPUT1) -> Tuple[List[str], torch.Tensor]:
                 op0 = [
                     str(i.item())
-                    for i in (INPUT0 + INPUT1 if not self.swap else INPUT0 -
-                              INPUT1)
+                    for i in (INPUT0 + INPUT1 if not self.swap else INPUT0 - INPUT1)
                 ]
-                op1 = (INPUT0 - INPUT1 if not self.swap else INPUT0 +
-                       INPUT1).to(self.output1_dtype)
+                op1 = (INPUT0 - INPUT1 if not self.swap else INPUT0 + INPUT1).to(
+                    self.output1_dtype
+                )
                 return op0, op1
+
     else:
 
         class AddSubNet(nn.Module):
-
             def __init__(self, *args):
                 self.output0_dtype = args[0][0]
                 self.output1_dtype = args[0][1]
@@ -1405,10 +1720,12 @@ def __init__(self, *args):
                 super(AddSubNet, self).__init__()
 
             def forward(self, INPUT0, INPUT1):
-                op0 = (INPUT0 + INPUT1 if not self.swap else INPUT0 -
-                       INPUT1).to(self.output0_dtype)
-                op1 = (INPUT0 - INPUT1 if not self.swap else INPUT0 +
-                       INPUT1).to(self.output1_dtype)
+                op0 = (INPUT0 + INPUT1 if not self.swap else INPUT0 - INPUT1).to(
+                    self.output0_dtype
+                )
+                op1 = (INPUT0 - INPUT1 if not self.swap else INPUT0 + INPUT1).to(
+                    self.output1_dtype
+                )
                 return op0, op1
 
     addSubModel = AddSubNet((torch_output0_dtype, torch_output1_dtype, swap))
@@ -1424,34 +1741,50 @@ def forward(self, INPUT0, INPUT1):
     traced.save(model_version_dir + "/model.pt")
 
 
-def create_libtorch_modelconfig(models_dir, max_batch, model_version,
-                                input_shape, output0_shape, output1_shape,
-                                input_dtype, output0_dtype, output1_dtype,
-                                output0_label_cnt, version_policy):
-
+def create_libtorch_modelconfig(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    output0_label_cnt,
+    version_policy,
+):
     if not tu.validate_for_libtorch_model(
-            input_dtype, output0_dtype, output1_dtype, input_shape,
-            output0_shape, output1_shape, max_batch):
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+        max_batch,
+    ):
         return
 
     # Unpack version policy
     version_policy_str = "{ latest { num_versions: 1 }}"
     if version_policy is not None:
         type, val = version_policy
-        if type == 'latest':
-            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(
-                val)
-        elif type == 'specific':
+        if type == "latest":
+            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(val)
+        elif type == "specific":
             version_policy_str = "{{ specific {{ versions: {} }}}}".format(val)
         else:
             version_policy_str = "{ all { }}"
 
     # Use a different model name for the non-batching variant
     model_name = tu.get_model_name(
-        "libtorch_nobatch" if max_batch == 0 else "libtorch", input_dtype,
-        output0_dtype, output1_dtype)
+        "libtorch_nobatch" if max_batch == 0 else "libtorch",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "pytorch_libtorch"
 max_batch_size: {}
@@ -1481,13 +1814,19 @@ def create_libtorch_modelconfig(models_dir, max_batch, model_version,
     dims: [ {} ]
   }}
 ]
-'''.format(model_name, max_batch, version_policy_str,
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(output0_dtype),
-           tu.shape_to_dims_str(output0_shape),
-           np_to_model_dtype(output1_dtype),
-           tu.shape_to_dims_str(output1_shape))
+""".format(
+        model_name,
+        max_batch,
+        version_policy_str,
+        np_to_model_dtype(input_dtype),
+        tu.shape_to_dims_str(input_shape),
+        np_to_model_dtype(input_dtype),
+        tu.shape_to_dims_str(input_shape),
+        np_to_model_dtype(output0_dtype),
+        tu.shape_to_dims_str(output0_shape),
+        np_to_model_dtype(output1_dtype),
+        tu.shape_to_dims_str(output1_shape),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -1502,35 +1841,40 @@ def create_libtorch_modelconfig(models_dir, max_batch, model_version,
             lfile.write("label" + str(l) + "\n")
 
 
-def create_openvino_modelfile(models_dir,
-                              max_batch,
-                              model_version,
-                              input_shape,
-                              output0_shape,
-                              output1_shape,
-                              input_dtype,
-                              output0_dtype,
-                              output1_dtype,
-                              swap=False):
-
+def create_openvino_modelfile(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    swap=False,
+):
     batch_dim = () if max_batch == 0 else (max_batch,)
     if not tu.validate_for_openvino_model(
-            input_dtype, output0_dtype, output1_dtype, batch_dim + input_shape,
-            batch_dim + output0_shape, batch_dim + output1_shape):
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        batch_dim + input_shape,
+        batch_dim + output0_shape,
+        batch_dim + output1_shape,
+    ):
         return
 
     # Create the model
     model_name = tu.get_model_name(
-        "openvino_nobatch" if max_batch == 0 else "openvino", input_dtype,
-        output0_dtype, output1_dtype)
+        "openvino_nobatch" if max_batch == 0 else "openvino",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
-    in0 = ng.parameter(shape=batch_dim + input_shape,
-                       dtype=input_dtype,
-                       name="INPUT0")
-    in1 = ng.parameter(shape=batch_dim + input_shape,
-                       dtype=input_dtype,
-                       name="INPUT1")
+    in0 = ng.parameter(shape=batch_dim + input_shape, dtype=input_dtype, name="INPUT0")
+    in1 = ng.parameter(shape=batch_dim + input_shape, dtype=input_dtype, name="INPUT1")
 
     r0 = ng.add(in0, in1) if not swap else ng.subtract(in0, in1)
     r1 = ng.subtract(in0, in1) if not swap else ng.add(in0, in1)
@@ -1549,41 +1893,57 @@ def create_openvino_modelfile(models_dir,
     except OSError as ex:
         pass  # ignore existing dir
 
-    ie_network.serialize(model_version_dir + "/model.xml",
-                         model_version_dir + "/model.bin")
-
-
-def create_openvino_modelconfig(models_dir, max_batch, model_version,
-                                input_shape, output0_shape, output1_shape,
-                                input_dtype, output0_dtype, output1_dtype,
-                                output0_label_cnt, version_policy):
-
+    ie_network.serialize(
+        model_version_dir + "/model.xml", model_version_dir + "/model.bin"
+    )
+
+
+def create_openvino_modelconfig(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    output0_label_cnt,
+    version_policy,
+):
     batch_dim = () if max_batch == 0 else (max_batch,)
     if not tu.validate_for_openvino_model(
-            input_dtype, output0_dtype, output1_dtype, batch_dim + input_shape,
-            batch_dim + output0_shape, batch_dim + output1_shape):
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        batch_dim + input_shape,
+        batch_dim + output0_shape,
+        batch_dim + output1_shape,
+    ):
         return
 
     # Unpack version policy
     version_policy_str = "{ latest { num_versions: 1 }}"
     if version_policy is not None:
         type, val = version_policy
-        if type == 'latest':
-            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(
-                val)
-        elif type == 'specific':
+        if type == "latest":
+            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(val)
+        elif type == "specific":
             version_policy_str = "{{ specific {{ versions: {} }}}}".format(val)
         else:
             version_policy_str = "{ all { }}"
 
     # Use a different model name for the non-batching variant
     model_name = tu.get_model_name(
-        "openvino_nobatch" if max_batch == 0 else "openvino", input_dtype,
-        output0_dtype, output1_dtype)
+        "openvino_nobatch" if max_batch == 0 else "openvino",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     config_dir = models_dir + "/" + model_name
 
     # platform is empty and backend is 'openvino' for openvino model
-    config = '''
+    config = """
 name: "{}"
 backend: "openvino"
 max_batch_size: {}
@@ -1613,13 +1973,19 @@ def create_openvino_modelconfig(models_dir, max_batch, model_version,
     dims: [ {} ]
   }}
 ]
-'''.format(model_name, max_batch, version_policy_str,
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(output0_dtype),
-           tu.shape_to_dims_str(output0_shape),
-           np_to_model_dtype(output1_dtype),
-           tu.shape_to_dims_str(output1_shape))
+""".format(
+        model_name,
+        max_batch,
+        version_policy_str,
+        np_to_model_dtype(input_dtype),
+        tu.shape_to_dims_str(input_shape),
+        np_to_model_dtype(input_dtype),
+        tu.shape_to_dims_str(input_shape),
+        np_to_model_dtype(output0_dtype),
+        tu.shape_to_dims_str(output0_shape),
+        np_to_model_dtype(output1_dtype),
+        tu.shape_to_dims_str(output1_shape),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -1634,165 +2000,381 @@ def create_openvino_modelconfig(models_dir, max_batch, model_version,
             lfile.write("label" + str(l) + "\n")
 
 
-def create_models(models_dir,
-                  input_dtype,
-                  output0_dtype,
-                  output1_dtype,
-                  input_shape,
-                  output0_shape,
-                  output1_shape,
-                  output0_label_cnt,
-                  version_policy=None):
+def create_models(
+    models_dir,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    output0_label_cnt,
+    version_policy=None,
+):
     model_version = 1
 
     # Create two models, one that supports batching with a max-batch
     # of 8, and one that does not with a max-batch of 0
     if FLAGS.graphdef:
         # max-batch 8
-        create_graphdef_modelconfig(models_dir, 8, model_version, input_shape,
-                                    output0_shape, output1_shape, input_dtype,
-                                    output0_dtype, output1_dtype,
-                                    output0_label_cnt, version_policy)
-        create_graphdef_modelfile(models_dir, 8, model_version, input_shape,
-                                  output0_shape, output1_shape, input_dtype,
-                                  output0_dtype, output1_dtype)
+        create_graphdef_modelconfig(
+            models_dir,
+            8,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_label_cnt,
+            version_policy,
+        )
+        create_graphdef_modelfile(
+            models_dir,
+            8,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
         # max-batch 0
-        create_graphdef_modelconfig(models_dir, 0, model_version, input_shape,
-                                    output0_shape, output1_shape, input_dtype,
-                                    output0_dtype, output1_dtype,
-                                    output0_label_cnt, version_policy)
-        create_graphdef_modelfile(models_dir, 0, model_version, input_shape,
-                                  output0_shape, output1_shape, input_dtype,
-                                  output0_dtype, output1_dtype)
+        create_graphdef_modelconfig(
+            models_dir,
+            0,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_label_cnt,
+            version_policy,
+        )
+        create_graphdef_modelfile(
+            models_dir,
+            0,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
 
     if FLAGS.savedmodel:
         # max-batch 8
-        create_savedmodel_modelconfig(models_dir, 8, model_version, input_shape,
-                                      output0_shape, output1_shape, input_dtype,
-                                      output0_dtype, output1_dtype,
-                                      output0_label_cnt, version_policy)
-        create_savedmodel_modelfile(models_dir, 8, model_version, input_shape,
-                                    output0_shape, output1_shape, input_dtype,
-                                    output0_dtype, output1_dtype)
+        create_savedmodel_modelconfig(
+            models_dir,
+            8,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_label_cnt,
+            version_policy,
+        )
+        create_savedmodel_modelfile(
+            models_dir,
+            8,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
         # max-batch 0
-        create_savedmodel_modelconfig(models_dir, 0, model_version, input_shape,
-                                      output0_shape, output1_shape, input_dtype,
-                                      output0_dtype, output1_dtype,
-                                      output0_label_cnt, version_policy)
-        create_savedmodel_modelfile(models_dir, 0, model_version, input_shape,
-                                    output0_shape, output1_shape, input_dtype,
-                                    output0_dtype, output1_dtype)
+        create_savedmodel_modelconfig(
+            models_dir,
+            0,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_label_cnt,
+            version_policy,
+        )
+        create_savedmodel_modelfile(
+            models_dir,
+            0,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
 
     if FLAGS.tensorrt:
         # max-batch 8
         suffix = ()
-        if input_dtype == np.int8 or output0_dtype == np.int8 or output1_dtype == np.int8:
+        if (
+            input_dtype == np.int8
+            or output0_dtype == np.int8
+            or output1_dtype == np.int8
+        ):
             suffix = (1, 1)
-        create_plan_modelconfig(models_dir, 8, model_version,
-                                input_shape + suffix, output0_shape + suffix,
-                                output1_shape + suffix, input_dtype,
-                                output0_dtype, output1_dtype, output0_label_cnt,
-                                version_policy)
-        create_plan_modelfile(models_dir, 8, model_version,
-                              input_shape + suffix, output0_shape + suffix,
-                              output1_shape + suffix, input_dtype,
-                              output0_dtype, output1_dtype)
+        create_plan_modelconfig(
+            models_dir,
+            8,
+            model_version,
+            input_shape + suffix,
+            output0_shape + suffix,
+            output1_shape + suffix,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_label_cnt,
+            version_policy,
+        )
+        create_plan_modelfile(
+            models_dir,
+            8,
+            model_version,
+            input_shape + suffix,
+            output0_shape + suffix,
+            output1_shape + suffix,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
         # max-batch 0
-        create_plan_modelconfig(models_dir, 0, model_version,
-                                input_shape + suffix, output0_shape + suffix,
-                                output1_shape + suffix, input_dtype,
-                                output0_dtype, output1_dtype, output0_label_cnt,
-                                version_policy)
-        create_plan_modelfile(models_dir, 0, model_version,
-                              input_shape + suffix, output0_shape + suffix,
-                              output1_shape + suffix, input_dtype,
-                              output0_dtype, output1_dtype)
+        create_plan_modelconfig(
+            models_dir,
+            0,
+            model_version,
+            input_shape + suffix,
+            output0_shape + suffix,
+            output1_shape + suffix,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_label_cnt,
+            version_policy,
+        )
+        create_plan_modelfile(
+            models_dir,
+            0,
+            model_version,
+            input_shape + suffix,
+            output0_shape + suffix,
+            output1_shape + suffix,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
 
         if -1 in input_shape:
             # models for testing optimization profiles
-            create_plan_modelconfig(models_dir,
-                                    8,
-                                    model_version,
-                                    input_shape + suffix,
-                                    output0_shape + suffix,
-                                    output1_shape + suffix,
-                                    input_dtype,
-                                    output0_dtype,
-                                    output1_dtype,
-                                    output0_label_cnt,
-                                    version_policy,
-                                    min_dim=4,
-                                    max_dim=32)
-            create_plan_modelfile(models_dir,
-                                  8,
-                                  model_version,
-                                  input_shape + suffix,
-                                  output0_shape + suffix,
-                                  output1_shape + suffix,
-                                  input_dtype,
-                                  output0_dtype,
-                                  output1_dtype,
-                                  min_dim=4,
-                                  max_dim=32)
+            create_plan_modelconfig(
+                models_dir,
+                8,
+                model_version,
+                input_shape + suffix,
+                output0_shape + suffix,
+                output1_shape + suffix,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                output0_label_cnt,
+                version_policy,
+                min_dim=4,
+                max_dim=32,
+            )
+            create_plan_modelfile(
+                models_dir,
+                8,
+                model_version,
+                input_shape + suffix,
+                output0_shape + suffix,
+                output1_shape + suffix,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                min_dim=4,
+                max_dim=32,
+            )
 
     if FLAGS.onnx:
         # max-batch 8
-        create_onnx_modelconfig(models_dir, 8, model_version, input_shape,
-                                output0_shape, output1_shape, input_dtype,
-                                output0_dtype, output1_dtype, output0_label_cnt,
-                                version_policy)
-        create_onnx_modelfile(models_dir, 8, model_version, input_shape,
-                              output0_shape, output1_shape, input_dtype,
-                              output0_dtype, output1_dtype)
+        create_onnx_modelconfig(
+            models_dir,
+            8,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_label_cnt,
+            version_policy,
+        )
+        create_onnx_modelfile(
+            models_dir,
+            8,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
         # max-batch 0
-        create_onnx_modelconfig(models_dir, 0, model_version, input_shape,
-                                output0_shape, output1_shape, input_dtype,
-                                output0_dtype, output1_dtype, output0_label_cnt,
-                                version_policy)
-        create_onnx_modelfile(models_dir, 0, model_version, input_shape,
-                              output0_shape, output1_shape, input_dtype,
-                              output0_dtype, output1_dtype)
+        create_onnx_modelconfig(
+            models_dir,
+            0,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_label_cnt,
+            version_policy,
+        )
+        create_onnx_modelfile(
+            models_dir,
+            0,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
 
     if FLAGS.libtorch:
         # max-batch 8
-        create_libtorch_modelconfig(models_dir, 8, model_version, input_shape,
-                                    output0_shape, output1_shape, input_dtype,
-                                    output0_dtype, output1_dtype,
-                                    output0_label_cnt, version_policy)
-        create_libtorch_modelfile(models_dir, 8, model_version, input_shape,
-                                  output0_shape, output1_shape, input_dtype,
-                                  output0_dtype, output1_dtype)
+        create_libtorch_modelconfig(
+            models_dir,
+            8,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_label_cnt,
+            version_policy,
+        )
+        create_libtorch_modelfile(
+            models_dir,
+            8,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
         # max-batch 0
-        create_libtorch_modelconfig(models_dir, 0, model_version, input_shape,
-                                    output0_shape, output1_shape, input_dtype,
-                                    output0_dtype, output1_dtype,
-                                    output0_label_cnt, version_policy)
-        create_libtorch_modelfile(models_dir, 0, model_version, input_shape,
-                                  output0_shape, output1_shape, input_dtype,
-                                  output0_dtype, output1_dtype)
+        create_libtorch_modelconfig(
+            models_dir,
+            0,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_label_cnt,
+            version_policy,
+        )
+        create_libtorch_modelfile(
+            models_dir,
+            0,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
 
     if FLAGS.openvino:
         # max-batch 8
-        create_openvino_modelconfig(models_dir, 8, model_version, input_shape,
-                                    output0_shape, output1_shape, input_dtype,
-                                    output0_dtype, output1_dtype,
-                                    output0_label_cnt, version_policy)
-        create_openvino_modelfile(models_dir, 8, model_version, input_shape,
-                                  output0_shape, output1_shape, input_dtype,
-                                  output0_dtype, output1_dtype)
+        create_openvino_modelconfig(
+            models_dir,
+            8,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_label_cnt,
+            version_policy,
+        )
+        create_openvino_modelfile(
+            models_dir,
+            8,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
         # max-batch 0
-        create_openvino_modelconfig(models_dir, 0, model_version, input_shape,
-                                    output0_shape, output1_shape, input_dtype,
-                                    output0_dtype, output1_dtype,
-                                    output0_label_cnt, version_policy)
-        create_openvino_modelfile(models_dir, 0, model_version, input_shape,
-                                  output0_shape, output1_shape, input_dtype,
-                                  output0_dtype, output1_dtype)
+        create_openvino_modelconfig(
+            models_dir,
+            0,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_label_cnt,
+            version_policy,
+        )
+        create_openvino_modelfile(
+            models_dir,
+            0,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
 
     if FLAGS.ensemble:
         for pair in emu.platform_types_and_validation():
-            if not pair[1](input_dtype, output0_dtype, output1_dtype,
-                           input_shape, output0_shape, output1_shape):
+            if not pair[1](
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                input_shape,
+                output0_shape,
+                output1_shape,
+            ):
                 continue
 
             config_input_shape = input_shape
@@ -1807,99 +2389,158 @@ def create_models(models_dir,
                     config_output1_shape = (output1_shape[0], 1, 1)
 
             # max-batch 0
-            emu.create_ensemble_modelconfig(pair[0], models_dir, 0,
-                                            model_version, config_input_shape,
-                                            config_output0_shape,
-                                            config_output1_shape, input_dtype,
-                                            output0_dtype, output1_dtype,
-                                            output0_label_cnt, version_policy)
-            emu.create_ensemble_modelfile(pair[0], models_dir, 0, model_version,
-                                          config_input_shape,
-                                          config_output0_shape,
-                                          config_output1_shape, input_dtype,
-                                          output0_dtype, output1_dtype)
+            emu.create_ensemble_modelconfig(
+                pair[0],
+                models_dir,
+                0,
+                model_version,
+                config_input_shape,
+                config_output0_shape,
+                config_output1_shape,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                output0_label_cnt,
+                version_policy,
+            )
+            emu.create_ensemble_modelfile(
+                pair[0],
+                models_dir,
+                0,
+                model_version,
+                config_input_shape,
+                config_output0_shape,
+                config_output1_shape,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+            )
 
             # max-batch 8 (Skip for PyTorch models with String I/O)
             if (pair[0] == "libtorch") and not pair[1](
-                    input_dtype, output0_dtype, output1_dtype, input_shape,
-                    output0_shape, output1_shape, 8):
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                input_shape,
+                output0_shape,
+                output1_shape,
+                8,
+            ):
                 continue
 
-            emu.create_ensemble_modelconfig(pair[0], models_dir, 8,
-                                            model_version, config_input_shape,
-                                            config_output0_shape,
-                                            config_output1_shape, input_dtype,
-                                            output0_dtype, output1_dtype,
-                                            output0_label_cnt, version_policy)
-            emu.create_ensemble_modelfile(pair[0], models_dir, 8, model_version,
-                                          config_input_shape,
-                                          config_output0_shape,
-                                          config_output1_shape, input_dtype,
-                                          output0_dtype, output1_dtype)
-
-
-def create_fixed_models(models_dir,
-                        input_dtype,
-                        output0_dtype,
-                        output1_dtype,
-                        version_policy=None):
+            emu.create_ensemble_modelconfig(
+                pair[0],
+                models_dir,
+                8,
+                model_version,
+                config_input_shape,
+                config_output0_shape,
+                config_output1_shape,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+                output0_label_cnt,
+                version_policy,
+            )
+            emu.create_ensemble_modelfile(
+                pair[0],
+                models_dir,
+                8,
+                model_version,
+                config_input_shape,
+                config_output0_shape,
+                config_output1_shape,
+                input_dtype,
+                output0_dtype,
+                output1_dtype,
+            )
+
+
+def create_fixed_models(
+    models_dir, input_dtype, output0_dtype, output1_dtype, version_policy=None
+):
     input_size = 16
-    create_models(models_dir, input_dtype, output0_dtype, output1_dtype,
-                  (input_size,), (input_size,), (input_size,), input_size,
-                  version_policy)
-
-
-if __name__ == '__main__':
+    create_models(
+        models_dir,
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        (input_size,),
+        (input_size,),
+        (input_size,),
+        input_size,
+        version_policy,
+    )
+
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
-    parser.add_argument('--graphdef',
-                        required=False,
-                        action='store_true',
-                        help='Generate GraphDef models')
-    parser.add_argument('--savedmodel',
-                        required=False,
-                        action='store_true',
-                        help='Generate SavedModel models')
-    parser.add_argument('--tensorrt',
-                        required=False,
-                        action='store_true',
-                        help='Generate TensorRT PLAN models')
-    parser.add_argument('--onnx',
-                        required=False,
-                        action='store_true',
-                        help='Generate Onnx Runtime Onnx models')
     parser.add_argument(
-        '--onnx_opset',
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
+    parser.add_argument(
+        "--graphdef",
+        required=False,
+        action="store_true",
+        help="Generate GraphDef models",
+    )
+    parser.add_argument(
+        "--savedmodel",
+        required=False,
+        action="store_true",
+        help="Generate SavedModel models",
+    )
+    parser.add_argument(
+        "--tensorrt",
+        required=False,
+        action="store_true",
+        help="Generate TensorRT PLAN models",
+    )
+    parser.add_argument(
+        "--onnx",
+        required=False,
+        action="store_true",
+        help="Generate Onnx Runtime Onnx models",
+    )
+    parser.add_argument(
+        "--onnx_opset",
         type=int,
         required=False,
         default=0,
-        help='Opset used for Onnx models. Default is to use ONNXRT default')
-    parser.add_argument('--libtorch',
-                        required=False,
-                        action='store_true',
-                        help='Generate Pytorch LibTorch models')
-    parser.add_argument('--openvino',
-                        required=False,
-                        action='store_true',
-                        help='Generate Openvino models')
-    parser.add_argument('--variable',
-                        required=False,
-                        action='store_true',
-                        help='Used variable-shape tensors for input/output')
-    parser.add_argument('--ensemble',
-                        required=False,
-                        action='store_true',
-                        help='Generate ensemble models against the models' +
-                        ' in all platforms. Note that the models generated' +
-                        ' are not completed.')
+        help="Opset used for Onnx models. Default is to use ONNXRT default",
+    )
+    parser.add_argument(
+        "--libtorch",
+        required=False,
+        action="store_true",
+        help="Generate Pytorch LibTorch models",
+    )
+    parser.add_argument(
+        "--openvino",
+        required=False,
+        action="store_true",
+        help="Generate Openvino models",
+    )
+    parser.add_argument(
+        "--variable",
+        required=False,
+        action="store_true",
+        help="Used variable-shape tensors for input/output",
+    )
+    parser.add_argument(
+        "--ensemble",
+        required=False,
+        action="store_true",
+        help="Generate ensemble models against the models"
+        + " in all platforms. Note that the models generated"
+        + " are not completed.",
+    )
     FLAGS, unparsed = parser.parse_known_args()
 
     if FLAGS.graphdef or FLAGS.savedmodel:
         import tensorflow as tf
         from tensorflow.python.framework import graph_io
+
         tf.compat.v1.disable_eager_execution()
     if FLAGS.tensorrt:
         import tensorrt as trt
@@ -1916,23 +2557,33 @@ def create_fixed_models(models_dir,
 
     # Tests with models that accept fixed-shape input/output tensors
     if not FLAGS.variable:
-        create_fixed_models(FLAGS.models_dir, np.uint8, np.uint8, np.uint8,
-                            ('latest', 3))
-        create_fixed_models(FLAGS.models_dir, np.int8, np.int8, np.int8,
-                            ('latest', 1))
-        create_fixed_models(FLAGS.models_dir, np.int16, np.int16, np.int16,
-                            ('latest', 2))
-        create_fixed_models(FLAGS.models_dir, np.int32, np.int32, np.int32,
-                            ('all', None))
+        create_fixed_models(
+            FLAGS.models_dir, np.uint8, np.uint8, np.uint8, ("latest", 3)
+        )
+        create_fixed_models(FLAGS.models_dir, np.int8, np.int8, np.int8, ("latest", 1))
+        create_fixed_models(
+            FLAGS.models_dir, np.int16, np.int16, np.int16, ("latest", 2)
+        )
+        create_fixed_models(
+            FLAGS.models_dir, np.int32, np.int32, np.int32, ("all", None)
+        )
         create_fixed_models(FLAGS.models_dir, np.int64, np.int64, np.int64)
-        create_fixed_models(FLAGS.models_dir, np.float16, np.float16,
-                            np.float16, ('specific', [
-                                1,
-                            ]))
-        create_fixed_models(FLAGS.models_dir, np.float32, np.float32,
-                            np.float32, ('specific', [1, 3]))
-        create_fixed_models(FLAGS.models_dir, np.float16, np.float32,
-                            np.float32)
+        create_fixed_models(
+            FLAGS.models_dir,
+            np.float16,
+            np.float16,
+            np.float16,
+            (
+                "specific",
+                [
+                    1,
+                ],
+            ),
+        )
+        create_fixed_models(
+            FLAGS.models_dir, np.float32, np.float32, np.float32, ("specific", [1, 3])
+        )
+        create_fixed_models(FLAGS.models_dir, np.float16, np.float32, np.float32)
         create_fixed_models(FLAGS.models_dir, np.int32, np.int8, np.int8)
         create_fixed_models(FLAGS.models_dir, np.int8, np.int32, np.int32)
         create_fixed_models(FLAGS.models_dir, np.int32, np.int8, np.int16)
@@ -1943,316 +2594,330 @@ def create_fixed_models(models_dir,
         create_fixed_models(FLAGS.models_dir, np.float32, np.int32, np.int32)
         create_fixed_models(FLAGS.models_dir, np.int32, np.float16, np.int16)
 
-        create_fixed_models(FLAGS.models_dir, np_dtype_string, np.int32,
-                            np.int32)
-        create_fixed_models(FLAGS.models_dir, np_dtype_string, np_dtype_string,
-                            np_dtype_string)
-        create_fixed_models(FLAGS.models_dir, np_dtype_string, np.int32,
-                            np_dtype_string)
-        create_fixed_models(FLAGS.models_dir, np_dtype_string, np_dtype_string,
-                            np.int32)
-        create_fixed_models(FLAGS.models_dir, np.int32, np_dtype_string,
-                            np_dtype_string)
-        create_fixed_models(FLAGS.models_dir, np.int32, np.int32,
-                            np_dtype_string)
-        create_fixed_models(FLAGS.models_dir, np.int32, np_dtype_string,
-                            np.int32)
+        create_fixed_models(FLAGS.models_dir, np_dtype_string, np.int32, np.int32)
+        create_fixed_models(
+            FLAGS.models_dir, np_dtype_string, np_dtype_string, np_dtype_string
+        )
+        create_fixed_models(
+            FLAGS.models_dir, np_dtype_string, np.int32, np_dtype_string
+        )
+        create_fixed_models(
+            FLAGS.models_dir, np_dtype_string, np_dtype_string, np.int32
+        )
+        create_fixed_models(
+            FLAGS.models_dir, np.int32, np_dtype_string, np_dtype_string
+        )
+        create_fixed_models(FLAGS.models_dir, np.int32, np.int32, np_dtype_string)
+        create_fixed_models(FLAGS.models_dir, np.int32, np_dtype_string, np.int32)
 
         # Make multiple versions of some models for version testing
         # (they use different version policies when created above)
         if FLAGS.graphdef:
             for vt in [np.float16, np.float32, np.int8, np.int16, np.int32]:
-                create_graphdef_modelfile(FLAGS.models_dir,
-                                          8,
-                                          2, (16,), (16,), (16,),
-                                          vt,
-                                          vt,
-                                          vt,
-                                          swap=True)
-                create_graphdef_modelfile(FLAGS.models_dir,
-                                          8,
-                                          3, (16,), (16,), (16,),
-                                          vt,
-                                          vt,
-                                          vt,
-                                          swap=True)
-                create_graphdef_modelfile(FLAGS.models_dir,
-                                          0,
-                                          2, (16,), (16,), (16,),
-                                          vt,
-                                          vt,
-                                          vt,
-                                          swap=True)
-                create_graphdef_modelfile(FLAGS.models_dir,
-                                          0,
-                                          3, (16,), (16,), (16,),
-                                          vt,
-                                          vt,
-                                          vt,
-                                          swap=True)
+                create_graphdef_modelfile(
+                    FLAGS.models_dir, 8, 2, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_graphdef_modelfile(
+                    FLAGS.models_dir, 8, 3, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_graphdef_modelfile(
+                    FLAGS.models_dir, 0, 2, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_graphdef_modelfile(
+                    FLAGS.models_dir, 0, 3, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
 
         if FLAGS.savedmodel:
             for vt in [np.float16, np.float32, np.int8, np.int16, np.int32]:
-                create_savedmodel_modelfile(FLAGS.models_dir,
-                                            8,
-                                            2, (16,), (16,), (16,),
-                                            vt,
-                                            vt,
-                                            vt,
-                                            swap=True)
-                create_savedmodel_modelfile(FLAGS.models_dir,
-                                            8,
-                                            3, (16,), (16,), (16,),
-                                            vt,
-                                            vt,
-                                            vt,
-                                            swap=True)
-                create_savedmodel_modelfile(FLAGS.models_dir,
-                                            0,
-                                            2, (16,), (16,), (16,),
-                                            vt,
-                                            vt,
-                                            vt,
-                                            swap=True)
-                create_savedmodel_modelfile(FLAGS.models_dir,
-                                            0,
-                                            3, (16,), (16,), (16,),
-                                            vt,
-                                            vt,
-                                            vt,
-                                            swap=True)
+                create_savedmodel_modelfile(
+                    FLAGS.models_dir, 8, 2, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_savedmodel_modelfile(
+                    FLAGS.models_dir, 8, 3, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_savedmodel_modelfile(
+                    FLAGS.models_dir, 0, 2, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_savedmodel_modelfile(
+                    FLAGS.models_dir, 0, 3, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
 
         if FLAGS.tensorrt:
             for vt in [np.float32, np.float16, np.int32, np.uint8]:
-                create_plan_modelfile(FLAGS.models_dir,
-                                      8,
-                                      2, (16,), (16,), (16,),
-                                      vt,
-                                      vt,
-                                      vt,
-                                      swap=True)
-                create_plan_modelfile(FLAGS.models_dir,
-                                      8,
-                                      3, (16,), (16,), (16,),
-                                      vt,
-                                      vt,
-                                      vt,
-                                      swap=True)
-                create_plan_modelfile(FLAGS.models_dir,
-                                      0,
-                                      2, (16,), (16,), (16,),
-                                      vt,
-                                      vt,
-                                      vt,
-                                      swap=True)
-                create_plan_modelfile(FLAGS.models_dir,
-                                      0,
-                                      3, (16,), (16,), (16,),
-                                      vt,
-                                      vt,
-                                      vt,
-                                      swap=True)
+                create_plan_modelfile(
+                    FLAGS.models_dir, 8, 2, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_plan_modelfile(
+                    FLAGS.models_dir, 8, 3, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_plan_modelfile(
+                    FLAGS.models_dir, 0, 2, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_plan_modelfile(
+                    FLAGS.models_dir, 0, 3, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
 
             vt = np.int8
-            #handle INT8 separately as it doesn't allow 1d tensors
-            create_plan_modelfile(FLAGS.models_dir,
-                                  8,
-                                  2, (16, 1, 1), (16, 1, 1), (16, 1, 1),
-                                  vt,
-                                  vt,
-                                  vt,
-                                  swap=True)
-            create_plan_modelfile(FLAGS.models_dir,
-                                  8,
-                                  3, (16, 1, 1), (16, 1, 1), (16, 1, 1),
-                                  vt,
-                                  vt,
-                                  vt,
-                                  swap=True)
-            create_plan_modelfile(FLAGS.models_dir,
-                                  0,
-                                  2, (16, 1, 1), (16, 1, 1), (16, 1, 1),
-                                  vt,
-                                  vt,
-                                  vt,
-                                  swap=True)
-            create_plan_modelfile(FLAGS.models_dir,
-                                  0,
-                                  3, (16, 1, 1), (16, 1, 1), (16, 1, 1),
-                                  vt,
-                                  vt,
-                                  vt,
-                                  swap=True)
+            # handle INT8 separately as it doesn't allow 1d tensors
+            create_plan_modelfile(
+                FLAGS.models_dir,
+                8,
+                2,
+                (16, 1, 1),
+                (16, 1, 1),
+                (16, 1, 1),
+                vt,
+                vt,
+                vt,
+                swap=True,
+            )
+            create_plan_modelfile(
+                FLAGS.models_dir,
+                8,
+                3,
+                (16, 1, 1),
+                (16, 1, 1),
+                (16, 1, 1),
+                vt,
+                vt,
+                vt,
+                swap=True,
+            )
+            create_plan_modelfile(
+                FLAGS.models_dir,
+                0,
+                2,
+                (16, 1, 1),
+                (16, 1, 1),
+                (16, 1, 1),
+                vt,
+                vt,
+                vt,
+                swap=True,
+            )
+            create_plan_modelfile(
+                FLAGS.models_dir,
+                0,
+                3,
+                (16, 1, 1),
+                (16, 1, 1),
+                (16, 1, 1),
+                vt,
+                vt,
+                vt,
+                swap=True,
+            )
 
         if FLAGS.onnx:
             for vt in [np.float16, np.float32, np.int8, np.int16, np.int32]:
-                create_onnx_modelfile(FLAGS.models_dir,
-                                      8,
-                                      2, (16,), (16,), (16,),
-                                      vt,
-                                      vt,
-                                      vt,
-                                      swap=True)
-                create_onnx_modelfile(FLAGS.models_dir,
-                                      8,
-                                      3, (16,), (16,), (16,),
-                                      vt,
-                                      vt,
-                                      vt,
-                                      swap=True)
-                create_onnx_modelfile(FLAGS.models_dir,
-                                      0,
-                                      2, (16,), (16,), (16,),
-                                      vt,
-                                      vt,
-                                      vt,
-                                      swap=True)
-                create_onnx_modelfile(FLAGS.models_dir,
-                                      0,
-                                      3, (16,), (16,), (16,),
-                                      vt,
-                                      vt,
-                                      vt,
-                                      swap=True)
+                create_onnx_modelfile(
+                    FLAGS.models_dir, 8, 2, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_onnx_modelfile(
+                    FLAGS.models_dir, 8, 3, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_onnx_modelfile(
+                    FLAGS.models_dir, 0, 2, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_onnx_modelfile(
+                    FLAGS.models_dir, 0, 3, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
         if FLAGS.libtorch:
             for vt in [np.float32, np.int32, np.int16, np.int8]:
-                create_libtorch_modelfile(FLAGS.models_dir,
-                                          8,
-                                          2, (16,), (16,), (16,),
-                                          vt,
-                                          vt,
-                                          vt,
-                                          swap=True)
-                create_libtorch_modelfile(FLAGS.models_dir,
-                                          8,
-                                          3, (16,), (16,), (16,),
-                                          vt,
-                                          vt,
-                                          vt,
-                                          swap=True)
-                create_libtorch_modelfile(FLAGS.models_dir,
-                                          0,
-                                          2, (16,), (16,), (16,),
-                                          vt,
-                                          vt,
-                                          vt,
-                                          swap=True)
-                create_libtorch_modelfile(FLAGS.models_dir,
-                                          0,
-                                          3, (16,), (16,), (16,),
-                                          vt,
-                                          vt,
-                                          vt,
-                                          swap=True)
+                create_libtorch_modelfile(
+                    FLAGS.models_dir, 8, 2, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_libtorch_modelfile(
+                    FLAGS.models_dir, 8, 3, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_libtorch_modelfile(
+                    FLAGS.models_dir, 0, 2, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_libtorch_modelfile(
+                    FLAGS.models_dir, 0, 3, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
         if FLAGS.openvino:
             for vt in [np.float16, np.float32, np.int8, np.int16, np.int32]:
-                create_openvino_modelfile(FLAGS.models_dir,
-                                          8,
-                                          2, (16,), (16,), (16,),
-                                          vt,
-                                          vt,
-                                          vt,
-                                          swap=True)
-                create_openvino_modelfile(FLAGS.models_dir,
-                                          8,
-                                          3, (16,), (16,), (16,),
-                                          vt,
-                                          vt,
-                                          vt,
-                                          swap=True)
-                create_openvino_modelfile(FLAGS.models_dir,
-                                          0,
-                                          2, (16,), (16,), (16,),
-                                          vt,
-                                          vt,
-                                          vt,
-                                          swap=True)
-                create_openvino_modelfile(FLAGS.models_dir,
-                                          0,
-                                          3, (16,), (16,), (16,),
-                                          vt,
-                                          vt,
-                                          vt,
-                                          swap=True)
+                create_openvino_modelfile(
+                    FLAGS.models_dir, 8, 2, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_openvino_modelfile(
+                    FLAGS.models_dir, 8, 3, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_openvino_modelfile(
+                    FLAGS.models_dir, 0, 2, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_openvino_modelfile(
+                    FLAGS.models_dir, 0, 3, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
 
         if FLAGS.ensemble:
             for pair in emu.platform_types_and_validation():
                 for vt in [np.float16, np.float32, np.int8, np.int16, np.int32]:
-                    shape = (16, 1, 1) if (pair[0] == "plan" and
-                                           vt == np.int8) else (16,)
+                    shape = (
+                        (16, 1, 1) if (pair[0] == "plan" and vt == np.int8) else (16,)
+                    )
                     if not pair[1](vt, vt, vt, shape, shape, shape):
                         continue
-                    emu.create_ensemble_modelfile(pair[0],
-                                                  FLAGS.models_dir,
-                                                  8,
-                                                  2,
-                                                  shape,
-                                                  shape,
-                                                  shape,
-                                                  vt,
-                                                  vt,
-                                                  vt,
-                                                  swap=True)
-                    emu.create_ensemble_modelfile(pair[0],
-                                                  FLAGS.models_dir,
-                                                  8,
-                                                  3,
-                                                  shape,
-                                                  shape,
-                                                  shape,
-                                                  vt,
-                                                  vt,
-                                                  vt,
-                                                  swap=True)
-                    emu.create_ensemble_modelfile(pair[0],
-                                                  FLAGS.models_dir,
-                                                  0,
-                                                  2,
-                                                  shape,
-                                                  shape,
-                                                  shape,
-                                                  vt,
-                                                  vt,
-                                                  vt,
-                                                  swap=True)
-                    emu.create_ensemble_modelfile(pair[0],
-                                                  FLAGS.models_dir,
-                                                  0,
-                                                  3,
-                                                  shape,
-                                                  shape,
-                                                  shape,
-                                                  vt,
-                                                  vt,
-                                                  vt,
-                                                  swap=True)
+                    emu.create_ensemble_modelfile(
+                        pair[0],
+                        FLAGS.models_dir,
+                        8,
+                        2,
+                        shape,
+                        shape,
+                        shape,
+                        vt,
+                        vt,
+                        vt,
+                        swap=True,
+                    )
+                    emu.create_ensemble_modelfile(
+                        pair[0],
+                        FLAGS.models_dir,
+                        8,
+                        3,
+                        shape,
+                        shape,
+                        shape,
+                        vt,
+                        vt,
+                        vt,
+                        swap=True,
+                    )
+                    emu.create_ensemble_modelfile(
+                        pair[0],
+                        FLAGS.models_dir,
+                        0,
+                        2,
+                        shape,
+                        shape,
+                        shape,
+                        vt,
+                        vt,
+                        vt,
+                        swap=True,
+                    )
+                    emu.create_ensemble_modelfile(
+                        pair[0],
+                        FLAGS.models_dir,
+                        0,
+                        3,
+                        shape,
+                        shape,
+                        shape,
+                        vt,
+                        vt,
+                        vt,
+                        swap=True,
+                    )
 
     # Tests with models that accept variable-shape input/output tensors
     if FLAGS.variable:
-        create_models(FLAGS.models_dir, np.float32, np.float32, np.float32,
-                      (-1,), (-1,), (-1,), 16)
-        create_models(FLAGS.models_dir, np.float32, np.int32, np.int32,
-                      (-1, -1), (-1, -1), (-1, -1), 16)
-        create_models(FLAGS.models_dir, np.float32, np.int64, np.int64, (8, -1),
-                      (8, -1), (8, -1), 32)
-        create_models(FLAGS.models_dir, np.float32, np.int32, np.int64,
-                      (-1, 8, -1), (-1, 8, -1), (-1, 8, -1), 32)
-        create_models(FLAGS.models_dir, np.float32, np.float32, np.int32, (-1,),
-                      (-1,), (-1,), 16)
-        create_models(FLAGS.models_dir, np.int32, np.int32, np.int32, (-1, -1),
-                      (-1, -1), (-1, -1), 16)
-        create_models(FLAGS.models_dir, np.int32, np.int32, np.float32,
-                      (-1, 8, -1), (-1, 8, -1), (-1, 8, -1), 32)
-
-        create_models(FLAGS.models_dir, np_dtype_string, np_dtype_string,
-                      np_dtype_string, (-1,), (-1,), (-1,), 16)
-        create_models(FLAGS.models_dir, np_dtype_string, np.int32, np.int32,
-                      (-1, -1), (-1, -1), (-1, -1), 16)
-        create_models(FLAGS.models_dir, np_dtype_string, np_dtype_string,
-                      np.int32, (8, -1), (8, -1), (8, -1), 32)
-        create_models(FLAGS.models_dir, np_dtype_string, np.int32,
-                      np_dtype_string, (-1, 8, -1), (-1, 8, -1), (-1, 8, -1),
-                      32)
+        create_models(
+            FLAGS.models_dir,
+            np.float32,
+            np.float32,
+            np.float32,
+            (-1,),
+            (-1,),
+            (-1,),
+            16,
+        )
+        create_models(
+            FLAGS.models_dir,
+            np.float32,
+            np.int32,
+            np.int32,
+            (-1, -1),
+            (-1, -1),
+            (-1, -1),
+            16,
+        )
+        create_models(
+            FLAGS.models_dir,
+            np.float32,
+            np.int64,
+            np.int64,
+            (8, -1),
+            (8, -1),
+            (8, -1),
+            32,
+        )
+        create_models(
+            FLAGS.models_dir,
+            np.float32,
+            np.int32,
+            np.int64,
+            (-1, 8, -1),
+            (-1, 8, -1),
+            (-1, 8, -1),
+            32,
+        )
+        create_models(
+            FLAGS.models_dir, np.float32, np.float32, np.int32, (-1,), (-1,), (-1,), 16
+        )
+        create_models(
+            FLAGS.models_dir,
+            np.int32,
+            np.int32,
+            np.int32,
+            (-1, -1),
+            (-1, -1),
+            (-1, -1),
+            16,
+        )
+        create_models(
+            FLAGS.models_dir,
+            np.int32,
+            np.int32,
+            np.float32,
+            (-1, 8, -1),
+            (-1, 8, -1),
+            (-1, 8, -1),
+            32,
+        )
+
+        create_models(
+            FLAGS.models_dir,
+            np_dtype_string,
+            np_dtype_string,
+            np_dtype_string,
+            (-1,),
+            (-1,),
+            (-1,),
+            16,
+        )
+        create_models(
+            FLAGS.models_dir,
+            np_dtype_string,
+            np.int32,
+            np.int32,
+            (-1, -1),
+            (-1, -1),
+            (-1, -1),
+            16,
+        )
+        create_models(
+            FLAGS.models_dir,
+            np_dtype_string,
+            np_dtype_string,
+            np.int32,
+            (8, -1),
+            (8, -1),
+            (8, -1),
+            32,
+        )
+        create_models(
+            FLAGS.models_dir,
+            np_dtype_string,
+            np.int32,
+            np_dtype_string,
+            (-1, 8, -1),
+            (-1, 8, -1),
+            (-1, 8, -1),
+            32,
+        )
 
     if FLAGS.ensemble:
         # Create utility models used in ensemble
@@ -2268,5 +2933,4 @@ def create_fixed_models(models_dir,
             # Use variable size to handle all shape. Note: piping variable size output
             # to fixed size model is not safe but doable
             for model_shape in [(-1,), (-1, -1), (-1, -1, -1)]:
-                emu.create_nop_modelconfig(FLAGS.models_dir, model_shape,
-                                           model_dtype)
+                emu.create_nop_modelconfig(FLAGS.models_dir, model_shape, model_dtype)
diff --git a/qa/common/gen_qa_noshape_models.py b/qa/common/gen_qa_noshape_models.py
old mode 100644
new mode 100755
index b999abcf6a..b37f1c1bbb
--- a/qa/common/gen_qa_noshape_models.py
+++ b/qa/common/gen_qa_noshape_models.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -25,8 +27,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-from builtins import range
 import os
+from builtins import range
+
 import numpy as np
 
 FLAGS = None
@@ -85,19 +88,26 @@ def np_to_tf_dtype(np_dtype):
     return None
 
 
-def create_savedmodel_modelfile(models_dir,
-                                max_batch,
-                                model_version,
-                                input_shape,
-                                output0_shape,
-                                output1_shape,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                swap=False):
-
-    if not tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
-                                    input_shape, output0_shape, output1_shape):
+def create_savedmodel_modelfile(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    swap=False,
+):
+    if not tu.validate_for_tf_model(
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+    ):
         return
 
     tf_input_dtype = np_to_tf_dtype(input_dtype)
@@ -108,17 +118,24 @@ def create_savedmodel_modelfile(models_dir,
     # dimension.
     tf.compat.v1.reset_default_graph()
     if max_batch == 0:
-        in0 = tf.compat.v1.placeholder(tf_input_dtype, tu.shape_to_tf_shape([]),
-                                       "TENSOR_INPUT0")
-        in1 = tf.compat.v1.placeholder(tf_input_dtype,
-                                       tu.shape_to_tf_shape(input_shape),
-                                       "TENSOR_INPUT1")
+        in0 = tf.compat.v1.placeholder(
+            tf_input_dtype, tu.shape_to_tf_shape([]), "TENSOR_INPUT0"
+        )
+        in1 = tf.compat.v1.placeholder(
+            tf_input_dtype, tu.shape_to_tf_shape(input_shape), "TENSOR_INPUT1"
+        )
     else:
-        in0 = tf.compat.v1.placeholder(tf_input_dtype, tu.shape_to_tf_shape([]),
-                                       "TENSOR_INPUT0")
-        in1 = tf.compat.v1.placeholder(tf_input_dtype, [
-            None,
-        ] + tu.shape_to_tf_shape(input_shape), "TENSOR_INPUT1")
+        in0 = tf.compat.v1.placeholder(
+            tf_input_dtype, tu.shape_to_tf_shape([]), "TENSOR_INPUT0"
+        )
+        in1 = tf.compat.v1.placeholder(
+            tf_input_dtype,
+            [
+                None,
+            ]
+            + tu.shape_to_tf_shape(input_shape),
+            "TENSOR_INPUT1",
+        )
 
     # If the input is a string, then convert each string to the
     # equivalent float value.
@@ -145,8 +162,11 @@ def create_savedmodel_modelfile(models_dir,
 
     # Use a different model name for the non-batching variant
     model_name = tu.get_model_name(
-        "savedmodel_nobatch" if max_batch == 0 else "savedmodel", input_dtype,
-        output0_dtype, output1_dtype)
+        "savedmodel_nobatch" if max_batch == 0 else "savedmodel",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -156,53 +176,68 @@ def create_savedmodel_modelfile(models_dir,
 
     with tf.compat.v1.Session() as sess:
         input0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_INPUT0:0")
+            "TENSOR_INPUT0:0"
+        )
         input1_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_INPUT1:0")
+            "TENSOR_INPUT1:0"
+        )
         output0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_OUTPUT0:0")
+            "TENSOR_OUTPUT0:0"
+        )
         output1_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_OUTPUT1:0")
-        tf.compat.v1.saved_model.simple_save(sess,
-                                             model_version_dir +
-                                             "/model.savedmodel",
-                                             inputs={
-                                                 "INPUT0": input0_tensor,
-                                                 "INPUT1": input1_tensor
-                                             },
-                                             outputs={
-                                                 "OUTPUT0": output0_tensor,
-                                                 "OUTPUT1": output1_tensor
-                                             })
-
-
-def create_savedmodel_modelconfig(models_dir, max_batch, model_version,
-                                  input_shape, output0_shape, output1_shape,
-                                  input_dtype, output0_dtype, output1_dtype,
-                                  output0_label_cnt, version_policy):
-
-    if not tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
-                                    input_shape, output0_shape, output1_shape):
+            "TENSOR_OUTPUT1:0"
+        )
+        tf.compat.v1.saved_model.simple_save(
+            sess,
+            model_version_dir + "/model.savedmodel",
+            inputs={"INPUT0": input0_tensor, "INPUT1": input1_tensor},
+            outputs={"OUTPUT0": output0_tensor, "OUTPUT1": output1_tensor},
+        )
+
+
+def create_savedmodel_modelconfig(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    output0_label_cnt,
+    version_policy,
+):
+    if not tu.validate_for_tf_model(
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+    ):
         return
 
     # Unpack version policy
     version_policy_str = "{ latest { num_versions: 1 }}"
     if version_policy is not None:
         type, val = version_policy
-        if type == 'latest':
-            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(
-                val)
-        elif type == 'specific':
+        if type == "latest":
+            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(val)
+        elif type == "specific":
             version_policy_str = "{{ specific {{ versions: {} }}}}".format(val)
         else:
             version_policy_str = "{ all { }}"
 
     # Use a different model name for the non-batching variant
     model_name = tu.get_model_name(
-        "savedmodel_nobatch" if max_batch == 0 else "savedmodel", input_dtype,
-        output0_dtype, output1_dtype)
+        "savedmodel_nobatch" if max_batch == 0 else "savedmodel",
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+    )
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "tensorflow_savedmodel"
 max_batch_size: {}
@@ -232,13 +267,19 @@ def create_savedmodel_modelconfig(models_dir, max_batch, model_version,
     dims: [ {} ]
   }}
 ]
-'''.format(model_name, max_batch, version_policy_str,
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(output0_dtype),
-           tu.shape_to_dims_str(output0_shape),
-           np_to_model_dtype(output1_dtype),
-           tu.shape_to_dims_str(output1_shape))
+""".format(
+        model_name,
+        max_batch,
+        version_policy_str,
+        np_to_model_dtype(input_dtype),
+        tu.shape_to_dims_str(input_shape),
+        np_to_model_dtype(input_dtype),
+        tu.shape_to_dims_str(input_shape),
+        np_to_model_dtype(output0_dtype),
+        tu.shape_to_dims_str(output0_shape),
+        np_to_model_dtype(output1_dtype),
+        tu.shape_to_dims_str(output1_shape),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -253,15 +294,17 @@ def create_savedmodel_modelconfig(models_dir, max_batch, model_version,
             lfile.write("label" + str(l) + "\n")
 
 
-def create_models(models_dir,
-                  input_dtype,
-                  output0_dtype,
-                  output1_dtype,
-                  input_shape,
-                  output0_shape,
-                  output1_shape,
-                  output0_label_cnt,
-                  version_policy=None):
+def create_models(
+    models_dir,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    output0_label_cnt,
+    version_policy=None,
+):
     model_version = 1
 
     # Create two models, one that supports batching with a max-batch
@@ -269,96 +312,159 @@ def create_models(models_dir,
 
     if FLAGS.savedmodel:
         # max-batch 8
-        create_savedmodel_modelconfig(models_dir, 8, model_version, input_shape,
-                                      output0_shape, output1_shape, input_dtype,
-                                      output0_dtype, output1_dtype,
-                                      output0_label_cnt, version_policy)
-        create_savedmodel_modelfile(models_dir, 8, model_version, input_shape,
-                                    output0_shape, output1_shape, input_dtype,
-                                    output0_dtype, output1_dtype)
+        create_savedmodel_modelconfig(
+            models_dir,
+            8,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_label_cnt,
+            version_policy,
+        )
+        create_savedmodel_modelfile(
+            models_dir,
+            8,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
         # max-batch 0
-        create_savedmodel_modelconfig(models_dir, 0, model_version, input_shape,
-                                      output0_shape, output1_shape, input_dtype,
-                                      output0_dtype, output1_dtype,
-                                      output0_label_cnt, version_policy)
-        create_savedmodel_modelfile(models_dir, 0, model_version, input_shape,
-                                    output0_shape, output1_shape, input_dtype,
-                                    output0_dtype, output1_dtype)
-
-
-def create_fixed_models(models_dir,
-                        input_dtype,
-                        output0_dtype,
-                        output1_dtype,
-                        version_policy=None):
+        create_savedmodel_modelconfig(
+            models_dir,
+            0,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_label_cnt,
+            version_policy,
+        )
+        create_savedmodel_modelfile(
+            models_dir,
+            0,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+        )
+
+
+def create_fixed_models(
+    models_dir, input_dtype, output0_dtype, output1_dtype, version_policy=None
+):
     input_size = 16
 
-    create_models(models_dir, input_dtype, output0_dtype, output1_dtype,
-                  (input_size,), (input_size,), (input_size,), input_size,
-                  version_policy)
+    create_models(
+        models_dir,
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        (input_size,),
+        (input_size,),
+        (input_size,),
+        input_size,
+        version_policy,
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
-    parser.add_argument('--graphdef',
-                        required=False,
-                        action='store_true',
-                        help='Generate GraphDef models')
-    parser.add_argument('--savedmodel',
-                        required=False,
-                        action='store_true',
-                        help='Generate SavedModel models')
-    parser.add_argument('--tensorrt',
-                        required=False,
-                        action='store_true',
-                        help='Generate TensorRT PLAN models')
-    parser.add_argument('--onnx',
-                        required=False,
-                        action='store_true',
-                        help='Generate Onnx Runtime Onnx models')
-    parser.add_argument('--libtorch',
-                        required=False,
-                        action='store_true',
-                        help='Generate Pytorch LibTorch models')
-    parser.add_argument('--variable',
-                        required=False,
-                        action='store_true',
-                        help='Used variable-shape tensors for input/output')
-    parser.add_argument('--ensemble',
-                        required=False,
-                        action='store_true',
-                        help='Generate ensemble models against the models' +
-                        ' in all platforms. Note that the models generated' +
-                        ' are not completed.')
+    parser.add_argument(
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
+    parser.add_argument(
+        "--graphdef",
+        required=False,
+        action="store_true",
+        help="Generate GraphDef models",
+    )
+    parser.add_argument(
+        "--savedmodel",
+        required=False,
+        action="store_true",
+        help="Generate SavedModel models",
+    )
+    parser.add_argument(
+        "--tensorrt",
+        required=False,
+        action="store_true",
+        help="Generate TensorRT PLAN models",
+    )
+    parser.add_argument(
+        "--onnx",
+        required=False,
+        action="store_true",
+        help="Generate Onnx Runtime Onnx models",
+    )
+    parser.add_argument(
+        "--libtorch",
+        required=False,
+        action="store_true",
+        help="Generate Pytorch LibTorch models",
+    )
+    parser.add_argument(
+        "--variable",
+        required=False,
+        action="store_true",
+        help="Used variable-shape tensors for input/output",
+    )
+    parser.add_argument(
+        "--ensemble",
+        required=False,
+        action="store_true",
+        help="Generate ensemble models against the models"
+        + " in all platforms. Note that the models generated"
+        + " are not completed.",
+    )
     FLAGS, unparsed = parser.parse_known_args()
 
     if FLAGS.savedmodel:
         import tensorflow as tf
+
         tf.compat.v1.disable_eager_execution()
 
     import test_util as tu
 
     # Tests with models that accept fixed-shape input/output tensors
     if not FLAGS.variable:
-        create_fixed_models(FLAGS.models_dir, np.int8, np.int8, np.int8,
-                            ('latest', 1))
-        create_fixed_models(FLAGS.models_dir, np.int16, np.int16, np.int16,
-                            ('latest', 2))
-        create_fixed_models(FLAGS.models_dir, np.int32, np.int32, np.int32,
-                            ('all', None))
+        create_fixed_models(FLAGS.models_dir, np.int8, np.int8, np.int8, ("latest", 1))
+        create_fixed_models(
+            FLAGS.models_dir, np.int16, np.int16, np.int16, ("latest", 2)
+        )
+        create_fixed_models(
+            FLAGS.models_dir, np.int32, np.int32, np.int32, ("all", None)
+        )
         create_fixed_models(FLAGS.models_dir, np.int64, np.int64, np.int64)
-        create_fixed_models(FLAGS.models_dir, np.float16, np.float16,
-                            np.float16, ('specific', [
-                                1,
-                            ]))
-        create_fixed_models(FLAGS.models_dir, np.float32, np.float32,
-                            np.float32, ('specific', [1, 3]))
-        create_fixed_models(FLAGS.models_dir, np.float16, np.float32,
-                            np.float32)
+        create_fixed_models(
+            FLAGS.models_dir,
+            np.float16,
+            np.float16,
+            np.float16,
+            (
+                "specific",
+                [
+                    1,
+                ],
+            ),
+        )
+        create_fixed_models(
+            FLAGS.models_dir, np.float32, np.float32, np.float32, ("specific", [1, 3])
+        )
+        create_fixed_models(FLAGS.models_dir, np.float16, np.float32, np.float32)
         create_fixed_models(FLAGS.models_dir, np.int32, np.int8, np.int8)
         create_fixed_models(FLAGS.models_dir, np.int8, np.int32, np.int32)
         create_fixed_models(FLAGS.models_dir, np.int32, np.int8, np.int16)
@@ -368,31 +474,15 @@ def create_fixed_models(models_dir,
 
         if FLAGS.savedmodel:
             for vt in [np.float16, np.float32, np.int8, np.int16, np.int32]:
-                create_savedmodel_modelfile(FLAGS.models_dir,
-                                            8,
-                                            2, (16,), (16,), (16,),
-                                            vt,
-                                            vt,
-                                            vt,
-                                            swap=True)
-                create_savedmodel_modelfile(FLAGS.models_dir,
-                                            8,
-                                            3, (16,), (16,), (16,),
-                                            vt,
-                                            vt,
-                                            vt,
-                                            swap=True)
-                create_savedmodel_modelfile(FLAGS.models_dir,
-                                            0,
-                                            2, (16,), (16,), (16,),
-                                            vt,
-                                            vt,
-                                            vt,
-                                            swap=True)
-                create_savedmodel_modelfile(FLAGS.models_dir,
-                                            0,
-                                            3, (16,), (16,), (16,),
-                                            vt,
-                                            vt,
-                                            vt,
-                                            swap=True)
+                create_savedmodel_modelfile(
+                    FLAGS.models_dir, 8, 2, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_savedmodel_modelfile(
+                    FLAGS.models_dir, 8, 3, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_savedmodel_modelfile(
+                    FLAGS.models_dir, 0, 2, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
+                create_savedmodel_modelfile(
+                    FLAGS.models_dir, 0, 3, (16,), (16,), (16,), vt, vt, vt, swap=True
+                )
diff --git a/qa/common/gen_qa_ragged_models.py b/qa/common/gen_qa_ragged_models.py
old mode 100644
new mode 100755
index ad613582b7..0ac70e80f6
--- a/qa/common/gen_qa_ragged_models.py
+++ b/qa/common/gen_qa_ragged_models.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -149,12 +151,15 @@ def create_savedmodel_modelfile(models_dir, model_version, dtype):
     tf_dtype = np_to_tf_dtype(dtype)
 
     tf.compat.v1.reset_default_graph()
-    in_node = tf.compat.v1.placeholder(tf_dtype, tu.shape_to_tf_shape([-1]),
-                                       "TENSOR_RAGGED_INPUT")
-    bs_node = tf.compat.v1.placeholder(tf_dtype, tu.shape_to_tf_shape([-1]),
-                                       "TENSOR_BATCH_AND_SIZE_INPUT")
-    batch_node = tf.compat.v1.placeholder(tf_dtype, tu.shape_to_tf_shape([-1]),
-                                          "TENSOR_BATCH_INPUT")
+    in_node = tf.compat.v1.placeholder(
+        tf_dtype, tu.shape_to_tf_shape([-1]), "TENSOR_RAGGED_INPUT"
+    )
+    bs_node = tf.compat.v1.placeholder(
+        tf_dtype, tu.shape_to_tf_shape([-1]), "TENSOR_BATCH_AND_SIZE_INPUT"
+    )
+    batch_node = tf.compat.v1.placeholder(
+        tf_dtype, tu.shape_to_tf_shape([-1]), "TENSOR_BATCH_INPUT"
+    )
 
     in_mat = tf.reshape(in_node, [1, -1])
     bs_mat = tf.reshape(bs_node, [1, -1])
@@ -163,12 +168,10 @@ def create_savedmodel_modelfile(models_dir, model_version, dtype):
     output_expander = tf.reshape(tf.divide(bs_node, bs_node), [-1, 1])
 
     out_node = tf.matmul(output_expander, in_mat, name="TENSOR_RAGGED_OUTPUT")
-    bs_out_node = tf.matmul(output_expander,
-                            bs_mat,
-                            name="TENSOR_BATCH_AND_SIZE_OUTPUT")
-    batch_out_node = tf.matmul(output_expander,
-                               batch_mat,
-                               name="TENSOR_BATCH_OUTPUT")
+    bs_out_node = tf.matmul(
+        output_expander, bs_mat, name="TENSOR_BATCH_AND_SIZE_OUTPUT"
+    )
+    batch_out_node = tf.matmul(output_expander, batch_mat, name="TENSOR_BATCH_OUTPUT")
 
     model_name = "savedmodel_batch_input"
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
@@ -180,17 +183,23 @@ def create_savedmodel_modelfile(models_dir, model_version, dtype):
 
     with tf.compat.v1.Session() as sess:
         in_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_RAGGED_INPUT:0")
+            "TENSOR_RAGGED_INPUT:0"
+        )
         bs_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_BATCH_AND_SIZE_INPUT:0")
+            "TENSOR_BATCH_AND_SIZE_INPUT:0"
+        )
         batch_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_BATCH_INPUT:0")
+            "TENSOR_BATCH_INPUT:0"
+        )
         out_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_RAGGED_OUTPUT:0")
+            "TENSOR_RAGGED_OUTPUT:0"
+        )
         bs_out_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_BATCH_AND_SIZE_OUTPUT:0")
+            "TENSOR_BATCH_AND_SIZE_OUTPUT:0"
+        )
         batch_out_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_BATCH_OUTPUT:0")
+            "TENSOR_BATCH_OUTPUT:0"
+        )
         tf.compat.v1.saved_model.simple_save(
             sess,
             model_version_dir + "/model.savedmodel",
@@ -203,7 +212,8 @@ def create_savedmodel_modelfile(models_dir, model_version, dtype):
                 "RAGGED_OUTPUT": out_tensor,
                 "BATCH_AND_SIZE_OUTPUT": bs_out_tensor,
                 "BATCH_OUTPUT": batch_out_tensor,
-            })
+            },
+        )
 
 
 def create_plan_modelfile(models_dir, model_version, dtype):
@@ -231,7 +241,8 @@ def create_plan_modelfile(models_dir, model_version, dtype):
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
     trt_dtype = np_to_trt_dtype(dtype)
 
     in_node = network.add_input("RAGGED_INPUT", trt_dtype, [-1])
@@ -246,21 +257,27 @@ def create_plan_modelfile(models_dir, model_version, dtype):
     batch_mat = network.add_shuffle(batch_node)
     batch_mat.reshape_dims = reshape_dims
 
-    batch_entry = network.add_elementwise(bs_mat.get_output(0),
-                                          bs_mat.get_output(0),
-                                          trt.ElementWiseOperation.DIV)
-    out_node = network.add_matrix_multiply(batch_entry.get_output(0),
-                                           trt.MatrixOperation.NONE,
-                                           in_mat.get_output(0),
-                                           trt.MatrixOperation.TRANSPOSE)
-    bs_out_node = network.add_matrix_multiply(batch_entry.get_output(0),
-                                              trt.MatrixOperation.NONE,
-                                              bs_mat.get_output(0),
-                                              trt.MatrixOperation.TRANSPOSE)
-    batch_out_node = network.add_matrix_multiply(batch_entry.get_output(0),
-                                                 trt.MatrixOperation.NONE,
-                                                 batch_mat.get_output(0),
-                                                 trt.MatrixOperation.TRANSPOSE)
+    batch_entry = network.add_elementwise(
+        bs_mat.get_output(0), bs_mat.get_output(0), trt.ElementWiseOperation.DIV
+    )
+    out_node = network.add_matrix_multiply(
+        batch_entry.get_output(0),
+        trt.MatrixOperation.NONE,
+        in_mat.get_output(0),
+        trt.MatrixOperation.TRANSPOSE,
+    )
+    bs_out_node = network.add_matrix_multiply(
+        batch_entry.get_output(0),
+        trt.MatrixOperation.NONE,
+        bs_mat.get_output(0),
+        trt.MatrixOperation.TRANSPOSE,
+    )
+    batch_out_node = network.add_matrix_multiply(
+        batch_entry.get_output(0),
+        trt.MatrixOperation.NONE,
+        batch_mat.get_output(0),
+        trt.MatrixOperation.TRANSPOSE,
+    )
     out_node.get_output(0).name = "RAGGED_OUTPUT"
     bs_out_node.get_output(0).name = "BATCH_AND_SIZE_OUTPUT"
     batch_out_node.get_output(0).name = "BATCH_OUTPUT"
@@ -275,8 +292,7 @@ def create_plan_modelfile(models_dir, model_version, dtype):
 
     profile = builder.create_optimization_profile()
     for input_name in ["RAGGED_INPUT", "BATCH_AND_SIZE_INPUT", "BATCH_INPUT"]:
-        profile.set_shape("{}".format(input_name), min_shape, opt_shape,
-                          max_shape)
+        profile.set_shape("{}".format(input_name), min_shape, opt_shape, max_shape)
     config = builder.create_builder_config()
     config.add_optimization_profile(profile)
     config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 20)
@@ -331,72 +347,94 @@ def create_onnx_modelfile(models_dir, model_version, dtype):
     bs_shape, idx = tu.shape_to_onnx_shape([-1], 0)
     batch_shape, idx = tu.shape_to_onnx_shape([-1], 0)
 
-    in0 = onnx.helper.make_tensor_value_info("RAGGED_INPUT", onnx_dtype,
-                                             in0_shape)
-    bs_in = onnx.helper.make_tensor_value_info("BATCH_AND_SIZE_INPUT",
-                                               onnx_dtype, bs_shape)
-    batch_in = onnx.helper.make_tensor_value_info("BATCH_INPUT", onnx_dtype,
-                                                  batch_shape)
+    in0 = onnx.helper.make_tensor_value_info("RAGGED_INPUT", onnx_dtype, in0_shape)
+    bs_in = onnx.helper.make_tensor_value_info(
+        "BATCH_AND_SIZE_INPUT", onnx_dtype, bs_shape
+    )
+    batch_in = onnx.helper.make_tensor_value_info(
+        "BATCH_INPUT", onnx_dtype, batch_shape
+    )
 
     out_shape, idx = tu.shape_to_onnx_shape([-1, -1], idx)
     bs_out_shape, idx = tu.shape_to_onnx_shape([-1, -1], idx)
     batch_out_shape, idx = tu.shape_to_onnx_shape([-1, -1], idx)
 
-    out = onnx.helper.make_tensor_value_info("RAGGED_OUTPUT", onnx_dtype,
-                                             out_shape)
-    bs_out = onnx.helper.make_tensor_value_info("BATCH_AND_SIZE_OUTPUT",
-                                                onnx_dtype, bs_out_shape)
-    batch_out = onnx.helper.make_tensor_value_info("BATCH_OUTPUT", onnx_dtype,
-                                                   batch_out_shape)
+    out = onnx.helper.make_tensor_value_info("RAGGED_OUTPUT", onnx_dtype, out_shape)
+    bs_out = onnx.helper.make_tensor_value_info(
+        "BATCH_AND_SIZE_OUTPUT", onnx_dtype, bs_out_shape
+    )
+    batch_out = onnx.helper.make_tensor_value_info(
+        "BATCH_OUTPUT", onnx_dtype, batch_out_shape
+    )
 
     const_node_shape = onnx.helper.make_node(
-        'Constant', [], ["shape"],
-        value=onnx.helper.make_tensor("const_shape", onnx.TensorProto.INT64,
-                                      [2], [1, -1]))
+        "Constant",
+        [],
+        ["shape"],
+        value=onnx.helper.make_tensor(
+            "const_shape", onnx.TensorProto.INT64, [2], [1, -1]
+        ),
+    )
 
     const_node_expander_shape = onnx.helper.make_node(
-        'Constant', [], ["expander_shape"],
-        value=onnx.helper.make_tensor("const_expander_shape",
-                                      onnx.TensorProto.INT64, [2], [-1, 1]))
-
-    in0_mat_node = onnx.helper.make_node("Reshape", ["RAGGED_INPUT", "shape"],
-                                         ["in_mat"])
-    bs_mat_node = onnx.helper.make_node("Reshape",
-                                        ["BATCH_AND_SIZE_INPUT", "shape"],
-                                        ["bs_mat"])
-    batch_mat_node = onnx.helper.make_node("Reshape", ["BATCH_INPUT", "shape"],
-                                           ["batch_mat"])
+        "Constant",
+        [],
+        ["expander_shape"],
+        value=onnx.helper.make_tensor(
+            "const_expander_shape", onnx.TensorProto.INT64, [2], [-1, 1]
+        ),
+    )
+
+    in0_mat_node = onnx.helper.make_node(
+        "Reshape", ["RAGGED_INPUT", "shape"], ["in_mat"]
+    )
+    bs_mat_node = onnx.helper.make_node(
+        "Reshape", ["BATCH_AND_SIZE_INPUT", "shape"], ["bs_mat"]
+    )
+    batch_mat_node = onnx.helper.make_node(
+        "Reshape", ["BATCH_INPUT", "shape"], ["batch_mat"]
+    )
 
     internal_node_div = onnx.helper.make_node(
-        "Div", ["BATCH_AND_SIZE_INPUT", "BATCH_AND_SIZE_INPUT"],
-        ["output_expander_int"])
+        "Div", ["BATCH_AND_SIZE_INPUT", "BATCH_AND_SIZE_INPUT"], ["output_expander_int"]
+    )
     internal_node_reshape = onnx.helper.make_node(
-        "Reshape", ["output_expander_int", "expander_shape"],
-        ["output_expander"])
-
-    out_node = onnx.helper.make_node("MatMul", ["output_expander", "in_mat"],
-                                     ["RAGGED_OUTPUT"])
-    bs_out_node = onnx.helper.make_node("MatMul", ["output_expander", "bs_mat"],
-                                        ["BATCH_AND_SIZE_OUTPUT"])
-    batch_out_node = onnx.helper.make_node("MatMul",
-                                           ["output_expander", "batch_mat"],
-                                           ["BATCH_OUTPUT"])
+        "Reshape", ["output_expander_int", "expander_shape"], ["output_expander"]
+    )
+
+    out_node = onnx.helper.make_node(
+        "MatMul", ["output_expander", "in_mat"], ["RAGGED_OUTPUT"]
+    )
+    bs_out_node = onnx.helper.make_node(
+        "MatMul", ["output_expander", "bs_mat"], ["BATCH_AND_SIZE_OUTPUT"]
+    )
+    batch_out_node = onnx.helper.make_node(
+        "MatMul", ["output_expander", "batch_mat"], ["BATCH_OUTPUT"]
+    )
 
     onnx_nodes = [
-        const_node_shape, const_node_expander_shape, in0_mat_node, bs_mat_node,
-        batch_mat_node, internal_node_div, internal_node_reshape, out_node,
-        bs_out_node, batch_out_node
+        const_node_shape,
+        const_node_expander_shape,
+        in0_mat_node,
+        bs_mat_node,
+        batch_mat_node,
+        internal_node_div,
+        internal_node_reshape,
+        out_node,
+        bs_out_node,
+        batch_out_node,
     ]
     onnx_inputs = [in0, bs_in, batch_in]
     onnx_outputs = [out, bs_out, batch_out]
 
-    graph_proto = onnx.helper.make_graph(onnx_nodes, model_name, onnx_inputs,
-                                         onnx_outputs)
+    graph_proto = onnx.helper.make_graph(
+        onnx_nodes, model_name, onnx_inputs, onnx_outputs
+    )
     if FLAGS.onnx_opset > 0:
         model_opset = onnx.helper.make_operatorsetid("", FLAGS.onnx_opset)
-        model_def = onnx.helper.make_model(graph_proto,
-                                           producer_name="triton",
-                                           opset_imports=[model_opset])
+        model_def = onnx.helper.make_model(
+            graph_proto, producer_name="triton", opset_imports=[model_opset]
+        )
     else:
         model_def = onnx.helper.make_model(graph_proto, producer_name="triton")
 
@@ -434,8 +472,7 @@ def create_libtorch_modelfile(models_dir, model_version, dtype):
     model_name = "libtorch_batch_input"
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
-    if (dtype == np_dtype_string):
-
+    if dtype == np_dtype_string:
         raise Exception(
             "PyTorch ragged model generation for string models not yet implemented"
         )
@@ -443,7 +480,6 @@ def create_libtorch_modelfile(models_dir, model_version, dtype):
     else:
 
         class IdentityNet(nn.Module):
-
             def __init__(self):
                 super(IdentityNet, self).__init__()
 
@@ -455,8 +491,7 @@ def forward(self, BATCH_INPUT, BATCH_AND_SIZE_INPUT, RAGGED_INPUT):
                 BATCH_OUTPUT = torch.matmul(batch_entry, BATCH_INPUT)
 
                 BATCH_AND_SIZE_INPUT = BATCH_AND_SIZE_INPUT.view(1, -1)
-                BATCH_AND_SIZE_OUTPUT = torch.matmul(batch_entry,
-                                                     BATCH_AND_SIZE_INPUT)
+                BATCH_AND_SIZE_OUTPUT = torch.matmul(batch_entry, BATCH_AND_SIZE_INPUT)
 
                 RAGGED_INPUT = RAGGED_INPUT.view(1, -1)
                 RAGGED_OUTPUT = torch.matmul(batch_entry, RAGGED_INPUT)
@@ -474,21 +509,24 @@ def forward(self, BATCH_INPUT, BATCH_AND_SIZE_INPUT, RAGGED_INPUT):
     traced.save(model_version_dir + "/model.pt")
 
 
-def create_modelconfig(models_dir, max_batch, model_version, dtype, backend,
-                       platform):
+def create_modelconfig(models_dir, max_batch, model_version, dtype, backend, platform):
     version_policy_str = "{ latest { num_versions: 1 }}"
 
-    backend_spec = '''
+    backend_spec = """
 backend: "{}"
-'''.format(backend)
+""".format(
+        backend
+    )
     if backend == "tensorflow":
-        backend_spec += '''
+        backend_spec += """
 platform: "{}_{}"
-'''.format(backend, platform)
+""".format(
+            backend, platform
+        )
 
     model_name = "{}_batch_input".format(platform)
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 {}
 max_batch_size: {}
@@ -539,11 +577,13 @@ def create_modelconfig(models_dir, max_batch, model_version, dtype, backend,
 dynamic_batching {{
   max_queue_delay_microseconds: 1000000
 }}
-'''.format(model_name,
-           backend_spec,
-           max_batch,
-           version_policy_str,
-           data_type=np_to_model_dtype(dtype))
+""".format(
+        model_name,
+        backend_spec,
+        max_batch,
+        version_policy_str,
+        data_type=np_to_model_dtype(dtype),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -564,12 +604,13 @@ def create_savedmodel_itemshape_modelfile(models_dir, model_version, dtype):
     tf_dtype = np_to_tf_dtype(dtype)
 
     tf.compat.v1.reset_default_graph()
-    tf.compat.v1.placeholder(tf_dtype, tu.shape_to_tf_shape([-1]),
-                             "TENSOR_RAGGED_INPUT")
+    tf.compat.v1.placeholder(
+        tf_dtype, tu.shape_to_tf_shape([-1]), "TENSOR_RAGGED_INPUT"
+    )
     # Shape is predefined
-    batch_node = tf.compat.v1.placeholder(tf_dtype,
-                                          tu.shape_to_tf_shape([-1, 2]),
-                                          "TENSOR_BATCH_INPUT")
+    batch_node = tf.compat.v1.placeholder(
+        tf_dtype, tu.shape_to_tf_shape([-1, 2]), "TENSOR_BATCH_INPUT"
+    )
     tf.identity(batch_node, name="TENSOR_BATCH_OUTPUT")
 
     model_name = "savedmodel_batch_item"
@@ -582,11 +623,14 @@ def create_savedmodel_itemshape_modelfile(models_dir, model_version, dtype):
 
     with tf.compat.v1.Session() as sess:
         in_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_RAGGED_INPUT:0")
+            "TENSOR_RAGGED_INPUT:0"
+        )
         batch_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_BATCH_INPUT:0")
+            "TENSOR_BATCH_INPUT:0"
+        )
         batch_out_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-            "TENSOR_BATCH_OUTPUT:0")
+            "TENSOR_BATCH_OUTPUT:0"
+        )
         tf.compat.v1.saved_model.simple_save(
             sess,
             model_version_dir + "/model.savedmodel",
@@ -596,7 +640,8 @@ def create_savedmodel_itemshape_modelfile(models_dir, model_version, dtype):
             },
             outputs={
                 "BATCH_OUTPUT": batch_out_tensor,
-            })
+            },
+        )
 
 
 def create_plan_itemshape_modelfile(models_dir, model_version, dtype):
@@ -609,7 +654,8 @@ def create_plan_itemshape_modelfile(models_dir, model_version, dtype):
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
     trt_dtype = np_to_trt_dtype(dtype)
 
     in_node = network.add_input("RAGGED_INPUT", trt_dtype, [-1])
@@ -626,8 +672,7 @@ def create_plan_itemshape_modelfile(models_dir, model_version, dtype):
 
     profile = builder.create_optimization_profile()
     profile.set_shape("RAGGED_INPUT", min_shape, opt_shape, max_shape)
-    profile.set_shape("BATCH_INPUT", min_shape + [2], opt_shape + [2],
-                      max_shape + [2])
+    profile.set_shape("BATCH_INPUT", min_shape + [2], opt_shape + [2], max_shape + [2])
     config = builder.create_builder_config()
     config.add_optimization_profile(profile)
     config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 20)
@@ -666,28 +711,28 @@ def create_onnx_itemshape_modelfile(models_dir, model_version, dtype):
     in0_shape, idx = tu.shape_to_onnx_shape([-1], 0)
     batch_shape, idx = tu.shape_to_onnx_shape([-1, 2], 0)
 
-    in0 = onnx.helper.make_tensor_value_info("RAGGED_INPUT", onnx_dtype,
-                                             in0_shape)
-    batch_in = onnx.helper.make_tensor_value_info("BATCH_INPUT", onnx_dtype,
-                                                  batch_shape)
+    in0 = onnx.helper.make_tensor_value_info("RAGGED_INPUT", onnx_dtype, in0_shape)
+    batch_in = onnx.helper.make_tensor_value_info(
+        "BATCH_INPUT", onnx_dtype, batch_shape
+    )
 
     batch_out_shape, idx = tu.shape_to_onnx_shape([-1, -1], idx)
-    batch_out = onnx.helper.make_tensor_value_info("BATCH_OUTPUT", onnx_dtype,
-                                                   batch_out_shape)
+    batch_out = onnx.helper.make_tensor_value_info(
+        "BATCH_OUTPUT", onnx_dtype, batch_out_shape
+    )
 
-    onnx_nodes = [
-        onnx.helper.make_node("Identity", ["BATCH_INPUT"], ["BATCH_OUTPUT"])
-    ]
+    onnx_nodes = [onnx.helper.make_node("Identity", ["BATCH_INPUT"], ["BATCH_OUTPUT"])]
     onnx_inputs = [in0, batch_in]
     onnx_outputs = [batch_out]
 
-    graph_proto = onnx.helper.make_graph(onnx_nodes, model_name, onnx_inputs,
-                                         onnx_outputs)
+    graph_proto = onnx.helper.make_graph(
+        onnx_nodes, model_name, onnx_inputs, onnx_outputs
+    )
     if FLAGS.onnx_opset > 0:
         model_opset = onnx.helper.make_operatorsetid("", FLAGS.onnx_opset)
-        model_def = onnx.helper.make_model(graph_proto,
-                                           producer_name="triton",
-                                           opset_imports=[model_opset])
+        model_def = onnx.helper.make_model(
+            graph_proto, producer_name="triton", opset_imports=[model_opset]
+        )
     else:
         model_def = onnx.helper.make_model(graph_proto, producer_name="triton")
 
@@ -710,8 +755,7 @@ def create_libtorch_itemshape_modelfile(models_dir, model_version, dtype):
     model_name = "libtorch_batch_item"
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
-    if (dtype == np_dtype_string):
-
+    if dtype == np_dtype_string:
         raise Exception(
             "PyTorch ragged model generation for string models not yet implemented"
         )
@@ -719,7 +763,6 @@ def create_libtorch_itemshape_modelfile(models_dir, model_version, dtype):
     else:
 
         class IdentityNet(nn.Module):
-
             def __init__(self):
                 super(IdentityNet, self).__init__()
 
@@ -737,21 +780,26 @@ def forward(self, RAGGED_INPUT, BATCH_INPUT):
     traced.save(model_version_dir + "/model.pt")
 
 
-def create_itemshape_modelconfig(models_dir, max_batch, model_version, dtype,
-                                 backend, platform):
+def create_itemshape_modelconfig(
+    models_dir, max_batch, model_version, dtype, backend, platform
+):
     version_policy_str = "{ latest { num_versions: 1 }}"
 
-    backend_spec = '''
+    backend_spec = """
 backend: "{}"
-'''.format(backend)
+""".format(
+        backend
+    )
     if backend == "tensorflow":
-        backend_spec += '''
+        backend_spec += """
 platform: "{}_{}"
-'''.format(backend, platform)
+""".format(
+            backend, platform
+        )
 
     model_name = "{}_batch_item".format(platform)
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 {}
 max_batch_size: {}
@@ -782,11 +830,13 @@ def create_itemshape_modelconfig(models_dir, max_batch, model_version, dtype,
 dynamic_batching {{
   max_queue_delay_microseconds: 1000000
 }}
-'''.format(model_name,
-           backend_spec,
-           max_batch,
-           version_policy_str,
-           data_type=np_to_model_dtype(dtype))
+""".format(
+        model_name,
+        backend_spec,
+        max_batch,
+        version_policy_str,
+        data_type=np_to_model_dtype(dtype),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -800,77 +850,93 @@ def create_itemshape_modelconfig(models_dir, max_batch, model_version, dtype,
 def create_batch_input_models(models_dir):
     model_version = 1
     if FLAGS.tensorrt:
-        create_modelconfig(models_dir, 4, model_version, np.float32, "tensorrt",
-                           "plan")
+        create_modelconfig(models_dir, 4, model_version, np.float32, "tensorrt", "plan")
         create_plan_modelfile(models_dir, model_version, np.float32)
-        create_itemshape_modelconfig(models_dir, 4, model_version, np.float32,
-                                     "tensorrt", "plan")
+        create_itemshape_modelconfig(
+            models_dir, 4, model_version, np.float32, "tensorrt", "plan"
+        )
         create_plan_itemshape_modelfile(models_dir, model_version, np.float32)
     if FLAGS.savedmodel:
-        create_modelconfig(models_dir, 4, model_version, np.float32,
-                           "tensorflow", "savedmodel")
+        create_modelconfig(
+            models_dir, 4, model_version, np.float32, "tensorflow", "savedmodel"
+        )
         create_savedmodel_modelfile(models_dir, model_version, np.float32)
-        create_itemshape_modelconfig(models_dir, 4, model_version, np.float32,
-                                     "tensorflow", "savedmodel")
-        create_savedmodel_itemshape_modelfile(models_dir, model_version,
-                                              np.float32)
+        create_itemshape_modelconfig(
+            models_dir, 4, model_version, np.float32, "tensorflow", "savedmodel"
+        )
+        create_savedmodel_itemshape_modelfile(models_dir, model_version, np.float32)
     if FLAGS.onnx:
-        create_modelconfig(models_dir, 4, model_version, np.float32,
-                           "onnxruntime", "onnx")
+        create_modelconfig(
+            models_dir, 4, model_version, np.float32, "onnxruntime", "onnx"
+        )
         create_onnx_modelfile(models_dir, model_version, np.float32)
-        create_itemshape_modelconfig(models_dir, 4, model_version, np.float32,
-                                     "onnxruntime", "onnx")
+        create_itemshape_modelconfig(
+            models_dir, 4, model_version, np.float32, "onnxruntime", "onnx"
+        )
         create_onnx_itemshape_modelfile(models_dir, model_version, np.float32)
     if FLAGS.libtorch:
-        create_modelconfig(models_dir, 4, model_version, np.float32, "pytorch",
-                           "libtorch")
+        create_modelconfig(
+            models_dir, 4, model_version, np.float32, "pytorch", "libtorch"
+        )
         create_libtorch_modelfile(models_dir, model_version, np.float32)
-        create_itemshape_modelconfig(models_dir, 4, model_version, np.float32,
-                                     "pytorch", "libtorch")
-        create_libtorch_itemshape_modelfile(models_dir, model_version,
-                                            np.float32)
+        create_itemshape_modelconfig(
+            models_dir, 4, model_version, np.float32, "pytorch", "libtorch"
+        )
+        create_libtorch_itemshape_modelfile(models_dir, model_version, np.float32)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
-    parser.add_argument('--tensorrt',
-                        required=False,
-                        action='store_true',
-                        help='Generate TensorRT PLAN models')
-    parser.add_argument('--savedmodel',
-                        required=False,
-                        action='store_true',
-                        help='Generate SavedModel models')
-    parser.add_argument('--graphdef',
-                        required=False,
-                        action='store_true',
-                        help='Generate GraphDef models')
-    parser.add_argument('--onnx',
-                        required=False,
-                        action='store_true',
-                        help='Generate Onnx Runtime Onnx models')
-    parser.add_argument('--libtorch',
-                        required=False,
-                        action='store_true',
-                        help='Generate Libtorch models')
     parser.add_argument(
-        '--onnx_opset',
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
+    parser.add_argument(
+        "--tensorrt",
+        required=False,
+        action="store_true",
+        help="Generate TensorRT PLAN models",
+    )
+    parser.add_argument(
+        "--savedmodel",
+        required=False,
+        action="store_true",
+        help="Generate SavedModel models",
+    )
+    parser.add_argument(
+        "--graphdef",
+        required=False,
+        action="store_true",
+        help="Generate GraphDef models",
+    )
+    parser.add_argument(
+        "--onnx",
+        required=False,
+        action="store_true",
+        help="Generate Onnx Runtime Onnx models",
+    )
+    parser.add_argument(
+        "--libtorch",
+        required=False,
+        action="store_true",
+        help="Generate Libtorch models",
+    )
+    parser.add_argument(
+        "--onnx_opset",
         type=int,
         required=False,
         default=0,
-        help='Opset used for Onnx models. Default is to use ONNXRT default')
+        help="Opset used for Onnx models. Default is to use ONNXRT default",
+    )
 
     FLAGS, unparsed = parser.parse_known_args()
 
     import test_util as tu
+
     if FLAGS.tensorrt:
         import tensorrt as trt
     if FLAGS.graphdef or FLAGS.savedmodel:
         import tensorflow as tf
+
         tf.compat.v1.disable_eager_execution()
     if FLAGS.onnx:
         import onnx
diff --git a/qa/common/gen_qa_reshape_models.py b/qa/common/gen_qa_reshape_models.py
old mode 100644
new mode 100755
index b6ff48003e..983f06c1d0
--- a/qa/common/gen_qa_reshape_models.py
+++ b/qa/common/gen_qa_reshape_models.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -25,10 +27,11 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-from builtins import range
 import os
-import numpy as np
+from builtins import range
+
 import gen_ensemble_model_utils as emu
+import numpy as np
 
 FLAGS = None
 np_dtype_string = np.dtype(object)
@@ -153,12 +156,19 @@ def np_to_torch_dtype(np_dtype):
     return None
 
 
-def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
-                        dtype, input_shapes, output_shapes):
-
+def create_tf_modelfile(
+    create_savedmodel,
+    models_dir,
+    model_version,
+    max_batch,
+    dtype,
+    input_shapes,
+    output_shapes,
+):
     assert len(input_shapes) == len(output_shapes)
-    if not tu.validate_for_tf_model(dtype, dtype, dtype, input_shapes[0],
-                                    input_shapes[0], input_shapes[0]):
+    if not tu.validate_for_tf_model(
+        dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0]
+    ):
         return
 
     tf_dtype = np_to_tf_dtype(dtype)
@@ -171,12 +181,17 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
         output_name = "OUTPUT{}".format(io_num)
         if max_batch == 0:
             tin = tf.compat.v1.placeholder(
-                tf_dtype, tu.shape_to_tf_shape(input_shapes[io_num]),
-                input_name)
+                tf_dtype, tu.shape_to_tf_shape(input_shapes[io_num]), input_name
+            )
         else:
-            tin = tf.compat.v1.placeholder(tf_dtype, [
-                None,
-            ] + tu.shape_to_tf_shape(input_shapes[io_num]), input_name)
+            tin = tf.compat.v1.placeholder(
+                tf_dtype,
+                [
+                    None,
+                ]
+                + tu.shape_to_tf_shape(input_shapes[io_num]),
+                input_name,
+            )
 
         if input_shapes == output_shapes:
             tf.identity(tin, name=output_name)
@@ -184,18 +199,24 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
             if max_batch == 0:
                 tf.reshape(tin, output_shapes[io_num], name=output_name)
             else:
-                tf.reshape(tin, [
-                    -1,
-                ] + output_shapes[io_num], name=output_name)
+                tf.reshape(
+                    tin,
+                    [
+                        -1,
+                    ]
+                    + output_shapes[io_num],
+                    name=output_name,
+                )
 
     # Use model name based on input/output count and non-batching variant
     if create_savedmodel:
         model_name = tu.get_zero_model_name(
-            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", io_cnt,
-            dtype)
+            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", io_cnt, dtype
+        )
     else:
         model_name = tu.get_zero_model_name(
-            "graphdef_nobatch" if max_batch == 0 else "graphdef", io_cnt, dtype)
+            "graphdef_nobatch" if max_batch == 0 else "graphdef", io_cnt, dtype
+        )
 
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
@@ -211,34 +232,47 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
             for io_num in range(io_cnt):
                 input_name = "INPUT{}".format(io_num)
                 output_name = "OUTPUT{}".format(io_num)
-                input_tensor = tf.compat.v1.get_default_graph(
-                ).get_tensor_by_name(input_name + ":0")
-                output_tensor = tf.compat.v1.get_default_graph(
-                ).get_tensor_by_name(output_name + ":0")
+                input_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
+                    input_name + ":0"
+                )
+                output_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
+                    output_name + ":0"
+                )
                 input_dict[input_name] = input_tensor
                 output_dict[output_name] = output_tensor
-            tf.compat.v1.saved_model.simple_save(sess,
-                                                 model_version_dir +
-                                                 "/model.savedmodel",
-                                                 inputs=input_dict,
-                                                 outputs=output_dict)
+            tf.compat.v1.saved_model.simple_save(
+                sess,
+                model_version_dir + "/model.savedmodel",
+                inputs=input_dict,
+                outputs=output_dict,
+            )
     else:
         with tf.compat.v1.Session() as sess:
-            graph_io.write_graph(sess.graph.as_graph_def(),
-                                 model_version_dir,
-                                 "model.graphdef",
-                                 as_text=False)
-
-
-def create_tf_modelconfig(create_savedmodel, models_dir, model_version,
-                          max_batch, dtype, input_shapes, input_model_shapes,
-                          output_shapes, output_model_shapes):
-
+            graph_io.write_graph(
+                sess.graph.as_graph_def(),
+                model_version_dir,
+                "model.graphdef",
+                as_text=False,
+            )
+
+
+def create_tf_modelconfig(
+    create_savedmodel,
+    models_dir,
+    model_version,
+    max_batch,
+    dtype,
+    input_shapes,
+    input_model_shapes,
+    output_shapes,
+    output_model_shapes,
+):
     assert len(input_shapes) == len(input_model_shapes)
     assert len(output_shapes) == len(output_model_shapes)
     assert len(input_shapes) == len(output_shapes)
-    if not tu.validate_for_tf_model(dtype, dtype, dtype, input_shapes[0],
-                                    input_shapes[0], input_shapes[0]):
+    if not tu.validate_for_tf_model(
+        dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0]
+    ):
         return
 
     io_cnt = len(input_shapes)
@@ -246,24 +280,26 @@ def create_tf_modelconfig(create_savedmodel, models_dir, model_version,
     # Use a different model name for the non-batching variant
     if create_savedmodel:
         model_name = tu.get_zero_model_name(
-            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", io_cnt,
-            dtype)
+            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", io_cnt, dtype
+        )
     else:
         model_name = tu.get_zero_model_name(
-            "graphdef_nobatch" if max_batch == 0 else "graphdef", io_cnt, dtype)
+            "graphdef_nobatch" if max_batch == 0 else "graphdef", io_cnt, dtype
+        )
 
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "{}"
 max_batch_size: {}
-'''.format(
+""".format(
         model_name,
         "tensorflow_savedmodel" if create_savedmodel else "tensorflow_graphdef",
-        max_batch)
+        max_batch,
+    )
 
     for io_num in range(io_cnt):
-        config += '''
+        config += """
 input [
   {{
     name: "INPUT{}"
@@ -280,17 +316,24 @@ def create_tf_modelconfig(create_savedmodel, models_dir, model_version,
     {}
   }}
 ]
-'''.format(
-            io_num, np_to_model_dtype(dtype),
+""".format(
+            io_num,
+            np_to_model_dtype(dtype),
             tu.shape_to_dims_str(input_shapes[io_num]),
             "reshape: {{ shape: [ {} ] }}".format(
-                tu.shape_to_dims_str(input_model_shapes[io_num]))
-            if input_shapes[io_num] != input_model_shapes[io_num] else "",
-            io_num, np_to_model_dtype(dtype),
+                tu.shape_to_dims_str(input_model_shapes[io_num])
+            )
+            if input_shapes[io_num] != input_model_shapes[io_num]
+            else "",
+            io_num,
+            np_to_model_dtype(dtype),
             tu.shape_to_dims_str(output_shapes[io_num]),
             "reshape: {{ shape: [ {} ] }}".format(
-                tu.shape_to_dims_str(output_model_shapes[io_num]))
-            if output_shapes[io_num] != output_model_shapes[io_num] else "")
+                tu.shape_to_dims_str(output_model_shapes[io_num])
+            )
+            if output_shapes[io_num] != output_model_shapes[io_num]
+            else "",
+        )
 
     try:
         os.makedirs(config_dir)
@@ -301,12 +344,13 @@ def create_tf_modelconfig(create_savedmodel, models_dir, model_version,
         cfile.write(config)
 
 
-def create_plan_modelfile(models_dir, model_version, max_batch, dtype,
-                          input_shapes, output_shapes):
-
+def create_plan_modelfile(
+    models_dir, model_version, max_batch, dtype, input_shapes, output_shapes
+):
     assert len(input_shapes) == len(output_shapes)
-    if not tu.validate_for_trt_model(dtype, dtype, dtype, input_shapes[0],
-                                     input_shapes[0], input_shapes[0]):
+    if not tu.validate_for_trt_model(
+        dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0]
+    ):
         return
 
     trt_dtype = np_to_trt_dtype(dtype)
@@ -342,7 +386,8 @@ def create_plan_modelfile(models_dir, model_version, max_batch, dtype,
     del network
 
     model_name = tu.get_zero_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", io_cnt, dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", io_cnt, dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -354,30 +399,40 @@ def create_plan_modelfile(models_dir, model_version, max_batch, dtype,
         f.write(engine_bytes)
 
 
-def create_plan_modelconfig(models_dir, model_version, max_batch, dtype,
-                            input_shapes, input_model_shapes, output_shapes,
-                            output_model_shapes):
-
+def create_plan_modelconfig(
+    models_dir,
+    model_version,
+    max_batch,
+    dtype,
+    input_shapes,
+    input_model_shapes,
+    output_shapes,
+    output_model_shapes,
+):
     assert len(input_shapes) == len(input_model_shapes)
     assert len(output_shapes) == len(output_model_shapes)
     assert len(input_shapes) == len(output_shapes)
-    if not tu.validate_for_trt_model(dtype, dtype, dtype, input_shapes[0],
-                                     input_shapes[0], input_shapes[0]):
+    if not tu.validate_for_trt_model(
+        dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0]
+    ):
         return
 
     io_cnt = len(input_shapes)
 
     model_name = tu.get_zero_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", io_cnt, dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", io_cnt, dtype
+    )
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: {}
-'''.format(model_name, max_batch)
+""".format(
+        model_name, max_batch
+    )
 
     for io_num in range(io_cnt):
-        config += '''
+        config += """
 input [
   {{
     name: "INPUT{}"
@@ -394,17 +449,24 @@ def create_plan_modelconfig(models_dir, model_version, max_batch, dtype,
     {}
   }}
 ]
-'''.format(
-            io_num, np_to_model_dtype(dtype),
+""".format(
+            io_num,
+            np_to_model_dtype(dtype),
             tu.shape_to_dims_str(input_shapes[io_num]),
             "reshape: {{ shape: [ {} ] }}".format(
-                tu.shape_to_dims_str(input_model_shapes[io_num]))
-            if input_shapes[io_num] != input_model_shapes[io_num] else "",
-            io_num, np_to_model_dtype(dtype),
+                tu.shape_to_dims_str(input_model_shapes[io_num])
+            )
+            if input_shapes[io_num] != input_model_shapes[io_num]
+            else "",
+            io_num,
+            np_to_model_dtype(dtype),
             tu.shape_to_dims_str(output_shapes[io_num]),
             "reshape: {{ shape: [ {} ] }}".format(
-                tu.shape_to_dims_str(output_model_shapes[io_num]))
-            if output_shapes[io_num] != output_model_shapes[io_num] else "")
+                tu.shape_to_dims_str(output_model_shapes[io_num])
+            )
+            if output_shapes[io_num] != output_model_shapes[io_num]
+            else "",
+        )
 
     try:
         os.makedirs(config_dir)
@@ -415,43 +477,45 @@ def create_plan_modelconfig(models_dir, model_version, max_batch, dtype,
         cfile.write(config)
 
 
-def create_libtorch_modelfile(models_dir, model_version, max_batch, dtype,
-                              input_shapes, output_shapes):
-
+def create_libtorch_modelfile(
+    models_dir, model_version, max_batch, dtype, input_shapes, output_shapes
+):
     assert len(input_shapes) == len(output_shapes)
-    if not tu.validate_for_libtorch_model(dtype,
-                                          dtype,
-                                          dtype,
-                                          input_shapes[0],
-                                          input_shapes[0],
-                                          input_shapes[0],
-                                          max_batch,
-                                          reshape=True):
+    if not tu.validate_for_libtorch_model(
+        dtype,
+        dtype,
+        dtype,
+        input_shapes[0],
+        input_shapes[0],
+        input_shapes[0],
+        max_batch,
+        reshape=True,
+    ):
         return
 
     torch_dtype = np_to_torch_dtype(dtype)
     io_cnt = len(input_shapes)
     model_name = tu.get_zero_model_name(
-        "libtorch_nobatch" if max_batch == 0 else "libtorch", io_cnt, dtype)
+        "libtorch_nobatch" if max_batch == 0 else "libtorch", io_cnt, dtype
+    )
 
     # Create the model that reshapes inputs to corresponding outputs
     # Note that string I/O is supported only for 1-dimensional inputs/outputs.
     # Use identity model for string I/O models and add 'reshape' field with
     # empty shape so that batching is supported and the full shape becomes [-1].
     if io_cnt == 1:
-        if (dtype == np_dtype_string):
+        if dtype == np_dtype_string:
 
             class IdentityNet(nn.Module):
-
                 def __init__(self):
                     super(IdentityNet, self).__init__()
 
                 def forward(self, input0: List[str]) -> List[str]:
                     return input0
+
         else:
 
             class ReshapeNet(nn.Module):
-
                 def __init__(self, *args):
                     super(ReshapeNet, self).__init__()
                     self.shape = args[0][0]
@@ -461,24 +525,28 @@ def forward(self, input0):
                     if self.max_batch == 0:
                         return input0.view(self.shape[0])
                     else:
-                        return input0.view([
-                            -1,
-                        ] + self.shape[0])
+                        return input0.view(
+                            [
+                                -1,
+                            ]
+                            + self.shape[0]
+                        )
+
     elif io_cnt == 2:
-        if (dtype == np_dtype_string):
+        if dtype == np_dtype_string:
 
             class IdentityNet(nn.Module):
-
                 def __init__(self):
                     super(IdentityNet, self).__init__()
 
-                def forward(self, input0: List[str],
-                            input1: List[str]) -> Tuple[List[str], List[str]]:
+                def forward(
+                    self, input0: List[str], input1: List[str]
+                ) -> Tuple[List[str], List[str]]:
                     return input0, input1
+
         else:
 
             class ReshapeNet(nn.Module):
-
                 def __init__(self, *args):
                     super(ReshapeNet, self).__init__()
                     self.shape = args[0][0]
@@ -486,31 +554,35 @@ def __init__(self, *args):
 
                 def forward(self, input0, input1):
                     if self.max_batch == 0:
-                        return input0.view(self.shape[0]), input1.view(
-                            self.shape[1])
+                        return input0.view(self.shape[0]), input1.view(self.shape[1])
                     else:
-                        return input0.view([
-                            -1,
-                        ] + self.shape[0]), input1.view([
-                            -1,
-                        ] + self.shape[1])
+                        return input0.view(
+                            [
+                                -1,
+                            ]
+                            + self.shape[0]
+                        ), input1.view(
+                            [
+                                -1,
+                            ]
+                            + self.shape[1]
+                        )
+
     elif io_cnt == 3:
-        if (dtype == np_dtype_string):
+        if dtype == np_dtype_string:
 
             class IdentityNet(nn.Module):
-
                 def __init__(self):
                     super(IdentityNet, self).__init__()
 
                 def forward(
-                    self, input0: List[str], input1: List[str],
-                    input2: List[str]
+                    self, input0: List[str], input1: List[str], input2: List[str]
                 ) -> Tuple[List[str], List[str], List[str]]:
                     return input0, input1, input2
+
         else:
 
             class ReshapeNet(nn.Module):
-
                 def __init__(self, *args):
                     super(ReshapeNet, self).__init__()
                     self.shape = args[0][0]
@@ -518,28 +590,52 @@ def __init__(self, *args):
 
                 def forward(self, input0, input1, input2):
                     if self.max_batch == 0:
-                        return input0.view(self.shape[0]), input1.view(
-                            self.shape[1]), input2.view(self.shape[2])
+                        return (
+                            input0.view(self.shape[0]),
+                            input1.view(self.shape[1]),
+                            input2.view(self.shape[2]),
+                        )
                     else:
-                        return input0.view([-1,]+self.shape[0]), input1.view([-1,]+self.shape[1]), \
-                            input2.view([-1,]+self.shape[2])
+                        return (
+                            input0.view(
+                                [
+                                    -1,
+                                ]
+                                + self.shape[0]
+                            ),
+                            input1.view(
+                                [
+                                    -1,
+                                ]
+                                + self.shape[1]
+                            ),
+                            input2.view(
+                                [
+                                    -1,
+                                ]
+                                + self.shape[2]
+                            ),
+                        )
+
     elif io_cnt == 4:
-        if (dtype == np_dtype_string):
+        if dtype == np_dtype_string:
 
             class IdentityNet(nn.Module):
-
                 def __init__(self):
                     super(IdentityNet, self).__init__()
 
                 def forward(
-                    self, input0: List[str], input1: List[str],
-                    input2: List[str], input3: List[str]
+                    self,
+                    input0: List[str],
+                    input1: List[str],
+                    input2: List[str],
+                    input3: List[str],
                 ) -> Tuple[List[str], List[str], List[str], List[str]]:
                     return input0, input1, input2, input3
+
         else:
 
             class ReshapeNet(nn.Module):
-
                 def __init__(self, *args):
                     super(ReshapeNet, self).__init__()
                     self.shape = args[0][0]
@@ -547,18 +643,45 @@ def __init__(self, *args):
 
                 def forward(self, input0, input1, input2, input3):
                     if self.max_batch == 0:
-                        return input0.view(self.shape[0]), input1.view(self.shape[1]), input2.view(self.shape[2]), \
-                            input3.view(self.shape[3])
+                        return (
+                            input0.view(self.shape[0]),
+                            input1.view(self.shape[1]),
+                            input2.view(self.shape[2]),
+                            input3.view(self.shape[3]),
+                        )
                     else:
-                        return input0.view([-1,]+self.shape[0]), input1.view([-1,]+self.shape[1]), \
-                            input2.view([-1,]+self.shape[2]), input3.view([-1,]+self.shape[3])
-
-    if (dtype == np_dtype_string):
+                        return (
+                            input0.view(
+                                [
+                                    -1,
+                                ]
+                                + self.shape[0]
+                            ),
+                            input1.view(
+                                [
+                                    -1,
+                                ]
+                                + self.shape[1]
+                            ),
+                            input2.view(
+                                [
+                                    -1,
+                                ]
+                                + self.shape[2]
+                            ),
+                            input3.view(
+                                [
+                                    -1,
+                                ]
+                                + self.shape[3]
+                            ),
+                        )
+
+    if dtype == np_dtype_string:
         identityModel = IdentityNet()
         traced = torch.jit.script(identityModel)
     else:
-        reshapeModel = ReshapeNet([[op_shape for op_shape in output_shapes],
-                                   max_batch])
+        reshapeModel = ReshapeNet([[op_shape for op_shape in output_shapes], max_batch])
         traced = torch.jit.script(reshapeModel)
 
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
@@ -571,36 +694,47 @@ def forward(self, input0, input1, input2, input3):
     traced.save(model_version_dir + "/model.pt")
 
 
-def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
-                                input_shapes, input_model_shapes, output_shapes,
-                                output_model_shapes):
-
+def create_libtorch_modelconfig(
+    models_dir,
+    model_version,
+    max_batch,
+    dtype,
+    input_shapes,
+    input_model_shapes,
+    output_shapes,
+    output_model_shapes,
+):
     assert len(input_shapes) == len(input_model_shapes)
     assert len(output_shapes) == len(output_model_shapes)
     assert len(input_shapes) == len(output_shapes)
-    if not tu.validate_for_libtorch_model(dtype,
-                                          dtype,
-                                          dtype,
-                                          input_shapes[0],
-                                          input_shapes[0],
-                                          input_shapes[0],
-                                          max_batch,
-                                          reshape=True):
+    if not tu.validate_for_libtorch_model(
+        dtype,
+        dtype,
+        dtype,
+        input_shapes[0],
+        input_shapes[0],
+        input_shapes[0],
+        max_batch,
+        reshape=True,
+    ):
         return
 
     io_cnt = len(input_shapes)
 
     model_name = tu.get_zero_model_name(
-        "libtorch_nobatch" if max_batch == 0 else "libtorch", io_cnt, dtype)
+        "libtorch_nobatch" if max_batch == 0 else "libtorch", io_cnt, dtype
+    )
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "pytorch_libtorch"
 max_batch_size: {}
-'''.format(model_name, max_batch)
+""".format(
+        model_name, max_batch
+    )
 
     for io_num in range(io_cnt):
-        config += '''
+        config += """
 input [
   {{
     name: "INPUT__{}"
@@ -617,17 +751,24 @@ def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
     {}
   }}
 ]
-'''.format(
-            io_num, np_to_model_dtype(dtype),
+""".format(
+            io_num,
+            np_to_model_dtype(dtype),
             tu.shape_to_dims_str(input_shapes[io_num]),
             "reshape: {{ shape: [ {} ] }}".format(
-                tu.shape_to_dims_str(input_model_shapes[io_num]))
-            if input_shapes[io_num] != input_model_shapes[io_num] else "",
-            io_num, np_to_model_dtype(dtype),
+                tu.shape_to_dims_str(input_model_shapes[io_num])
+            )
+            if input_shapes[io_num] != input_model_shapes[io_num]
+            else "",
+            io_num,
+            np_to_model_dtype(dtype),
             tu.shape_to_dims_str(output_shapes[io_num]),
             "reshape: {{ shape: [ {} ] }}".format(
-                tu.shape_to_dims_str(output_model_shapes[io_num]))
-            if output_shapes[io_num] != output_model_shapes[io_num] else "")
+                tu.shape_to_dims_str(output_model_shapes[io_num])
+            )
+            if output_shapes[io_num] != output_model_shapes[io_num]
+            else "",
+        )
 
     try:
         os.makedirs(config_dir)
@@ -638,30 +779,54 @@ def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
         cfile.write(config)
 
 
-def create_ensemble_modelfile(models_dir, model_version, max_batch, dtype,
-                              input_shapes, output_shapes):
-
+def create_ensemble_modelfile(
+    models_dir, model_version, max_batch, dtype, input_shapes, output_shapes
+):
     assert len(input_shapes) == len(output_shapes)
-    if not tu.validate_for_ensemble_model("reshape", dtype, dtype, dtype,
-                                          input_shapes[0], input_shapes[0],
-                                          input_shapes[0]):
+    if not tu.validate_for_ensemble_model(
+        "reshape",
+        dtype,
+        dtype,
+        dtype,
+        input_shapes[0],
+        input_shapes[0],
+        input_shapes[0],
+    ):
         return
 
-    emu.create_identity_ensemble_modelfile("reshape", models_dir, model_version,
-                                           max_batch, dtype, input_shapes,
-                                           output_shapes)
-
-
-def create_ensemble_modelconfig(models_dir, model_version, max_batch, dtype,
-                                input_shapes, input_model_shapes, output_shapes,
-                                output_model_shapes):
-
+    emu.create_identity_ensemble_modelfile(
+        "reshape",
+        models_dir,
+        model_version,
+        max_batch,
+        dtype,
+        input_shapes,
+        output_shapes,
+    )
+
+
+def create_ensemble_modelconfig(
+    models_dir,
+    model_version,
+    max_batch,
+    dtype,
+    input_shapes,
+    input_model_shapes,
+    output_shapes,
+    output_model_shapes,
+):
     assert len(input_shapes) == len(input_model_shapes)
     assert len(output_shapes) == len(output_model_shapes)
     assert len(input_shapes) == len(output_shapes)
-    if not tu.validate_for_ensemble_model("reshape", dtype, dtype, dtype,
-                                          input_shapes[0], input_shapes[0],
-                                          input_shapes[0]):
+    if not tu.validate_for_ensemble_model(
+        "reshape",
+        dtype,
+        dtype,
+        dtype,
+        input_shapes[0],
+        input_shapes[0],
+        input_shapes[0],
+    ):
         return
 
     # No reason to reshape ensemble inputs / outputs to empty as the inner models
@@ -678,20 +843,26 @@ def create_ensemble_modelconfig(models_dir, model_version, max_batch, dtype,
         else:
             output_model_shapes_list.append(output_model_shapes[idx])
 
-    emu.create_identity_ensemble_modelconfig("reshape", models_dir,
-                                             model_version, max_batch, dtype,
-                                             input_shapes,
-                                             tuple(input_model_shapes_list),
-                                             output_shapes,
-                                             tuple(output_model_shapes_list))
-
+    emu.create_identity_ensemble_modelconfig(
+        "reshape",
+        models_dir,
+        model_version,
+        max_batch,
+        dtype,
+        input_shapes,
+        tuple(input_model_shapes_list),
+        output_shapes,
+        tuple(output_model_shapes_list),
+    )
 
-def create_onnx_modelfile(models_dir, model_version, max_batch, dtype,
-                          input_shapes, output_shapes):
 
+def create_onnx_modelfile(
+    models_dir, model_version, max_batch, dtype, input_shapes, output_shapes
+):
     assert len(input_shapes) == len(output_shapes)
-    if not tu.validate_for_onnx_model(dtype, dtype, dtype, input_shapes[0],
-                                      input_shapes[0], input_shapes[0]):
+    if not tu.validate_for_onnx_model(
+        dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0]
+    ):
         return
 
     onnx_dtype = np_to_onnx_dtype(dtype)
@@ -699,7 +870,8 @@ def create_onnx_modelfile(models_dir, model_version, max_batch, dtype,
 
     # Create the model
     model_name = tu.get_zero_model_name(
-        "onnx_nobatch" if max_batch == 0 else "onnx", io_cnt, dtype)
+        "onnx_nobatch" if max_batch == 0 else "onnx", io_cnt, dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     batch_dim = [] if max_batch == 0 else [None]
@@ -717,29 +889,34 @@ def create_onnx_modelfile(models_dir, model_version, max_batch, dtype,
         out_shape_name = out_name + "_shape"
 
         onnx_inputs.append(
-            onnx.helper.make_tensor_value_info(in_name, onnx_dtype,
-                                               batch_dim + in_shape))
+            onnx.helper.make_tensor_value_info(
+                in_name, onnx_dtype, batch_dim + in_shape
+            )
+        )
         onnx_outputs.append(
-            onnx.helper.make_tensor_value_info(out_name, onnx_dtype,
-                                               batch_dim + out_shape))
+            onnx.helper.make_tensor_value_info(
+                out_name, onnx_dtype, batch_dim + out_shape
+            )
+        )
 
         if input_shapes == output_shapes:
-            onnx_nodes.append(
-                onnx.helper.make_node("Identity", [in_name], [out_name]))
+            onnx_nodes.append(onnx.helper.make_node("Identity", [in_name], [out_name]))
         else:
             onnx_nodes.append(
-                onnx.helper.make_node("Shape", [out_name], [out_shape_name]))
+                onnx.helper.make_node("Shape", [out_name], [out_shape_name])
+            )
             onnx_nodes.append(
-                onnx.helper.make_node("Reshape", [in_name, out_shape_name],
-                                      [out_name]))
+                onnx.helper.make_node("Reshape", [in_name, out_shape_name], [out_name])
+            )
 
-    graph_proto = onnx.helper.make_graph(onnx_nodes, model_name, onnx_inputs,
-                                         onnx_outputs)
+    graph_proto = onnx.helper.make_graph(
+        onnx_nodes, model_name, onnx_inputs, onnx_outputs
+    )
     if FLAGS.onnx_opset > 0:
         model_opset = onnx.helper.make_operatorsetid("", FLAGS.onnx_opset)
-        model_def = onnx.helper.make_model(graph_proto,
-                                           producer_name="triton",
-                                           opset_imports=[model_opset])
+        model_def = onnx.helper.make_model(
+            graph_proto, producer_name="triton", opset_imports=[model_opset]
+        )
     else:
         model_def = onnx.helper.make_model(graph_proto, producer_name="triton")
 
@@ -751,35 +928,45 @@ def create_onnx_modelfile(models_dir, model_version, max_batch, dtype,
     onnx.save(model_def, model_version_dir + "/model.onnx")
 
 
-def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype,
-                            input_shapes, input_model_shapes, output_shapes,
-                            output_model_shapes):
-
+def create_onnx_modelconfig(
+    models_dir,
+    model_version,
+    max_batch,
+    dtype,
+    input_shapes,
+    input_model_shapes,
+    output_shapes,
+    output_model_shapes,
+):
     assert len(input_shapes) == len(input_model_shapes)
     assert len(output_shapes) == len(output_model_shapes)
     assert len(input_shapes) == len(output_shapes)
-    if not tu.validate_for_onnx_model(dtype, dtype, dtype, input_shapes[0],
-                                      input_shapes[0], input_shapes[0]):
+    if not tu.validate_for_onnx_model(
+        dtype, dtype, dtype, input_shapes[0], input_shapes[0], input_shapes[0]
+    ):
         return
 
     io_cnt = len(input_shapes)
 
     # Use a different model name for the non-batching variant
     model_name = tu.get_zero_model_name(
-        "onnx_nobatch" if max_batch == 0 else "onnx", io_cnt, dtype)
+        "onnx_nobatch" if max_batch == 0 else "onnx", io_cnt, dtype
+    )
     config_dir = models_dir + "/" + model_name
 
-    config = emu.create_general_modelconfig(model_name,
-                                            "onnxruntime_onnx",
-                                            max_batch,
-                                            emu.repeat(dtype, io_cnt),
-                                            input_shapes,
-                                            input_model_shapes,
-                                            emu.repeat(dtype, io_cnt),
-                                            output_shapes,
-                                            output_model_shapes,
-                                            emu.repeat(None, io_cnt),
-                                            force_tensor_number_suffix=True)
+    config = emu.create_general_modelconfig(
+        model_name,
+        "onnxruntime_onnx",
+        max_batch,
+        emu.repeat(dtype, io_cnt),
+        input_shapes,
+        input_model_shapes,
+        emu.repeat(dtype, io_cnt),
+        output_shapes,
+        output_model_shapes,
+        emu.repeat(None, io_cnt),
+        force_tensor_number_suffix=True,
+    )
 
     try:
         os.makedirs(config_dir)
@@ -790,23 +977,33 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype,
         cfile.write(config)
 
 
-def create_openvino_modelfile(models_dir, model_version, max_batch, dtype,
-                              input_shapes, output_shapes):
-
+def create_openvino_modelfile(
+    models_dir, model_version, max_batch, dtype, input_shapes, output_shapes
+):
     assert len(input_shapes) == len(output_shapes)
-    batch_dim = [] if max_batch == 0 else [
-        max_batch,
-    ]
+    batch_dim = (
+        []
+        if max_batch == 0
+        else [
+            max_batch,
+        ]
+    )
     if not tu.validate_for_openvino_model(
-            dtype, dtype, dtype, batch_dim + input_shapes[0],
-            batch_dim + input_shapes[0], batch_dim + input_shapes[0]):
+        dtype,
+        dtype,
+        dtype,
+        batch_dim + input_shapes[0],
+        batch_dim + input_shapes[0],
+        batch_dim + input_shapes[0],
+    ):
         return
 
     io_cnt = len(input_shapes)
 
     # Create the model
     model_name = tu.get_zero_model_name(
-        "openvino_nobatch" if max_batch == 0 else "openvino", io_cnt, dtype)
+        "openvino_nobatch" if max_batch == 0 else "openvino", io_cnt, dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     openvino_inputs = []
@@ -815,15 +1012,19 @@ def create_openvino_modelfile(models_dir, model_version, max_batch, dtype,
         in_name = "INPUT{}".format(io_num)
         out_name = "OUTPUT{}".format(io_num)
         openvino_inputs.append(
-            ng.parameter(shape=batch_dim + input_shapes[io_num],
-                         dtype=dtype,
-                         name=in_name))
+            ng.parameter(
+                shape=batch_dim + input_shapes[io_num], dtype=dtype, name=in_name
+            )
+        )
 
         openvino_outputs.append(
-            ng.reshape(openvino_inputs[io_num],
-                       batch_dim + output_shapes[io_num],
-                       name=out_name,
-                       special_zero=False))
+            ng.reshape(
+                openvino_inputs[io_num],
+                batch_dim + output_shapes[io_num],
+                name=out_name,
+                special_zero=False,
+            )
+        )
 
     function = ng.impl.Function(openvino_outputs, openvino_inputs, model_name)
     ie_network = IENetwork(ng.impl.Function.to_capsule(function))
@@ -833,40 +1034,59 @@ def create_openvino_modelfile(models_dir, model_version, max_batch, dtype,
     except OSError as ex:
         pass  # ignore existing dir
 
-    ie_network.serialize(model_version_dir + "/model.xml",
-                         model_version_dir + "/model.bin")
-
-
-def create_openvino_modelconfig(models_dir, model_version, max_batch, dtype,
-                                input_shapes, input_model_shapes, output_shapes,
-                                output_model_shapes):
-
+    ie_network.serialize(
+        model_version_dir + "/model.xml", model_version_dir + "/model.bin"
+    )
+
+
+def create_openvino_modelconfig(
+    models_dir,
+    model_version,
+    max_batch,
+    dtype,
+    input_shapes,
+    input_model_shapes,
+    output_shapes,
+    output_model_shapes,
+):
     assert len(input_shapes) == len(input_model_shapes)
     assert len(output_shapes) == len(output_model_shapes)
     assert len(input_shapes) == len(output_shapes)
-    batch_dim = [] if max_batch == 0 else [
-        max_batch,
-    ]
+    batch_dim = (
+        []
+        if max_batch == 0
+        else [
+            max_batch,
+        ]
+    )
     if not tu.validate_for_openvino_model(
-            dtype, dtype, dtype, batch_dim + input_shapes[0],
-            batch_dim + input_shapes[0], batch_dim + input_shapes[0]):
+        dtype,
+        dtype,
+        dtype,
+        batch_dim + input_shapes[0],
+        batch_dim + input_shapes[0],
+        batch_dim + input_shapes[0],
+    ):
         return
 
     io_cnt = len(input_shapes)
 
     # Use a different model name for the non-batching variant
     model_name = tu.get_zero_model_name(
-        "openvino_nobatch" if max_batch == 0 else "openvino", io_cnt, dtype)
+        "openvino_nobatch" if max_batch == 0 else "openvino", io_cnt, dtype
+    )
     config_dir = models_dir + "/" + model_name
 
-    config = '''
+    config = """
 name: "{}"
 backend: "openvino"
 max_batch_size: {}
-'''.format(model_name, max_batch)
+""".format(
+        model_name, max_batch
+    )
 
     for io_num in range(io_cnt):
-        config += '''
+        config += """
 input [
   {{
     name: "INPUT{}"
@@ -883,17 +1103,24 @@ def create_openvino_modelconfig(models_dir, model_version, max_batch, dtype,
     {}
   }}
 ]
-'''.format(
-            io_num, np_to_model_dtype(dtype),
+""".format(
+            io_num,
+            np_to_model_dtype(dtype),
             tu.shape_to_dims_str(input_shapes[io_num]),
             "reshape: {{ shape: [ {} ] }}".format(
-                tu.shape_to_dims_str(input_model_shapes[io_num]))
-            if input_shapes[io_num] != input_model_shapes[io_num] else "",
-            io_num, np_to_model_dtype(dtype),
+                tu.shape_to_dims_str(input_model_shapes[io_num])
+            )
+            if input_shapes[io_num] != input_model_shapes[io_num]
+            else "",
+            io_num,
+            np_to_model_dtype(dtype),
             tu.shape_to_dims_str(output_shapes[io_num]),
             "reshape: {{ shape: [ {} ] }}".format(
-                tu.shape_to_dims_str(output_model_shapes[io_num]))
-            if output_shapes[io_num] != output_model_shapes[io_num] else "")
+                tu.shape_to_dims_str(output_model_shapes[io_num])
+            )
+            if output_shapes[io_num] != output_model_shapes[io_num]
+            else "",
+        )
 
     try:
         os.makedirs(config_dir)
@@ -904,13 +1131,15 @@ def create_openvino_modelconfig(models_dir, model_version, max_batch, dtype,
         cfile.write(config)
 
 
-def create_models(models_dir,
-                  dtype,
-                  input_shapes,
-                  input_model_shapes,
-                  output_shapes=None,
-                  output_model_shapes=None,
-                  no_batch=True):
+def create_models(
+    models_dir,
+    dtype,
+    input_shapes,
+    input_model_shapes,
+    output_shapes=None,
+    output_model_shapes=None,
+    no_batch=True,
+):
     model_version = 1
     if output_shapes is None:
         output_shapes = input_shapes
@@ -918,43 +1147,124 @@ def create_models(models_dir,
         output_model_shapes = input_model_shapes
 
     if FLAGS.graphdef:
-        create_tf_modelconfig(False, models_dir, model_version, 8, dtype,
-                              input_shapes, input_model_shapes, output_shapes,
-                              output_model_shapes)
-        create_tf_modelfile(False, models_dir, model_version, 8, dtype,
-                            input_model_shapes, output_model_shapes)
+        create_tf_modelconfig(
+            False,
+            models_dir,
+            model_version,
+            8,
+            dtype,
+            input_shapes,
+            input_model_shapes,
+            output_shapes,
+            output_model_shapes,
+        )
+        create_tf_modelfile(
+            False,
+            models_dir,
+            model_version,
+            8,
+            dtype,
+            input_model_shapes,
+            output_model_shapes,
+        )
         if no_batch:
-            create_tf_modelconfig(False, models_dir, model_version, 0, dtype,
-                                  input_shapes, input_model_shapes,
-                                  output_shapes, output_model_shapes)
-            create_tf_modelfile(False, models_dir, model_version, 0, dtype,
-                                input_model_shapes, output_model_shapes)
+            create_tf_modelconfig(
+                False,
+                models_dir,
+                model_version,
+                0,
+                dtype,
+                input_shapes,
+                input_model_shapes,
+                output_shapes,
+                output_model_shapes,
+            )
+            create_tf_modelfile(
+                False,
+                models_dir,
+                model_version,
+                0,
+                dtype,
+                input_model_shapes,
+                output_model_shapes,
+            )
 
     if FLAGS.savedmodel:
-        create_tf_modelconfig(True, models_dir, model_version, 8, dtype,
-                              input_shapes, input_model_shapes, output_shapes,
-                              output_model_shapes)
-        create_tf_modelfile(True, models_dir, model_version, 8, dtype,
-                            input_model_shapes, output_model_shapes)
+        create_tf_modelconfig(
+            True,
+            models_dir,
+            model_version,
+            8,
+            dtype,
+            input_shapes,
+            input_model_shapes,
+            output_shapes,
+            output_model_shapes,
+        )
+        create_tf_modelfile(
+            True,
+            models_dir,
+            model_version,
+            8,
+            dtype,
+            input_model_shapes,
+            output_model_shapes,
+        )
         if no_batch:
-            create_tf_modelconfig(True, models_dir, model_version, 0, dtype,
-                                  input_shapes, input_model_shapes,
-                                  output_shapes, output_model_shapes)
-            create_tf_modelfile(True, models_dir, model_version, 0, dtype,
-                                input_model_shapes, output_model_shapes)
+            create_tf_modelconfig(
+                True,
+                models_dir,
+                model_version,
+                0,
+                dtype,
+                input_shapes,
+                input_model_shapes,
+                output_shapes,
+                output_model_shapes,
+            )
+            create_tf_modelfile(
+                True,
+                models_dir,
+                model_version,
+                0,
+                dtype,
+                input_model_shapes,
+                output_model_shapes,
+            )
 
     if FLAGS.onnx:
-        create_onnx_modelconfig(models_dir, model_version, 8, dtype,
-                                input_shapes, input_model_shapes, output_shapes,
-                                output_model_shapes)
-        create_onnx_modelfile(models_dir, model_version, 8, dtype,
-                              input_model_shapes, output_model_shapes)
+        create_onnx_modelconfig(
+            models_dir,
+            model_version,
+            8,
+            dtype,
+            input_shapes,
+            input_model_shapes,
+            output_shapes,
+            output_model_shapes,
+        )
+        create_onnx_modelfile(
+            models_dir, model_version, 8, dtype, input_model_shapes, output_model_shapes
+        )
         if no_batch:
-            create_onnx_modelconfig(models_dir, model_version, 0, dtype,
-                                    input_shapes, input_model_shapes,
-                                    output_shapes, output_model_shapes)
-            create_onnx_modelfile(models_dir, model_version, 0, dtype,
-                                  input_model_shapes, output_model_shapes)
+            create_onnx_modelconfig(
+                models_dir,
+                model_version,
+                0,
+                dtype,
+                input_shapes,
+                input_model_shapes,
+                output_shapes,
+                output_model_shapes,
+            )
+            create_onnx_modelfile(
+                models_dir,
+                model_version,
+                0,
+                dtype,
+                input_model_shapes,
+                output_model_shapes,
+            )
 
     # Shouldn't create ensembles that reshape to zero-sized tensors. Reshaping
     # from / to zero dimension is not allow as ensemble inputs / outputs
@@ -967,26 +1277,49 @@ def create_models(models_dir,
             emu.create_nop_modelconfig(models_dir, shape, np.float32)
             emu.create_nop_tunnel_modelconfig(models_dir, shape, np.float32)
             emu.create_nop_modelconfig(models_dir, [-1], np.float32)
-        create_ensemble_modelconfig(models_dir, model_version, 8, dtype,
-                                    input_shapes, input_model_shapes,
-                                    output_shapes, output_model_shapes)
-        create_ensemble_modelfile(models_dir, model_version, 8, dtype,
-                                  input_model_shapes, output_model_shapes)
+        create_ensemble_modelconfig(
+            models_dir,
+            model_version,
+            8,
+            dtype,
+            input_shapes,
+            input_model_shapes,
+            output_shapes,
+            output_model_shapes,
+        )
+        create_ensemble_modelfile(
+            models_dir, model_version, 8, dtype, input_model_shapes, output_model_shapes
+        )
         if no_batch:
-            create_ensemble_modelconfig(models_dir, model_version, 0, dtype,
-                                        input_shapes, input_model_shapes,
-                                        output_shapes, output_model_shapes)
-            create_ensemble_modelfile(models_dir, model_version, 0, dtype,
-                                      input_model_shapes, output_model_shapes)
-
-
-def create_trt_models(models_dir,
-                      dtype,
-                      input_shapes,
-                      input_model_shapes,
-                      output_shapes=None,
-                      output_model_shapes=None,
-                      no_batch=True):
+            create_ensemble_modelconfig(
+                models_dir,
+                model_version,
+                0,
+                dtype,
+                input_shapes,
+                input_model_shapes,
+                output_shapes,
+                output_model_shapes,
+            )
+            create_ensemble_modelfile(
+                models_dir,
+                model_version,
+                0,
+                dtype,
+                input_model_shapes,
+                output_model_shapes,
+            )
+
+
+def create_trt_models(
+    models_dir,
+    dtype,
+    input_shapes,
+    input_model_shapes,
+    output_shapes=None,
+    output_model_shapes=None,
+    no_batch=True,
+):
     model_version = 1
     if output_shapes is None:
         output_shapes = input_shapes
@@ -994,26 +1327,49 @@ def create_trt_models(models_dir,
         output_model_shapes = input_model_shapes
 
     if FLAGS.tensorrt:
-        create_plan_modelconfig(models_dir, model_version, 8, dtype,
-                                input_shapes, input_model_shapes, output_shapes,
-                                output_model_shapes)
-        create_plan_modelfile(models_dir, model_version, 8, dtype,
-                              input_model_shapes, output_model_shapes)
+        create_plan_modelconfig(
+            models_dir,
+            model_version,
+            8,
+            dtype,
+            input_shapes,
+            input_model_shapes,
+            output_shapes,
+            output_model_shapes,
+        )
+        create_plan_modelfile(
+            models_dir, model_version, 8, dtype, input_model_shapes, output_model_shapes
+        )
         if no_batch:
-            create_plan_modelconfig(models_dir, model_version, 0, dtype,
-                                    input_shapes, input_model_shapes,
-                                    output_shapes, output_model_shapes)
-            create_plan_modelfile(models_dir, model_version, 0, dtype,
-                                  input_model_shapes, output_model_shapes)
-
-
-def create_libtorch_models(models_dir,
-                           dtype,
-                           input_shapes,
-                           input_model_shapes,
-                           output_shapes=None,
-                           output_model_shapes=None,
-                           no_batch=True):
+            create_plan_modelconfig(
+                models_dir,
+                model_version,
+                0,
+                dtype,
+                input_shapes,
+                input_model_shapes,
+                output_shapes,
+                output_model_shapes,
+            )
+            create_plan_modelfile(
+                models_dir,
+                model_version,
+                0,
+                dtype,
+                input_model_shapes,
+                output_model_shapes,
+            )
+
+
+def create_libtorch_models(
+    models_dir,
+    dtype,
+    input_shapes,
+    input_model_shapes,
+    output_shapes=None,
+    output_model_shapes=None,
+    no_batch=True,
+):
     model_version = 1
     if output_shapes is None:
         output_shapes = input_shapes
@@ -1021,27 +1377,50 @@ def create_libtorch_models(models_dir,
         output_model_shapes = input_model_shapes
 
     if FLAGS.libtorch:
-        create_libtorch_modelconfig(models_dir, model_version, 8, dtype,
-                                    input_shapes, input_model_shapes,
-                                    output_shapes, output_model_shapes)
-        create_libtorch_modelfile(models_dir, model_version, 8, dtype,
-                                  input_model_shapes, output_model_shapes)
+        create_libtorch_modelconfig(
+            models_dir,
+            model_version,
+            8,
+            dtype,
+            input_shapes,
+            input_model_shapes,
+            output_shapes,
+            output_model_shapes,
+        )
+        create_libtorch_modelfile(
+            models_dir, model_version, 8, dtype, input_model_shapes, output_model_shapes
+        )
         # skip for libtorch string I/O
         if no_batch and (dtype != np_dtype_string):
-            create_libtorch_modelconfig(models_dir, model_version, 0, dtype,
-                                        input_shapes, input_model_shapes,
-                                        output_shapes, output_model_shapes)
-            create_libtorch_modelfile(models_dir, model_version, 0, dtype,
-                                      input_model_shapes, output_model_shapes)
-
-
-def create_openvino_models(models_dir,
-                           dtype,
-                           input_shapes,
-                           input_model_shapes,
-                           output_shapes=None,
-                           output_model_shapes=None,
-                           no_batch=True):
+            create_libtorch_modelconfig(
+                models_dir,
+                model_version,
+                0,
+                dtype,
+                input_shapes,
+                input_model_shapes,
+                output_shapes,
+                output_model_shapes,
+            )
+            create_libtorch_modelfile(
+                models_dir,
+                model_version,
+                0,
+                dtype,
+                input_model_shapes,
+                output_model_shapes,
+            )
+
+
+def create_openvino_models(
+    models_dir,
+    dtype,
+    input_shapes,
+    input_model_shapes,
+    output_shapes=None,
+    output_model_shapes=None,
+    no_batch=True,
+):
     model_version = 1
     if output_shapes is None:
         output_shapes = input_shapes
@@ -1049,68 +1428,106 @@ def create_openvino_models(models_dir,
         output_model_shapes = input_model_shapes
 
     if FLAGS.openvino:
-        create_openvino_modelconfig(models_dir, model_version, 8, dtype,
-                                    input_shapes, input_model_shapes,
-                                    output_shapes, output_model_shapes)
-        create_openvino_modelfile(models_dir, model_version, 8, dtype,
-                                  input_model_shapes, output_model_shapes)
+        create_openvino_modelconfig(
+            models_dir,
+            model_version,
+            8,
+            dtype,
+            input_shapes,
+            input_model_shapes,
+            output_shapes,
+            output_model_shapes,
+        )
+        create_openvino_modelfile(
+            models_dir, model_version, 8, dtype, input_model_shapes, output_model_shapes
+        )
         if no_batch:
-            create_openvino_modelconfig(models_dir, model_version, 0, dtype,
-                                        input_shapes, input_model_shapes,
-                                        output_shapes, output_model_shapes)
-            create_openvino_modelfile(models_dir, model_version, 0, dtype,
-                                      input_model_shapes, output_model_shapes)
-
-
-if __name__ == '__main__':
+            create_openvino_modelconfig(
+                models_dir,
+                model_version,
+                0,
+                dtype,
+                input_shapes,
+                input_model_shapes,
+                output_shapes,
+                output_model_shapes,
+            )
+            create_openvino_modelfile(
+                models_dir,
+                model_version,
+                0,
+                dtype,
+                input_model_shapes,
+                output_model_shapes,
+            )
+
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
-    parser.add_argument('--graphdef',
-                        required=False,
-                        action='store_true',
-                        help='Generate GraphDef models')
-    parser.add_argument('--savedmodel',
-                        required=False,
-                        action='store_true',
-                        help='Generate SavedModel models')
-    parser.add_argument('--tensorrt',
-                        required=False,
-                        action='store_true',
-                        help='Generate TensorRT PLAN models')
-    parser.add_argument('--onnx',
-                        required=False,
-                        action='store_true',
-                        help='Generate Onnx Runtime Onnx models')
     parser.add_argument(
-        '--onnx_opset',
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
+    parser.add_argument(
+        "--graphdef",
+        required=False,
+        action="store_true",
+        help="Generate GraphDef models",
+    )
+    parser.add_argument(
+        "--savedmodel",
+        required=False,
+        action="store_true",
+        help="Generate SavedModel models",
+    )
+    parser.add_argument(
+        "--tensorrt",
+        required=False,
+        action="store_true",
+        help="Generate TensorRT PLAN models",
+    )
+    parser.add_argument(
+        "--onnx",
+        required=False,
+        action="store_true",
+        help="Generate Onnx Runtime Onnx models",
+    )
+    parser.add_argument(
+        "--onnx_opset",
         type=int,
         required=False,
         default=0,
-        help='Opset used for Onnx models. Default is to use ONNXRT default')
-    parser.add_argument('--libtorch',
-                        required=False,
-                        action='store_true',
-                        help='Generate Pytorch LibTorch models')
-    parser.add_argument('--openvino',
-                        required=False,
-                        action='store_true',
-                        help='Generate OpenVino models')
-    parser.add_argument('--ensemble',
-                        required=False,
-                        action='store_true',
-                        help='Generate ensemble models')
-    parser.add_argument('--variable',
-                        required=False,
-                        action='store_true',
-                        help='Used variable-shape tensors for input/output')
+        help="Opset used for Onnx models. Default is to use ONNXRT default",
+    )
+    parser.add_argument(
+        "--libtorch",
+        required=False,
+        action="store_true",
+        help="Generate Pytorch LibTorch models",
+    )
+    parser.add_argument(
+        "--openvino",
+        required=False,
+        action="store_true",
+        help="Generate OpenVino models",
+    )
+    parser.add_argument(
+        "--ensemble",
+        required=False,
+        action="store_true",
+        help="Generate ensemble models",
+    )
+    parser.add_argument(
+        "--variable",
+        required=False,
+        action="store_true",
+        help="Used variable-shape tensors for input/output",
+    )
     FLAGS, unparsed = parser.parse_known_args()
 
     if FLAGS.graphdef or FLAGS.savedmodel:
         import tensorflow as tf
         from tensorflow.python.framework import graph_io
+
         tf.compat.v1.disable_eager_execution()
     if FLAGS.tensorrt:
         import tensorrt as trt
@@ -1127,64 +1544,82 @@ def create_openvino_models(models_dir,
 
     # TensorRT, OpenVino and LibTorch must be handled separately since they
     # don't support zero-sized tensors.
-    create_models(FLAGS.models_dir,
-                  np_dtype_string, ([1],), ([],),
-                  no_batch=False)
+    create_models(FLAGS.models_dir, np_dtype_string, ([1],), ([],), no_batch=False)
     create_models(FLAGS.models_dir, np.float32, ([1],), ([],), no_batch=False)
-    create_models(FLAGS.models_dir,
-                  np.float32, ([1], [8]), ([], [4, 1, 2]),
-                  no_batch=False)
-    create_models(FLAGS.models_dir, np.float32, ([4, 4], [2], [2, 2, 3]),
-                  ([16], [1, 2], [3, 2, 2]))
-    create_libtorch_models(FLAGS.models_dir,
-                           np.float32, ([1],), ([1, 1, 1],),
-                           no_batch=False)
-    create_libtorch_models(FLAGS.models_dir,
-                           np.float32, ([1], [8]), ([1, 1, 1], [4, 1, 2]),
-                           no_batch=False)
-    create_libtorch_models(FLAGS.models_dir, np.float32,
-                           ([4, 4], [2], [2, 2, 3]), ([16], [1, 2], [3, 2, 2]))
-    create_libtorch_models(FLAGS.models_dir,
-                           np_dtype_string, ([1],), ([],),
-                           no_batch=False)
-    create_openvino_models(FLAGS.models_dir,
-                           np.float32, ([1],), ([1, 1, 1],),
-                           no_batch=False)
-    create_openvino_models(FLAGS.models_dir,
-                           np.float32, ([1], [8]), ([1, 1, 1], [4, 1, 2]),
-                           no_batch=False)
-    create_openvino_models(FLAGS.models_dir, np.float32,
-                           ([4, 4], [2], [2, 2, 3]), ([16], [1, 2], [3, 2, 2]))
-    create_trt_models(FLAGS.models_dir, np.float32, ([1], [8]),
-                      ([1, 1, 1], [4, 1, 2]))
+    create_models(
+        FLAGS.models_dir, np.float32, ([1], [8]), ([], [4, 1, 2]), no_batch=False
+    )
+    create_models(
+        FLAGS.models_dir,
+        np.float32,
+        ([4, 4], [2], [2, 2, 3]),
+        ([16], [1, 2], [3, 2, 2]),
+    )
+    create_libtorch_models(
+        FLAGS.models_dir, np.float32, ([1],), ([1, 1, 1],), no_batch=False
+    )
+    create_libtorch_models(
+        FLAGS.models_dir, np.float32, ([1], [8]), ([1, 1, 1], [4, 1, 2]), no_batch=False
+    )
+    create_libtorch_models(
+        FLAGS.models_dir,
+        np.float32,
+        ([4, 4], [2], [2, 2, 3]),
+        ([16], [1, 2], [3, 2, 2]),
+    )
+    create_libtorch_models(
+        FLAGS.models_dir, np_dtype_string, ([1],), ([],), no_batch=False
+    )
+    create_openvino_models(
+        FLAGS.models_dir, np.float32, ([1],), ([1, 1, 1],), no_batch=False
+    )
+    create_openvino_models(
+        FLAGS.models_dir, np.float32, ([1], [8]), ([1, 1, 1], [4, 1, 2]), no_batch=False
+    )
+    create_openvino_models(
+        FLAGS.models_dir,
+        np.float32,
+        ([4, 4], [2], [2, 2, 3]),
+        ([16], [1, 2], [3, 2, 2]),
+    )
+    create_trt_models(FLAGS.models_dir, np.float32, ([1], [8]), ([1, 1, 1], [4, 1, 2]))
 
     # Models that reshape only the input, not the output.
-    create_models(FLAGS.models_dir,
-                  np.float32, ([4, 4], [2], [2, 2, 3], [1]),
-                  ([16], [1, 2], [3, 2, 2], [1]),
-                  output_shapes=([16], [1, 2], [3, 2, 2], [1]),
-                  output_model_shapes=([16], [1, 2], [3, 2, 2], [1]))
-
-    create_libtorch_models(FLAGS.models_dir,
-                           np.float32, ([4, 4], [2], [2, 2, 3], [1]),
-                           ([16], [1, 2], [3, 2, 2], [1]),
-                           output_shapes=([16], [1, 2], [3, 2, 2], [1]),
-                           output_model_shapes=([16], [1, 2], [3, 2, 2], [1]))
-
-    create_openvino_models(FLAGS.models_dir,
-                           np.float32, ([4, 4], [2], [2, 2, 3], [1]),
-                           ([16], [1, 2], [3, 2, 2], [1]),
-                           output_shapes=([16], [1, 2], [3, 2, 2], [1]),
-                           output_model_shapes=([16], [1, 2], [3, 2, 2], [1]))
-
-    create_trt_models(FLAGS.models_dir,
-                      np.float32, ([4, 4], [2], [2, 2, 3], [1]),
-                      ([2, 2, 4], [1, 2, 1], [3, 2, 2], [1, 1, 1]),
-                      output_shapes=([2, 2, 4], [1, 2, 1], [3, 2, 2], [1, 1,
-                                                                       1]),
-                      output_model_shapes=([2, 2, 4], [1, 2, 1], [3, 2,
-                                                                  2], [1, 1,
-                                                                       1]))
+    create_models(
+        FLAGS.models_dir,
+        np.float32,
+        ([4, 4], [2], [2, 2, 3], [1]),
+        ([16], [1, 2], [3, 2, 2], [1]),
+        output_shapes=([16], [1, 2], [3, 2, 2], [1]),
+        output_model_shapes=([16], [1, 2], [3, 2, 2], [1]),
+    )
+
+    create_libtorch_models(
+        FLAGS.models_dir,
+        np.float32,
+        ([4, 4], [2], [2, 2, 3], [1]),
+        ([16], [1, 2], [3, 2, 2], [1]),
+        output_shapes=([16], [1, 2], [3, 2, 2], [1]),
+        output_model_shapes=([16], [1, 2], [3, 2, 2], [1]),
+    )
+
+    create_openvino_models(
+        FLAGS.models_dir,
+        np.float32,
+        ([4, 4], [2], [2, 2, 3], [1]),
+        ([16], [1, 2], [3, 2, 2], [1]),
+        output_shapes=([16], [1, 2], [3, 2, 2], [1]),
+        output_model_shapes=([16], [1, 2], [3, 2, 2], [1]),
+    )
+
+    create_trt_models(
+        FLAGS.models_dir,
+        np.float32,
+        ([4, 4], [2], [2, 2, 3], [1]),
+        ([2, 2, 4], [1, 2, 1], [3, 2, 2], [1, 1, 1]),
+        output_shapes=([2, 2, 4], [1, 2, 1], [3, 2, 2], [1, 1, 1]),
+        output_model_shapes=([2, 2, 4], [1, 2, 1], [3, 2, 2], [1, 1, 1]),
+    )
 
     # Tests with models that accept variable-shape input/output tensors and reshape
     # TensorRT is ignored as it only allows fixed-shape tensors
@@ -1192,14 +1627,21 @@ def create_openvino_models(models_dir,
     # based on input used for tracing), need to find equivalent operation that
     # is not shape dependent.
     if FLAGS.variable:
-        create_models(FLAGS.models_dir, np.int32, ([2, 4, -1, 6],),
-                      ([8, -1, 1, 6],))
-        create_models(FLAGS.models_dir, np.int32, ([1, -1, 1], [-1], [2, 2, 3]),
-                      ([-1], [1, -1, 1], [3, 2, 2]))
-        create_models(FLAGS.models_dir,
-                      np.int32, ([-1, 1], [2]), ([1, -1], [1, 2]),
-                      output_shapes=([1, -1], [1, 2]),
-                      output_model_shapes=([1, -1], [1, 2]))
+        create_models(FLAGS.models_dir, np.int32, ([2, 4, -1, 6],), ([8, -1, 1, 6],))
+        create_models(
+            FLAGS.models_dir,
+            np.int32,
+            ([1, -1, 1], [-1], [2, 2, 3]),
+            ([-1], [1, -1, 1], [3, 2, 2]),
+        )
+        create_models(
+            FLAGS.models_dir,
+            np.int32,
+            ([-1, 1], [2]),
+            ([1, -1], [1, 2]),
+            output_shapes=([1, -1], [1, 2]),
+            output_model_shapes=([1, -1], [1, 2]),
+        )
 
     # TRT plan that reshapes neither input nor output. Needed for
     # L0_perflab_nomodel.
diff --git a/qa/common/gen_qa_sequence_models.py b/qa/common/gen_qa_sequence_models.py
old mode 100644
new mode 100755
index 9be91c6c86..01cd515c15
--- a/qa/common/gen_qa_sequence_models.py
+++ b/qa/common/gen_qa_sequence_models.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,8 +28,9 @@
 
 import argparse
 import os
-import numpy as np
+
 import gen_ensemble_model_utils as emu
+import numpy as np
 
 FLAGS = None
 np_dtype_string = np.dtype(object)
@@ -150,9 +153,9 @@ def np_to_torch_dtype(np_dtype):
     return None
 
 
-def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
-                        dtype, shape):
-
+def create_tf_modelfile(
+    create_savedmodel, models_dir, model_version, max_batch, dtype, shape
+):
     if not tu.validate_for_tf_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
@@ -177,29 +180,43 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
     # dimension.
     tf.compat.v1.reset_default_graph()
     if create_savedmodel and (max_batch == 0):
-        input0 = tf.compat.v1.placeholder(tf_input_dtype, [
-            1,
-        ], "INPUT")
+        input0 = tf.compat.v1.placeholder(
+            tf_input_dtype,
+            [
+                1,
+            ],
+            "INPUT",
+        )
         if tf_input_dtype == tf.string:
-            input0 = tf.strings.to_number(tf.strings.join(["0", input0]),
-                                          tf_dtype)
-        start0 = tf.compat.v1.placeholder(tf_control_type, [
-            1,
-        ], "START")
-        ready0 = tf.compat.v1.placeholder(tf_control_type, [
-            1,
-        ], "READY")
-        acc = tf.compat.v1.get_variable("ACC", [
-            1,
-        ], dtype=tf_dtype)
+            input0 = tf.strings.to_number(tf.strings.join(["0", input0]), tf_dtype)
+        start0 = tf.compat.v1.placeholder(
+            tf_control_type,
+            [
+                1,
+            ],
+            "START",
+        )
+        ready0 = tf.compat.v1.placeholder(
+            tf_control_type,
+            [
+                1,
+            ],
+            "READY",
+        )
+        acc = tf.compat.v1.get_variable(
+            "ACC",
+            [
+                1,
+            ],
+            dtype=tf_dtype,
+        )
 
         # Convert boolean value to int32 value
         if tf_control_type == tf.bool:
             start0 = tf.cast(start0, tf.int32)
             ready0 = tf.cast(ready0, tf.int32)
 
-        tmp = tf.compat.v1.where(tf.equal(start0, 1), input0,
-                                 tf.add(acc, input0))
+        tmp = tf.compat.v1.where(tf.equal(start0, 1), input0, tf.add(acc, input0))
         newacc = tf.compat.v1.where(tf.equal(ready0, 1), tmp, acc)
 
         assign = tf.compat.v1.assign(acc, newacc)
@@ -215,12 +232,16 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
         # output shape being [None, 1]. So instead we just return 0 if
         # not-ready and 'INPUT'+'START' otherwise... the tests know to
         # expect this.
-        input0 = tf.compat.v1.placeholder(tf_input_dtype, [
-            None,
-        ] + tu.shape_to_tf_shape(shape), "INPUT")
+        input0 = tf.compat.v1.placeholder(
+            tf_input_dtype,
+            [
+                None,
+            ]
+            + tu.shape_to_tf_shape(shape),
+            "INPUT",
+        )
         if tf_input_dtype == tf.string:
-            input0 = tf.strings.to_number(tf.strings.join(["0", input0]),
-                                          tf_dtype)
+            input0 = tf.strings.to_number(tf.strings.join(["0", input0]), tf_dtype)
         start0 = tf.compat.v1.placeholder(tf_control_type, [None, 1], "START")
         ready0 = tf.compat.v1.placeholder(tf_control_type, [None, 1], "READY")
 
@@ -230,8 +251,10 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
             ready0 = tf.cast(ready0, tf.int32)
 
         tmp = tf.compat.v1.where(
-            tf.equal(ready0, 1), tf.add(start0, input0),
-            tf.zeros(tf.shape(input=input0), dtype=tf_dtype))
+            tf.equal(ready0, 1),
+            tf.add(start0, input0),
+            tf.zeros(tf.shape(input=input0), dtype=tf_dtype),
+        )
 
         if tf_input_dtype == tf.string:
             tf.strings.as_string(tmp, name="OUTPUT")
@@ -241,10 +264,12 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
     # Use a different model name for the non-batching variant
     if create_savedmodel:
         model_name = tu.get_sequence_model_name(
-            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", dtype)
+            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", dtype
+        )
     else:
         model_name = tu.get_sequence_model_name(
-            "graphdef_nobatch" if max_batch == 0 else "graphdef", dtype)
+            "graphdef_nobatch" if max_batch == 0 else "graphdef", dtype
+        )
 
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
@@ -257,44 +282,53 @@ def create_tf_modelfile(create_savedmodel, models_dir, model_version, max_batch,
         with tf.compat.v1.Session() as sess:
             sess.run(tf.compat.v1.initializers.global_variables())
             input0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-                "INPUT:0")
+                "INPUT:0"
+            )
             start0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-                "START:0")
+                "START:0"
+            )
             ready0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
-                "READY:0")
-            output0_tensor = tf.compat.v1.get_default_graph(
-            ).get_tensor_by_name("OUTPUT:0")
+                "READY:0"
+            )
+            output0_tensor = tf.compat.v1.get_default_graph().get_tensor_by_name(
+                "OUTPUT:0"
+            )
             tf.compat.v1.saved_model.simple_save(
                 sess,
                 model_version_dir + "/model.savedmodel",
                 inputs={
                     "INPUT": input0_tensor,
                     "START": start0_tensor,
-                    "READY": ready0_tensor
+                    "READY": ready0_tensor,
                 },
-                outputs={"OUTPUT": output0_tensor})
+                outputs={"OUTPUT": output0_tensor},
+            )
     else:
         with tf.compat.v1.Session() as sess:
             sess.run(tf.compat.v1.initializers.global_variables())
-            graph_io.write_graph(sess.graph.as_graph_def(),
-                                 model_version_dir,
-                                 "model.graphdef",
-                                 as_text=False)
+            graph_io.write_graph(
+                sess.graph.as_graph_def(),
+                model_version_dir,
+                "model.graphdef",
+                as_text=False,
+            )
 
 
-def create_tf_modelconfig(create_savedmodel, models_dir, model_version,
-                          max_batch, dtype, shape):
-
+def create_tf_modelconfig(
+    create_savedmodel, models_dir, model_version, max_batch, dtype, shape
+):
     if not tu.validate_for_tf_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     # Use a different model name for the non-batching variant
     if create_savedmodel:
         model_name = tu.get_sequence_model_name(
-            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", dtype)
+            "savedmodel_nobatch" if max_batch == 0 else "savedmodel", dtype
+        )
     else:
         model_name = tu.get_sequence_model_name(
-            "graphdef_nobatch" if max_batch == 0 else "graphdef", dtype)
+            "graphdef_nobatch" if max_batch == 0 else "graphdef", dtype
+        )
 
     if dtype == np.float32:
         control_type = "fp32"
@@ -305,7 +339,7 @@ def create_tf_modelconfig(create_savedmodel, models_dir, model_version,
         control_type = "int32"
 
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "{}"
 max_batch_size: {}
@@ -351,11 +385,16 @@ def create_tf_modelconfig(create_savedmodel, models_dir, model_version,
     kind: KIND_GPU
   }}
 ]
-'''.format(
+""".format(
         model_name,
         "tensorflow_savedmodel" if create_savedmodel else "tensorflow_graphdef",
-        max_batch, control_type, control_type, np_to_model_dtype(dtype),
-        tu.shape_to_dims_str(shape), np_to_model_dtype(dtype))
+        max_batch,
+        control_type,
+        control_type,
+        np_to_model_dtype(dtype),
+        tu.shape_to_dims_str(shape),
+        np_to_model_dtype(dtype),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -366,8 +405,9 @@ def create_tf_modelconfig(create_savedmodel, models_dir, model_version,
         cfile.write(config)
 
 
-def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
-                                       dtype, shape):
+def create_plan_shape_tensor_modelfile(
+    models_dir, model_version, max_batch, dtype, shape
+):
     # Note that resize layer does not support int tensors.
     # The model takes two inputs (INPUT and SHAPE_INPUT)
     # and two control inputs(START and READY).
@@ -383,12 +423,12 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
 
-    unit_shape = ([1] * len(shape))
+    unit_shape = [1] * len(shape)
     if max_batch != 0:
-        shape_in0 = network.add_input("SHAPE_INPUT", trt.int32,
-                                      [1 + len(shape)])
+        shape_in0 = network.add_input("SHAPE_INPUT", trt.int32, [1 + len(shape)])
         in0 = network.add_input("INPUT", trt_dtype, [-1] + shape)
         start0 = network.add_input("START", trt_dtype, [-1] + unit_shape)
         ready0 = network.add_input("READY", trt_dtype, [-1] + unit_shape)
@@ -399,8 +439,9 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
         ready0 = network.add_input("READY", trt_dtype, unit_shape)
 
     add = network.add_elementwise(in0, start0, trt.ElementWiseOperation.SUM)
-    out0 = network.add_elementwise(add.get_output(0), ready0,
-                                   trt.ElementWiseOperation.PROD).get_output(0)
+    out0 = network.add_elementwise(
+        add.get_output(0), ready0, trt.ElementWiseOperation.PROD
+    ).get_output(0)
 
     resize_layer = network.add_resize(input=in0)
     resize_layer.set_input(1, shape_in0)
@@ -427,7 +468,7 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
     shape_out0.get_output(0).allowed_formats = 1 << int(trt_memory_format)
     resized_out0.allowed_formats = 1 << int(trt_memory_format)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         in0.dynamic_range = (-128.0, 127.0)
         out0.dynamic_range = (-128.0, 127.0)
         resized_out0.dynamic_range = (-128.0, 127.0)
@@ -436,9 +477,9 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
 
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
-    elif (trt_dtype == trt.float16):
+    elif trt_dtype == trt.float16:
         flags |= 1 << int(trt.BuilderFlag.FP16)
 
     min_prefix = []
@@ -457,10 +498,18 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
     profile = builder.create_optimization_profile()
     profile.set_shape_input("SHAPE_INPUT", min_shape, opt_shape, max_shape)
     profile.set_shape("INPUT", min_shape, opt_shape, max_shape)
-    profile.set_shape("START", min_prefix + unit_shape, opt_prefix + unit_shape,
-                      opt_prefix + unit_shape)
-    profile.set_shape("READY", min_prefix + unit_shape, opt_prefix + unit_shape,
-                      opt_prefix + unit_shape)
+    profile.set_shape(
+        "START",
+        min_prefix + unit_shape,
+        opt_prefix + unit_shape,
+        opt_prefix + unit_shape,
+    )
+    profile.set_shape(
+        "READY",
+        min_prefix + unit_shape,
+        opt_prefix + unit_shape,
+        opt_prefix + unit_shape,
+    )
 
     config = builder.create_builder_config()
     config.flags = flags
@@ -475,7 +524,8 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
         del engine
 
     model_name = tu.get_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -487,8 +537,7 @@ def create_plan_shape_tensor_modelfile(models_dir, model_version, max_batch,
         f.write(engine_bytes)
 
 
-def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
-                                shape):
+def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype, shape):
     trt_dtype = np_to_trt_dtype(dtype)
     # Create the model. For now don't implement a proper accumulator
     # just return 0 if not-ready and 'INPUT'+'START' otherwise...  the
@@ -500,8 +549,9 @@ def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
     start0 = network.add_input("START", trt_dtype, [1 for i in shape])
     ready0 = network.add_input("READY", trt_dtype, [1 for i in shape])
     add = network.add_elementwise(in0, start0, trt.ElementWiseOperation.SUM)
-    out0 = network.add_elementwise(add.get_output(0), ready0,
-                                   trt.ElementWiseOperation.PROD)
+    out0 = network.add_elementwise(
+        add.get_output(0), ready0, trt.ElementWiseOperation.PROD
+    )
 
     out0.get_output(0).name = "OUTPUT"
     network.mark_output(out0.get_output(0))
@@ -518,7 +568,8 @@ def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
     del network
 
     model_name = tu.get_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -530,8 +581,7 @@ def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
         f.write(engine_bytes)
 
 
-def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
-                                   shape):
+def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype, shape):
     trt_dtype = np_to_trt_dtype(dtype)
     trt_memory_format = trt.TensorFormat.LINEAR
     # Create the model. For now don't implement a proper accumulator
@@ -544,8 +594,9 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
     start0 = network.add_input("START", trt_dtype, [1 for i in shape])
     ready0 = network.add_input("READY", trt_dtype, [1 for i in shape])
     add = network.add_elementwise(in0, start0, trt.ElementWiseOperation.SUM)
-    out0 = network.add_elementwise(add.get_output(0), ready0,
-                                   trt.ElementWiseOperation.PROD)
+    out0 = network.add_elementwise(
+        add.get_output(0), ready0, trt.ElementWiseOperation.PROD
+    )
 
     out0.get_output(0).name = "OUTPUT"
     network.mark_output(out0.get_output(0))
@@ -557,7 +608,7 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
     ready0.allowed_formats = 1 << int(trt_memory_format)
     out0.get_output(0).allowed_formats = 1 << int(trt_memory_format)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         in0.dynamic_range = (-128.0, 127.0)
         out0.dynamic_range = (-128.0, 127.0)
         start0.dynamic_range = (-128.0, 127.0)
@@ -565,9 +616,9 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
 
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
-    elif (trt_dtype == trt.float16):
+    elif trt_dtype == trt.float16:
         flags |= 1 << int(trt.BuilderFlag.FP16)
 
     config = builder.create_builder_config()
@@ -582,7 +633,8 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
         del engine
 
     model_name = tu.get_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -594,8 +646,7 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
         f.write(engine_bytes)
 
 
-def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
-                                  shape):
+def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype, shape):
     trt_dtype = np_to_trt_dtype(dtype)
     # Create the model. For now don't implement a proper accumulator
     # just return 0 if not-ready and 'INPUT'+'START' otherwise...  the
@@ -603,9 +654,10 @@ def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
 
-    unit_shape = ([1] * len(shape))
+    unit_shape = [1] * len(shape)
     if max_batch != 0:
         in0 = network.add_input("INPUT", trt_dtype, [-1] + shape)
         start0 = network.add_input("START", trt_dtype, [-1] + unit_shape)
@@ -616,8 +668,9 @@ def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
         ready0 = network.add_input("READY", trt_dtype, unit_shape)
 
     add = network.add_elementwise(in0, start0, trt.ElementWiseOperation.SUM)
-    out0 = network.add_elementwise(add.get_output(0), ready0,
-                                   trt.ElementWiseOperation.PROD)
+    out0 = network.add_elementwise(
+        add.get_output(0), ready0, trt.ElementWiseOperation.PROD
+    )
 
     out0.get_output(0).name = "OUTPUT"
     network.mark_output(out0.get_output(0))
@@ -642,10 +695,18 @@ def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
     profile = builder.create_optimization_profile()
     profile.set_shape("INPUT", min_shape, opt_shape, max_shape)
     if max_batch != 0:
-        profile.set_shape("START", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
-        profile.set_shape("READY", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
+        profile.set_shape(
+            "START",
+            [1] + unit_shape,
+            [max_batch] + unit_shape,
+            [max_batch] + unit_shape,
+        )
+        profile.set_shape(
+            "READY",
+            [1] + unit_shape,
+            [max_batch] + unit_shape,
+            [max_batch] + unit_shape,
+        )
     else:
         profile.set_shape("START", unit_shape, unit_shape, unit_shape)
         profile.set_shape("READY", unit_shape, unit_shape, unit_shape)
@@ -661,7 +722,8 @@ def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
         del engine
 
     model_name = tu.get_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -673,8 +735,9 @@ def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
         f.write(engine_bytes)
 
 
-def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
-                                     dtype, shape):
+def create_plan_dynamic_rf_modelfile(
+    models_dir, model_version, max_batch, dtype, shape
+):
     trt_dtype = np_to_trt_dtype(dtype)
     trt_memory_format = trt.TensorFormat.LINEAR
 
@@ -684,9 +747,10 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
 
-    unit_shape = ([1] * len(shape))
+    unit_shape = [1] * len(shape)
     if max_batch != 0:
         in0 = network.add_input("INPUT", trt_dtype, [-1] + shape)
         start0 = network.add_input("START", trt_dtype, [-1] + unit_shape)
@@ -697,8 +761,9 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
         ready0 = network.add_input("READY", trt_dtype, unit_shape)
 
     add = network.add_elementwise(in0, start0, trt.ElementWiseOperation.SUM)
-    out0 = network.add_elementwise(add.get_output(0), ready0,
-                                   trt.ElementWiseOperation.PROD)
+    out0 = network.add_elementwise(
+        add.get_output(0), ready0, trt.ElementWiseOperation.PROD
+    )
 
     out0.get_output(0).name = "OUTPUT"
     network.mark_output(out0.get_output(0))
@@ -710,7 +775,7 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
     ready0.allowed_formats = 1 << int(trt_memory_format)
     out0.get_output(0).allowed_formats = 1 << int(trt_memory_format)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         in0.dynamic_range = (-128.0, 127.0)
         out0.dynamic_range = (-128.0, 127.0)
         start0.dynamic_range = (-128.0, 127.0)
@@ -718,9 +783,9 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
 
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
-    elif (trt_dtype == trt.float16):
+    elif trt_dtype == trt.float16:
         flags |= 1 << int(trt.BuilderFlag.FP16)
 
     min_shape = []
@@ -743,10 +808,18 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
     profile = builder.create_optimization_profile()
     profile.set_shape("INPUT", min_shape, opt_shape, max_shape)
     if max_batch != 0:
-        profile.set_shape("START", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
-        profile.set_shape("READY", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
+        profile.set_shape(
+            "START",
+            [1] + unit_shape,
+            [max_batch] + unit_shape,
+            [max_batch] + unit_shape,
+        )
+        profile.set_shape(
+            "READY",
+            [1] + unit_shape,
+            [max_batch] + unit_shape,
+            [max_batch] + unit_shape,
+        )
     else:
         profile.set_shape("START", unit_shape, unit_shape, unit_shape)
         profile.set_shape("READY", unit_shape, unit_shape, unit_shape)
@@ -764,7 +837,8 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
         del engine
 
     model_name = tu.get_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -777,37 +851,40 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
 
 
 def create_plan_modelfile(models_dir, model_version, max_batch, dtype, shape):
-
     if not tu.validate_for_trt_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     if dtype != np.float32:
-        if (not tu.shape_is_fixed(shape)):
-            create_plan_dynamic_rf_modelfile(models_dir, model_version,
-                                             max_batch, dtype, shape)
+        if not tu.shape_is_fixed(shape):
+            create_plan_dynamic_rf_modelfile(
+                models_dir, model_version, max_batch, dtype, shape
+            )
         else:
-            create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch,
-                                           dtype, shape)
+            create_plan_fixed_rf_modelfile(
+                models_dir, model_version, max_batch, dtype, shape
+            )
     else:
-        if (not tu.shape_is_fixed(shape)):
-            create_plan_dynamic_modelfile(models_dir, model_version, max_batch,
-                                          dtype, shape)
+        if not tu.shape_is_fixed(shape):
+            create_plan_dynamic_modelfile(
+                models_dir, model_version, max_batch, dtype, shape
+            )
         else:
-            create_plan_fixed_modelfile(models_dir, model_version, max_batch,
-                                        dtype, shape)
+            create_plan_fixed_modelfile(
+                models_dir, model_version, max_batch, dtype, shape
+            )
 
 
 def create_plan_modelconfig(models_dir, model_version, max_batch, dtype, shape):
-
     if not tu.validate_for_trt_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     config_dir = models_dir + "/" + model_name
     if FLAGS.tensorrt_shape_io:
         shape_tensor_dim = len(shape)
-        config = '''
+        config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: {}
@@ -876,15 +953,23 @@ def create_plan_modelconfig(models_dir, model_version, max_batch, dtype, shape):
     kind: KIND_GPU
   }}
 ]
-'''.format(model_name, max_batch, "int32" if dtype == np.int32 else "fp32",
-           "int32" if dtype == np.int32 else "fp32", np_to_model_dtype(dtype),
-           tu.shape_to_dims_str(shape), shape_tensor_dim,
-           np_to_model_dtype(dtype), tu.shape_to_dims_str(shape),
-           np_to_model_dtype(dtype), tu.shape_to_dims_str(shape),
-           shape_tensor_dim)
+""".format(
+            model_name,
+            max_batch,
+            "int32" if dtype == np.int32 else "fp32",
+            "int32" if dtype == np.int32 else "fp32",
+            np_to_model_dtype(dtype),
+            tu.shape_to_dims_str(shape),
+            shape_tensor_dim,
+            np_to_model_dtype(dtype),
+            tu.shape_to_dims_str(shape),
+            np_to_model_dtype(dtype),
+            tu.shape_to_dims_str(shape),
+            shape_tensor_dim,
+        )
 
     else:
-        config = '''
+        config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: {}
@@ -930,10 +1015,16 @@ def create_plan_modelconfig(models_dir, model_version, max_batch, dtype, shape):
     kind: KIND_GPU
   }}
 ]
-'''.format(model_name, max_batch, "int32" if dtype == np.int32 else "fp32",
-           "int32" if dtype == np.int32 else "fp32", np_to_model_dtype(dtype),
-           tu.shape_to_dims_str(shape), np_to_model_dtype(dtype),
-           tu.shape_to_dims_str(shape))
+""".format(
+            model_name,
+            max_batch,
+            "int32" if dtype == np.int32 else "fp32",
+            "int32" if dtype == np.int32 else "fp32",
+            np_to_model_dtype(dtype),
+            tu.shape_to_dims_str(shape),
+            np_to_model_dtype(dtype),
+            tu.shape_to_dims_str(shape),
+        )
 
     try:
         os.makedirs(config_dir)
@@ -945,12 +1036,12 @@ def create_plan_modelconfig(models_dir, model_version, max_batch, dtype, shape):
 
 
 def create_onnx_modelfile(models_dir, model_version, max_batch, dtype, shape):
-
     if not tu.validate_for_onnx_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_sequence_model_name(
-        "onnx_nobatch" if max_batch == 0 else "onnx", dtype)
+        "onnx_nobatch" if max_batch == 0 else "onnx", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     # Create the model. For now don't implement a proper accumulator
@@ -974,30 +1065,39 @@ def create_onnx_modelfile(models_dir, model_version, max_batch, dtype, shape):
     batch_dim = [] if max_batch == 0 else [None]
 
     onnx_input = onnx.helper.make_tensor_value_info(
-        "INPUT", onnx_dtype, batch_dim + onnx_input_shape)
-    onnx_start = onnx.helper.make_tensor_value_info("START", onnx_control_dtype,
-                                                    batch_dim + [1])
-    onnx_ready = onnx.helper.make_tensor_value_info("READY", onnx_control_dtype,
-                                                    batch_dim + [1])
+        "INPUT", onnx_dtype, batch_dim + onnx_input_shape
+    )
+    onnx_start = onnx.helper.make_tensor_value_info(
+        "START", onnx_control_dtype, batch_dim + [1]
+    )
+    onnx_ready = onnx.helper.make_tensor_value_info(
+        "READY", onnx_control_dtype, batch_dim + [1]
+    )
     onnx_output = onnx.helper.make_tensor_value_info(
-        "OUTPUT", onnx_dtype, batch_dim + onnx_output_shape)
+        "OUTPUT", onnx_dtype, batch_dim + onnx_output_shape
+    )
 
     internal_input = onnx.helper.make_node("Identity", ["INPUT"], ["_INPUT"])
 
-    # cast int8, int16 input to higer precision int as Onnx Add/Sub operator doesn't support those type
+    # cast int8, int16 input to higher precision int as Onnx Add/Sub operator doesn't support those type
     # Also casting String data type to int32
-    if ((onnx_dtype == onnx.TensorProto.INT8) or
-        (onnx_dtype == onnx.TensorProto.INT16) or
-        (onnx_dtype == onnx.TensorProto.STRING)):
-        internal_input = onnx.helper.make_node("Cast", ["INPUT"], ["_INPUT"],
-                                               to=onnx.TensorProto.INT32)
+    if (
+        (onnx_dtype == onnx.TensorProto.INT8)
+        or (onnx_dtype == onnx.TensorProto.INT16)
+        or (onnx_dtype == onnx.TensorProto.STRING)
+    ):
+        internal_input = onnx.helper.make_node(
+            "Cast", ["INPUT"], ["_INPUT"], to=onnx.TensorProto.INT32
+        )
 
     # Convert boolean value to int32 value
     if onnx_control_dtype == onnx.TensorProto.BOOL:
-        internal_input1 = onnx.helper.make_node("Cast", ["START"], ["_START"],
-                                                to=onnx.TensorProto.INT32)
-        internal_input2 = onnx.helper.make_node("Cast", ["READY"], ["_READY"],
-                                                to=onnx.TensorProto.INT32)
+        internal_input1 = onnx.helper.make_node(
+            "Cast", ["START"], ["_START"], to=onnx.TensorProto.INT32
+        )
+        internal_input2 = onnx.helper.make_node(
+            "Cast", ["READY"], ["_READY"], to=onnx.TensorProto.INT32
+        )
         add = onnx.helper.make_node("Add", ["_INPUT", "_START"], ["add"])
         # Take advantage of knowledge that the READY false value is 0 and true is 1
         mul = onnx.helper.make_node("Mul", ["_READY", "add"], ["CAST"])
@@ -1015,21 +1115,20 @@ def create_onnx_modelfile(models_dir, model_version, max_batch, dtype, shape):
         cast = onnx.helper.make_node("Identity", ["CAST"], ["OUTPUT"])
 
     if onnx_control_dtype == onnx.TensorProto.BOOL:
-        onnx_nodes = [
-            internal_input, internal_input1, internal_input2, add, mul, cast
-        ]
+        onnx_nodes = [internal_input, internal_input1, internal_input2, add, mul, cast]
     else:
         onnx_nodes = [internal_input, add, mul, cast]
     onnx_inputs = [onnx_input, onnx_start, onnx_ready]
     onnx_outputs = [onnx_output]
 
-    graph_proto = onnx.helper.make_graph(onnx_nodes, model_name, onnx_inputs,
-                                         onnx_outputs)
+    graph_proto = onnx.helper.make_graph(
+        onnx_nodes, model_name, onnx_inputs, onnx_outputs
+    )
     if FLAGS.onnx_opset > 0:
         model_opset = onnx.helper.make_operatorsetid("", FLAGS.onnx_opset)
-        model_def = onnx.helper.make_model(graph_proto,
-                                           producer_name="triton",
-                                           opset_imports=[model_opset])
+        model_def = onnx.helper.make_model(
+            graph_proto, producer_name="triton", opset_imports=[model_opset]
+        )
     else:
         model_def = onnx.helper.make_model(graph_proto, producer_name="triton")
 
@@ -1042,12 +1141,12 @@ def create_onnx_modelfile(models_dir, model_version, max_batch, dtype, shape):
 
 
 def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape):
-
     if not tu.validate_for_onnx_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_sequence_model_name(
-        "onnx_nobatch" if max_batch == 0 else "onnx", dtype)
+        "onnx_nobatch" if max_batch == 0 else "onnx", dtype
+    )
     config_dir = models_dir + "/" + model_name
 
     if dtype == np.float32:
@@ -1058,24 +1157,32 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape):
     else:
         control_type = "int32"
 
-    instance_group_string = '''
+    instance_group_string = """
 instance_group [
   {
     kind: KIND_GPU
   }
 ]
-'''
+"""
 
     # [TODO] move create_general_modelconfig() out of emu as it is general
     # enough for all backends to use
     config = emu.create_general_modelconfig(
         model_name,
         "onnxruntime_onnx",
-        max_batch, [dtype], [shape], [None], [dtype], [shape], [None], [None],
+        max_batch,
+        [dtype],
+        [shape],
+        [None],
+        [dtype],
+        [shape],
+        [None],
+        [None],
         force_tensor_number_suffix=False,
-        instance_group_str=instance_group_string)
+        instance_group_str=instance_group_string,
+    )
 
-    config += '''
+    config += """
 sequence_batching {{
   max_sequence_idle_microseconds: 5000000
   control_input [
@@ -1099,7 +1206,9 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape):
     }}
   ]
 }}
-'''.format(type=control_type)
+""".format(
+        type=control_type
+    )
 
     try:
         os.makedirs(config_dir)
@@ -1110,11 +1219,10 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape):
         cfile.write(config)
 
 
-def create_libtorch_modelfile(models_dir, model_version, max_batch, dtype,
-                              shape):
-
-    if not tu.validate_for_libtorch_model(dtype, dtype, dtype, shape, shape,
-                                          shape, max_batch):
+def create_libtorch_modelfile(models_dir, model_version, max_batch, dtype, shape):
+    if not tu.validate_for_libtorch_model(
+        dtype, dtype, dtype, shape, shape, shape, max_batch
+    ):
         return
 
     torch_dtype = np_to_torch_dtype(dtype)
@@ -1126,12 +1234,12 @@ def create_libtorch_modelfile(models_dir, model_version, max_batch, dtype,
         torch_dtype = torch.int32
 
     model_name = tu.get_sequence_model_name(
-        "libtorch_nobatch" if max_batch == 0 else "libtorch", dtype)
+        "libtorch_nobatch" if max_batch == 0 else "libtorch", dtype
+    )
     # handle for -1 (when variable) since can't create tensor with shape of [-1]
     shape = [abs(ips) for ips in shape]
 
     class SequenceNet(nn.Module):
-
         def __init__(self):
             super(SequenceNet, self).__init__()
 
@@ -1149,8 +1257,9 @@ def forward(self, input0, start0, ready0):
         example_input1 = example_input1.long()
         example_input2 = example_input2.long()
 
-    traced = torch.jit.trace(sequenceModel,
-                             (example_input0, example_input1, example_input2))
+    traced = torch.jit.trace(
+        sequenceModel, (example_input0, example_input1, example_input2)
+    )
 
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
@@ -1162,15 +1271,15 @@ def forward(self, input0, start0, ready0):
     traced.save(model_version_dir + "/model.pt")
 
 
-def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
-                                shape):
-
-    if not tu.validate_for_libtorch_model(dtype, dtype, dtype, shape, shape,
-                                          shape, max_batch):
+def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype, shape):
+    if not tu.validate_for_libtorch_model(
+        dtype, dtype, dtype, shape, shape, shape, max_batch
+    ):
         return
 
     model_name = tu.get_sequence_model_name(
-        "libtorch_nobatch" if max_batch == 0 else "libtorch", dtype)
+        "libtorch_nobatch" if max_batch == 0 else "libtorch", dtype
+    )
     config_dir = models_dir + "/" + model_name
 
     if dtype == np.float32:
@@ -1182,7 +1291,7 @@ def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
         control_type = "int32"
 
     #  FIX FOR LibTorch
-    config = '''
+    config = """
 name: "{}"
 platform: "pytorch_libtorch"
 max_batch_size: {}
@@ -1228,9 +1337,15 @@ def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
     kind: KIND_GPU
   }}
 ]
-'''.format(model_name, max_batch, control_type, control_type,
-           np_to_model_dtype(dtype), tu.shape_to_dims_str(shape),
-           np_to_model_dtype(dtype))
+""".format(
+        model_name,
+        max_batch,
+        control_type,
+        control_type,
+        np_to_model_dtype(dtype),
+        tu.shape_to_dims_str(shape),
+        np_to_model_dtype(dtype),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -1241,19 +1356,22 @@ def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
         cfile.write(config)
 
 
-def create_openvino_modelfile(models_dir, model_version, max_batch, dtype,
-                              shape):
-
-    batch_dim = [] if max_batch == 0 else [
-        max_batch,
-    ]
-    if not tu.validate_for_openvino_model(dtype, dtype, dtype,
-                                          batch_dim + shape, batch_dim + shape,
-                                          batch_dim + shape):
+def create_openvino_modelfile(models_dir, model_version, max_batch, dtype, shape):
+    batch_dim = (
+        []
+        if max_batch == 0
+        else [
+            max_batch,
+        ]
+    )
+    if not tu.validate_for_openvino_model(
+        dtype, dtype, dtype, batch_dim + shape, batch_dim + shape, batch_dim + shape
+    ):
         return
 
     model_name = tu.get_sequence_model_name(
-        "openvino_nobatch" if max_batch == 0 else "openvino", dtype)
+        "openvino_nobatch" if max_batch == 0 else "openvino", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     in0 = ng.parameter(shape=batch_dim + shape, dtype=dtype, name="INPUT")
@@ -1271,25 +1389,29 @@ def create_openvino_modelfile(models_dir, model_version, max_batch, dtype,
     except OSError as ex:
         pass  # ignore existing dir
 
-    ie_network.serialize(model_version_dir + "/model.xml",
-                         model_version_dir + "/model.bin")
+    ie_network.serialize(
+        model_version_dir + "/model.xml", model_version_dir + "/model.bin"
+    )
 
 
-def create_openvino_modelconfig(models_dir, model_version, max_batch, dtype,
-                                shape):
-
-    batch_dim = [] if max_batch == 0 else [
-        max_batch,
-    ]
-    if not tu.validate_for_openvino_model(dtype, dtype, dtype,
-                                          batch_dim + shape, batch_dim + shape,
-                                          batch_dim + shape):
+def create_openvino_modelconfig(models_dir, model_version, max_batch, dtype, shape):
+    batch_dim = (
+        []
+        if max_batch == 0
+        else [
+            max_batch,
+        ]
+    )
+    if not tu.validate_for_openvino_model(
+        dtype, dtype, dtype, batch_dim + shape, batch_dim + shape, batch_dim + shape
+    ):
         return
 
     model_name = tu.get_sequence_model_name(
-        "openvino_nobatch" if max_batch == 0 else "openvino", dtype)
+        "openvino_nobatch" if max_batch == 0 else "openvino", dtype
+    )
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 backend: "openvino"
 max_batch_size: {}
@@ -1330,9 +1452,15 @@ def create_openvino_modelconfig(models_dir, model_version, max_batch, dtype,
     dims: [ 1 ]
   }}
 ]
-'''.format(model_name, max_batch, "int32" if dtype == np.int32 else "fp32",
-           "int32" if dtype == np.int32 else "fp32", np_to_model_dtype(dtype),
-           tu.shape_to_dims_str(shape), np_to_model_dtype(dtype))
+""".format(
+        model_name,
+        max_batch,
+        "int32" if dtype == np.int32 else "fp32",
+        "int32" if dtype == np.int32 else "fp32",
+        np_to_model_dtype(dtype),
+        tu.shape_to_dims_str(shape),
+        np_to_model_dtype(dtype),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -1347,12 +1475,10 @@ def create_shape_tensor_models(models_dir, dtype, shape, no_batch=True):
     model_version = 1
 
     create_plan_modelconfig(models_dir, model_version, 8, dtype, shape)
-    create_plan_shape_tensor_modelfile(models_dir, model_version, 8, dtype,
-                                       shape)
+    create_plan_shape_tensor_modelfile(models_dir, model_version, 8, dtype, shape)
     if no_batch:
         create_plan_modelconfig(models_dir, model_version, 0, dtype, shape)
-        create_plan_shape_tensor_modelfile(models_dir, model_version, 0, dtype,
-                                           shape)
+        create_plan_shape_tensor_modelfile(models_dir, model_version, 0, dtype, shape)
 
 
 def create_models(models_dir, dtype, shape, no_batch=True):
@@ -1362,19 +1488,15 @@ def create_models(models_dir, dtype, shape, no_batch=True):
         create_tf_modelconfig(False, models_dir, model_version, 8, dtype, shape)
         create_tf_modelfile(False, models_dir, model_version, 8, dtype, shape)
         if no_batch:
-            create_tf_modelconfig(False, models_dir, model_version, 0, dtype,
-                                  shape)
-            create_tf_modelfile(False, models_dir, model_version, 0, dtype,
-                                shape)
+            create_tf_modelconfig(False, models_dir, model_version, 0, dtype, shape)
+            create_tf_modelfile(False, models_dir, model_version, 0, dtype, shape)
 
     if FLAGS.savedmodel:
         create_tf_modelconfig(True, models_dir, model_version, 8, dtype, shape)
         create_tf_modelfile(True, models_dir, model_version, 8, dtype, shape)
         if no_batch:
-            create_tf_modelconfig(True, models_dir, model_version, 0, dtype,
-                                  shape)
-            create_tf_modelfile(True, models_dir, model_version, 0, dtype,
-                                shape)
+            create_tf_modelconfig(True, models_dir, model_version, 0, dtype, shape)
+            create_tf_modelfile(True, models_dir, model_version, 0, dtype, shape)
 
     if FLAGS.tensorrt:
         if dtype == bool:
@@ -1383,15 +1505,11 @@ def create_models(models_dir, dtype, shape, no_batch=True):
         if dtype == np.int8:
             suffix = [1, 1]
 
-        create_plan_modelconfig(models_dir, model_version, 8, dtype,
-                                shape + suffix)
-        create_plan_modelfile(models_dir, model_version, 8, dtype,
-                              shape + suffix)
+        create_plan_modelconfig(models_dir, model_version, 8, dtype, shape + suffix)
+        create_plan_modelfile(models_dir, model_version, 8, dtype, shape + suffix)
         if no_batch:
-            create_plan_modelconfig(models_dir, model_version, 0, dtype,
-                                    shape + suffix)
-            create_plan_modelfile(models_dir, model_version, 0, dtype,
-                                  shape + suffix)
+            create_plan_modelconfig(models_dir, model_version, 0, dtype, shape + suffix)
+            create_plan_modelfile(models_dir, model_version, 0, dtype, shape + suffix)
 
     if FLAGS.onnx:
         create_onnx_modelconfig(models_dir, model_version, 8, dtype, shape)
@@ -1405,19 +1523,15 @@ def create_models(models_dir, dtype, shape, no_batch=True):
         create_libtorch_modelconfig(models_dir, model_version, 8, dtype, shape)
         create_libtorch_modelfile(models_dir, model_version, 8, dtype, shape)
         if no_batch:
-            create_libtorch_modelconfig(models_dir, model_version, 0, dtype,
-                                        shape)
-            create_libtorch_modelfile(models_dir, model_version, 0, dtype,
-                                      shape)
+            create_libtorch_modelconfig(models_dir, model_version, 0, dtype, shape)
+            create_libtorch_modelfile(models_dir, model_version, 0, dtype, shape)
 
     if FLAGS.openvino:
         create_openvino_modelconfig(models_dir, model_version, 8, dtype, shape)
         create_openvino_modelfile(models_dir, model_version, 8, dtype, shape)
         if no_batch:
-            create_openvino_modelconfig(models_dir, model_version, 0, dtype,
-                                        shape)
-            create_openvino_modelfile(models_dir, model_version, 0, dtype,
-                                      shape)
+            create_openvino_modelconfig(models_dir, model_version, 0, dtype, shape)
+            create_openvino_modelfile(models_dir, model_version, 0, dtype, shape)
 
     if FLAGS.ensemble:
         if dtype == bool:
@@ -1426,80 +1540,97 @@ def create_models(models_dir, dtype, shape, no_batch=True):
             config_shape = shape
             if pair[0] == "plan" and dtype == np.int8:
                 config_shape = shape + [1, 1]
-            if not pair[1](dtype, dtype, dtype, config_shape, config_shape,
-                           config_shape):
+            if not pair[1](
+                dtype, dtype, dtype, config_shape, config_shape, config_shape
+            ):
                 continue
 
-            emu.create_sequence_ensemble_modelconfig(pair[0], models_dir, 8,
-                                                     model_version,
-                                                     config_shape, dtype)
-            emu.create_sequence_ensemble_modelfile(pair[0], models_dir, 8,
-                                                   model_version, config_shape,
-                                                   dtype)
+            emu.create_sequence_ensemble_modelconfig(
+                pair[0], models_dir, 8, model_version, config_shape, dtype
+            )
+            emu.create_sequence_ensemble_modelfile(
+                pair[0], models_dir, 8, model_version, config_shape, dtype
+            )
             if no_batch:
                 emu.create_sequence_ensemble_modelconfig(
-                    pair[0], models_dir, 0, model_version, config_shape, dtype)
-                emu.create_sequence_ensemble_modelfile(pair[0], models_dir, 0,
-                                                       model_version,
-                                                       config_shape, dtype)
+                    pair[0], models_dir, 0, model_version, config_shape, dtype
+                )
+                emu.create_sequence_ensemble_modelfile(
+                    pair[0], models_dir, 0, model_version, config_shape, dtype
+                )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
-    parser.add_argument('--graphdef',
-                        required=False,
-                        action='store_true',
-                        help='Generate GraphDef models')
-    parser.add_argument('--savedmodel',
-                        required=False,
-                        action='store_true',
-                        help='Generate SavedModel models')
-    parser.add_argument('--tensorrt',
-                        required=False,
-                        action='store_true',
-                        help='Generate TensorRT PLAN models')
     parser.add_argument(
-        '--tensorrt-shape-io',
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
+    parser.add_argument(
+        "--graphdef",
+        required=False,
+        action="store_true",
+        help="Generate GraphDef models",
+    )
+    parser.add_argument(
+        "--savedmodel",
+        required=False,
+        action="store_true",
+        help="Generate SavedModel models",
+    )
+    parser.add_argument(
+        "--tensorrt",
         required=False,
-        action='store_true',
-        help='Generate TensorRT PLAN models w/ shape tensor i/o')
-    parser.add_argument('--onnx',
-                        required=False,
-                        action='store_true',
-                        help='Generate Onnx models')
+        action="store_true",
+        help="Generate TensorRT PLAN models",
+    )
     parser.add_argument(
-        '--onnx_opset',
+        "--tensorrt-shape-io",
+        required=False,
+        action="store_true",
+        help="Generate TensorRT PLAN models w/ shape tensor i/o",
+    )
+    parser.add_argument(
+        "--onnx", required=False, action="store_true", help="Generate Onnx models"
+    )
+    parser.add_argument(
+        "--onnx_opset",
         type=int,
         required=False,
         default=0,
-        help='Opset used for Onnx models. Default is to use ONNXRT default')
-    parser.add_argument('--libtorch',
-                        required=False,
-                        action='store_true',
-                        help='Generate Pytorch LibTorch models')
-    parser.add_argument('--openvino',
-                        required=False,
-                        action='store_true',
-                        help='Generate OpenVino models')
-    parser.add_argument('--variable',
-                        required=False,
-                        action='store_true',
-                        help='Used variable-shape tensors for input/output')
-    parser.add_argument('--ensemble',
-                        required=False,
-                        action='store_true',
-                        help='Generate ensemble models against the models' +
-                        ' in all platforms. Note that the models generated' +
-                        ' are not completed.')
+        help="Opset used for Onnx models. Default is to use ONNXRT default",
+    )
+    parser.add_argument(
+        "--libtorch",
+        required=False,
+        action="store_true",
+        help="Generate Pytorch LibTorch models",
+    )
+    parser.add_argument(
+        "--openvino",
+        required=False,
+        action="store_true",
+        help="Generate OpenVino models",
+    )
+    parser.add_argument(
+        "--variable",
+        required=False,
+        action="store_true",
+        help="Used variable-shape tensors for input/output",
+    )
+    parser.add_argument(
+        "--ensemble",
+        required=False,
+        action="store_true",
+        help="Generate ensemble models against the models"
+        + " in all platforms. Note that the models generated"
+        + " are not completed.",
+    )
     FLAGS, unparsed = parser.parse_known_args()
 
     if FLAGS.graphdef or FLAGS.savedmodel:
         import tensorflow as tf
         from tensorflow.python.framework import graph_io
+
         tf.compat.v1.disable_eager_execution()
     if FLAGS.tensorrt or FLAGS.tensorrt_shape_io:
         import tensorrt as trt
@@ -1515,43 +1646,84 @@ def create_models(models_dir, dtype, shape, no_batch=True):
     import test_util as tu
 
     if FLAGS.tensorrt_shape_io:
-        create_shape_tensor_models(FLAGS.models_dir, np.float32, [
-            -1,
-        ])
+        create_shape_tensor_models(
+            FLAGS.models_dir,
+            np.float32,
+            [
+                -1,
+            ],
+        )
     else:
         # Tests with models that accept fixed-shape input/output tensors
         if not FLAGS.variable:
-            create_models(FLAGS.models_dir, np.float32, [
-                1,
-            ])
-            create_models(FLAGS.models_dir, np.int32, [
-                1,
-            ])
-            create_models(FLAGS.models_dir, np_dtype_string, [
-                1,
-            ])
-            create_models(FLAGS.models_dir, bool, [
-                1,
-            ])
+            create_models(
+                FLAGS.models_dir,
+                np.float32,
+                [
+                    1,
+                ],
+            )
+            create_models(
+                FLAGS.models_dir,
+                np.int32,
+                [
+                    1,
+                ],
+            )
+            create_models(
+                FLAGS.models_dir,
+                np_dtype_string,
+                [
+                    1,
+                ],
+            )
+            create_models(
+                FLAGS.models_dir,
+                bool,
+                [
+                    1,
+                ],
+            )
 
         # Tests with models that accept variable-shape input/output tensors
         if FLAGS.variable:
-            create_models(FLAGS.models_dir, np.int32, [
-                -1,
-            ], False)
-            create_models(FLAGS.models_dir, np.float32, [
-                -1,
-            ], False)
-            create_models(FLAGS.models_dir, np_dtype_string, [
-                -1,
-            ], False)
-            create_models(FLAGS.models_dir, bool, [
-                -1,
-            ], False)
+            create_models(
+                FLAGS.models_dir,
+                np.int32,
+                [
+                    -1,
+                ],
+                False,
+            )
+            create_models(
+                FLAGS.models_dir,
+                np.float32,
+                [
+                    -1,
+                ],
+                False,
+            )
+            create_models(
+                FLAGS.models_dir,
+                np_dtype_string,
+                [
+                    -1,
+                ],
+                False,
+            )
+            create_models(
+                FLAGS.models_dir,
+                bool,
+                [
+                    -1,
+                ],
+                False,
+            )
 
         if FLAGS.ensemble:
             # Create nop models used in ensemble
             for model_dtype in ["TYPE_INT32", "TYPE_FP32"]:
                 for model_shape in [(-1,)]:
-                    emu.create_nop_modelconfig(FLAGS.models_dir, model_shape,
-                                               model_dtype)
+                    emu.create_nop_modelconfig(
+                        FLAGS.models_dir, model_shape, model_dtype
+                    )
diff --git a/qa/common/gen_qa_tf_parameters.py b/qa/common/gen_qa_tf_parameters.py
old mode 100644
new mode 100755
index e27ddea60e..9c99ba1a6f
--- a/qa/common/gen_qa_tf_parameters.py
+++ b/qa/common/gen_qa_tf_parameters.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,6 +31,7 @@
 sys.path.append("../common")
 
 import os
+
 import tensorflow as tf
 from tensorflow.python.framework import graph_io
 
@@ -40,15 +43,21 @@ def create_graphdefmodel(models_dir, model_name, model_version=1):
 
     tf.compat.v1.reset_default_graph()
     tf.compat.v1.disable_eager_execution()
-    input0 = tf.compat.v1.placeholder(tf.int32, [
-        1,
-    ], "INPUT")
+    input0 = tf.compat.v1.placeholder(
+        tf.int32,
+        [
+            1,
+        ],
+        "INPUT",
+    )
     variable = tf.compat.v1.get_variable(
-        "VARIABLE", [
+        "VARIABLE",
+        [
             1,
         ],
         initializer=tf.compat.v1.zeros_initializer(),
-        dtype=tf.int32)
+        dtype=tf.int32,
+    )
     tf.add(variable, input0, name="OUTPUT")
     tf.compat.v1.global_variables_initializer()
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
@@ -59,15 +68,17 @@ def create_graphdefmodel(models_dir, model_name, model_version=1):
         pass  # ignore existing dir
 
     with tf.compat.v1.Session() as sess:
-        graph_io.write_graph(sess.graph.as_graph_def(),
-                             model_version_dir,
-                             "model.graphdef",
-                             as_text=False)
+        graph_io.write_graph(
+            sess.graph.as_graph_def(),
+            model_version_dir,
+            "model.graphdef",
+            as_text=False,
+        )
 
 
 def create_graphdef_modelconfig(models_dir, model_name):
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "tensorflow_graphdef"
 input [
@@ -84,7 +95,9 @@ def create_graphdef_modelconfig(models_dir, model_name):
     dims: [ 1 ]
   }}
 ]
-'''.format(model_name)
+""".format(
+        model_name
+    )
 
     try:
         os.makedirs(config_dir)
@@ -95,13 +108,13 @@ def create_graphdef_modelconfig(models_dir, model_name):
         cfile.write(config)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     import argparse
+
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
+    parser.add_argument(
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
     args = parser.parse_args()
 
     model_name = "graphdef_variable"
diff --git a/qa/common/gen_qa_torchtrt_models.py b/qa/common/gen_qa_torchtrt_models.py
old mode 100644
new mode 100755
index d74b5e1e4e..5f6ea04581
--- a/qa/common/gen_qa_torchtrt_models.py
+++ b/qa/common/gen_qa_torchtrt_models.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,8 +30,8 @@
 import os
 
 import torch
-import torchvision
 import torch_tensorrt
+import torchvision
 
 
 def create_resnet50_torchtrt(models_dir, max_batch):
@@ -42,10 +44,12 @@ def create_resnet50_torchtrt(models_dir, max_batch):
     trt_ts_module = torch_tensorrt.compile(
         resnet50_ts,
         inputs=[
-            torch_tensorrt.Input(min_shape=[1, 3, 224, 224],
-                                 opt_shape=[1, 3, 224, 224],
-                                 max_shape=[max_batch, 3, 224, 224],
-                                 dtype=torch.float)
+            torch_tensorrt.Input(
+                min_shape=[1, 3, 224, 224],
+                opt_shape=[1, 3, 224, 224],
+                max_shape=[max_batch, 3, 224, 224],
+                dtype=torch.float,
+            )
         ],
         enabled_precisions={torch.float},
     )
@@ -64,10 +68,9 @@ def create_resnet50_torchtrt(models_dir, max_batch):
 
 
 def create_resnet50_torchtrt_modelconfig(models_dir, max_batch):
-
     model_name = "resnet50_libtorch"
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 backend: "pytorch"
 max_batch_size: {}
@@ -87,7 +90,9 @@ def create_resnet50_torchtrt_modelconfig(models_dir, max_batch):
     label_filename: "resnet50_labels.txt"
   }}
 ]
-'''.format(model_name, max_batch)
+""".format(
+        model_name, max_batch
+    )
 
     try:
         os.makedirs(config_dir)
@@ -98,12 +103,11 @@ def create_resnet50_torchtrt_modelconfig(models_dir, max_batch):
         cfile.write(config)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
+    parser.add_argument(
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
     FLAGS, unparsed = parser.parse_known_args()
 
     create_resnet50_torchtrt(FLAGS.models_dir, 128)
diff --git a/qa/common/gen_qa_trt_data_dependent_shape.py b/qa/common/gen_qa_trt_data_dependent_shape.py
old mode 100644
new mode 100755
index adf02684cc..1b40455fd6
--- a/qa/common/gen_qa_trt_data_dependent_shape.py
+++ b/qa/common/gen_qa_trt_data_dependent_shape.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,6 +28,7 @@
 
 import argparse
 import os
+
 import numpy as np
 import tensorrt as trt
 import test_util as tu
@@ -75,19 +78,17 @@ def np_to_trt_dtype(np_dtype):
 # not support batching, because the layer output is not trivially separable
 # based on the request batch size.
 # input_shape is config shape
-def create_data_dependent_modelfile(models_dir,
-                                    model_name,
-                                    input_shape,
-                                    input_dtype=np.int32,
-                                    min_dim=1,
-                                    max_dim=32):
+def create_data_dependent_modelfile(
+    models_dir, model_name, input_shape, input_dtype=np.int32, min_dim=1, max_dim=32
+):
     trt_input_dtype = np_to_trt_dtype(input_dtype)
 
     # Create the model
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
 
     # input
     in0 = network.add_input("INPUT", trt_input_dtype, input_shape)
@@ -133,12 +134,11 @@ def create_data_dependent_modelfile(models_dir,
         f.write(engine_bytes)
 
 
-def create_data_dependent_modelconfig(models_dir,
-                                      model_name,
-                                      input_shape,
-                                      input_dtype=np.int32):
+def create_data_dependent_modelconfig(
+    models_dir, model_name, input_shape, input_dtype=np.int32
+):
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: 0
@@ -156,9 +156,13 @@ def create_data_dependent_modelconfig(models_dir,
     dims: [ {} ]
    }}
 ]
-'''.format(model_name, np_to_model_dtype(input_dtype),
-           tu.shape_to_dims_str(input_shape), np_to_model_dtype(np.int32),
-           tu.shape_to_dims_str((len(input_shape), -1)))
+""".format(
+        model_name,
+        np_to_model_dtype(input_dtype),
+        tu.shape_to_dims_str(input_shape),
+        np_to_model_dtype(np.int32),
+        tu.shape_to_dims_str((len(input_shape), -1)),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -169,22 +173,25 @@ def create_data_dependent_modelconfig(models_dir,
         cfile.write(config)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
+    parser.add_argument(
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
     FLAGS, unparsed = parser.parse_known_args()
 
     # Fixed input shape
-    create_data_dependent_modelfile(FLAGS.models_dir,
-                                    "plan_nobatch_nonzero_fixed", (4, 4))
-    create_data_dependent_modelconfig(FLAGS.models_dir,
-                                      "plan_nobatch_nonzero_fixed", (4, 4))
+    create_data_dependent_modelfile(
+        FLAGS.models_dir, "plan_nobatch_nonzero_fixed", (4, 4)
+    )
+    create_data_dependent_modelconfig(
+        FLAGS.models_dir, "plan_nobatch_nonzero_fixed", (4, 4)
+    )
 
     # Dynamic input shape
-    create_data_dependent_modelfile(FLAGS.models_dir,
-                                    "plan_nobatch_nonzero_dynamic", (-1, -1))
-    create_data_dependent_modelconfig(FLAGS.models_dir,
-                                      "plan_nobatch_nonzero_dynamic", (-1, -1))
+    create_data_dependent_modelfile(
+        FLAGS.models_dir, "plan_nobatch_nonzero_dynamic", (-1, -1)
+    )
+    create_data_dependent_modelconfig(
+        FLAGS.models_dir, "plan_nobatch_nonzero_dynamic", (-1, -1)
+    )
diff --git a/qa/common/gen_qa_trt_format_models.py b/qa/common/gen_qa_trt_format_models.py
old mode 100644
new mode 100755
index 6a9b014af9..9502cdb972
--- a/qa/common/gen_qa_trt_format_models.py
+++ b/qa/common/gen_qa_trt_format_models.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,6 +28,7 @@
 
 import argparse
 import os
+
 import numpy as np
 import tensorrt as trt
 import test_util as tu
@@ -96,19 +99,21 @@ def trt_format_to_string(trt_format):
     return "INVALID"
 
 
-def create_plan_dynamic_modelfile(models_dir,
-                                  max_batch,
-                                  model_version,
-                                  input_shape,
-                                  output0_shape,
-                                  output1_shape,
-                                  input_dtype,
-                                  output0_dtype,
-                                  output1_dtype,
-                                  input_memory_format,
-                                  output_memory_format,
-                                  min_dim=1,
-                                  max_dim=64):
+def create_plan_dynamic_modelfile(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    input_memory_format,
+    output_memory_format,
+    min_dim=1,
+    max_dim=64,
+):
     trt_input_dtype = np_to_trt_dtype(input_dtype)
     trt_output0_dtype = np_to_trt_dtype(output0_dtype)
     trt_output1_dtype = np_to_trt_dtype(output1_dtype)
@@ -119,7 +124,8 @@ def create_plan_dynamic_modelfile(models_dir,
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
     if max_batch == 0:
         input_with_batchsize = [i for i in input_shape]
     else:
@@ -146,12 +152,12 @@ def create_plan_dynamic_modelfile(models_dir,
     out0.get_output(0).allowed_formats = 1 << int(trt_output_memory_format)
     out1.get_output(0).allowed_formats = 1 << int(trt_output_memory_format)
 
-    if (trt_input_dtype == trt.int8):
+    if trt_input_dtype == trt.int8:
         in0.dynamic_range = (-128.0, 127.0)
         in1.dynamic_range = (-128.0, 127.0)
-    if (trt_output0_dtype == trt.int8):
+    if trt_output0_dtype == trt.int8:
         out0.get_output(0).dynamic_range = (-128.0, 127.0)
-    if (trt_output1_dtype == trt.int8):
+    if trt_output1_dtype == trt.int8:
         out1.get_output(0).dynamic_range = (-128.0, 127.0)
 
     min_shape = []
@@ -177,9 +183,9 @@ def create_plan_dynamic_modelfile(models_dir,
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
     datatype_set = set([trt_input_dtype, trt_output0_dtype, trt_output1_dtype])
     for dt in datatype_set:
-        if (dt == trt.int8):
+        if dt == trt.int8:
             flags |= 1 << int(trt.BuilderFlag.INT8)
-        elif (dt == trt.float16):
+        elif dt == trt.float16:
             flags |= 1 << int(trt.BuilderFlag.FP16)
     config = builder.create_builder_config()
     config.flags = flags
@@ -194,10 +200,13 @@ def create_plan_dynamic_modelfile(models_dir,
 
     # Use a different model name for different kinds of models
     base_name = "plan_nobatch" if max_batch == 0 else "plan"
-    base_name += "_" + trt_format_to_string(
-        input_memory_format) + "_" + trt_format_to_string(output_memory_format)
-    model_name = tu.get_model_name(base_name, input_dtype, output0_dtype,
-                                   output1_dtype)
+    base_name += (
+        "_"
+        + trt_format_to_string(input_memory_format)
+        + "_"
+        + trt_format_to_string(output_memory_format)
+    )
+    model_name = tu.get_model_name(base_name, input_dtype, output0_dtype, output1_dtype)
 
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
@@ -210,10 +219,19 @@ def create_plan_dynamic_modelfile(models_dir,
         f.write(engine_bytes)
 
 
-def create_plan_fixed_modelfile(models_dir, max_batch, model_version,
-                                input_shape, output0_shape, output1_shape,
-                                input_dtype, output0_dtype, output1_dtype,
-                                input_memory_format, output_memory_format):
+def create_plan_fixed_modelfile(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    input_memory_format,
+    output_memory_format,
+):
     trt_input_dtype = np_to_trt_dtype(input_dtype)
     trt_output0_dtype = np_to_trt_dtype(output0_dtype)
     trt_output1_dtype = np_to_trt_dtype(output1_dtype)
@@ -245,20 +263,20 @@ def create_plan_fixed_modelfile(models_dir, max_batch, model_version,
     out0.get_output(0).allowed_formats = 1 << int(trt_output_memory_format)
     out1.get_output(0).allowed_formats = 1 << int(trt_output_memory_format)
 
-    if (trt_input_dtype == trt.int8):
+    if trt_input_dtype == trt.int8:
         in0.dynamic_range = (-128.0, 127.0)
         in1.dynamic_range = (-128.0, 127.0)
-    if (trt_output0_dtype == trt.int8):
+    if trt_output0_dtype == trt.int8:
         out0.get_output(0).dynamic_range = (-128.0, 127.0)
-    if (trt_output1_dtype == trt.int8):
+    if trt_output1_dtype == trt.int8:
         out1.get_output(0).dynamic_range = (-128.0, 127.0)
 
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
     datatype_set = set([trt_input_dtype, trt_output0_dtype, trt_output1_dtype])
     for dt in datatype_set:
-        if (dt == trt.int8):
+        if dt == trt.int8:
             flags |= 1 << int(trt.BuilderFlag.INT8)
-        elif (dt == trt.float16):
+        elif dt == trt.float16:
             flags |= 1 << int(trt.BuilderFlag.FP16)
     config = builder.create_builder_config()
     config.flags = flags
@@ -272,10 +290,13 @@ def create_plan_fixed_modelfile(models_dir, max_batch, model_version,
         del engine
 
     base_name = "plan_nobatch" if max_batch == 0 else "plan"
-    base_name += "_" + trt_format_to_string(
-        input_memory_format) + "_" + trt_format_to_string(output_memory_format)
-    model_name = tu.get_model_name(base_name, input_dtype, output0_dtype,
-                                   output1_dtype)
+    base_name += (
+        "_"
+        + trt_format_to_string(input_memory_format)
+        + "_"
+        + trt_format_to_string(output_memory_format)
+    )
+    model_name = tu.get_model_name(base_name, input_dtype, output0_dtype, output1_dtype)
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -287,38 +308,55 @@ def create_plan_fixed_modelfile(models_dir, max_batch, model_version,
         f.write(engine_bytes)
 
 
-def create_plan_modelconfig(models_dir, max_batch, model_version, input_shape,
-                            output0_shape, output1_shape, input_dtype,
-                            output0_dtype, output1_dtype, input_memory_format,
-                            output_memory_format, version_policy):
-
-    if not tu.validate_for_trt_model(input_dtype, output0_dtype, output1_dtype,
-                                     input_shape, output0_shape, output1_shape):
+def create_plan_modelconfig(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    input_memory_format,
+    output_memory_format,
+    version_policy,
+):
+    if not tu.validate_for_trt_model(
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+    ):
         return
 
     # Unpack version policy
     version_policy_str = "{ latest { num_versions: 1 }}"
     if version_policy is not None:
         type, val = version_policy
-        if type == 'latest':
-            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(
-                val)
-        elif type == 'specific':
+        if type == "latest":
+            version_policy_str = "{{ latest {{ num_versions: {} }}}}".format(val)
+        elif type == "specific":
             version_policy_str = "{{ specific {{ versions: {} }}}}".format(val)
         else:
             version_policy_str = "{ all { }}"
 
     # Use a different model name for different kinds of models
     base_name = "plan_nobatch" if max_batch == 0 else "plan"
-    base_name += "_" + trt_format_to_string(
-        input_memory_format) + "_" + trt_format_to_string(output_memory_format)
-    model_name = tu.get_model_name(base_name, input_dtype, output0_dtype,
-                                   output1_dtype)
+    base_name += (
+        "_"
+        + trt_format_to_string(input_memory_format)
+        + "_"
+        + trt_format_to_string(output_memory_format)
+    )
+    model_name = tu.get_model_name(base_name, input_dtype, output0_dtype, output1_dtype)
 
     config_dir = models_dir + "/" + model_name
     if -1 in input_shape:
         profile_index = 0
-        config = '''
+        config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: {}
@@ -352,15 +390,22 @@ def create_plan_modelconfig(models_dir, max_batch, model_version, input_shape,
       profile:"{}"
   }}
 ]
-'''.format(model_name, max_batch, version_policy_str,
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(output0_dtype),
-           tu.shape_to_dims_str(output0_shape),
-           np_to_model_dtype(output1_dtype),
-           tu.shape_to_dims_str(output1_shape), profile_index)
+""".format(
+            model_name,
+            max_batch,
+            version_policy_str,
+            np_to_model_dtype(input_dtype),
+            tu.shape_to_dims_str(input_shape),
+            np_to_model_dtype(input_dtype),
+            tu.shape_to_dims_str(input_shape),
+            np_to_model_dtype(output0_dtype),
+            tu.shape_to_dims_str(output0_shape),
+            np_to_model_dtype(output1_dtype),
+            tu.shape_to_dims_str(output1_shape),
+            profile_index,
+        )
     else:
-        config = '''
+        config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: {}
@@ -389,13 +434,19 @@ def create_plan_modelconfig(models_dir, max_batch, model_version, input_shape,
     dims: [ {} ]
   }}
 ]
-'''.format(model_name, max_batch, version_policy_str,
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(output0_dtype),
-           tu.shape_to_dims_str(output0_shape),
-           np_to_model_dtype(output1_dtype),
-           tu.shape_to_dims_str(output1_shape))
+""".format(
+            model_name,
+            max_batch,
+            version_policy_str,
+            np_to_model_dtype(input_dtype),
+            tu.shape_to_dims_str(input_shape),
+            np_to_model_dtype(input_dtype),
+            tu.shape_to_dims_str(input_shape),
+            np_to_model_dtype(output0_dtype),
+            tu.shape_to_dims_str(output0_shape),
+            np_to_model_dtype(output1_dtype),
+            tu.shape_to_dims_str(output1_shape),
+        )
 
     try:
         os.makedirs(config_dir)
@@ -406,57 +457,141 @@ def create_plan_modelconfig(models_dir, max_batch, model_version, input_shape,
         cfile.write(config)
 
 
-def create_plan_model(models_dir, max_batch, model_version, input_shape,
-                      output0_shape, output1_shape, input_dtype, output0_dtype,
-                      output1_dtype, input_memory_format, output_memory_format):
-
-    if not tu.validate_for_trt_model(input_dtype, output0_dtype, output1_dtype,
-                                     input_shape, output0_shape, output1_shape):
+def create_plan_model(
+    models_dir,
+    max_batch,
+    model_version,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    input_memory_format,
+    output_memory_format,
+):
+    if not tu.validate_for_trt_model(
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_shape,
+        output0_shape,
+        output1_shape,
+    ):
         return
 
-    create_plan_modelconfig(models_dir, max_batch, model_version, input_shape,
-                            output0_shape, output1_shape, input_dtype,
-                            output0_dtype, output1_dtype, input_memory_format,
-                            output_memory_format, None)
-
-    if (not tu.shape_is_fixed(input_shape) or
-            not tu.shape_is_fixed(output0_shape) or
-            not tu.shape_is_fixed(output1_shape)):
-        create_plan_dynamic_modelfile(models_dir, max_batch, model_version,
-                                      input_shape, output0_shape, output1_shape,
-                                      input_dtype, output0_dtype, output1_dtype,
-                                      input_memory_format, output_memory_format)
+    create_plan_modelconfig(
+        models_dir,
+        max_batch,
+        model_version,
+        input_shape,
+        output0_shape,
+        output1_shape,
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        input_memory_format,
+        output_memory_format,
+        None,
+    )
+
+    if (
+        not tu.shape_is_fixed(input_shape)
+        or not tu.shape_is_fixed(output0_shape)
+        or not tu.shape_is_fixed(output1_shape)
+    ):
+        create_plan_dynamic_modelfile(
+            models_dir,
+            max_batch,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            input_memory_format,
+            output_memory_format,
+        )
     else:
-        create_plan_fixed_modelfile(models_dir, max_batch, model_version,
-                                    input_shape, output0_shape, output1_shape,
-                                    input_dtype, output0_dtype, output1_dtype,
-                                    input_memory_format, output_memory_format)
-
-
-if __name__ == '__main__':
+        create_plan_fixed_modelfile(
+            models_dir,
+            max_batch,
+            model_version,
+            input_shape,
+            output0_shape,
+            output1_shape,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            input_memory_format,
+            output_memory_format,
+        )
+
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
+    parser.add_argument(
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
     FLAGS, unparsed = parser.parse_known_args()
 
     # reformat-free input
     # Fixed shape
-    create_plan_model(FLAGS.models_dir, 0, 1, (13, 2, 1), (13, 2, 1),
-                      (13, 2, 1), np.float16, np.float16, np.float16,
-                      trt.TensorFormat.CHW2, trt.TensorFormat.LINEAR)
-    create_plan_model(FLAGS.models_dir, 8, 1, (13, 2, 1), (13, 2, 1),
-                      (13, 2, 1), np.float16, np.float16, np.float16,
-                      trt.TensorFormat.CHW2, trt.TensorFormat.LINEAR)
+    create_plan_model(
+        FLAGS.models_dir,
+        0,
+        1,
+        (13, 2, 1),
+        (13, 2, 1),
+        (13, 2, 1),
+        np.float16,
+        np.float16,
+        np.float16,
+        trt.TensorFormat.CHW2,
+        trt.TensorFormat.LINEAR,
+    )
+    create_plan_model(
+        FLAGS.models_dir,
+        8,
+        1,
+        (13, 2, 1),
+        (13, 2, 1),
+        (13, 2, 1),
+        np.float16,
+        np.float16,
+        np.float16,
+        trt.TensorFormat.CHW2,
+        trt.TensorFormat.LINEAR,
+    )
 
     # Dynamic shape
-    create_plan_model(FLAGS.models_dir, 0, 1, (-1, 2, 1), (-1, 2, 1),
-                      (-1, 2, 1), np.float32, np.float32, np.float32,
-                      trt.TensorFormat.CHW32, trt.TensorFormat.LINEAR)
-    create_plan_model(FLAGS.models_dir, 8, 1, (-1, 2, 1), (-1, 2, 1),
-                      (-1, 2, 1), np.float32, np.float32, np.float32,
-                      trt.TensorFormat.CHW32, trt.TensorFormat.LINEAR)
+    create_plan_model(
+        FLAGS.models_dir,
+        0,
+        1,
+        (-1, 2, 1),
+        (-1, 2, 1),
+        (-1, 2, 1),
+        np.float32,
+        np.float32,
+        np.float32,
+        trt.TensorFormat.CHW32,
+        trt.TensorFormat.LINEAR,
+    )
+    create_plan_model(
+        FLAGS.models_dir,
+        8,
+        1,
+        (-1, 2, 1),
+        (-1, 2, 1),
+        (-1, 2, 1),
+        np.float32,
+        np.float32,
+        np.float32,
+        trt.TensorFormat.CHW32,
+        trt.TensorFormat.LINEAR,
+    )
 
     # reformat-free output
     # reformat-free I/O
diff --git a/qa/common/gen_qa_trt_plugin_models.py b/qa/common/gen_qa_trt_plugin_models.py
old mode 100644
new mode 100755
index 7520af80ee..ce6b309f3d
--- a/qa/common/gen_qa_trt_plugin_models.py
+++ b/qa/common/gen_qa_trt_plugin_models.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -34,7 +36,7 @@
 
 TRT_LOGGER = trt.Logger()
 
-trt.init_libnvinfer_plugins(TRT_LOGGER, '')
+trt.init_libnvinfer_plugins(TRT_LOGGER, "")
 PLUGIN_CREATORS = trt.get_plugin_registry().plugin_creator_list
 
 
@@ -82,70 +84,105 @@ def get_trt_plugin(plugin_name):
     plugin = None
     field_collection = None
     for plugin_creator in PLUGIN_CREATORS:
-        if (plugin_creator.name == "Normalize_TRT") and \
-                (plugin_name == "Normalize_TRT"):
-            nbWeights = trt.PluginField("nbWeights",
-                                        np.array([1], dtype=np.int32),
-                                        trt.PluginFieldType.INT32)
-            eps = trt.PluginField("eps", np.array([0.00001], dtype=np.float32),
-                                  trt.PluginFieldType.FLOAT32)
-            weights = trt.PluginField('weights',
-                                      np.array([1] * 16, dtype=np.float32),
-                                      trt.PluginFieldType.FLOAT32)
-            field_collection = trt.PluginFieldCollection(
-                [weights, eps, nbWeights])
+        if (plugin_creator.name == "Normalize_TRT") and (
+            plugin_name == "Normalize_TRT"
+        ):
+            nbWeights = trt.PluginField(
+                "nbWeights", np.array([1], dtype=np.int32), trt.PluginFieldType.INT32
+            )
+            eps = trt.PluginField(
+                "eps",
+                np.array([0.00001], dtype=np.float32),
+                trt.PluginFieldType.FLOAT32,
+            )
+            weights = trt.PluginField(
+                "weights",
+                np.array([1] * 16, dtype=np.float32),
+                trt.PluginFieldType.FLOAT32,
+            )
+            field_collection = trt.PluginFieldCollection([weights, eps, nbWeights])
             break
-        elif (plugin_creator.name
-              == "CustomGeluPluginDynamic") and (plugin_name
-                                                 == "CustomGeluPluginDynamic"):
-            type_id = trt.PluginField("type_id", np.array([0], np.int32),
-                                      trt.PluginFieldType.INT32)
-            bias = trt.PluginField("bias", np.array([[[1]]], np.float32),
-                                   trt.PluginFieldType.FLOAT32)
+        elif (plugin_creator.name == "CustomGeluPluginDynamic") and (
+            plugin_name == "CustomGeluPluginDynamic"
+        ):
+            type_id = trt.PluginField(
+                "type_id", np.array([0], np.int32), trt.PluginFieldType.INT32
+            )
+            bias = trt.PluginField(
+                "bias", np.array([[[1]]], np.float32), trt.PluginFieldType.FLOAT32
+            )
             field_collection = trt.PluginFieldCollection([type_id, bias])
             break
-        elif (plugin_creator.name
-              == "CustomClipPlugin") and (plugin_name == "CustomClipPlugin"):
-            min_clip = trt.PluginField("clipMin", np.array([0.1],\
-                dtype=np.float32), trt.PluginFieldType.FLOAT32)
-            max_clip = trt.PluginField("clipMax", np.array([0.5],\
-                dtype=np.float32), trt.PluginFieldType.FLOAT32)
+        elif (plugin_creator.name == "CustomClipPlugin") and (
+            plugin_name == "CustomClipPlugin"
+        ):
+            min_clip = trt.PluginField(
+                "clipMin",
+                np.array([0.1], dtype=np.float32),
+                trt.PluginFieldType.FLOAT32,
+            )
+            max_clip = trt.PluginField(
+                "clipMax",
+                np.array([0.5], dtype=np.float32),
+                trt.PluginFieldType.FLOAT32,
+            )
             field_collection = trt.PluginFieldCollection([min_clip, max_clip])
             break
 
     if field_collection is None:
         raise RuntimeError("Plugin not found: " + plugin_name)
-    plugin = plugin_creator.create_plugin(name=plugin_name,
-                                          field_collection=field_collection)
+    plugin = plugin_creator.create_plugin(
+        name=plugin_name, field_collection=field_collection
+    )
 
     return plugin
 
 
-def create_plan_modelfile(models_dir, max_batch, model_version, plugin_name,
-                          input_shape, output0_shape, input_dtype,
-                          output0_dtype):
-
-    if not tu.validate_for_trt_model(input_dtype, output0_dtype, output0_dtype,
-                                     input_shape, output0_shape, output0_shape):
+def create_plan_modelfile(
+    models_dir,
+    max_batch,
+    model_version,
+    plugin_name,
+    input_shape,
+    output0_shape,
+    input_dtype,
+    output0_dtype,
+):
+    if not tu.validate_for_trt_model(
+        input_dtype,
+        output0_dtype,
+        output0_dtype,
+        input_shape,
+        output0_shape,
+        output0_shape,
+    ):
         return
 
     trt_input_dtype = np_to_trt_dtype(input_dtype)
 
-    model_name = tu.get_model_name("plan_nobatch" if max_batch == 0 else "plan",
-                                   input_dtype, output0_dtype,
-                                   output0_dtype) + '_' + plugin_name
+    model_name = (
+        tu.get_model_name(
+            "plan_nobatch" if max_batch == 0 else "plan",
+            input_dtype,
+            output0_dtype,
+            output0_dtype,
+        )
+        + "_"
+        + plugin_name
+    )
 
     # using explicit batch is necessary for CustomGeluPluginDynamic
     if plugin_name == "CustomGeluPluginDynamic":
-        explicit_batch = 1 << (int)(
-            trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+        explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
         with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
-                explicit_batch) as network:
-            input_layer = network.add_input(name="INPUT0",
-                                            dtype=trt_input_dtype,
-                                            shape=input_shape)
+            explicit_batch
+        ) as network:
+            input_layer = network.add_input(
+                name="INPUT0", dtype=trt_input_dtype, shape=input_shape
+            )
             plugin_layer = network.add_plugin_v2(
-                inputs=[input_layer], plugin=get_trt_plugin(plugin_name))
+                inputs=[input_layer], plugin=get_trt_plugin(plugin_name)
+            )
             plugin_layer.get_output(0).name = "OUTPUT0"
             network.mark_output(plugin_layer.get_output(0))
 
@@ -159,8 +196,7 @@ def create_plan_modelfile(models_dir, max_batch, model_version, plugin_name,
                 engine_bytes = engine.serialize()
                 del engine
 
-            model_version_dir = models_dir + "/" + model_name + "/" + str(
-                model_version)
+            model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
             try:
                 os.makedirs(model_version_dir)
@@ -170,13 +206,13 @@ def create_plan_modelfile(models_dir, max_batch, model_version, plugin_name,
             with open(model_version_dir + "/model.plan", "wb") as f:
                 f.write(engine_bytes)
     else:
-        with trt.Builder(
-                TRT_LOGGER) as builder, builder.create_network() as network:
-            input_layer = network.add_input(name="INPUT0",
-                                            dtype=trt_input_dtype,
-                                            shape=input_shape)
+        with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network:
+            input_layer = network.add_input(
+                name="INPUT0", dtype=trt_input_dtype, shape=input_shape
+            )
             plugin_layer = network.add_plugin_v2(
-                inputs=[input_layer], plugin=get_trt_plugin(plugin_name))
+                inputs=[input_layer], plugin=get_trt_plugin(plugin_name)
+            )
             plugin_layer.get_output(0).name = "OUTPUT0"
             network.mark_output(plugin_layer.get_output(0))
 
@@ -191,8 +227,7 @@ def create_plan_modelfile(models_dir, max_batch, model_version, plugin_name,
                 engine_bytes = engine.serialize()
                 del engine
 
-            model_version_dir = models_dir + "/" + model_name + "/" + str(
-                model_version)
+            model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
             try:
                 os.makedirs(model_version_dir)
@@ -203,22 +238,41 @@ def create_plan_modelfile(models_dir, max_batch, model_version, plugin_name,
                 f.write(engine_bytes)
 
 
-def create_plan_modelconfig(models_dir, max_batch, model_version, plugin_name,
-                            input_shape, output0_shape, input_dtype,
-                            output0_dtype):
-
-    if not tu.validate_for_trt_model(input_dtype, output0_dtype, output0_dtype,
-                                     input_shape, output0_shape, output0_shape):
+def create_plan_modelconfig(
+    models_dir,
+    max_batch,
+    model_version,
+    plugin_name,
+    input_shape,
+    output0_shape,
+    input_dtype,
+    output0_dtype,
+):
+    if not tu.validate_for_trt_model(
+        input_dtype,
+        output0_dtype,
+        output0_dtype,
+        input_shape,
+        output0_shape,
+        output0_shape,
+    ):
         return
 
     version_policy_str = "{ latest { num_versions: 1 }}"
 
     # Use a different model name for the non-batching variant
-    model_name = tu.get_model_name("plan_nobatch" if max_batch == 0 else "plan",
-                                   input_dtype, output0_dtype,
-                                   output0_dtype) + '_' + plugin_name
+    model_name = (
+        tu.get_model_name(
+            "plan_nobatch" if max_batch == 0 else "plan",
+            input_dtype,
+            output0_dtype,
+            output0_dtype,
+        )
+        + "_"
+        + plugin_name
+    )
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: {}
@@ -237,10 +291,15 @@ def create_plan_modelconfig(models_dir, max_batch, model_version, plugin_name,
     dims: [ {} ]
    }}
 ]
-'''.format(model_name, max_batch, version_policy_str,
-           np_to_model_dtype(input_dtype), tu.shape_to_dims_str(input_shape),
-           np_to_model_dtype(output0_dtype),
-           tu.shape_to_dims_str(output0_shape))
+""".format(
+        model_name,
+        max_batch,
+        version_policy_str,
+        np_to_model_dtype(input_dtype),
+        tu.shape_to_dims_str(input_shape),
+        np_to_model_dtype(output0_dtype),
+        tu.shape_to_dims_str(output0_shape),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -255,46 +314,93 @@ def create_plugin_models(models_dir):
     model_version = 1
 
     # custom CustomClipPlugin
-    create_plan_modelconfig(models_dir, 8, model_version, "CustomClipPlugin",
-                            (16,), (16,), np.float32, np.float32)
-    create_plan_modelfile(models_dir, 8, model_version, "CustomClipPlugin",
-                          (16,), (16,), np.float32, np.float32)
+    create_plan_modelconfig(
+        models_dir,
+        8,
+        model_version,
+        "CustomClipPlugin",
+        (16,),
+        (16,),
+        np.float32,
+        np.float32,
+    )
+    create_plan_modelfile(
+        models_dir,
+        8,
+        model_version,
+        "CustomClipPlugin",
+        (16,),
+        (16,),
+        np.float32,
+        np.float32,
+    )
 
     # default CustomGeluPluginDynamic plugin
-    create_plan_modelconfig(models_dir, 0, model_version,
-                            "CustomGeluPluginDynamic", (16, 1, 1), (16, 1, 1),
-                            np.float32, np.float32)
-    create_plan_modelfile(models_dir, 0, model_version,
-                          "CustomGeluPluginDynamic", (16, 1, 1), (16, 1, 1),
-                          np.float32, np.float32)
+    create_plan_modelconfig(
+        models_dir,
+        0,
+        model_version,
+        "CustomGeluPluginDynamic",
+        (16, 1, 1),
+        (16, 1, 1),
+        np.float32,
+        np.float32,
+    )
+    create_plan_modelfile(
+        models_dir,
+        0,
+        model_version,
+        "CustomGeluPluginDynamic",
+        (16, 1, 1),
+        (16, 1, 1),
+        np.float32,
+        np.float32,
+    )
 
     # default Normalize_TRT
-    create_plan_modelconfig(models_dir, 8, model_version, "Normalize_TRT", (
-        16,
-        16,
-        16,
-    ), (
-        16,
-        16,
-        16,
-    ), np.float32, np.float32)
-    create_plan_modelfile(models_dir, 8, model_version, "Normalize_TRT", (
-        16,
-        16,
-        16,
-    ), (
-        16,
-        16,
-        16,
-    ), np.float32, np.float32)
-
-
-if __name__ == '__main__':
+    create_plan_modelconfig(
+        models_dir,
+        8,
+        model_version,
+        "Normalize_TRT",
+        (
+            16,
+            16,
+            16,
+        ),
+        (
+            16,
+            16,
+            16,
+        ),
+        np.float32,
+        np.float32,
+    )
+    create_plan_modelfile(
+        models_dir,
+        8,
+        model_version,
+        "Normalize_TRT",
+        (
+            16,
+            16,
+            16,
+        ),
+        (
+            16,
+            16,
+            16,
+        ),
+        np.float32,
+        np.float32,
+    )
+
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
+    parser.add_argument(
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
     FLAGS, unparsed = parser.parse_known_args()
 
     import test_util as tu
diff --git a/qa/common/gen_tag_sigdef.py b/qa/common/gen_tag_sigdef.py
old mode 100644
new mode 100755
index 880fcf4483..9c0c5ffbf7
--- a/qa/common/gen_tag_sigdef.py
+++ b/qa/common/gen_tag_sigdef.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -31,22 +33,23 @@
 import os
 from builtins import range
 
-from tensorflow.python.framework import ops
-from tensorflow.python.saved_model import builder
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import tag_constants
-import tensorflow.compat.v1 as tf
 import gen_ensemble_model_utils as gu
+import tensorflow.compat.v1 as tf
+from tensorflow.python.framework import ops
+from tensorflow.python.saved_model import builder, signature_constants, tag_constants
+
 """Create SavedModels that contains multiple tags and multiple signature defs"""
 
 
-def create_savedmodel(models_dir,
-                      model_version=1,
-                      dims=16,
-                      model_name="sig_tag",
-                      tag_name="testTag",
-                      signature_def_name="testSigDef",
-                      different_io=False):
+def create_savedmodel(
+    models_dir,
+    model_version=1,
+    dims=16,
+    model_name="sig_tag",
+    tag_name="testTag",
+    signature_def_name="testSigDef",
+    different_io=False,
+):
     """
     Creates one SavedModel with four variants of the model based on provided tag and signature_def.
     The models multiplies the input tensor by a multiplier and the multiplier value is different for each variant.
@@ -81,62 +84,49 @@ def create_savedmodel(models_dir,
         # tag:tag_name, signature_def:signature_def_name
         multiplier_3 = tf.constant(4.0, name="multiplier_3")
 
-        output_tensor_0 = tf.multiply(multiplier_0,
-                                      input_tensor,
-                                      name="TENSOR_OUTPUT")
-        output_tensor_1 = tf.multiply(multiplier_1,
-                                      input_tensor,
-                                      name="TENSOR_OUTPUT")
-        output_tensor_2 = tf.multiply(multiplier_2,
-                                      input_tensor,
-                                      name="TENSOR_OUTPUT")
-        output_tensor_3 = tf.multiply(multiplier_3,
-                                      input_tensor,
-                                      name="TENSOR_OUTPUT")
+        output_tensor_0 = tf.multiply(multiplier_0, input_tensor, name="TENSOR_OUTPUT")
+        output_tensor_1 = tf.multiply(multiplier_1, input_tensor, name="TENSOR_OUTPUT")
+        output_tensor_2 = tf.multiply(multiplier_2, input_tensor, name="TENSOR_OUTPUT")
+        output_tensor_3 = tf.multiply(multiplier_3, input_tensor, name="TENSOR_OUTPUT")
 
         # build_tensor_info_op could be used if build_tensor_info is deprecated
         input_tensor_info = tf.saved_model.utils.build_tensor_info(input_tensor)
-        output_tensor_info_0 = tf.saved_model.utils.build_tensor_info(
-            output_tensor_0)
-        output_tensor_info_1 = tf.saved_model.utils.build_tensor_info(
-            output_tensor_1)
-        output_tensor_info_2 = tf.saved_model.utils.build_tensor_info(
-            output_tensor_2)
-        output_tensor_info_3 = tf.saved_model.utils.build_tensor_info(
-            output_tensor_3)
+        output_tensor_info_0 = tf.saved_model.utils.build_tensor_info(output_tensor_0)
+        output_tensor_info_1 = tf.saved_model.utils.build_tensor_info(output_tensor_1)
+        output_tensor_info_2 = tf.saved_model.utils.build_tensor_info(output_tensor_2)
+        output_tensor_info_3 = tf.saved_model.utils.build_tensor_info(output_tensor_3)
 
         # Using predict method name because simple save uses it
         # tag:"serve", signature_def:"serving_default"
         signature_0 = tf.saved_model.signature_def_utils.build_signature_def(
             inputs={"INPUT": input_tensor_info},
             outputs={"OUTPUT": output_tensor_info_0},
-            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
+            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME,
+        )
         # tag:"serve", signature_def:signature_def_name
         signature_1 = tf.saved_model.signature_def_utils.build_signature_def(
             inputs={"INPUT": input_tensor_info},
             outputs={"OUTPUT": output_tensor_info_1},
-            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
+            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME,
+        )
         # tag:tag_name, signature_def:"serving_default"
         signature_2 = tf.saved_model.signature_def_utils.build_signature_def(
             inputs={"INPUT": input_tensor_info},
             outputs={"OUTPUT": output_tensor_info_2},
-            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
+            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME,
+        )
         # tag:tag_name, signature_def:signature_def_name
         signature_3 = tf.saved_model.signature_def_utils.build_signature_def(
             inputs={"INPUT": input_tensor_info},
             outputs={"OUTPUT": output_tensor_info_3},
-            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
+            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME,
+        )
         # tag:tag_name, signature_def:signature_def_name, two inputs/outputs
         signature_4 = tf.saved_model.signature_def_utils.build_signature_def(
-            inputs={
-                "INPUT": input_tensor_info,
-                "INPUT1": input_tensor_info
-            },
-            outputs={
-                "OUTPUT": output_tensor_info_0,
-                "OUTPUT1": output_tensor_info_1
-            },
-            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
+            inputs={"INPUT": input_tensor_info, "INPUT1": input_tensor_info},
+            outputs={"OUTPUT": output_tensor_info_0, "OUTPUT1": output_tensor_info_1},
+            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME,
+        )
 
         b = builder.SavedModelBuilder(model_version_dir + "/model.savedmodel")
 
@@ -145,56 +135,54 @@ def create_savedmodel(models_dir,
                 sess,
                 tags=[tag_name],
                 signature_def_map={signature_def_name: signature_0},
-                assets_collection=ops.get_collection(
-                    ops.GraphKeys.ASSET_FILEPATHS),
-                clear_devices=True)
+                assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS),
+                clear_devices=True,
+            )
             b.add_meta_graph(
                 tags=[tag_constants.SERVING],
                 signature_def_map={
-                    signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-                        signature_4
+                    signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_4
                 },
-                assets_collection=ops.get_collection(
-                    ops.GraphKeys.ASSET_FILEPATHS),
-                clear_devices=True)
+                assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS),
+                clear_devices=True,
+            )
         else:
             signature_def_map_0 = {
-                signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-                    signature_0,
-                signature_def_name:
-                    signature_1
+                signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_0,
+                signature_def_name: signature_1,
             }
             signature_def_map_1 = {
-                signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-                    signature_2,
-                signature_def_name:
-                    signature_3
+                signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_2,
+                signature_def_name: signature_3,
             }
 
             b.add_meta_graph_and_variables(
                 sess,
                 tags=[tag_constants.SERVING],
                 signature_def_map=signature_def_map_0,
-                assets_collection=ops.get_collection(
-                    ops.GraphKeys.ASSET_FILEPATHS),
-                clear_devices=True)
-            b.add_meta_graph(tags=[tag_name],
-                             signature_def_map=signature_def_map_1,
-                             assets_collection=ops.get_collection(
-                                 ops.GraphKeys.ASSET_FILEPATHS),
-                             clear_devices=True)
+                assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS),
+                clear_devices=True,
+            )
+            b.add_meta_graph(
+                tags=[tag_name],
+                signature_def_map=signature_def_map_1,
+                assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS),
+                clear_devices=True,
+            )
 
         b.save()
 
 
-def create_savedmodel_modelconfig(models_dir,
-                                  model_version=1,
-                                  dims=16,
-                                  model_name="sig_tag",
-                                  tag_name="testTag",
-                                  signature_def_name="testSigDef"):
+def create_savedmodel_modelconfig(
+    models_dir,
+    model_version=1,
+    dims=16,
+    model_name="sig_tag",
+    tag_name="testTag",
+    signature_def_name="testSigDef",
+):
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "tensorflow_savedmodel"
 input [
@@ -223,9 +211,15 @@ def create_savedmodel_modelconfig(models_dir,
 string_value: "{}"
 }}
 }}
-'''.format(model_name, gu.np_to_model_dtype(tf.float32), str(dims),
-           gu.np_to_model_dtype(tf.float32), str(dims), tag_name,
-           signature_def_name)
+""".format(
+        model_name,
+        gu.np_to_model_dtype(tf.float32),
+        str(dims),
+        gu.np_to_model_dtype(tf.float32),
+        str(dims),
+        tag_name,
+        signature_def_name,
+    )
 
     try:
         os.makedirs(config_dir)
@@ -236,12 +230,11 @@ def create_savedmodel_modelconfig(models_dir,
         cfile.write(config)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     import argparse
-    parser = argparse.ArgumentParser(description='getting model output dir')
-    parser.add_argument('--dir',
-                        help='directory to run model in',
-                        required=True)
+
+    parser = argparse.ArgumentParser(description="getting model output dir")
+    parser.add_argument("--dir", help="directory to run model in", required=True)
     args = parser.parse_args()
     base_dir = args.dir
     base_model_name = "sig_tag"
@@ -252,32 +245,46 @@ def create_savedmodel_modelconfig(models_dir,
 
     for i in range(4):
         model_name = base_model_name + str(i)
-        create_savedmodel(base_dir,
-                          model_name=model_name,
-                          tag_name=test_tag,
-                          signature_def_name=test_sig_def)
-    create_savedmodel(base_dir,
-                      model_name=base_model_name + "_different_io",
-                      tag_name=test_tag,
-                      signature_def_name=test_sig_def,
-                      different_io=True)
-    create_savedmodel_modelconfig(base_dir,
-                                  model_name="sig_tag0",
-                                  tag_name=base_tag,
-                                  signature_def_name=base_sig_def)
-    create_savedmodel_modelconfig(base_dir,
-                                  model_name="sig_tag1",
-                                  tag_name=base_tag,
-                                  signature_def_name=test_sig_def)
-    create_savedmodel_modelconfig(base_dir,
-                                  model_name="sig_tag2",
-                                  tag_name=test_tag,
-                                  signature_def_name=base_sig_def)
-    create_savedmodel_modelconfig(base_dir,
-                                  model_name="sig_tag3",
-                                  tag_name=test_tag,
-                                  signature_def_name=test_sig_def)
-    create_savedmodel_modelconfig(base_dir,
-                                  model_name="sig_tag_different_io",
-                                  tag_name=test_tag,
-                                  signature_def_name=test_sig_def)
+        create_savedmodel(
+            base_dir,
+            model_name=model_name,
+            tag_name=test_tag,
+            signature_def_name=test_sig_def,
+        )
+    create_savedmodel(
+        base_dir,
+        model_name=base_model_name + "_different_io",
+        tag_name=test_tag,
+        signature_def_name=test_sig_def,
+        different_io=True,
+    )
+    create_savedmodel_modelconfig(
+        base_dir,
+        model_name="sig_tag0",
+        tag_name=base_tag,
+        signature_def_name=base_sig_def,
+    )
+    create_savedmodel_modelconfig(
+        base_dir,
+        model_name="sig_tag1",
+        tag_name=base_tag,
+        signature_def_name=test_sig_def,
+    )
+    create_savedmodel_modelconfig(
+        base_dir,
+        model_name="sig_tag2",
+        tag_name=test_tag,
+        signature_def_name=base_sig_def,
+    )
+    create_savedmodel_modelconfig(
+        base_dir,
+        model_name="sig_tag3",
+        tag_name=test_tag,
+        signature_def_name=test_sig_def,
+    )
+    create_savedmodel_modelconfig(
+        base_dir,
+        model_name="sig_tag_different_io",
+        tag_name=test_tag,
+        signature_def_name=test_sig_def,
+    )
diff --git a/qa/common/infer_test.py b/qa/common/infer_test.py
old mode 100644
new mode 100755
index cd954f250e..21cf630e39
--- a/qa/common/infer_test.py
+++ b/qa/common/infer_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,11 +30,12 @@
 
 sys.path.append("../common")
 
+import os
 import unittest
-import numpy as np
+
 import infer_util as iu
+import numpy as np
 import test_util as tu
-import os
 
 np_dtype_string = np.dtype(object)
 
@@ -42,134 +45,176 @@
 
 
 class InferTest(tu.TestResultCollector):
-
-    def _full_exact(self, input_dtype, output0_dtype, output1_dtype,
-                    output0_raw, output1_raw, swap):
-
-        def _infer_exact_helper(tester,
-                                pf,
-                                tensor_shape,
-                                batch_size,
-                                input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                output0_raw=True,
-                                output1_raw=True,
-                                model_version=None,
-                                swap=False,
-                                outputs=("OUTPUT0", "OUTPUT1"),
-                                use_http=True,
-                                use_grpc=True,
-                                skip_request_id_check=False,
-                                use_streaming=True,
-                                correlation_id=0):
+    def _full_exact(
+        self, input_dtype, output0_dtype, output1_dtype, output0_raw, output1_raw, swap
+    ):
+        def _infer_exact_helper(
+            tester,
+            pf,
+            tensor_shape,
+            batch_size,
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            output0_raw=True,
+            output1_raw=True,
+            model_version=None,
+            swap=False,
+            outputs=("OUTPUT0", "OUTPUT1"),
+            use_http=True,
+            use_grpc=True,
+            skip_request_id_check=False,
+            use_streaming=True,
+            correlation_id=0,
+        ):
             for bs in (1, batch_size):
-                iu.infer_exact(tester,
-                               pf, (bs,) + tensor_shape,
-                               bs,
-                               input_dtype,
-                               output0_dtype,
-                               output1_dtype,
-                               output0_raw=output0_raw,
-                               output1_raw=output1_raw,
-                               model_version=model_version,
-                               swap=swap,
-                               outputs=outputs,
-                               use_http=use_http,
-                               use_grpc=use_grpc,
-                               skip_request_id_check=skip_request_id_check,
-                               use_streaming=use_streaming,
-                               correlation_id=correlation_id)
+                iu.infer_exact(
+                    tester,
+                    pf,
+                    (bs,) + tensor_shape,
+                    bs,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    output0_raw=output0_raw,
+                    output1_raw=output1_raw,
+                    model_version=model_version,
+                    swap=swap,
+                    outputs=outputs,
+                    use_http=use_http,
+                    use_grpc=use_grpc,
+                    skip_request_id_check=skip_request_id_check,
+                    use_streaming=use_streaming,
+                    correlation_id=correlation_id,
+                )
 
         input_size = 16
 
-        if tu.validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
-                                    (input_size,), (input_size,),
-                                    (input_size,)):
+        if tu.validate_for_tf_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size,),
+            (input_size,),
+            (input_size,),
+        ):
             for pf in ["graphdef", "savedmodel"]:
                 if pf in TEST_BACKENDS:
-                    _infer_exact_helper(self,
-                                        pf, (input_size,),
-                                        8,
-                                        input_dtype,
-                                        output0_dtype,
-                                        output1_dtype,
-                                        output0_raw=output0_raw,
-                                        output1_raw=output1_raw,
-                                        swap=swap)
-
-        if tu.validate_for_trt_model(input_dtype, output0_dtype, output1_dtype,
-                                     (input_size, 1, 1), (input_size, 1, 1),
-                                     (input_size, 1, 1)):
+                    _infer_exact_helper(
+                        self,
+                        pf,
+                        (input_size,),
+                        8,
+                        input_dtype,
+                        output0_dtype,
+                        output1_dtype,
+                        output0_raw=output0_raw,
+                        output1_raw=output1_raw,
+                        swap=swap,
+                    )
+
+        if tu.validate_for_trt_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size, 1, 1),
+            (input_size, 1, 1),
+            (input_size, 1, 1),
+        ):
             if "plan" in TEST_BACKENDS:
                 if input_dtype == np.int8:
                     shape = (input_size, 1, 1)
                 else:
                     shape = (input_size,)
-                _infer_exact_helper(self,
-                                    'plan',
-                                    shape,
-                                    8,
-                                    input_dtype,
-                                    output0_dtype,
-                                    output1_dtype,
-                                    output0_raw=output0_raw,
-                                    output1_raw=output1_raw,
-                                    swap=swap)
-
-        if tu.validate_for_onnx_model(input_dtype, output0_dtype, output1_dtype,
-                                      (input_size,), (input_size,),
-                                      (input_size,)):
+                _infer_exact_helper(
+                    self,
+                    "plan",
+                    shape,
+                    8,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    output0_raw=output0_raw,
+                    output1_raw=output1_raw,
+                    swap=swap,
+                )
+
+        if tu.validate_for_onnx_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size,),
+            (input_size,),
+            (input_size,),
+        ):
             if "onnx" in TEST_BACKENDS:
-                _infer_exact_helper(self,
-                                    'onnx', (input_size,),
-                                    8,
-                                    input_dtype,
-                                    output0_dtype,
-                                    output1_dtype,
-                                    output0_raw=output0_raw,
-                                    output1_raw=output1_raw,
-                                    swap=swap)
+                _infer_exact_helper(
+                    self,
+                    "onnx",
+                    (input_size,),
+                    8,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    output0_raw=output0_raw,
+                    output1_raw=output1_raw,
+                    swap=swap,
+                )
 
         # Skip for batched string I/O
-        if tu.validate_for_libtorch_model(input_dtype, output0_dtype,
-                                          output1_dtype, (input_size,),
-                                          (input_size,), (input_size,), 8):
+        if tu.validate_for_libtorch_model(
+            input_dtype,
+            output0_dtype,
+            output1_dtype,
+            (input_size,),
+            (input_size,),
+            (input_size,),
+            8,
+        ):
             if "libtorch" in TEST_BACKENDS:
-                _infer_exact_helper(self,
-                                    'libtorch', (input_size,),
-                                    8,
-                                    input_dtype,
-                                    output0_dtype,
-                                    output1_dtype,
-                                    output0_raw=output0_raw,
-                                    output1_raw=output1_raw,
-                                    swap=swap)
+                _infer_exact_helper(
+                    self,
+                    "libtorch",
+                    (input_size,),
+                    8,
+                    input_dtype,
+                    output0_dtype,
+                    output1_dtype,
+                    output0_raw=output0_raw,
+                    output1_raw=output1_raw,
+                    swap=swap,
+                )
 
     def test_raw_fff(self):
-        self._full_exact(np.float32,
-                         np.float32,
-                         np.float32,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=True)
+        self._full_exact(
+            np.float32,
+            np.float32,
+            np.float32,
+            output0_raw=True,
+            output1_raw=True,
+            swap=True,
+        )
 
     def test_raw_ooo(self):
-        self._full_exact(np_dtype_string,
-                         np_dtype_string,
-                         np_dtype_string,
-                         output0_raw=True,
-                         output1_raw=True,
-                         swap=False)
+        self._full_exact(
+            np_dtype_string,
+            np_dtype_string,
+            np_dtype_string,
+            output0_raw=True,
+            output1_raw=True,
+            swap=False,
+        )
 
     def test_class_fff(self):
-        self._full_exact(np.float32,
-                         np.float32,
-                         np.float32,
-                         output0_raw=False,
-                         output1_raw=False,
-                         swap=True)
+        self._full_exact(
+            np.float32,
+            np.float32,
+            np.float32,
+            output0_raw=False,
+            output1_raw=False,
+            swap=True,
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/common/infer_util.py b/qa/common/infer_util.py
old mode 100644
new mode 100755
index 58fb182323..98cb4671ab
--- a/qa/common/infer_util.py
+++ b/qa/common/infer_util.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,12 +28,13 @@
 
 import os
 import sys
-import numpy as np
 from functools import partial
+
+import numpy as np
+import shm_util as su
 import test_util as tu
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
-import shm_util as su
 from tritonclient.utils import *
 
 if sys.version_info >= (3, 0):
@@ -48,7 +51,7 @@
 
 # By default, find tritonserver on "localhost", but can be overridden
 # with TRITONSERVER_IPADDR envvar
-_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost')
+_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
 
 
 def _unique_request_id():
@@ -78,7 +81,6 @@ def serialize_byte_tensor_list(tensor_values):
 
 
 class UserData:
-
     def __init__(self):
         self._completed_requests = queue.Queue()
 
@@ -91,31 +93,32 @@ def completion_callback(user_data, result, error):
 
 # Perform inference using an "addsum" type verification backend.
 def infer_exact(
-        tester,
-        pf,
-        tensor_shape,
-        batch_size,
-        input_dtype,
-        output0_dtype,
-        output1_dtype,
-        output0_raw=True,
-        output1_raw=True,
-        model_version=None,
-        swap=False,
-        outputs=("OUTPUT0", "OUTPUT1"),
-        use_http=True,
-        use_grpc=True,
-        use_http_json_tensors=True,
-        skip_request_id_check=False,
-        use_streaming=True,
-        correlation_id=0,
-        shm_region_names=None,
-        precreated_shm_regions=None,
-        use_system_shared_memory=False,
-        use_cuda_shared_memory=False,
-        priority=0,
-        # 60 sec is the default value for L0_infer_valgrind
-        network_timeout=60.0):
+    tester,
+    pf,
+    tensor_shape,
+    batch_size,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    output0_raw=True,
+    output1_raw=True,
+    model_version=None,
+    swap=False,
+    outputs=("OUTPUT0", "OUTPUT1"),
+    use_http=True,
+    use_grpc=True,
+    use_http_json_tensors=True,
+    skip_request_id_check=False,
+    use_streaming=True,
+    correlation_id=0,
+    shm_region_names=None,
+    precreated_shm_regions=None,
+    use_system_shared_memory=False,
+    use_cuda_shared_memory=False,
+    priority=0,
+    # 60 sec is the default value for L0_infer_valgrind
+    network_timeout=60.0,
+):
     # Lazy shm imports...
     if use_system_shared_memory:
         import tritonclient.utils.shared_memory as shm
@@ -129,10 +132,13 @@ def infer_exact(
         configs.append((f"{_tritonserver_ipaddr}:8000", "http", False, True))
         if output0_raw == output1_raw:
             # Float16 not supported for Input and Output via JSON
-            if use_http_json_tensors and (input_dtype != np.float16) and \
-               (output0_dtype != np.float16) and (output1_dtype != np.float16):
-                configs.append(
-                    (f"{_tritonserver_ipaddr}:8000", "http", False, False))
+            if (
+                use_http_json_tensors
+                and (input_dtype != np.float16)
+                and (output0_dtype != np.float16)
+                and (output1_dtype != np.float16)
+            ):
+                configs.append((f"{_tritonserver_ipaddr}:8000", "http", False, False))
     if use_grpc:
         configs.append((f"{_tritonserver_ipaddr}:8001", "grpc", False, False))
     if use_streaming:
@@ -145,27 +151,31 @@ def infer_exact(
     # class outputs the result value/probability is returned as a
     # float so must use fp32 range in that case.
     rinput_dtype = _range_repr_dtype(input_dtype)
-    routput0_dtype = _range_repr_dtype(
-        output0_dtype if output0_raw else np.float32)
-    routput1_dtype = _range_repr_dtype(
-        output1_dtype if output1_raw else np.float32)
-    val_min = max(
-        np.iinfo(rinput_dtype).min,
-        np.iinfo(routput0_dtype).min,
-        np.iinfo(routput1_dtype).min) / 2
-    val_max = min(
-        np.iinfo(rinput_dtype).max,
-        np.iinfo(routput0_dtype).max,
-        np.iinfo(routput1_dtype).max) / 2
-
-    input0_array = np.random.randint(low=val_min,
-                                     high=val_max,
-                                     size=tensor_shape,
-                                     dtype=rinput_dtype)
-    input1_array = np.random.randint(low=val_min,
-                                     high=val_max,
-                                     size=tensor_shape,
-                                     dtype=rinput_dtype)
+    routput0_dtype = _range_repr_dtype(output0_dtype if output0_raw else np.float32)
+    routput1_dtype = _range_repr_dtype(output1_dtype if output1_raw else np.float32)
+    val_min = (
+        max(
+            np.iinfo(rinput_dtype).min,
+            np.iinfo(routput0_dtype).min,
+            np.iinfo(routput1_dtype).min,
+        )
+        / 2
+    )
+    val_max = (
+        min(
+            np.iinfo(rinput_dtype).max,
+            np.iinfo(routput0_dtype).max,
+            np.iinfo(routput1_dtype).max,
+        )
+        / 2
+    )
+
+    input0_array = np.random.randint(
+        low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype
+    )
+    input1_array = np.random.randint(
+        low=val_min, high=val_max, size=tensor_shape, dtype=rinput_dtype
+    )
     if input_dtype != np.object_:
         input0_array = input0_array.astype(input_dtype)
         input1_array = input1_array.astype(input_dtype)
@@ -175,8 +185,7 @@ def infer_exact(
     if val_min == 0:
         # swap element if the element at input 0 < input 1
         tmp = np.where(input0_array < input1_array, input1_array, input0_array)
-        input1_array = np.where(input0_array < input1_array, input0_array,
-                                input1_array)
+        input1_array = np.where(input0_array < input1_array, input0_array, input1_array)
         input0_array = tmp
 
     if not swap:
@@ -187,28 +196,28 @@ def infer_exact(
         output1_array = input0_array + input1_array
 
     if output0_dtype == np.object_:
-        output0_array = np.array([
-            unicode(str(x), encoding='utf-8') for x in (output0_array.flatten())
-        ],
-                                 dtype=object).reshape(output0_array.shape)
+        output0_array = np.array(
+            [unicode(str(x), encoding="utf-8") for x in (output0_array.flatten())],
+            dtype=object,
+        ).reshape(output0_array.shape)
     else:
         output0_array = output0_array.astype(output0_dtype)
     if output1_dtype == np.object_:
-        output1_array = np.array([
-            unicode(str(x), encoding='utf-8') for x in (output1_array.flatten())
-        ],
-                                 dtype=object).reshape(output1_array.shape)
+        output1_array = np.array(
+            [unicode(str(x), encoding="utf-8") for x in (output1_array.flatten())],
+            dtype=object,
+        ).reshape(output1_array.shape)
     else:
         output1_array = output1_array.astype(output1_dtype)
 
     if input_dtype == np.object_:
         in0n = np.array(
-            [str(x) for x in input0_array.reshape(input0_array.size)],
-            dtype=object)
+            [str(x) for x in input0_array.reshape(input0_array.size)], dtype=object
+        )
         input0_array = in0n.reshape(input0_array.shape)
         in1n = np.array(
-            [str(x) for x in input1_array.reshape(input1_array.size)],
-            dtype=object)
+            [str(x) for x in input1_array.reshape(input1_array.size)], dtype=object
+        )
         input1_array = in1n.reshape(input1_array.shape)
 
     # prepend size of string to output string data
@@ -229,14 +238,12 @@ def infer_exact(
         output1_array_tmp = output1_array
 
     if output0_dtype == np.object_:
-        output0_byte_size = sum(
-            [serialized_byte_size(o0) for o0 in output0_array_tmp])
+        output0_byte_size = sum([serialized_byte_size(o0) for o0 in output0_array_tmp])
     else:
         output0_byte_size = sum([o0.nbytes for o0 in output0_array_tmp])
 
     if output1_dtype == np.object_:
-        output1_byte_size = sum(
-            [serialized_byte_size(o1) for o1 in output1_array_tmp])
+        output1_byte_size = sum([serialized_byte_size(o1) for o1 in output1_array_tmp])
     else:
         output1_byte_size = sum([o1.nbytes for o1 in output1_array_tmp])
 
@@ -256,10 +263,8 @@ def infer_exact(
         input1_list_tmp = input1_list
 
     if input_dtype == np.object_:
-        input0_byte_size = sum(
-            [serialized_byte_size(i0) for i0 in input0_list_tmp])
-        input1_byte_size = sum(
-            [serialized_byte_size(i1) for i1 in input1_list_tmp])
+        input0_byte_size = sum([serialized_byte_size(i0) for i0 in input0_list_tmp])
+        input1_byte_size = sum([serialized_byte_size(i1) for i1 in input1_list_tmp])
     else:
         input0_byte_size = sum([i0.nbytes for i0 in input0_list_tmp])
         input1_byte_size = sum([i1.nbytes for i1 in input1_list_tmp])
@@ -270,26 +275,68 @@ def infer_exact(
         model_version = ""
 
     # Run inference and check results for each config
-    inferAndCheckResults(tester, configs, pf, batch_size, model_version,
-                         input_dtype, output0_dtype, output1_dtype,
-                         tensor_shape, input0_array, input1_array,
-                         output0_array, output1_array, output0_raw, output1_raw,
-                         outputs, precreated_shm_regions, input0_list_tmp,
-                         input1_list_tmp, shm_region_names, input0_byte_size,
-                         input1_byte_size, output0_byte_size, output1_byte_size,
-                         use_system_shared_memory, use_cuda_shared_memory,
-                         network_timeout, skip_request_id_check)
-
-
-def inferAndCheckResults(tester, configs, pf, batch_size, model_version,
-                         input_dtype, output0_dtype, output1_dtype,
-                         tensor_shape, input0_array, input1_array,
-                         output0_array, output1_array, output0_raw, output1_raw,
-                         outputs, precreated_shm_regions, input0_list_tmp,
-                         input1_list_tmp, shm_region_names, input0_byte_size,
-                         input1_byte_size, output0_byte_size, output1_byte_size,
-                         use_system_shared_memory, use_cuda_shared_memory,
-                         network_timeout, skip_request_id_check):
+    inferAndCheckResults(
+        tester,
+        configs,
+        pf,
+        batch_size,
+        model_version,
+        input_dtype,
+        output0_dtype,
+        output1_dtype,
+        tensor_shape,
+        input0_array,
+        input1_array,
+        output0_array,
+        output1_array,
+        output0_raw,
+        output1_raw,
+        outputs,
+        precreated_shm_regions,
+        input0_list_tmp,
+        input1_list_tmp,
+        shm_region_names,
+        input0_byte_size,
+        input1_byte_size,
+        output0_byte_size,
+        output1_byte_size,
+        use_system_shared_memory,
+        use_cuda_shared_memory,
+        network_timeout,
+        skip_request_id_check,
+    )
+
+
+def inferAndCheckResults(
+    tester,
+    configs,
+    pf,
+    batch_size,
+    model_version,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    tensor_shape,
+    input0_array,
+    input1_array,
+    output0_array,
+    output1_array,
+    output0_raw,
+    output1_raw,
+    outputs,
+    precreated_shm_regions,
+    input0_list_tmp,
+    input1_list_tmp,
+    shm_region_names,
+    input0_byte_size,
+    input1_byte_size,
+    output0_byte_size,
+    output1_byte_size,
+    use_system_shared_memory,
+    use_cuda_shared_memory,
+    network_timeout,
+    skip_request_id_check,
+):
     # Lazy shm imports...
     if use_system_shared_memory:
         import tritonclient.utils.shared_memory as shm
@@ -298,16 +345,13 @@ def inferAndCheckResults(tester, configs, pf, batch_size, model_version,
     num_classes = 3
 
     # Get model platform
-    model_name = tu.get_model_name(pf, input_dtype, output0_dtype,
-                                   output1_dtype)
+    model_name = tu.get_model_name(pf, input_dtype, output0_dtype, output1_dtype)
     if configs[0][1] == "http":
-        metadata_client = httpclient.InferenceServerClient(configs[0][0],
-                                                           verbose=True)
+        metadata_client = httpclient.InferenceServerClient(configs[0][0], verbose=True)
         metadata = metadata_client.get_model_metadata(model_name)
         platform = metadata["platform"]
     else:
-        metadata_client = grpcclient.InferenceServerClient(configs[0][0],
-                                                           verbose=True)
+        metadata_client = grpcclient.InferenceServerClient(configs[0][0], verbose=True)
         metadata = metadata_client.get_model_metadata(model_name)
         platform = metadata.platform
 
@@ -323,53 +367,73 @@ def inferAndCheckResults(tester, configs, pf, batch_size, model_version,
 
     # Create system/cuda shared memory regions if needed
     shm_regions, shm_handles = su.create_set_shm_regions(
-        input0_list_tmp, input1_list_tmp, output0_byte_size, output1_byte_size,
-        outputs, shm_region_names, precreated_shm_regions,
-        use_system_shared_memory, use_cuda_shared_memory)
+        input0_list_tmp,
+        input1_list_tmp,
+        output0_byte_size,
+        output1_byte_size,
+        outputs,
+        shm_region_names,
+        precreated_shm_regions,
+        use_system_shared_memory,
+        use_cuda_shared_memory,
+    )
     for config in configs:
-        model_name = tu.get_model_name(pf, input_dtype, output0_dtype,
-                                       output1_dtype)
+        model_name = tu.get_model_name(pf, input_dtype, output0_dtype, output1_dtype)
 
         if config[1] == "http":
             triton_client = httpclient.InferenceServerClient(
-                config[0], verbose=True, network_timeout=network_timeout)
+                config[0], verbose=True, network_timeout=network_timeout
+            )
         else:
-            triton_client = grpcclient.InferenceServerClient(config[0],
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(config[0], verbose=True)
 
         inputs = []
         if config[1] == "http":
             inputs.append(
-                httpclient.InferInput(INPUT0, tensor_shape,
-                                      np_to_triton_dtype(input_dtype)))
+                httpclient.InferInput(
+                    INPUT0, tensor_shape, np_to_triton_dtype(input_dtype)
+                )
+            )
             inputs.append(
-                httpclient.InferInput(INPUT1, tensor_shape,
-                                      np_to_triton_dtype(input_dtype)))
+                httpclient.InferInput(
+                    INPUT1, tensor_shape, np_to_triton_dtype(input_dtype)
+                )
+            )
         else:
             inputs.append(
-                grpcclient.InferInput(INPUT0, tensor_shape,
-                                      np_to_triton_dtype(input_dtype)))
+                grpcclient.InferInput(
+                    INPUT0, tensor_shape, np_to_triton_dtype(input_dtype)
+                )
+            )
             inputs.append(
-                grpcclient.InferInput(INPUT1, tensor_shape,
-                                      np_to_triton_dtype(input_dtype)))
+                grpcclient.InferInput(
+                    INPUT1, tensor_shape, np_to_triton_dtype(input_dtype)
+                )
+            )
 
         if not (use_cuda_shared_memory or use_system_shared_memory):
             if config[1] == "http":
-                inputs[0].set_data_from_numpy(input0_array,
-                                              binary_data=config[3])
-                inputs[1].set_data_from_numpy(input1_array,
-                                              binary_data=config[3])
+                inputs[0].set_data_from_numpy(input0_array, binary_data=config[3])
+                inputs[1].set_data_from_numpy(input1_array, binary_data=config[3])
             else:
                 inputs[0].set_data_from_numpy(input0_array)
                 inputs[1].set_data_from_numpy(input1_array)
         else:
             # Register necessary shared memory regions/handles
-            su.register_add_shm_regions(inputs, outputs, shm_regions,
-                                        precreated_shm_regions, shm_handles,
-                                        input0_byte_size, input1_byte_size,
-                                        output0_byte_size, output1_byte_size,
-                                        use_system_shared_memory,
-                                        use_cuda_shared_memory, triton_client)
+            su.register_add_shm_regions(
+                inputs,
+                outputs,
+                shm_regions,
+                precreated_shm_regions,
+                shm_handles,
+                input0_byte_size,
+                input1_byte_size,
+                output0_byte_size,
+                output1_byte_size,
+                use_system_shared_memory,
+                use_cuda_shared_memory,
+                triton_client,
+            )
 
         if batch_size == 1:
             expected0_sort_idx = [
@@ -397,65 +461,73 @@ def inferAndCheckResults(tester, configs, pf, batch_size, model_version,
             if len(shm_regions) != 0:
                 if config[1] == "http":
                     output_req.append(
-                        httpclient.InferRequestedOutput(OUTPUT0,
-                                                        binary_data=config[3]))
+                        httpclient.InferRequestedOutput(OUTPUT0, binary_data=config[3])
+                    )
                 else:
                     output_req.append(grpcclient.InferRequestedOutput(OUTPUT0))
 
-                output_req[-1].set_shared_memory(shm_regions[2] + '_data',
-                                                 output0_byte_size)
+                output_req[-1].set_shared_memory(
+                    shm_regions[2] + "_data", output0_byte_size
+                )
             else:
                 if output0_raw:
                     if config[1] == "http":
                         output_req.append(
                             httpclient.InferRequestedOutput(
-                                OUTPUT0, binary_data=config[3]))
+                                OUTPUT0, binary_data=config[3]
+                            )
+                        )
                     else:
-                        output_req.append(
-                            grpcclient.InferRequestedOutput(OUTPUT0))
+                        output_req.append(grpcclient.InferRequestedOutput(OUTPUT0))
                 else:
                     if config[1] == "http":
                         output_req.append(
                             httpclient.InferRequestedOutput(
-                                OUTPUT0,
-                                binary_data=config[3],
-                                class_count=num_classes))
+                                OUTPUT0, binary_data=config[3], class_count=num_classes
+                            )
+                        )
                     else:
                         output_req.append(
                             grpcclient.InferRequestedOutput(
-                                OUTPUT0, class_count=num_classes))
+                                OUTPUT0, class_count=num_classes
+                            )
+                        )
             i += 1
         if "OUTPUT1" in outputs:
             if len(shm_regions) != 0:
                 if config[1] == "http":
                     output_req.append(
-                        httpclient.InferRequestedOutput(OUTPUT1,
-                                                        binary_data=config[3]))
+                        httpclient.InferRequestedOutput(OUTPUT1, binary_data=config[3])
+                    )
                 else:
                     output_req.append(grpcclient.InferRequestedOutput(OUTPUT1))
 
-                output_req[-1].set_shared_memory(shm_regions[2 + i] + '_data',
-                                                 output1_byte_size)
+                output_req[-1].set_shared_memory(
+                    shm_regions[2 + i] + "_data", output1_byte_size
+                )
             else:
                 if output1_raw:
                     if config[1] == "http":
                         output_req.append(
                             httpclient.InferRequestedOutput(
-                                OUTPUT1, binary_data=config[3]))
+                                OUTPUT1, binary_data=config[3]
+                            )
+                        )
                     else:
-                        output_req.append(
-                            grpcclient.InferRequestedOutput(OUTPUT1))
+                        output_req.append(grpcclient.InferRequestedOutput(OUTPUT1))
                 else:
                     if config[1] == "http":
                         output_req.append(
                             httpclient.InferRequestedOutput(
-                                OUTPUT1,
-                                binary_data=config[3],
-                                class_count=num_classes))
+                                OUTPUT1, binary_data=config[3], class_count=num_classes
+                            )
+                        )
                     else:
                         output_req.append(
                             grpcclient.InferRequestedOutput(
-                                OUTPUT1, class_count=num_classes))
+                                OUTPUT1, class_count=num_classes
+                            )
+                        )
 
         if config[2]:
             user_data = UserData()
@@ -466,7 +538,8 @@ def inferAndCheckResults(tester, configs, pf, batch_size, model_version,
                     inputs,
                     model_version=model_version,
                     outputs=output_req,
-                    request_id=str(_unique_request_id()))
+                    request_id=str(_unique_request_id()),
+                )
             except Exception as e:
                 triton_client.stop_stream()
                 raise e
@@ -475,11 +548,13 @@ def inferAndCheckResults(tester, configs, pf, batch_size, model_version,
             if error is not None:
                 raise error
         else:
-            results = triton_client.infer(model_name,
-                                          inputs,
-                                          model_version=model_version,
-                                          outputs=output_req,
-                                          request_id=str(_unique_request_id()))
+            results = triton_client.infer(
+                model_name,
+                inputs,
+                model_version=model_version,
+                outputs=output_req,
+                request_id=str(_unique_request_id()),
+            )
 
         last_response = results.get_response()
 
@@ -489,8 +564,9 @@ def inferAndCheckResults(tester, configs, pf, batch_size, model_version,
                 request_id = int(last_response["id"])
             else:
                 request_id = int(last_response.id)
-            tester.assertFalse(request_id in _seen_request_ids,
-                               "request_id: {}".format(request_id))
+            tester.assertFalse(
+                request_id in _seen_request_ids, "request_id: {}".format(request_id)
+            )
             _seen_request_ids.add(request_id)
 
         if config[1] == "http":
@@ -517,8 +593,9 @@ def inferAndCheckResults(tester, configs, pf, batch_size, model_version,
             else:
                 result_name = result.name
 
-            if ((result_name == OUTPUT0 and output0_raw) or
-                (result_name == OUTPUT1 and output1_raw)):
+            if (result_name == OUTPUT0 and output0_raw) or (
+                result_name == OUTPUT1 and output1_raw
+            ):
                 if use_system_shared_memory or use_cuda_shared_memory:
                     if result_name == OUTPUT0:
                         shm_handle = shm_handles[2]
@@ -527,46 +604,54 @@ def inferAndCheckResults(tester, configs, pf, batch_size, model_version,
 
                     output = results.get_output(result_name)
                     if config[1] == "http":
-                        output_datatype = output['datatype']
-                        output_shape = output['shape']
+                        output_datatype = output["datatype"]
+                        output_shape = output["shape"]
                     else:
                         output_datatype = output.datatype
                         output_shape = output.shape
                     output_dtype = triton_to_np_dtype(output_datatype)
                 if use_system_shared_memory:
                     output_data = shm.get_contents_as_numpy(
-                        shm_handle, output_dtype, output_shape)
+                        shm_handle, output_dtype, output_shape
+                    )
                 elif use_cuda_shared_memory:
                     output_data = cudashm.get_contents_as_numpy(
-                        shm_handle, output_dtype, output_shape)
+                        shm_handle, output_dtype, output_shape
+                    )
                 else:
                     output_data = results.as_numpy(result_name)
                     if (output_data.dtype == np.object_) and (not config[3]):
-                        if config[1] == 'http':
-                            output_data = np.array([
-                                unicode(str(x), encoding='utf-8')
-                                for x in (output_data.flatten())
-                            ],
-                                                   dtype=np.object_).reshape(
-                                                       output_data.shape)
-                        elif config[1] == 'grpc':
+                        if config[1] == "http":
+                            output_data = np.array(
+                                [
+                                    unicode(str(x), encoding="utf-8")
+                                    for x in (output_data.flatten())
+                                ],
+                                dtype=np.object_,
+                            ).reshape(output_data.shape)
+                        elif config[1] == "grpc":
                             output_data = np.array(
-                                [x for x in (output_data.flatten())],
-                                dtype=np.object_).reshape(output_data.shape)
+                                [x for x in (output_data.flatten())], dtype=np.object_
+                            ).reshape(output_data.shape)
 
                 if result_name == OUTPUT0:
                     tester.assertTrue(
                         np.array_equal(output_data, output0_array),
                         "{}, {} expected: {}, got {}".format(
-                            model_name, OUTPUT0, output0_array, output_data))
+                            model_name, OUTPUT0, output0_array, output_data
+                        ),
+                    )
                 elif result_name == OUTPUT1:
                     tester.assertTrue(
                         np.array_equal(output_data, output1_array),
                         "{}, {} expected: {}, got {}".format(
-                            model_name, OUTPUT1, output1_array, output_data))
+                            model_name, OUTPUT1, output1_array, output_data
+                        ),
+                    )
                 else:
                     tester.assertTrue(
-                        False, "unexpected raw result {}".format(result_name))
+                        False, "unexpected raw result {}".format(result_name)
+                    )
             else:
                 for b in range(batch_size):
                     # num_classes values must be returned and must
@@ -590,55 +675,61 @@ def inferAndCheckResults(tester, configs, pf, batch_size, model_version,
                         # the value of each index equals the expected value.
                         # Only compare labels when the indices are equal.
                         if type(class_label) == str:
-                            ctuple = class_label.split(':')
+                            ctuple = class_label.split(":")
                         else:
-                            ctuple = "".join(
-                                chr(x) for x in class_label).split(':')
+                            ctuple = "".join(chr(x) for x in class_label).split(":")
                         cval = float(ctuple[0])
                         cidx = int(ctuple[1])
                         if result_name == OUTPUT0:
                             tester.assertEqual(cval, expected0_flatten[cidx])
                             tester.assertEqual(
-                                cval,
-                                expected0_flatten[expected0_sort_idx[b][idx]])
+                                cval, expected0_flatten[expected0_sort_idx[b][idx]]
+                            )
                             if cidx == expected0_sort_idx[b][idx]:
                                 tester.assertEqual(
-                                    ctuple[2], 'label{}'.format(
-                                        expected0_sort_idx[b][idx]))
+                                    ctuple[2],
+                                    "label{}".format(expected0_sort_idx[b][idx]),
+                                )
                         elif result_name == OUTPUT1:
                             tester.assertEqual(cval, expected1_flatten[cidx])
                             tester.assertEqual(
-                                cval,
-                                expected1_flatten[expected1_sort_idx[b][idx]])
+                                cval, expected1_flatten[expected1_sort_idx[b][idx]]
+                            )
                         else:
                             tester.assertTrue(
-                                False, "unexpected class result {}".format(
-                                    result_name))
+                                False, "unexpected class result {}".format(result_name)
+                            )
 
     # Unregister system/cuda shared memory regions if they exist
-    su.unregister_cleanup_shm_regions(shm_regions, shm_handles,
-                                      precreated_shm_regions, outputs,
-                                      use_system_shared_memory,
-                                      use_cuda_shared_memory)
+    su.unregister_cleanup_shm_regions(
+        shm_regions,
+        shm_handles,
+        precreated_shm_regions,
+        outputs,
+        use_system_shared_memory,
+        use_cuda_shared_memory,
+    )
 
     return results
 
 
 # resize the dummy tensor with the provided values in the shape tensor and finally
 # return the shape of the resized tensor.
-def infer_shape_tensor(tester,
-                       pf,
-                       tensor_dtype,
-                       input_shape_values,
-                       dummy_input_shapes,
-                       use_http=True,
-                       use_grpc=True,
-                       use_streaming=True,
-                       shm_suffix="",
-                       use_system_shared_memory=False,
-                       priority=0,
-                       timeout_us=0,
-                       batch_size=1):
+def infer_shape_tensor(
+    tester,
+    pf,
+    tensor_dtype,
+    input_shape_values,
+    dummy_input_shapes,
+    use_http=True,
+    use_grpc=True,
+    use_streaming=True,
+    shm_suffix="",
+    use_system_shared_memory=False,
+    priority=0,
+    timeout_us=0,
+    batch_size=1,
+):
     # Lazy shm imports...
     if use_system_shared_memory:
         import tritonclient.utils.shared_memory as shm
@@ -673,19 +764,23 @@ def infer_shape_tensor(tester,
 
         # Prepare the dummy tensor
         rtensor_dtype = _range_repr_dtype(tensor_dtype)
-        if (rtensor_dtype != bool):
-            dummy_in0 = np.random.randint(low=np.iinfo(rtensor_dtype).min,
-                                          high=np.iinfo(rtensor_dtype).max,
-                                          size=dummy_input_shapes[io_num],
-                                          dtype=rtensor_dtype)
+        if rtensor_dtype != bool:
+            dummy_in0 = np.random.randint(
+                low=np.iinfo(rtensor_dtype).min,
+                high=np.iinfo(rtensor_dtype).max,
+                size=dummy_input_shapes[io_num],
+                dtype=rtensor_dtype,
+            )
         else:
-            dummy_in0 = np.random.choice(a=[False, True],
-                                         size=dummy_input_shapes[io_num])
+            dummy_in0 = np.random.choice(
+                a=[False, True], size=dummy_input_shapes[io_num]
+            )
         if tensor_dtype != np.object_:
             dummy_in0 = dummy_in0.astype(tensor_dtype)
         else:
-            dummy_in0 = np.array([str(x) for x in dummy_in0.flatten()],
-                                 dtype=object).reshape(dummy_in0.shape)
+            dummy_in0 = np.array(
+                [str(x) for x in dummy_in0.flatten()], dtype=object
+            ).reshape(dummy_in0.shape)
         dummy_input_list.append(dummy_in0)
 
         # Prepare shape input tensor
@@ -701,25 +796,37 @@ def infer_shape_tensor(tester,
         output_byte_size = input_byte_size * batch_size
         if use_system_shared_memory:
             input_shm_handle_list.append(
-                (shm.create_shared_memory_region(input_name + shm_suffix,
-                                                 '/' + input_name + shm_suffix,
-                                                 input_byte_size),
-                 input_byte_size))
+                (
+                    shm.create_shared_memory_region(
+                        input_name + shm_suffix,
+                        "/" + input_name + shm_suffix,
+                        input_byte_size,
+                    ),
+                    input_byte_size,
+                )
+            )
             output_shm_handle_list.append(
-                (shm.create_shared_memory_region(output_name + shm_suffix,
-                                                 '/' + output_name + shm_suffix,
-                                                 output_byte_size),
-                 output_byte_size))
-            shm.set_shared_memory_region(input_shm_handle_list[-1][0], [
-                in0,
-            ])
+                (
+                    shm.create_shared_memory_region(
+                        output_name + shm_suffix,
+                        "/" + output_name + shm_suffix,
+                        output_byte_size,
+                    ),
+                    output_byte_size,
+                )
+            )
+            shm.set_shared_memory_region(
+                input_shm_handle_list[-1][0],
+                [
+                    in0,
+                ],
+            )
 
     model_name = tu.get_zero_model_name(pf, io_cnt, tensor_dtype)
     # Run inference and check results for each config
     for config in configs:
         client_utils = grpcclient if config[1] == "grpc" else httpclient
-        triton_client = client_utils.InferenceServerClient(config[0],
-                                                           verbose=True)
+        triton_client = client_utils.InferenceServerClient(config[0], verbose=True)
 
         inputs = []
         outputs = []
@@ -732,42 +839,51 @@ def infer_shape_tensor(tester,
             output_name = "OUTPUT{}".format(io_num)
 
             inputs.append(
-                client_utils.InferInput(dummy_input_name,
-                                        dummy_input_shapes[io_num],
-                                        np_to_triton_dtype(tensor_dtype)))
+                client_utils.InferInput(
+                    dummy_input_name,
+                    dummy_input_shapes[io_num],
+                    np_to_triton_dtype(tensor_dtype),
+                )
+            )
             inputs.append(
-                client_utils.InferInput(input_name, input_list[io_num].shape,
-                                        "INT32"))
+                client_utils.InferInput(input_name, input_list[io_num].shape, "INT32")
+            )
             outputs.append(client_utils.InferRequestedOutput(dummy_output_name))
             outputs.append(client_utils.InferRequestedOutput(output_name))
 
             # -2: dummy; -1: input
             inputs[-2].set_data_from_numpy(dummy_input_list[io_num])
-            if (not use_system_shared_memory):
+            if not use_system_shared_memory:
                 inputs[-1].set_data_from_numpy(input_list[io_num])
             else:
                 input_byte_size = input_shm_handle_list[io_num][1]
                 output_byte_size = output_shm_handle_list[io_num][1]
                 triton_client.register_system_shared_memory(
-                    input_name + shm_suffix, "/" + input_name + shm_suffix,
-                    input_byte_size)
+                    input_name + shm_suffix,
+                    "/" + input_name + shm_suffix,
+                    input_byte_size,
+                )
                 triton_client.register_system_shared_memory(
-                    output_name + shm_suffix, "/" + output_name + shm_suffix,
-                    output_byte_size)
-                inputs[-1].set_shared_memory(input_name + shm_suffix,
-                                             input_byte_size)
-                outputs[-1].set_shared_memory(output_name + shm_suffix,
-                                              output_byte_size)
+                    output_name + shm_suffix,
+                    "/" + output_name + shm_suffix,
+                    output_byte_size,
+                )
+                inputs[-1].set_shared_memory(input_name + shm_suffix, input_byte_size)
+                outputs[-1].set_shared_memory(
+                    output_name + shm_suffix, output_byte_size
+                )
 
         if config[2]:
             user_data = UserData()
             triton_client.start_stream(partial(completion_callback, user_data))
             try:
-                results = triton_client.async_stream_infer(model_name,
-                                                           inputs,
-                                                           outputs=outputs,
-                                                           priority=priority,
-                                                           timeout=timeout_us)
+                results = triton_client.async_stream_infer(
+                    model_name,
+                    inputs,
+                    outputs=outputs,
+                    priority=priority,
+                    timeout=timeout_us,
+                )
             except Exception as e:
                 triton_client.stop_stream()
                 raise e
@@ -776,11 +892,13 @@ def infer_shape_tensor(tester,
             if error is not None:
                 raise error
         else:
-            results = triton_client.infer(model_name,
-                                          inputs,
-                                          outputs=outputs,
-                                          priority=priority,
-                                          timeout=timeout_us)
+            results = triton_client.infer(
+                model_name,
+                inputs,
+                outputs=outputs,
+                priority=priority,
+                timeout=timeout_us,
+            )
 
         for io_num in range(io_cnt):
             output_name = "OUTPUT{}".format(io_num)
@@ -789,7 +907,7 @@ def infer_shape_tensor(tester,
 
             # get outputs as numpy array
             dummy_out = results.as_numpy(dummy_output_name)
-            if (not use_system_shared_memory):
+            if not use_system_shared_memory:
                 out = results.as_numpy(output_name)
             else:
                 output = results.get_output(output_name)
@@ -798,39 +916,44 @@ def infer_shape_tensor(tester,
                 else:
                     output_shape = output["shape"]
                 out = shm.get_contents_as_numpy(
-                    output_shm_handle_list[io_num][0], np.int32, output_shape)
+                    output_shm_handle_list[io_num][0], np.int32, output_shape
+                )
 
             # if out shape is 2D, it is batched
-            if (len(out.shape) == 2):
+            if len(out.shape) == 2:
                 # The shape of the dummy output should be equal to the shape values
                 # specified in the shape tensor
                 tester.assertTrue(
                     np.array_equal(dummy_out.shape[1:], out[0]),
                     "{}, {} shape, expected: {}, got {}".format(
-                        model_name, dummy_output_name, out[0],
-                        dummy_out.shape[1:]))
+                        model_name, dummy_output_name, out[0], dummy_out.shape[1:]
+                    ),
+                )
                 for b in range(1, out.shape[0]):
                     tester.assertTrue(
                         np.array_equal(out[b - 1], out[b]),
                         "expect shape tensor has consistent value, "
-                        "expected: {}, got {}".format(out[b - 1], out[b]))
+                        "expected: {}, got {}".format(out[b - 1], out[b]),
+                    )
                 out = out[0]
             else:
                 tester.assertTrue(
                     np.array_equal(dummy_out.shape, out),
                     "{}, {} shape, expected: {}, got {}".format(
-                        model_name, dummy_output_name, out, dummy_out.shape))
+                        model_name, dummy_output_name, out, dummy_out.shape
+                    ),
+                )
             tester.assertTrue(
                 np.array_equal(out, expected),
-                "{}, {}, expected: {}, got {}".format(model_name, output_name,
-                                                      expected, out))
+                "{}, {}, expected: {}, got {}".format(
+                    model_name, output_name, expected, out
+                ),
+            )
 
             # unregister shared memory region for next config
             if use_system_shared_memory:
-                triton_client.unregister_system_shared_memory(input_name +
-                                                              shm_suffix)
-                triton_client.unregister_system_shared_memory(output_name +
-                                                              shm_suffix)
+                triton_client.unregister_system_shared_memory(input_name + shm_suffix)
+                triton_client.unregister_system_shared_memory(output_name + shm_suffix)
 
     for handle in input_shm_handle_list:
         shm.destroy_shared_memory_region(handle[0])
@@ -842,25 +965,27 @@ def infer_shape_tensor(tester,
 # zero-sized input/output tensor.
 # FIXME Support for empty tensors using non-empty shared memory regions.
 # Currently shared memory support is broken for empty input/outputs tensors.
-def infer_zero(tester,
-               pf,
-               batch_size,
-               tensor_dtype,
-               input_shapes,
-               output_shapes,
-               model_version=None,
-               use_http=True,
-               use_grpc=True,
-               use_http_json_tensors=True,
-               use_streaming=True,
-               shm_region_name_prefix=None,
-               use_system_shared_memory=False,
-               use_cuda_shared_memory=False,
-               priority=0,
-               timeout_us=0,
-               override_model_name=None,
-               override_input_names=[],
-               override_output_names=[]):
+def infer_zero(
+    tester,
+    pf,
+    batch_size,
+    tensor_dtype,
+    input_shapes,
+    output_shapes,
+    model_version=None,
+    use_http=True,
+    use_grpc=True,
+    use_http_json_tensors=True,
+    use_streaming=True,
+    shm_region_name_prefix=None,
+    use_system_shared_memory=False,
+    use_cuda_shared_memory=False,
+    priority=0,
+    timeout_us=0,
+    override_model_name=None,
+    override_input_names=[],
+    override_output_names=[],
+):
     # Lazy shm imports...
     if use_system_shared_memory:
         import tritonclient.utils.shared_memory as shm
@@ -872,8 +997,7 @@ def infer_zero(tester,
     if use_http:
         configs.append((f"{_tritonserver_ipaddr}:8000", "http", False, True))
         if use_http_json_tensors and (tensor_dtype != np.float16):
-            configs.append(
-                (f"{_tritonserver_ipaddr}:8000", "http", False, False))
+            configs.append((f"{_tritonserver_ipaddr}:8000", "http", False, False))
     if use_grpc:
         configs.append((f"{_tritonserver_ipaddr}:8001", "grpc", False, False))
     if use_streaming:
@@ -895,13 +1019,11 @@ def infer_zero(tester,
     else:
         model_name = override_model_name
     if configs[0][1] == "http":
-        metadata_client = httpclient.InferenceServerClient(configs[0][0],
-                                                           verbose=True)
+        metadata_client = httpclient.InferenceServerClient(configs[0][0], verbose=True)
         metadata = metadata_client.get_model_metadata(model_name)
         platform = metadata["platform"]
     else:
-        metadata_client = grpcclient.InferenceServerClient(configs[0][0],
-                                                           verbose=True)
+        metadata_client = grpcclient.InferenceServerClient(configs[0][0], verbose=True)
         metadata = metadata_client.get_model_metadata(model_name)
         platform = metadata.platform
 
@@ -926,24 +1048,26 @@ def infer_zero(tester,
         output_shape = output_shapes[io_num]
 
         rtensor_dtype = _range_repr_dtype(tensor_dtype)
-        if (rtensor_dtype != bool):
-            input_array = np.random.randint(low=np.iinfo(rtensor_dtype).min,
-                                            high=np.iinfo(rtensor_dtype).max,
-                                            size=input_shape,
-                                            dtype=rtensor_dtype)
+        if rtensor_dtype != bool:
+            input_array = np.random.randint(
+                low=np.iinfo(rtensor_dtype).min,
+                high=np.iinfo(rtensor_dtype).max,
+                size=input_shape,
+                dtype=rtensor_dtype,
+            )
         else:
             input_array = np.random.choice(a=[False, True], size=input_shape)
         if tensor_dtype != np.object_:
             input_array = input_array.astype(tensor_dtype)
             expected_array = np.ndarray.copy(input_array)
         else:
-            expected_array = np.array([
-                unicode(str(x), encoding='utf-8')
-                for x in input_array.flatten()
-            ],
-                                      dtype=object)
-            input_array = np.array([str(x) for x in input_array.flatten()],
-                                   dtype=object).reshape(input_array.shape)
+            expected_array = np.array(
+                [unicode(str(x), encoding="utf-8") for x in input_array.flatten()],
+                dtype=object,
+            )
+            input_array = np.array(
+                [str(x) for x in input_array.flatten()], dtype=object
+            ).reshape(input_array.shape)
 
         expected_array = expected_array.reshape(output_shape)
         expected_dict[output_name] = expected_array
@@ -965,8 +1089,7 @@ def infer_zero(tester,
             input_list_tmp = input_list
 
         if tensor_dtype == np.object_:
-            input_byte_size = sum(
-                [serialized_byte_size(ip) for ip in input_list_tmp])
+            input_byte_size = sum([serialized_byte_size(ip) for ip in input_list_tmp])
         else:
             input_byte_size = sum([ip.nbytes for ip in input_list_tmp])
 
@@ -974,9 +1097,14 @@ def infer_zero(tester,
         shm_io_handles = su.create_set_either_shm_region(
             [
                 shm_region_name_prefix[0] + str(io_num),
-                shm_region_name_prefix[1] + str(io_num)
-            ], input_list_tmp, input_byte_size, output_byte_size,
-            use_system_shared_memory, use_cuda_shared_memory)
+                shm_region_name_prefix[1] + str(io_num),
+            ],
+            input_list_tmp,
+            input_byte_size,
+            output_byte_size,
+            use_system_shared_memory,
+            use_cuda_shared_memory,
+        )
 
         if len(shm_io_handles) != 0:
             shm_ip_handles.append(shm_io_handles[0])
@@ -991,52 +1119,63 @@ def infer_zero(tester,
     # Run inference and check results for each config
     for config in configs:
         if config[1] == "http":
-            triton_client = httpclient.InferenceServerClient(config[0],
-                                                             verbose=True)
+            triton_client = httpclient.InferenceServerClient(config[0], verbose=True)
         else:
-            triton_client = grpcclient.InferenceServerClient(config[0],
-                                                             verbose=True)
+            triton_client = grpcclient.InferenceServerClient(config[0], verbose=True)
 
         inputs = []
         output_req = []
         for io_num, (input_name, output_name) in enumerate(
-                zip(input_dict.keys(), expected_dict.keys())):
+            zip(input_dict.keys(), expected_dict.keys())
+        ):
             input_data = input_dict[input_name]
             output_data = expected_dict[output_name]
             if tensor_dtype == np.object_:
                 input_byte_size = serialized_byte_size(
-                    serialize_byte_tensor(input_data))
+                    serialize_byte_tensor(input_data)
+                )
                 output_byte_size = serialized_byte_size(
-                    serialize_byte_tensor(output_data))
+                    serialize_byte_tensor(output_data)
+                )
             else:
                 input_byte_size = input_data.nbytes
                 output_byte_size = output_data.nbytes
             if config[1] == "http":
                 inputs.append(
-                    httpclient.InferInput(input_name, input_data.shape,
-                                          np_to_triton_dtype(tensor_dtype)))
+                    httpclient.InferInput(
+                        input_name, input_data.shape, np_to_triton_dtype(tensor_dtype)
+                    )
+                )
                 output_req.append(
-                    httpclient.InferRequestedOutput(output_name,
-                                                    binary_data=config[3]))
+                    httpclient.InferRequestedOutput(output_name, binary_data=config[3])
+                )
             else:
                 inputs.append(
-                    grpcclient.InferInput(input_name, input_data.shape,
-                                          np_to_triton_dtype(tensor_dtype)))
+                    grpcclient.InferInput(
+                        input_name, input_data.shape, np_to_triton_dtype(tensor_dtype)
+                    )
+                )
                 output_req.append(grpcclient.InferRequestedOutput(output_name))
 
             if not (use_cuda_shared_memory or use_system_shared_memory):
                 if config[1] == "http":
-                    inputs[-1].set_data_from_numpy(input_data,
-                                                   binary_data=config[3])
+                    inputs[-1].set_data_from_numpy(input_data, binary_data=config[3])
                 else:
                     inputs[-1].set_data_from_numpy(input_data)
             else:
                 # Register necessary shared memory regions/handles
                 su.register_add_either_shm_regions(
-                    inputs, output_req, shm_region_name_prefix,
-                    (shm_ip_handles, shm_op_handles), io_num, input_byte_size,
-                    output_byte_size, use_system_shared_memory,
-                    use_cuda_shared_memory, triton_client)
+                    inputs,
+                    output_req,
+                    shm_region_name_prefix,
+                    (shm_ip_handles, shm_op_handles),
+                    io_num,
+                    input_byte_size,
+                    output_byte_size,
+                    use_system_shared_memory,
+                    use_cuda_shared_memory,
+                    triton_client,
+                )
 
         if config[2]:
             user_data = UserData()
@@ -1049,7 +1188,8 @@ def infer_zero(tester,
                     outputs=output_req,
                     request_id=str(_unique_request_id()),
                     priority=priority,
-                    timeout=timeout_us)
+                    timeout=timeout_us,
+                )
             except Exception as e:
                 triton_client.stop_stream()
                 raise e
@@ -1058,13 +1198,15 @@ def infer_zero(tester,
             if error is not None:
                 raise error
         else:
-            results = triton_client.infer(model_name,
-                                          inputs,
-                                          model_version=model_version,
-                                          outputs=output_req,
-                                          request_id=str(_unique_request_id()),
-                                          priority=priority,
-                                          timeout=timeout_us)
+            results = triton_client.infer(
+                model_name,
+                inputs,
+                model_version=model_version,
+                outputs=output_req,
+                request_id=str(_unique_request_id()),
+                priority=priority,
+                timeout=timeout_us,
+            )
 
         last_response = results.get_response()
 
@@ -1102,56 +1244,64 @@ def infer_zero(tester,
 
                 output = results.get_output(result_name)
                 if config[1] == "http":
-                    output_datatype = output['datatype']
-                    output_shape = output['shape']
+                    output_datatype = output["datatype"]
+                    output_shape = output["shape"]
                 else:
                     output_datatype = output.datatype
                     output_shape = output.shape
                 output_dtype = triton_to_np_dtype(output_datatype)
             if use_system_shared_memory:
-                output_data = shm.get_contents_as_numpy(shm_handle,
-                                                        output_dtype,
-                                                        output_shape)
+                output_data = shm.get_contents_as_numpy(
+                    shm_handle, output_dtype, output_shape
+                )
             elif use_cuda_shared_memory:
                 output_data = cudashm.get_contents_as_numpy(
-                    shm_handle, output_dtype, output_shape)
+                    shm_handle, output_dtype, output_shape
+                )
             else:
                 output_data = results.as_numpy(result_name)
 
                 if (output_data.dtype == np.object_) and (config[3] == False):
-                    if config[1] == 'http':
-                        output_data = np.array([
-                            unicode(str(x), encoding='utf-8')
-                            for x in (output_data.flatten())
-                        ],
-                                               dtype=np.object_).reshape(
-                                                   output_data.shape)
-                    elif config[1] == 'grpc':
+                    if config[1] == "http":
+                        output_data = np.array(
+                            [
+                                unicode(str(x), encoding="utf-8")
+                                for x in (output_data.flatten())
+                            ],
+                            dtype=np.object_,
+                        ).reshape(output_data.shape)
+                    elif config[1] == "grpc":
                         output_data = np.array(
-                            [x for x in (output_data.flatten())],
-                            dtype=np.object_).reshape(output_data.shape)
+                            [x for x in (output_data.flatten())], dtype=np.object_
+                        ).reshape(output_data.shape)
 
             expected = expected_dict[result_name]
             tester.assertEqual(output_data.shape, expected.shape)
             tester.assertTrue(
                 np.array_equal(output_data, expected),
-                "{}, {}, expected: {}, got {}".format(model_name, result_name,
-                                                      expected, output_data))
+                "{}, {}, expected: {}, got {}".format(
+                    model_name, result_name, expected, output_data
+                ),
+            )
 
     if len(shm_ip_handles) != 0:
         for io_num in range(io_cnt):
             if use_cuda_shared_memory:
                 triton_client.unregister_cuda_shared_memory(
-                    shm_region_name_prefix[0] + str(io_num) + '_data')
+                    shm_region_name_prefix[0] + str(io_num) + "_data"
+                )
                 triton_client.unregister_cuda_shared_memory(
-                    shm_region_name_prefix[0] + str(io_num) + '_data')
+                    shm_region_name_prefix[0] + str(io_num) + "_data"
+                )
                 cudashm.destroy_shared_memory_region(shm_ip_handles[io_num])
                 cudashm.destroy_shared_memory_region(shm_op_handles[io_num])
             else:
                 triton_client.unregister_system_shared_memory(
-                    shm_region_name_prefix[1] + str(io_num) + '_data')
+                    shm_region_name_prefix[1] + str(io_num) + "_data"
+                )
                 triton_client.unregister_system_shared_memory(
-                    shm_region_name_prefix[1] + str(io_num) + '_data')
+                    shm_region_name_prefix[1] + str(io_num) + "_data"
+                )
                 shm.destroy_shared_memory_region(shm_ip_handles[io_num])
                 shm.destroy_shared_memory_region(shm_op_handles[io_num])
 
diff --git a/qa/common/inferentia_perf_analyzer_input_data_json/non_aligned_validation_batched.json b/qa/common/inferentia_perf_analyzer_input_data_json/non_aligned_validation_batched.json
index 5889c92459..af02734578 100644
--- a/qa/common/inferentia_perf_analyzer_input_data_json/non_aligned_validation_batched.json
+++ b/qa/common/inferentia_perf_analyzer_input_data_json/non_aligned_validation_batched.json
@@ -2,49 +2,49 @@
   "data" :
     [
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [1, 2, 3, 4],
           "shape": [4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [1, 1, 1, 1],
           "shape": [4]
         }
       },
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [0, 0, 0, 0],
           "shape": [4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [1, 1, 1, 1],
           "shape": [4]
         }
       },
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [-1, -2, -3, -4],
           "shape": [4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [1, 1, 1, 1],
           "shape": [4]
         }
       },
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [-4, -3, -2, -1],
           "shape": [4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [-1, -1, -1, -1],
           "shape": [4]
         }
@@ -53,37 +53,37 @@
   "validation_data" :
   [
       {
-        "OUTPUT__0" : 
-        { 
+        "OUTPUT__0" :
+        {
           "content": [2, 3, 4, 5],
           "shape": [4]
         },
-        "OUTPUT__1" : 
-        { 
+        "OUTPUT__1" :
+        {
           "content": [0, 1, 2, 3],
           "shape": [4]
         }
       },
       {
-        "OUTPUT__0" : 
-        { 
+        "OUTPUT__0" :
+        {
           "content": [1, 1, 1, 1],
           "shape": [4]
         },
-        "OUTPUT__1" : 
-        { 
+        "OUTPUT__1" :
+        {
           "content": [-1, -1 ,-1, -1],
           "shape": [4]
         }
       },
       {
-        "OUTPUT__0" : 
-        { 
+        "OUTPUT__0" :
+        {
           "content": [0, -1, -2, -3],
           "shape": [4]
         },
-        "OUTPUT__1" : 
-        { 
+        "OUTPUT__1" :
+        {
           "content": [-2, -3, -4, -5],
           "shape": [4]
         }
diff --git a/qa/common/inferentia_perf_analyzer_input_data_json/non_aligned_validation_no_batch.json b/qa/common/inferentia_perf_analyzer_input_data_json/non_aligned_validation_no_batch.json
index 2de478a6b7..a14eac39f6 100644
--- a/qa/common/inferentia_perf_analyzer_input_data_json/non_aligned_validation_no_batch.json
+++ b/qa/common/inferentia_perf_analyzer_input_data_json/non_aligned_validation_no_batch.json
@@ -2,49 +2,49 @@
   "data" :
     [
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4],
           "shape": [6, 4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           "shape": [6, 4]
         }
       },
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           "shape": [6, 4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           "shape": [6, 4]
         }
       },
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [-1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4],
           "shape": [6, 4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           "shape": [6, 4]
         }
       },
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [-4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1],
           "shape": [6, 4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
           "shape": [6, 4]
         }
@@ -53,37 +53,37 @@
   "validation_data" :
   [
       {
-        "OUTPUT__0" : 
-        { 
+        "OUTPUT__0" :
+        {
           "content": [2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5],
           "shape": [6, 4]
         },
-        "OUTPUT__1" : 
-        { 
+        "OUTPUT__1" :
+        {
           "content": [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3],
           "shape": [6, 4]
         }
       },
       {
-        "OUTPUT__0" : 
-        { 
+        "OUTPUT__0" :
+        {
           "content": [0, -1, -2, -3, 0, -1, -2, -3, 0, -1, -2, -3, 0, -1, -2, -3, 0, -1, -2, -3, 0, -1, -2, -3],
           "shape": [6, 4]
         },
-        "OUTPUT__1" : 
-        { 
+        "OUTPUT__1" :
+        {
           "content": [-2, -3, -4, -5, -2, -3, -4, -5, -2, -3, -4, -5, -2, -3, -4, -5, -2, -3, -4, -5, -2, -3, -4, -5],
           "shape": [6, 4]
         }
       },
       {
-        "OUTPUT__0" : 
-        { 
+        "OUTPUT__0" :
+        {
           "content": [-5, -4, -3, -2, -5, -4, -3, -2, -5, -4, -3, -2, -5, -4, -3, -2, -5, -4, -3, -2, -5, -4, -3, -2],
           "shape": [6, 4]
         },
-        "OUTPUT__1" : 
-        { 
+        "OUTPUT__1" :
+        {
           "content": [-3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0],
           "shape": [6, 4]
         }
diff --git a/qa/common/inferentia_perf_analyzer_input_data_json/simple_model.py b/qa/common/inferentia_perf_analyzer_input_data_json/simple_model.py
old mode 100644
new mode 100755
index ef7757a8d6..db7ca95848
--- a/qa/common/inferentia_perf_analyzer_input_data_json/simple_model.py
+++ b/qa/common/inferentia_perf_analyzer_input_data_json/simple_model.py
@@ -28,7 +28,6 @@
 
 
 def gen_pytorch_model(name, batch_size):
-
     class PyAddSubNet(nn.Module):
         """
         Simple AddSub network in PyTorch. This network outputs the sum and
@@ -39,88 +38,92 @@ def __init__(self):
             super(PyAddSubNet, self).__init__()
 
         def forward(self, input0, input1):
-            return torch.sub(input0, input1, alpha=-1), torch.sub(input0,
-                                                                  input1,
-                                                                  alpha=1)
+            return torch.sub(input0, input1, alpha=-1), torch.sub(
+                input0, input1, alpha=1
+            )
 
     model = PyAddSubNet()
     model.eval()
     batch_size = 1
     example_inputs = torch.zeros([8, 4], dtype=torch.int64), torch.zeros(
-        [8, 4], dtype=torch.int64)
-    model_neuron = torch_neuron.trace(model,
-                                      example_inputs,
-                                      dynamic_batch_size=True)
-    model_neuron.save('{}.pt'.format(name))
+        [8, 4], dtype=torch.int64
+    )
+    model_neuron = torch_neuron.trace(model, example_inputs, dynamic_batch_size=True)
+    model_neuron.save("{}.pt".format(name))
 
 
 def gen_tf_model(name, batch_size, tf_version):
     # Set up model directory
-    model_dir = 'add_sub_model'
+    model_dir = "add_sub_model"
     compiled_model_dir = name
     shutil.rmtree(model_dir, ignore_errors=True)
     shutil.rmtree(compiled_model_dir, ignore_errors=True)
-    if (tf_version == 1):
+    if tf_version == 1:
         with tf.Session() as sess:
             # Export SavedModel
             input0 = tf.placeholder(tf.int64, [None, 4], "INPUT__0")
             input1 = tf.placeholder(tf.int64, [None, 4], "INPUT__1")
             output0 = tf.add(input0, input1, "OUTPUT__0")
             output1 = tf.subtract(input0, input1, "OUTPUT__1")
-            tf.compat.v1.saved_model.simple_save(session=sess,
-                                                 export_dir=model_dir,
-                                                 inputs={
-                                                     "INPUT__0": input0,
-                                                     "INPUT__1": input1
-                                                 },
-                                                 outputs={
-                                                     "OUTPUT__0": output0,
-                                                     "OUTPUT__1": output1
-                                                 })
+            tf.compat.v1.saved_model.simple_save(
+                session=sess,
+                export_dir=model_dir,
+                inputs={"INPUT__0": input0, "INPUT__1": input1},
+                outputs={"OUTPUT__0": output0, "OUTPUT__1": output1},
+            )
         # Compile using Neuron
-        tfn.saved_model.compile(model_dir,
-                                compiled_model_dir,
-                                batch_size=batch_size,
-                                dynamic_batch_size=True)
-    elif (tf_version == 2):
+        tfn.saved_model.compile(
+            model_dir,
+            compiled_model_dir,
+            batch_size=batch_size,
+            dynamic_batch_size=True,
+        )
+    elif tf_version == 2:
         # TODO: Add gen scripts for TF2
         raise Exception("TensorFlow2 not yet supported")
     else:
-        raise Exception(
-            "Unrecognized Tensorflow version: {}".format(tf_version))
+        raise Exception("Unrecognized Tensorflow version: {}".format(tf_version))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model_type',
-                        type=str,
-                        required=True,
-                        choices=['pytorch', 'tensorflow'],
-                        help='''The type of the compiled model. Currently,
-                        only supports \"pytorch\" and \"tensorflow\".''')
-    parser.add_argument('--name',
-                        type=str,
-                        required=True,
-                        help='The name of the compiled model')
-    parser.add_argument('--tf_version',
-                        type=int,
-                        choices=[1, 2],
-                        help='Version of tensorflow for compiled model')
-    parser.add_argument('--batch_size',
-                        type=int,
-                        default=1,
-                        help='The batch size for the compiled model')
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        required=True,
+        choices=["pytorch", "tensorflow"],
+        help="""The type of the compiled model. Currently,
+                        only supports \"pytorch\" and \"tensorflow\".""",
+    )
+    parser.add_argument(
+        "--name", type=str, required=True, help="The name of the compiled model"
+    )
+    parser.add_argument(
+        "--tf_version",
+        type=int,
+        choices=[1, 2],
+        help="Version of tensorflow for compiled model",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for the compiled model",
+    )
 
     FLAGS, unparsed = parser.parse_known_args()
     if len(unparsed) > 0:
         raise Exception("Unrecognized options: {}".format(unparsed))
-    if FLAGS.model_type == 'tensorflow':
+    if FLAGS.model_type == "tensorflow":
         import shutil
+
         import tensorflow as tf
         import tensorflow.neuron as tfn
+
         gen_tf_model(FLAGS.name, FLAGS.batch_size, FLAGS.tf_version)
-    elif FLAGS.model_type == 'pytorch':
+    elif FLAGS.model_type == "pytorch":
         import torch
         import torch_neuron
         from torch import nn
+
         gen_pytorch_model(FLAGS.name, FLAGS.batch_size)
diff --git a/qa/common/inferentia_perf_analyzer_input_data_json/validation_batched.json b/qa/common/inferentia_perf_analyzer_input_data_json/validation_batched.json
index 1d66ef33fa..9b733c5a55 100644
--- a/qa/common/inferentia_perf_analyzer_input_data_json/validation_batched.json
+++ b/qa/common/inferentia_perf_analyzer_input_data_json/validation_batched.json
@@ -2,49 +2,49 @@
     "data" :
       [
         {
-          "INPUT__0" : 
-          { 
+          "INPUT__0" :
+          {
             "content": [1, 2, 3, 4],
             "shape": [4]
           },
-          "INPUT__1" : 
-          { 
+          "INPUT__1" :
+          {
             "content": [1, 1, 1, 1],
             "shape": [4]
           }
         },
         {
-          "INPUT__0" : 
-          { 
+          "INPUT__0" :
+          {
             "content": [0, 0, 0, 0],
             "shape": [4]
           },
-          "INPUT__1" : 
-          { 
+          "INPUT__1" :
+          {
             "content": [1, 1, 1, 1],
             "shape": [4]
           }
         },
         {
-          "INPUT__0" : 
-          { 
+          "INPUT__0" :
+          {
             "content": [-1, -2, -3, -4],
             "shape": [4]
           },
-          "INPUT__1" : 
-          { 
+          "INPUT__1" :
+          {
             "content": [1, 1, 1, 1],
             "shape": [4]
           }
         },
         {
-          "INPUT__0" : 
-          { 
+          "INPUT__0" :
+          {
             "content": [-4, -3, -2, -1],
             "shape": [4]
           },
-          "INPUT__1" : 
-          { 
+          "INPUT__1" :
+          {
             "content": [-1, -1, -1, -1],
             "shape": [4]
           }
@@ -53,49 +53,49 @@
     "validation_data" :
     [
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [2, 3, 4, 5],
             "shape": [4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [0, 1, 2, 3],
             "shape": [4]
           }
         },
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [1, 1, 1, 1],
             "shape": [4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [-1, -1 ,-1, -1],
             "shape": [4]
           }
         },
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [0, -1, -2, -3],
             "shape": [4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [-2, -3, -4, -5],
             "shape": [4]
           }
         },
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [-5, -4, -3, -2],
             "shape": [4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [-3, -2, -1, 0],
             "shape": [4]
           }
diff --git a/qa/common/inferentia_perf_analyzer_input_data_json/validation_no_batch.json b/qa/common/inferentia_perf_analyzer_input_data_json/validation_no_batch.json
index 79367bd304..4ce1308fbc 100644
--- a/qa/common/inferentia_perf_analyzer_input_data_json/validation_no_batch.json
+++ b/qa/common/inferentia_perf_analyzer_input_data_json/validation_no_batch.json
@@ -2,49 +2,49 @@
     "data" :
       [
         {
-          "INPUT__0" : 
-          { 
+          "INPUT__0" :
+          {
             "content": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4],
             "shape": [6, 4]
           },
-          "INPUT__1" : 
-          { 
+          "INPUT__1" :
+          {
             "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
             "shape": [6, 4]
           }
         },
         {
-          "INPUT__0" : 
-          { 
+          "INPUT__0" :
+          {
             "content": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             "shape": [6, 4]
           },
-          "INPUT__1" : 
-          { 
+          "INPUT__1" :
+          {
             "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
             "shape": [6, 4]
           }
         },
         {
-          "INPUT__0" : 
-          { 
+          "INPUT__0" :
+          {
             "content": [-1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4],
             "shape": [6, 4]
           },
-          "INPUT__1" : 
-          { 
+          "INPUT__1" :
+          {
             "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
             "shape": [6, 4]
           }
         },
         {
-          "INPUT__0" : 
-          { 
+          "INPUT__0" :
+          {
             "content": [-4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1],
             "shape": [6, 4]
           },
-          "INPUT__1" : 
-          { 
+          "INPUT__1" :
+          {
             "content": [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
             "shape": [6, 4]
           }
@@ -53,49 +53,49 @@
     "validation_data" :
     [
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5],
             "shape": [6, 4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3],
             "shape": [6, 4]
           }
         },
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
             "shape": [6, 4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [-1, -1 ,-1, -1, -1, -1 ,-1, -1, -1, -1 ,-1, -1, -1, -1 ,-1, -1, -1, -1 ,-1, -1, -1, -1 ,-1, -1],
             "shape": [6, 4]
           }
         },
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [0, -1, -2, -3, 0, -1, -2, -3, 0, -1, -2, -3, 0, -1, -2, -3, 0, -1, -2, -3, 0, -1, -2, -3],
             "shape": [6, 4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [-2, -3, -4, -5, -2, -3, -4, -5, -2, -3, -4, -5, -2, -3, -4, -5, -2, -3, -4, -5, -2, -3, -4, -5],
             "shape": [6, 4]
           }
         },
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [-5, -4, -3, -2, -5, -4, -3, -2, -5, -4, -3, -2, -5, -4, -3, -2, -5, -4, -3, -2, -5, -4, -3, -2],
             "shape": [6, 4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [-3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0],
             "shape": [6, 4]
           }
diff --git a/qa/common/inferentia_perf_analyzer_input_data_json/wrong_validation_batched.json b/qa/common/inferentia_perf_analyzer_input_data_json/wrong_validation_batched.json
index 4f0d9f0e30..5e40ffe569 100644
--- a/qa/common/inferentia_perf_analyzer_input_data_json/wrong_validation_batched.json
+++ b/qa/common/inferentia_perf_analyzer_input_data_json/wrong_validation_batched.json
@@ -2,49 +2,49 @@
   "data" :
     [
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [1, 2, 3, 4],
           "shape": [4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [1, 1, 1, 1],
           "shape": [4]
         }
       },
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [0, 0, 0, 0],
           "shape": [4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [1, 1, 1, 1],
           "shape": [4]
         }
       },
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [-1, -2, -3, -4],
           "shape": [4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [1, 1, 1, 1],
           "shape": [4]
         }
       },
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [-4, -3, -2, -1],
           "shape": [4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [-1, -1, -1, -1],
           "shape": [4]
         }
@@ -53,49 +53,49 @@
     "validation_data" :
     [
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [2, 3, 4, 5],
             "shape": [4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [0, 0, 0, 0],
             "shape": [4]
           }
         },
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [1, 1, 1, 1],
             "shape": [4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [1, 1, 1, 1],
             "shape": [4]
           }
         },
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [0, 1, 2, 3],
             "shape": [4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [7, 8, 9, 10],
             "shape": [4]
           }
         },
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [-5, -4, -3, -1],
             "shape": [4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [-3, -2, -1, 0],
             "shape": [4]
           }
diff --git a/qa/common/inferentia_perf_analyzer_input_data_json/wrong_validation_no_batch.json b/qa/common/inferentia_perf_analyzer_input_data_json/wrong_validation_no_batch.json
index 82c3faee65..e9a212e5ec 100644
--- a/qa/common/inferentia_perf_analyzer_input_data_json/wrong_validation_no_batch.json
+++ b/qa/common/inferentia_perf_analyzer_input_data_json/wrong_validation_no_batch.json
@@ -2,49 +2,49 @@
   "data" :
     [
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4],
           "shape": [6, 4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           "shape": [6, 4]
         }
       },
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           "shape": [6, 4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           "shape": [6, 4]
         }
       },
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [-1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4],
           "shape": [6, 4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
           "shape": [6, 4]
         }
       },
       {
-        "INPUT__0" : 
-        { 
+        "INPUT__0" :
+        {
           "content": [-4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1, -4, -3, -2, -1],
           "shape": [6, 4]
         },
-        "INPUT__1" : 
-        { 
+        "INPUT__1" :
+        {
           "content": [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
           "shape": [6, 4]
         }
@@ -53,49 +53,49 @@
     "validation_data" :
     [
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5, 2, 3, 4, 5],
             "shape": [6, 4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3],
             "shape": [6, 4]
           }
         },
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
             "shape": [6, 4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1 ,-1, -1],
             "shape": [6, 4]
           }
         },
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3],
             "shape": [6, 4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [7, 8, 9, 10, 7, 8, 9, 10, 7, 8, 9, 10, 7, 8, 9, 10, 7, 8, 9, 10, 7, 8, 9, 10],
             "shape": [6, 4]
           }
         },
         {
-          "OUTPUT__0" : 
-          { 
+          "OUTPUT__0" :
+          {
             "content": [-5, -4, -3, -1, -5, -4, -3, -1, -5, -4, -3, -1, -5, -4, -3, -1, -5, -4, -3, -1, -5, -4, -3, -1],
             "shape": [6, 4]
           },
-          "OUTPUT__1" : 
-          { 
+          "OUTPUT__1" :
+          {
             "content": [-3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0],
             "shape": [6, 4]
           }
diff --git a/qa/common/libtorch_infer_client.py b/qa/common/libtorch_infer_client.py
old mode 100644
new mode 100755
index fb29f07f65..063c8dc009
--- a/qa/common/libtorch_infer_client.py
+++ b/qa/common/libtorch_infer_client.py
@@ -31,22 +31,22 @@
 sys.path.append("../common")
 
 import unittest
+
 import numpy as np
 import test_util as tu
-
 import tritonclient.http as httpclient
 
 # By default, find tritonserver on "localhost", but can be overridden
 # with TRITONSERVER_IPADDR envvar
-_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost')
+_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
 
 
 class InferTest(tu.TestResultCollector):
-
     def test_infer(self):
         try:
             triton_client = httpclient.InferenceServerClient(
-                url=f"{_tritonserver_ipaddr}:8000")
+                url=f"{_tritonserver_ipaddr}:8000"
+            )
         except Exception as e:
             print("channel creation failed: " + str(e))
             sys.exit(1)
@@ -55,8 +55,8 @@ def test_infer(self):
 
         inputs = []
         outputs = []
-        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "INT32"))
-        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
 
         # Create the data for the two input tensors. Initialize the first
         # to unique integers and the second to all ones.
@@ -68,24 +68,30 @@ def test_infer(self):
         inputs[0].set_data_from_numpy(input0_data, binary_data=True)
         inputs[1].set_data_from_numpy(input1_data, binary_data=True)
 
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT__0', binary_data=True))
-        outputs.append(
-            httpclient.InferRequestedOutput('OUTPUT__1', binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT__0", binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT__1", binary_data=True))
 
         results = triton_client.infer(model_name, inputs, outputs=outputs)
 
-        output0_data = results.as_numpy('OUTPUT__0')
-        output1_data = results.as_numpy('OUTPUT__1')
+        output0_data = results.as_numpy("OUTPUT__0")
+        output1_data = results.as_numpy("OUTPUT__1")
 
         # Validate the results by comparing with precomputed values.
         for i in range(16):
             print(
-                str(input0_data[0][i]) + " - " + str(input1_data[0][i]) +
-                " = " + str(output0_data[0][i]))
+                str(input0_data[0][i])
+                + " - "
+                + str(input1_data[0][i])
+                + " = "
+                + str(output0_data[0][i])
+            )
             print(
-                str(input0_data[0][i]) + " + " + str(input1_data[0][i]) +
-                " = " + str(output1_data[0][i]))
+                str(input0_data[0][i])
+                + " + "
+                + str(input1_data[0][i])
+                + " = "
+                + str(output1_data[0][i])
+            )
             if (input0_data[0][i] - input1_data[0][i]) != output0_data[0][i]:
                 print("sync infer error: incorrect difference")
                 sys.exit(1)
@@ -94,5 +100,5 @@ def test_infer(self):
                 sys.exit(1)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/common/nightly_email_helper.py b/qa/common/nightly_email_helper.py
old mode 100644
new mode 100755
index 995e50a666..bc401e56d4
--- a/qa/common/nightly_email_helper.py
+++ b/qa/common/nightly_email_helper.py
@@ -25,34 +25,32 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from email import encoders
-import smtplib
-from email.mime.multipart import MIMEMultipart
-from email.mime.base import MIMEBase
-from email.mime.text import MIMEText
 import glob
 import os
+import smtplib
 import sys
 import tarfile
+from email import encoders
+from email.mime.base import MIMEBase
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
 
 
-def send(subject: str,
-         content: str,
-         attachments=None,
-         files_to_tar=None,
-         is_html=False):
-    FROM = os.environ.get('TRITON_FROM', '')
-    TO = os.environ.get('TRITON_TO_DL', '')
-    if FROM == '' or TO == '':
-        print('Must set TRITON_FROM and TRITON_TO_DL env variables')
+def send(
+    subject: str, content: str, attachments=None, files_to_tar=None, is_html=False
+):
+    FROM = os.environ.get("TRITON_FROM", "")
+    TO = os.environ.get("TRITON_TO_DL", "")
+    if FROM == "" or TO == "":
+        print("Must set TRITON_FROM and TRITON_TO_DL env variables")
         sys.exit(1)
 
-    msg = MIMEMultipart('alternative')
-    msg['Subject'] = subject
-    msg['From'] = FROM
-    msg['To'] = TO
+    msg = MIMEMultipart("alternative")
+    msg["Subject"] = subject
+    msg["From"] = FROM
+    msg["To"] = TO
     if is_html:
-        mime_text = MIMEText(content, 'html')
+        mime_text = MIMEText(content, "html")
     else:
         mime_text = MIMEText(content)
     msg.attach(mime_text)
@@ -67,12 +65,11 @@ def send(subject: str,
         attachments.append(subject + ".tgz")
 
     for fname in attachments:
-        p = MIMEBase('application', 'octet-stream')
+        p = MIMEBase("application", "octet-stream")
         with open(fname, "rb") as attachment:
             p.set_payload((attachment).read())
         encoders.encode_base64(p)
-        p.add_header('Content-Disposition',
-                     "attachment; filename= %s" % (fname))
+        p.add_header("Content-Disposition", "attachment; filename= %s" % (fname))
         msg.attach(p)
 
     mailServer = smtplib.SMTP("mailgw.nvidia.com")
diff --git a/qa/common/perf_analyzer_input_data_json/int_data.json b/qa/common/perf_analyzer_input_data_json/int_data.json
index 0f99ae08f8..8921d57f6e 100644
--- a/qa/common/perf_analyzer_input_data_json/int_data.json
+++ b/qa/common/perf_analyzer_input_data_json/int_data.json
@@ -1,7 +1,7 @@
 {
     "data" :
         [
-        
+
             {
                 "INPUT0" : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                 "INPUT1" : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
@@ -18,6 +18,6 @@
                 "INPUT0" : [4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                 "INPUT1" : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
             }
-            
+
         ]
 }
diff --git a/qa/common/perf_analyzer_input_data_json/int_data_diff_shape.json b/qa/common/perf_analyzer_input_data_json/int_data_diff_shape.json
index 6b827bd0f3..53c3f1b412 100644
--- a/qa/common/perf_analyzer_input_data_json/int_data_diff_shape.json
+++ b/qa/common/perf_analyzer_input_data_json/int_data_diff_shape.json
@@ -1,7 +1,7 @@
 {
     "data" :
         [
-        
+
             {
                 "INPUT0" :
                 {
@@ -50,6 +50,6 @@
                     "shape": [2,8,2]
                 }
             }
-            
+
         ]
 }
diff --git a/qa/common/perf_analyzer_input_data_json/output.json b/qa/common/perf_analyzer_input_data_json/output.json
index d95d790882..f09aee52de 100644
--- a/qa/common/perf_analyzer_input_data_json/output.json
+++ b/qa/common/perf_analyzer_input_data_json/output.json
@@ -35,6 +35,6 @@
           {
               "OUTPUT0" : [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               "OUTPUT1" : [5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
-          }  
+          }
         ]
 }
diff --git a/qa/common/perf_analyzer_input_data_json/string_data_with_shape.json b/qa/common/perf_analyzer_input_data_json/string_data_with_shape.json
index d268b89c25..16640c7935 100644
--- a/qa/common/perf_analyzer_input_data_json/string_data_with_shape.json
+++ b/qa/common/perf_analyzer_input_data_json/string_data_with_shape.json
@@ -11,7 +11,7 @@
                     {
                         "content": ["1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"],
                         "shape": [2,8]
-                    }          
+                    }
             },
             {
                 "INPUT0" :
@@ -21,7 +21,7 @@
                 "INPUT1" :
                     {
                         "content": ["1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"]
-                    }          
+                    }
             },
             {
                 "INPUT0" :
@@ -31,7 +31,7 @@
                 "INPUT1" :
                     {
                         "content": ["1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"]
-                    }          
+                    }
             },
             {
                 "INPUT0" :
@@ -43,7 +43,7 @@
                     {
                         "content": ["1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"],
                         "shape": [2,8]
-                    }          
+                    }
             }
         ]
 }
diff --git a/qa/common/perf_analyzer_input_data_json/wrong_output.json b/qa/common/perf_analyzer_input_data_json/wrong_output.json
index 5d7b3f6eb6..a7765fdcb1 100644
--- a/qa/common/perf_analyzer_input_data_json/wrong_output.json
+++ b/qa/common/perf_analyzer_input_data_json/wrong_output.json
@@ -35,6 +35,6 @@
           {
               "OUTPUT0" : [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               "OUTPUT1" : [5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
-          }  
+          }
         ]
 }
diff --git a/qa/common/perf_analyzer_input_data_json/wrong_output_2.json b/qa/common/perf_analyzer_input_data_json/wrong_output_2.json
index e140ab7ba8..bc4487a3a3 100644
--- a/qa/common/perf_analyzer_input_data_json/wrong_output_2.json
+++ b/qa/common/perf_analyzer_input_data_json/wrong_output_2.json
@@ -35,6 +35,6 @@
           {
               "OUTPUT0" : [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
               "OUTPUT1" : [5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
-          }  
+          }
         ]
 }
diff --git a/qa/common/reporter.py b/qa/common/reporter.py
index 6b6ad09372..f4a085ac6f 100755
--- a/qa/common/reporter.py
+++ b/qa/common/reporter.py
@@ -30,39 +30,48 @@
 import csv
 import json
 import os
-import requests
 import socket
 
+import requests
+
 FLAGS = None
 
 ENVS = [
-    "CUDA_DRIVER_VERSION", "CUDA_VERSION", "TRITON_SERVER_VERSION",
-    "NVIDIA_TRITON_SERVER_VERSION", "TRT_VERSION", "CUDNN_VERSION",
-    "CUBLAS_VERSION", "BENCHMARK_PIPELINE", "BENCHMARK_REPO_BRANCH",
-    "BENCHMARK_REPO_COMMIT", "BENCHMARK_CLUSTER", "BENCHMARK_GPU_COUNT"
+    "CUDA_DRIVER_VERSION",
+    "CUDA_VERSION",
+    "TRITON_SERVER_VERSION",
+    "NVIDIA_TRITON_SERVER_VERSION",
+    "TRT_VERSION",
+    "CUDNN_VERSION",
+    "CUBLAS_VERSION",
+    "BENCHMARK_PIPELINE",
+    "BENCHMARK_REPO_BRANCH",
+    "BENCHMARK_REPO_COMMIT",
+    "BENCHMARK_CLUSTER",
+    "BENCHMARK_GPU_COUNT",
 ]
 
 
-def annotate(datas):
+def annotate(data):
     # Add all interesting envvar values
-    for data in datas:
+    for data in data:
         for env in ENVS:
             if env in os.environ:
                 val = os.environ[env]
-                data['s_' + env.lower()] = val
+                data["s_" + env.lower()] = val
 
         # Add this system's name. If running within slurm use
         # SLURM_JOB_NODELIST as the name (this assumes that the slurm
         # job was scheduled on a single node, otherwise
         # SLURM_JOB_NODELIST will list multiple nodes).
-        if 'SLURM_JOB_NODELIST' in os.environ:
-            data['s_benchmark_system'] = os.environ['SLURM_JOB_NODELIST']
+        if "SLURM_JOB_NODELIST" in os.environ:
+            data["s_benchmark_system"] = os.environ["SLURM_JOB_NODELIST"]
         else:
-            data['s_benchmark_system'] = socket.gethostname()
+            data["s_benchmark_system"] = socket.gethostname()
 
 
 def annotate_csv(data, csv_file):
-    csv_reader = csv.reader(csv_file, delimiter=',')
+    csv_reader = csv.reader(csv_file, delimiter=",")
     linenum = 0
     header_row = None
     concurrency_row = None
@@ -77,57 +86,59 @@ def annotate_csv(data, csv_file):
     if (header_row is not None) and (concurrency_row is not None):
         avg_latency_us = 0
         for header, result in zip(header_row, concurrency_row):
-            if header == 'Inferences/Second':
-                data['d_infer_per_sec'] = float(result)
-            elif ((header == 'Client Send') or
-                  (header == 'Network+Server Send/Recv') or
-                  (header == 'Server Queue') or
-                  (header == 'Server Compute Input') or
-                  (header == 'Server Compute Output') or
-                  (header == 'Server Compute Infer') or
-                  (header == 'Client Recv')):
+            if header == "Inferences/Second":
+                data["d_infer_per_sec"] = float(result)
+            elif (
+                (header == "Client Send")
+                or (header == "Network+Server Send/Recv")
+                or (header == "Server Queue")
+                or (header == "Server Compute Input")
+                or (header == "Server Compute Output")
+                or (header == "Server Compute Infer")
+                or (header == "Client Recv")
+            ):
                 avg_latency_us += float(result)
-            elif header == 'p50 latency':
-                data['d_latency_p50_ms'] = float(result) / 1000.0
-            elif header == 'p90 latency':
-                data['d_latency_p90_ms'] = float(result) / 1000.0
-            elif header == 'p95 latency':
-                data['d_latency_p95_ms'] = float(result) / 1000.0
-            elif header == 'p99 latency':
-                data['d_latency_p99_ms'] = float(result) / 1000.0
+            elif header == "p50 latency":
+                data["d_latency_p50_ms"] = float(result) / 1000.0
+            elif header == "p90 latency":
+                data["d_latency_p90_ms"] = float(result) / 1000.0
+            elif header == "p95 latency":
+                data["d_latency_p95_ms"] = float(result) / 1000.0
+            elif header == "p99 latency":
+                data["d_latency_p99_ms"] = float(result) / 1000.0
 
-        data['d_latency_avg_ms'] = avg_latency_us / 1000.0
+        data["d_latency_avg_ms"] = avg_latency_us / 1000.0
 
 
 def post_to_url(url, data):
-    headers = {'Content-Type': 'application/json', 'Accept-Charset': 'UTF-8'}
+    headers = {"Content-Type": "application/json", "Accept-Charset": "UTF-8"}
     r = requests.post(url, data=data, headers=headers)
     r.raise_for_status()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-o',
-                        '--output',
-                        type=str,
-                        required=False,
-                        help='Output filename')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        help='Post results to a URL')
-    parser.add_argument('--csv',
-                        type=argparse.FileType('r'),
-                        required=False,
-                        help='perf_analyzer generated CSV')
-    parser.add_argument('file', type=argparse.FileType('r'))
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-o", "--output", type=str, required=False, help="Output filename"
+    )
+    parser.add_argument(
+        "-u", "--url", type=str, required=False, help="Post results to a URL"
+    )
+    parser.add_argument(
+        "--csv",
+        type=argparse.FileType("r"),
+        required=False,
+        help="perf_analyzer generated CSV",
+    )
+    parser.add_argument("file", type=argparse.FileType("r"))
     FLAGS = parser.parse_args()
 
     data = json.loads(FLAGS.file.read())
@@ -138,8 +149,7 @@ def post_to_url(url, data):
 
     if FLAGS.csv is not None:
         if len(data) != 1:
-            raise Exception(
-                "--csv requires that json data have a single array entry")
+            raise Exception("--csv requires that json data have a single array entry")
         annotate_csv(data[0], FLAGS.csv)
         if FLAGS.verbose:
             print("*** Annotate CSV ***")
diff --git a/qa/common/sequence_util.py b/qa/common/sequence_util.py
old mode 100644
new mode 100755
index cb29721514..22c618fbb3
--- a/qa/common/sequence_util.py
+++ b/qa/common/sequence_util.py
@@ -1,4 +1,6 @@
-# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,17 +26,16 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from builtins import range
-from builtins import str
 import os
 import sys
-import time
 import threading
-import numpy as np
-import infer_util as iu
-import test_util as tu
+import time
+from builtins import range, str
 from functools import partial
 
+import infer_util as iu
+import numpy as np
+import test_util as tu
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
 from tritonclient.utils import *
@@ -46,23 +47,21 @@
 
 # By default, find tritonserver on "localhost", but can be overridden
 # with TRITONSERVER_IPADDR envvar
-_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost')
+_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
 
-_test_system_shared_memory = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
-_test_cuda_shared_memory = bool(
-    int(os.environ.get('TEST_CUDA_SHARED_MEMORY', 0)))
+_test_system_shared_memory = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
+_test_cuda_shared_memory = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0)))
 
 if _test_system_shared_memory:
     import tritonclient.utils.shared_memory as shm
 if _test_cuda_shared_memory:
     import tritonclient.utils.cuda_shared_memory as cudashm
 
-_test_valgrind = bool(int(os.environ.get('TEST_VALGRIND', 0)))
-_test_jetson = bool(int(os.environ.get('TEST_JETSON', 0)))
+_test_valgrind = bool(int(os.environ.get("TEST_VALGRIND", 0)))
+_test_jetson = bool(int(os.environ.get("TEST_JETSON", 0)))
 
 _max_sequence_idle_ms = 5000
-_valgrind_delay_ms = bool(int(os.environ.get('TEST_DELAY_MS', 50)))
+_valgrind_delay_ms = bool(int(os.environ.get("TEST_DELAY_MS", 50)))
 
 _deferred_exceptions_lock = threading.Lock()
 _deferred_exceptions = None
@@ -70,7 +69,6 @@
 
 
 class UserData:
-
     def __init__(self):
         self._completed_requests = queue.Queue()
 
@@ -82,11 +80,11 @@ def completion_callback(user_data, result, error):
 
 
 class SequenceBatcherTestUtil(tu.TestResultCollector):
-
     def setUp(self):
         # The helper client for setup will be GRPC for simplicity.
         self.triton_client_ = grpcclient.InferenceServerClient(
-            f"{_tritonserver_ipaddr}:8001")
+            f"{_tritonserver_ipaddr}:8001"
+        )
         self.clear_deferred_exceptions()
 
     def clear_deferred_exceptions(self):
@@ -113,12 +111,9 @@ def check_failure(self):
             if len(_deferred_exceptions) == 0:
                 raise Exception("Unexpected inference success")
 
-    def precreate_register_regions(self,
-                                   value_list,
-                                   dtype,
-                                   i,
-                                   batch_size=1,
-                                   tensor_shape=(1,)):
+    def precreate_register_regions(
+        self, value_list, dtype, i, batch_size=1, tensor_shape=(1,)
+    ):
         if _test_system_shared_memory or _test_cuda_shared_memory:
             shm_region_handles = []
             for j, value in enumerate(value_list):
@@ -135,11 +130,10 @@ def precreate_register_regions(self,
                 for b in range(batch_size):
                     if dtype == np.object_:
                         in0 = np.full(tensor_shape, value, dtype=np.int32)
-                        in0n = np.array([
-                            str(x).encode('utf-8')
-                            for x in in0.reshape(in0.size)
-                        ],
-                                        dtype=object)
+                        in0n = np.array(
+                            [str(x).encode("utf-8") for x in in0.reshape(in0.size)],
+                            dtype=object,
+                        )
                         in0 = in0n.reshape(tensor_shape)
                         output_byte_size += 64 * in0.size
                     else:
@@ -150,54 +144,63 @@ def precreate_register_regions(self,
                 if dtype == np.object_:
                     input_list_tmp = iu.serialize_byte_tensor_list(input_list)
                     input_byte_size = sum(
-                        [serialized_byte_size(i0) for i0 in input_list_tmp])
+                        [serialized_byte_size(i0) for i0 in input_list_tmp]
+                    )
                 else:
                     input_list_tmp = input_list
                     input_byte_size = sum([i0.nbytes for i0 in input_list_tmp])
 
                 # create shared memory regions and copy data for input values
-                ip_name = 'ip{}{}'.format(i, j)
-                op_name = 'op{}{}_data'.format(i, j)
+                ip_name = "ip{}{}".format(i, j)
+                op_name = "op{}{}_data".format(i, j)
                 if _test_system_shared_memory:
                     shm_ip_handle = shm.create_shared_memory_region(
-                        ip_name, '/' + ip_name, input_byte_size)
+                        ip_name, "/" + ip_name, input_byte_size
+                    )
                     shm_op_handle = shm.create_shared_memory_region(
-                        op_name, '/' + op_name, output_byte_size)
+                        op_name, "/" + op_name, output_byte_size
+                    )
                     shm.set_shared_memory_region(shm_ip_handle, input_list_tmp)
                     self.triton_client_.register_system_shared_memory(
-                        ip_name, '/' + ip_name, input_byte_size)
+                        ip_name, "/" + ip_name, input_byte_size
+                    )
                     self.triton_client_.register_system_shared_memory(
-                        op_name, '/' + op_name, output_byte_size)
+                        op_name, "/" + op_name, output_byte_size
+                    )
                 elif _test_cuda_shared_memory:
                     shm_ip_handle = cudashm.create_shared_memory_region(
-                        ip_name, input_byte_size, 0)
+                        ip_name, input_byte_size, 0
+                    )
                     shm_op_handle = cudashm.create_shared_memory_region(
-                        op_name, output_byte_size, 0)
-                    cudashm.set_shared_memory_region(shm_ip_handle,
-                                                     input_list_tmp)
+                        op_name, output_byte_size, 0
+                    )
+                    cudashm.set_shared_memory_region(shm_ip_handle, input_list_tmp)
                     self.triton_client_.register_cuda_shared_memory(
-                        ip_name, cudashm.get_raw_handle(shm_ip_handle), 0,
-                        input_byte_size)
+                        ip_name,
+                        cudashm.get_raw_handle(shm_ip_handle),
+                        0,
+                        input_byte_size,
+                    )
                     self.triton_client_.register_cuda_shared_memory(
-                        op_name, cudashm.get_raw_handle(shm_op_handle), 0,
-                        output_byte_size)
-                shm_region_handles.append(
-                    (ip_name, input_byte_size, shm_ip_handle))
-                shm_region_handles.append(
-                    (op_name, output_byte_size, shm_op_handle))
+                        op_name,
+                        cudashm.get_raw_handle(shm_op_handle),
+                        0,
+                        output_byte_size,
+                    )
+                shm_region_handles.append((ip_name, input_byte_size, shm_ip_handle))
+                shm_region_handles.append((op_name, output_byte_size, shm_op_handle))
             return shm_region_handles
         else:
             return []
 
     # Returns (name, byte size, shm_handle)
-    def precreate_register_shape_tensor_regions(self,
-                                                value_list,
-                                                dtype,
-                                                i,
-                                                batch_size=1,
-                                                tensor_shape=(1,)):
-        self.assertFalse(_test_cuda_shared_memory,
-                         "Shape tensors does not support CUDA shared memory")
+    def precreate_register_shape_tensor_regions(
+        self, value_list, dtype, i, batch_size=1, tensor_shape=(1,)
+    ):
+        self.assertFalse(
+            _test_cuda_shared_memory,
+            "Shape tensors does not support CUDA shared memory",
+        )
         if _test_system_shared_memory:
             shm_region_handles = []
             for j, (shape_value, value) in enumerate(value_list):
@@ -207,8 +210,9 @@ def precreate_register_shape_tensor_regions(self,
                 for b in range(batch_size):
                     if dtype == np.object_:
                         in0 = np.full(tensor_shape, value, dtype=np.int32)
-                        in0n = np.array([str(x) for x in in0.reshape(in0.size)],
-                                        dtype=object)
+                        in0n = np.array(
+                            [str(x) for x in in0.reshape(in0.size)], dtype=object
+                        )
                         in0 = in0n.reshape(tensor_shape)
                     else:
                         in0 = np.full(tensor_shape, value, dtype=dtype)
@@ -216,80 +220,86 @@ def precreate_register_shape_tensor_regions(self,
 
                 # Only one shape tensor input per batch
                 shape_input_list.append(
-                    np.full(tensor_shape, shape_value, dtype=np.int32))
+                    np.full(tensor_shape, shape_value, dtype=np.int32)
+                )
 
                 if dtype == np.object_:
                     input_list_tmp = iu.serialize_byte_tensor_list(input_list)
                     input_byte_size = sum(
-                        [serialized_byte_size(i0) for i0 in input_list_tmp])
+                        [serialized_byte_size(i0) for i0 in input_list_tmp]
+                    )
                 else:
                     input_list_tmp = input_list
                     input_byte_size = sum([i0.nbytes for i0 in input_list_tmp])
 
-                shape_input_byte_size = sum(
-                    [i0.nbytes for i0 in shape_input_list])
+                shape_input_byte_size = sum([i0.nbytes for i0 in shape_input_list])
                 shape_output_byte_size = shape_input_byte_size
                 output_byte_size = np.dtype(dtype).itemsize + 2
                 resized_output_byte_size = 32 * shape_value
 
                 # create shared memory regions and copy data for input values
-                ip_name = 'ip{}{}'.format(i, j)
-                shape_ip_name = 'shape_ip{}{}'.format(i, j)
-                shape_op_name = 'shape_op{}{}'.format(i, j)
-                op_name = 'op{}{}'.format(i, j)
-                resized_op_name = 'resized_op{}{}'.format(i, j)
+                ip_name = "ip{}{}".format(i, j)
+                shape_ip_name = "shape_ip{}{}".format(i, j)
+                shape_op_name = "shape_op{}{}".format(i, j)
+                op_name = "op{}{}".format(i, j)
+                resized_op_name = "resized_op{}{}".format(i, j)
 
                 shm_ip_handle = shm.create_shared_memory_region(
-                    ip_name, '/' + ip_name, input_byte_size)
+                    ip_name, "/" + ip_name, input_byte_size
+                )
                 shm_shape_ip_handle = shm.create_shared_memory_region(
-                    shape_ip_name, '/' + shape_ip_name, shape_input_byte_size)
+                    shape_ip_name, "/" + shape_ip_name, shape_input_byte_size
+                )
                 shm_shape_op_handle = shm.create_shared_memory_region(
-                    shape_op_name, '/' + shape_op_name, shape_output_byte_size)
+                    shape_op_name, "/" + shape_op_name, shape_output_byte_size
+                )
                 shm_op_handle = shm.create_shared_memory_region(
-                    op_name, '/' + op_name, output_byte_size)
+                    op_name, "/" + op_name, output_byte_size
+                )
                 shm_resized_op_handle = shm.create_shared_memory_region(
-                    resized_op_name, '/' + resized_op_name,
-                    resized_output_byte_size)
+                    resized_op_name, "/" + resized_op_name, resized_output_byte_size
+                )
                 shm.set_shared_memory_region(shm_ip_handle, input_list_tmp)
-                shm.set_shared_memory_region(shm_shape_ip_handle,
-                                             shape_input_list)
+                shm.set_shared_memory_region(shm_shape_ip_handle, shape_input_list)
                 self.triton_client_.register_system_shared_memory(
-                    ip_name, '/' + ip_name, input_byte_size)
+                    ip_name, "/" + ip_name, input_byte_size
+                )
                 self.triton_client_.register_system_shared_memory(
-                    shape_ip_name, '/' + shape_ip_name, shape_input_byte_size)
+                    shape_ip_name, "/" + shape_ip_name, shape_input_byte_size
+                )
                 self.triton_client_.register_system_shared_memory(
-                    shape_op_name, '/' + shape_op_name, shape_output_byte_size)
+                    shape_op_name, "/" + shape_op_name, shape_output_byte_size
+                )
                 self.triton_client_.register_system_shared_memory(
-                    op_name, '/' + op_name, output_byte_size)
+                    op_name, "/" + op_name, output_byte_size
+                )
                 self.triton_client_.register_system_shared_memory(
-                    resized_op_name, '/' + resized_op_name,
-                    resized_output_byte_size)
+                    resized_op_name, "/" + resized_op_name, resized_output_byte_size
+                )
 
+                shm_region_handles.append((ip_name, input_byte_size, shm_ip_handle))
                 shm_region_handles.append(
-                    (ip_name, input_byte_size, shm_ip_handle))
+                    (shape_ip_name, shape_input_byte_size, shm_shape_ip_handle)
+                )
                 shm_region_handles.append(
-                    (shape_ip_name, shape_input_byte_size, shm_shape_ip_handle))
+                    (shape_op_name, shape_output_byte_size, shm_shape_op_handle)
+                )
+                shm_region_handles.append((op_name, output_byte_size, shm_op_handle))
                 shm_region_handles.append(
-                    (shape_op_name, shape_output_byte_size,
-                     shm_shape_op_handle))
-                shm_region_handles.append(
-                    (op_name, output_byte_size, shm_op_handle))
-                shm_region_handles.append(
-                    (resized_op_name, resized_output_byte_size,
-                     shm_resized_op_handle))
+                    (resized_op_name, resized_output_byte_size, shm_resized_op_handle)
+                )
             return shm_region_handles
         else:
             return []
 
     # Returns (name, byte size, shm_handle)
-    def precreate_register_dynaseq_shape_tensor_regions(self,
-                                                        value_list,
-                                                        dtype,
-                                                        i,
-                                                        batch_size=1,
-                                                        tensor_shape=(1,)):
-        self.assertFalse(_test_cuda_shared_memory,
-                         "Shape tensors does not support CUDA shared memory")
+    def precreate_register_dynaseq_shape_tensor_regions(
+        self, value_list, dtype, i, batch_size=1, tensor_shape=(1,)
+    ):
+        self.assertFalse(
+            _test_cuda_shared_memory,
+            "Shape tensors does not support CUDA shared memory",
+        )
         if _test_system_shared_memory:
             shm_region_handles = []
             for j, (shape_value, value) in enumerate(value_list):
@@ -301,8 +311,8 @@ def precreate_register_dynaseq_shape_tensor_regions(self,
                     if dtype == np.object_:
                         dummy_in0 = np.full(tensor_shape, value, dtype=np.int32)
                         dummy_in0n = np.array(
-                            [str(x) for x in dummy_in0.reshape(in0.size)],
-                            dtype=object)
+                            [str(x) for x in dummy_in0.reshape(in0.size)], dtype=object
+                        )
                         dummy_in0 = dummy_in0n.reshape(tensor_shape)
                     else:
                         dummy_in0 = np.full(tensor_shape, value, dtype=dtype)
@@ -312,79 +322,87 @@ def precreate_register_dynaseq_shape_tensor_regions(self,
 
                 # Only one shape tensor input per batch
                 shape_input_list.append(
-                    np.full(tensor_shape, shape_value, dtype=np.int32))
+                    np.full(tensor_shape, shape_value, dtype=np.int32)
+                )
 
                 if dtype == np.object_:
                     input_list_tmp = iu.serialize_byte_tensor_list(input_list)
                     input_byte_size = sum(
-                        [serialized_byte_size(i0) for i0 in input_list_tmp])
+                        [serialized_byte_size(i0) for i0 in input_list_tmp]
+                    )
                 else:
                     input_list_tmp = input_list
                     input_byte_size = sum([i0.nbytes for i0 in input_list_tmp])
 
-                dummy_input_byte_size = sum(
-                    [i0.nbytes for i0 in dummy_input_list])
+                dummy_input_byte_size = sum([i0.nbytes for i0 in dummy_input_list])
 
-                shape_input_byte_size = sum(
-                    [i0.nbytes for i0 in shape_input_list])
+                shape_input_byte_size = sum([i0.nbytes for i0 in shape_input_list])
                 shape_output_byte_size = shape_input_byte_size
                 output_byte_size = np.dtype(np.int32).itemsize + 2
                 resized_output_byte_size = 32 * shape_value
 
                 # create shared memory regions and copy data for input values
-                ip_name = 'ip{}{}'.format(i, j)
-                shape_ip_name = 'shape_ip{}{}'.format(i, j)
-                dummy_ip_name = 'dummy_ip{}{}'.format(i, j)
-                shape_op_name = 'shape_op{}{}'.format(i, j)
-                op_name = 'op{}{}'.format(i, j)
-                resized_op_name = 'resized_op{}{}'.format(i, j)
+                ip_name = "ip{}{}".format(i, j)
+                shape_ip_name = "shape_ip{}{}".format(i, j)
+                dummy_ip_name = "dummy_ip{}{}".format(i, j)
+                shape_op_name = "shape_op{}{}".format(i, j)
+                op_name = "op{}{}".format(i, j)
+                resized_op_name = "resized_op{}{}".format(i, j)
 
                 shm_ip_handle = shm.create_shared_memory_region(
-                    ip_name, '/' + ip_name, input_byte_size)
+                    ip_name, "/" + ip_name, input_byte_size
+                )
                 shm_shape_ip_handle = shm.create_shared_memory_region(
-                    shape_ip_name, '/' + shape_ip_name, shape_input_byte_size)
+                    shape_ip_name, "/" + shape_ip_name, shape_input_byte_size
+                )
                 shm_dummy_ip_handle = shm.create_shared_memory_region(
-                    dummy_ip_name, '/' + dummy_ip_name, dummy_input_byte_size)
+                    dummy_ip_name, "/" + dummy_ip_name, dummy_input_byte_size
+                )
                 shm_shape_op_handle = shm.create_shared_memory_region(
-                    shape_op_name, '/' + shape_op_name, shape_output_byte_size)
+                    shape_op_name, "/" + shape_op_name, shape_output_byte_size
+                )
                 shm_op_handle = shm.create_shared_memory_region(
-                    op_name, '/' + op_name, output_byte_size)
+                    op_name, "/" + op_name, output_byte_size
+                )
                 shm_resized_op_handle = shm.create_shared_memory_region(
-                    resized_op_name, '/' + resized_op_name,
-                    resized_output_byte_size)
+                    resized_op_name, "/" + resized_op_name, resized_output_byte_size
+                )
                 shm.set_shared_memory_region(shm_ip_handle, input_list_tmp)
-                shm.set_shared_memory_region(shm_shape_ip_handle,
-                                             shape_input_list)
-                shm.set_shared_memory_region(shm_dummy_ip_handle,
-                                             dummy_input_list)
+                shm.set_shared_memory_region(shm_shape_ip_handle, shape_input_list)
+                shm.set_shared_memory_region(shm_dummy_ip_handle, dummy_input_list)
                 self.triton_client_.register_system_shared_memory(
-                    ip_name, '/' + ip_name, input_byte_size)
+                    ip_name, "/" + ip_name, input_byte_size
+                )
                 self.triton_client_.register_system_shared_memory(
-                    shape_ip_name, '/' + shape_ip_name, shape_input_byte_size)
+                    shape_ip_name, "/" + shape_ip_name, shape_input_byte_size
+                )
                 self.triton_client_.register_system_shared_memory(
-                    dummy_ip_name, '/' + dummy_ip_name, dummy_input_byte_size)
+                    dummy_ip_name, "/" + dummy_ip_name, dummy_input_byte_size
+                )
                 self.triton_client_.register_system_shared_memory(
-                    shape_op_name, '/' + shape_op_name, shape_output_byte_size)
+                    shape_op_name, "/" + shape_op_name, shape_output_byte_size
+                )
                 self.triton_client_.register_system_shared_memory(
-                    op_name, '/' + op_name, output_byte_size)
+                    op_name, "/" + op_name, output_byte_size
+                )
                 self.triton_client_.register_system_shared_memory(
-                    resized_op_name, '/' + resized_op_name,
-                    resized_output_byte_size)
+                    resized_op_name, "/" + resized_op_name, resized_output_byte_size
+                )
 
+                shm_region_handles.append((ip_name, input_byte_size, shm_ip_handle))
                 shm_region_handles.append(
-                    (ip_name, input_byte_size, shm_ip_handle))
-                shm_region_handles.append(
-                    (shape_ip_name, shape_input_byte_size, shm_shape_ip_handle))
+                    (shape_ip_name, shape_input_byte_size, shm_shape_ip_handle)
+                )
                 shm_region_handles.append(
-                    (dummy_ip_name, dummy_input_byte_size, shm_dummy_ip_handle))
+                    (dummy_ip_name, dummy_input_byte_size, shm_dummy_ip_handle)
+                )
                 shm_region_handles.append(
-                    (shape_op_name, shape_output_byte_size,
-                     shm_shape_op_handle))
+                    (shape_op_name, shape_output_byte_size, shm_shape_op_handle)
+                )
+                shm_region_handles.append((op_name, output_byte_size, shm_op_handle))
                 shm_region_handles.append(
-                    (op_name, output_byte_size, shm_op_handle))
-                shm_region_handles.append(
-                    (resized_op_name, resized_output_byte_size,
-                     shm_resized_op_handle))
+                    (resized_op_name, resized_output_byte_size, shm_resized_op_handle)
+                )
             return shm_region_handles
         else:
             return []
@@ -401,28 +419,35 @@ def cleanup_shm_regions(self, shm_handles):
             elif _test_cuda_shared_memory:
                 cudashm.destroy_shared_memory_region(shm_tmp_handle[2])
 
-    def check_sequence(self,
-                       trial,
-                       model_name,
-                       input_dtype,
-                       correlation_id,
-                       sequence_thresholds,
-                       values,
-                       expected_result,
-                       protocol,
-                       batch_size=1,
-                       sequence_name="<unknown>",
-                       tensor_shape=(1,)):
+    def check_sequence(
+        self,
+        trial,
+        model_name,
+        input_dtype,
+        correlation_id,
+        sequence_thresholds,
+        values,
+        expected_result,
+        protocol,
+        batch_size=1,
+        sequence_name="<unknown>",
+        tensor_shape=(1,),
+    ):
         """Perform sequence of inferences. The 'values' holds a list of
         tuples, one for each inference with format:
 
         (flag_str, value, (ls_ms, gt_ms), (pre_delay_ms, post_delay_ms)
 
         """
-        if (("savedmodel" not in trial) and ("graphdef" not in trial) and
-            ("custom" not in trial) and ("onnx" not in trial) and
-            ("libtorch" not in trial) and ("plan" not in trial) and
-            ("python" not in trial)):
+        if (
+            ("savedmodel" not in trial)
+            and ("graphdef" not in trial)
+            and ("custom" not in trial)
+            and ("onnx" not in trial)
+            and ("libtorch" not in trial)
+            and ("plan" not in trial)
+            and ("python" not in trial)
+        ):
             self.assertFalse(True, "unknown trial type: " + trial)
 
         # Can only send the request exactly once since it is a
@@ -437,12 +462,14 @@ def check_sequence(self,
 
         self.assertFalse(
             _test_system_shared_memory and _test_cuda_shared_memory,
-            "Cannot set both System and CUDA shared memory flags to 1")
+            "Cannot set both System and CUDA shared memory flags to 1",
+        )
 
         self.assertEqual(len(configs), 1)
 
-        full_shape = tensor_shape if "nobatch" in trial else (
-            batch_size,) + tensor_shape
+        full_shape = (
+            tensor_shape if "nobatch" in trial else (batch_size,) + tensor_shape
+        )
 
         # create and register shared memory output region in advance,
         # knowing that this function will not be called concurrently.
@@ -452,33 +479,36 @@ def check_sequence(self,
             output_byte_size = 512
             if _test_system_shared_memory:
                 shm_op_handle = shm.create_shared_memory_region(
-                    "output_data", "/output", output_byte_size)
+                    "output_data", "/output", output_byte_size
+                )
                 self.triton_client_.register_system_shared_memory(
-                    "output_data", "/output", output_byte_size)
+                    "output_data", "/output", output_byte_size
+                )
             elif _test_cuda_shared_memory:
                 shm_op_handle = cudashm.create_shared_memory_region(
-                    "output_data", output_byte_size, 0)
+                    "output_data", output_byte_size, 0
+                )
                 self.triton_client_.register_cuda_shared_memory(
-                    "output_data", cudashm.get_raw_handle(shm_op_handle), 0,
-                    output_byte_size)
+                    "output_data",
+                    cudashm.get_raw_handle(shm_op_handle),
+                    0,
+                    output_byte_size,
+                )
             shm_ip_handles = []
 
         for config in configs:
             client_utils = grpcclient if config[1] == "grpc" else httpclient
 
-            triton_client = client_utils.InferenceServerClient(config[0],
-                                                               verbose=True)
+            triton_client = client_utils.InferenceServerClient(config[0], verbose=True)
             if config[2]:
                 user_data = UserData()
-                triton_client.start_stream(
-                    partial(completion_callback, user_data))
+                triton_client.start_stream(partial(completion_callback, user_data))
             # Execute the sequence of inference...
             try:
                 seq_start_ms = int(round(time.time() * 1000))
 
                 INPUT = "INPUT__0" if trial.startswith("libtorch") else "INPUT"
-                OUTPUT = "OUTPUT__0" if trial.startswith(
-                    "libtorch") else "OUTPUT"
+                OUTPUT = "OUTPUT__0" if trial.startswith("libtorch") else "OUTPUT"
                 for flag_str, value, thresholds, delay_ms in values:
                     if _test_valgrind or _test_jetson:
                         if delay_ms is not None:
@@ -493,20 +523,23 @@ def check_sequence(self,
                     seq_start = False
                     seq_end = False
                     if flag_str is not None:
-                        seq_start = ("start" in flag_str)
-                        seq_end = ("end" in flag_str)
+                        seq_start = "start" in flag_str
+                        seq_end = "end" in flag_str
 
                     # Construct request IOs
                     inputs = []
                     outputs = []
                     inputs.append(
                         client_utils.InferInput(
-                            INPUT, full_shape, np_to_triton_dtype(input_dtype)))
+                            INPUT, full_shape, np_to_triton_dtype(input_dtype)
+                        )
+                    )
                     outputs.append(client_utils.InferRequestedOutput(OUTPUT))
                     if input_dtype == np.object_:
                         in0 = np.full(full_shape, value, dtype=np.int32)
-                        in0n = np.array([str(x) for x in in0.reshape(in0.size)],
-                                        dtype=object)
+                        in0n = np.array(
+                            [str(x) for x in in0.reshape(in0.size)], dtype=object
+                        )
                         in0 = in0n.reshape(full_shape)
                     else:
                         in0 = np.full(full_shape, value, dtype=input_dtype)
@@ -514,39 +547,44 @@ def check_sequence(self,
                     # create input shared memory and copy input data values into it
                     if _test_system_shared_memory or _test_cuda_shared_memory:
                         if input_dtype == np.object_:
-                            input_list_tmp = iu.serialize_byte_tensor_list(
-                                [in0])
-                            input_byte_size = sum([
-                                serialized_byte_size(i0)
-                                for i0 in input_list_tmp
-                            ])
+                            input_list_tmp = iu.serialize_byte_tensor_list([in0])
+                            input_byte_size = sum(
+                                [serialized_byte_size(i0) for i0 in input_list_tmp]
+                            )
                         else:
                             input_list_tmp = [in0]
-                            input_byte_size = sum(
-                                [i0.nbytes for i0 in input_list_tmp])
+                            input_byte_size = sum([i0.nbytes for i0 in input_list_tmp])
                         ip_name = "ip{}".format(len(shm_ip_handles))
                         if _test_system_shared_memory:
                             shm_ip_handles.append(
                                 shm.create_shared_memory_region(
-                                    ip_name, "/" + ip_name, input_byte_size))
+                                    ip_name, "/" + ip_name, input_byte_size
+                                )
+                            )
                             shm.set_shared_memory_region(
-                                shm_ip_handles[-1], input_list_tmp)
+                                shm_ip_handles[-1], input_list_tmp
+                            )
                             triton_client.register_system_shared_memory(
-                                ip_name, "/" + ip_name, input_byte_size)
+                                ip_name, "/" + ip_name, input_byte_size
+                            )
                         elif _test_cuda_shared_memory:
                             shm_ip_handles.append(
                                 cudashm.create_shared_memory_region(
-                                    ip_name, input_byte_size, 0))
+                                    ip_name, input_byte_size, 0
+                                )
+                            )
                             cudashm.set_shared_memory_region(
-                                shm_ip_handles[-1], input_list_tmp)
+                                shm_ip_handles[-1], input_list_tmp
+                            )
                             triton_client.register_cuda_shared_memory(
                                 ip_name,
-                                cudashm.get_raw_handle(shm_ip_handles[-1]), 0,
-                                input_byte_size)
+                                cudashm.get_raw_handle(shm_ip_handles[-1]),
+                                0,
+                                input_byte_size,
+                            )
 
                         inputs[0].set_shared_memory(ip_name, input_byte_size)
-                        outputs[0].set_shared_memory("output_data",
-                                                     output_byte_size)
+                        outputs[0].set_shared_memory("output_data", output_byte_size)
                     else:
                         inputs[0].set_data_from_numpy(in0)
 
@@ -559,7 +597,8 @@ def check_sequence(self,
                             outputs=outputs,
                             sequence_id=correlation_id,
                             sequence_start=seq_start,
-                            sequence_end=seq_end)
+                            sequence_end=seq_end,
+                        )
                         (results, error) = user_data._completed_requests.get()
                         if error is not None:
                             raise error
@@ -570,14 +609,16 @@ def check_sequence(self,
                             outputs=outputs,
                             sequence_id=correlation_id,
                             sequence_start=seq_start,
-                            sequence_end=seq_end)
+                            sequence_end=seq_end,
+                        )
 
                     end_ms = int(round(time.time() * 1000))
 
                     # Get value of "OUTPUT", for shared memory, need to get it via
                     # shared memory utils
                     if (not _test_system_shared_memory) and (
-                            not _test_cuda_shared_memory):
+                        not _test_cuda_shared_memory
+                    ):
                         out = results.as_numpy(OUTPUT)
                     else:
                         output = results.get_output(OUTPUT)
@@ -588,10 +629,12 @@ def check_sequence(self,
                         output_type = input_dtype
                         if _test_system_shared_memory:
                             out = shm.get_contents_as_numpy(
-                                shm_op_handle, output_type, output_shape)
+                                shm_op_handle, output_type, output_shape
+                            )
                         else:
                             out = cudashm.get_contents_as_numpy(
-                                shm_op_handle, output_type, output_shape)
+                                shm_op_handle, output_type, output_shape
+                            )
                     result = out[0] if "nobatch" in trial else out[0][0]
                     print("{}: {}".format(sequence_name, result))
 
@@ -599,16 +642,23 @@ def check_sequence(self,
                         lt_ms = thresholds[0]
                         gt_ms = thresholds[1]
                         if lt_ms is not None:
-                            self.assertTrue((end_ms - start_ms) < lt_ms,
-                                            "expected less than " + str(lt_ms) +
-                                            "ms response time, got " +
-                                            str(end_ms - start_ms) + " ms")
+                            self.assertTrue(
+                                (end_ms - start_ms) < lt_ms,
+                                "expected less than "
+                                + str(lt_ms)
+                                + "ms response time, got "
+                                + str(end_ms - start_ms)
+                                + " ms",
+                            )
                         if gt_ms is not None:
                             self.assertTrue(
                                 (end_ms - start_ms) > gt_ms,
-                                "expected greater than " + str(gt_ms) +
-                                "ms response time, got " +
-                                str(end_ms - start_ms) + " ms")
+                                "expected greater than "
+                                + str(gt_ms)
+                                + "ms response time, got "
+                                + str(end_ms - start_ms)
+                                + " ms",
+                            )
                     if delay_ms is not None:
                         time.sleep(delay_ms[1] / 1000.0)
 
@@ -625,15 +675,23 @@ def check_sequence(self,
                     if lt_ms is not None:
                         if _test_jetson:
                             lt_ms *= _jetson_slowdown_factor
-                        self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms,
-                                        "sequence expected less than " +
-                                        str(lt_ms) + "ms response time, got " +
-                                        str(seq_end_ms - seq_start_ms) + " ms")
+                        self.assertTrue(
+                            (seq_end_ms - seq_start_ms) < lt_ms,
+                            "sequence expected less than "
+                            + str(lt_ms)
+                            + "ms response time, got "
+                            + str(seq_end_ms - seq_start_ms)
+                            + " ms",
+                        )
                     if gt_ms is not None:
-                        self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms,
-                                        "sequence expected greater than " +
-                                        str(gt_ms) + "ms response time, got " +
-                                        str(seq_end_ms - seq_start_ms) + " ms")
+                        self.assertTrue(
+                            (seq_end_ms - seq_start_ms) > gt_ms,
+                            "sequence expected greater than "
+                            + str(gt_ms)
+                            + "ms response time, got "
+                            + str(seq_end_ms - seq_start_ms)
+                            + " ms",
+                        )
             except Exception as ex:
                 self.add_deferred_exception(ex)
             if config[2]:
@@ -642,45 +700,59 @@ def check_sequence(self,
         if _test_system_shared_memory or _test_cuda_shared_memory:
             self.triton_client_.unregister_system_shared_memory()
             self.triton_client_.unregister_cuda_shared_memory()
-            destroy_func = shm.destroy_shared_memory_region if _test_system_shared_memory else cudashm.destroy_shared_memory_region
+            destroy_func = (
+                shm.destroy_shared_memory_region
+                if _test_system_shared_memory
+                else cudashm.destroy_shared_memory_region
+            )
             destroy_func(shm_op_handle)
             for shm_ip_handle in shm_ip_handles:
                 destroy_func(shm_ip_handle)
 
-    def check_sequence_async(self,
-                             trial,
-                             model_name,
-                             input_dtype,
-                             correlation_id,
-                             sequence_thresholds,
-                             values,
-                             expected_result,
-                             shm_region_handles,
-                             batch_size=1,
-                             sequence_name="<unknown>",
-                             tensor_shape=(1,)):
+    def check_sequence_async(
+        self,
+        trial,
+        model_name,
+        input_dtype,
+        correlation_id,
+        sequence_thresholds,
+        values,
+        expected_result,
+        shm_region_handles,
+        batch_size=1,
+        sequence_name="<unknown>",
+        tensor_shape=(1,),
+    ):
         """Perform sequence of inferences using stream async run.
         The 'values' holds a list of tuples, one for each inference with format:
 
         (flag_str, value, pre_delay_ms)
 
         """
-        if (("savedmodel" not in trial) and ("graphdef" not in trial) and
-            ("custom" not in trial) and ("onnx" not in trial) and
-            ("libtorch" not in trial) and ("plan" not in trial) and
-            ("python" not in trial)):
+        if (
+            ("savedmodel" not in trial)
+            and ("graphdef" not in trial)
+            and ("custom" not in trial)
+            and ("onnx" not in trial)
+            and ("libtorch" not in trial)
+            and ("plan" not in trial)
+            and ("python" not in trial)
+        ):
             self.assertFalse(True, "unknown trial type: " + trial)
 
         self.assertFalse(
             _test_system_shared_memory and _test_cuda_shared_memory,
-            "Cannot set both System and CUDA shared memory flags to 1")
+            "Cannot set both System and CUDA shared memory flags to 1",
+        )
 
-        full_shape = tensor_shape if "nobatch" in trial else (
-            batch_size,) + tensor_shape
+        full_shape = (
+            tensor_shape if "nobatch" in trial else (batch_size,) + tensor_shape
+        )
 
         client_utils = grpcclient
         triton_client = client_utils.InferenceServerClient(
-            f"{_tritonserver_ipaddr}:8001", verbose=True)
+            f"{_tritonserver_ipaddr}:8001", verbose=True
+        )
         user_data = UserData()
         triton_client.start_stream(partial(completion_callback, user_data))
         # Execute the sequence of inference...
@@ -694,43 +766,50 @@ def check_sequence_async(self,
                 seq_start = False
                 seq_end = False
                 if flag_str is not None:
-                    seq_start = ("start" in flag_str)
-                    seq_end = ("end" in flag_str)
+                    seq_start = "start" in flag_str
+                    seq_end = "end" in flag_str
 
                 # Construct request IOs
                 inputs = []
                 outputs = []
                 inputs.append(
-                    client_utils.InferInput(INPUT, full_shape,
-                                            np_to_triton_dtype(input_dtype)))
+                    client_utils.InferInput(
+                        INPUT, full_shape, np_to_triton_dtype(input_dtype)
+                    )
+                )
                 outputs.append(client_utils.InferRequestedOutput(OUTPUT))
 
                 if not (_test_system_shared_memory or _test_cuda_shared_memory):
                     if input_dtype == np.object_:
                         in0 = np.full(full_shape, value, dtype=np.int32)
-                        in0n = np.array([str(x) for x in in0.reshape(in0.size)],
-                                        dtype=object)
+                        in0n = np.array(
+                            [str(x) for x in in0.reshape(in0.size)], dtype=object
+                        )
                         in0 = in0n.reshape(full_shape)
                     else:
                         in0 = np.full(full_shape, value, dtype=input_dtype)
                     inputs[0].set_data_from_numpy(in0)
                 else:
                     offset = 2 * sent_count
-                    inputs[0].set_shared_memory(shm_region_handles[offset][0],
-                                                shm_region_handles[offset][1])
+                    inputs[0].set_shared_memory(
+                        shm_region_handles[offset][0], shm_region_handles[offset][1]
+                    )
                     outputs[0].set_shared_memory(
                         shm_region_handles[offset + 1][0],
-                        shm_region_handles[offset + 1][1])
+                        shm_region_handles[offset + 1][1],
+                    )
 
                 if pre_delay_ms is not None:
                     time.sleep(pre_delay_ms / 1000.0)
 
-                triton_client.async_stream_infer(model_name,
-                                                 inputs,
-                                                 outputs=outputs,
-                                                 sequence_id=correlation_id,
-                                                 sequence_start=seq_start,
-                                                 sequence_end=seq_end)
+                triton_client.async_stream_infer(
+                    model_name,
+                    inputs,
+                    outputs=outputs,
+                    sequence_id=correlation_id,
+                    sequence_start=seq_start,
+                    sequence_end=seq_end,
+                )
                 sent_count += 1
 
             # Wait for the results in the order sent
@@ -742,8 +821,7 @@ def check_sequence_async(self,
                     raise error
                 # Get value of "OUTPUT", for shared memory, need to get it via
                 # shared memory utils
-                if (not _test_system_shared_memory) and (
-                        not _test_cuda_shared_memory):
+                if (not _test_system_shared_memory) and (not _test_cuda_shared_memory):
                     out = results.as_numpy(OUTPUT)
                 else:
                     output = results.get_output(OUTPUT)
@@ -752,12 +830,12 @@ def check_sequence_async(self,
                     output_type = input_dtype
                     if _test_system_shared_memory:
                         out = shm.get_contents_as_numpy(
-                            shm_region_handles[offset][2], output_type,
-                            output_shape)
+                            shm_region_handles[offset][2], output_type, output_shape
+                        )
                     else:
                         out = cudashm.get_contents_as_numpy(
-                            shm_region_handles[offset][2], output_type,
-                            output_shape)
+                            shm_region_handles[offset][2], output_type, output_shape
+                        )
                 result = out[0] if "nobatch" in trial else out[0][0]
                 print("{}: {}".format(sequence_name, result))
                 processed_count += 1
@@ -775,30 +853,40 @@ def check_sequence_async(self,
                 if lt_ms is not None:
                     if _test_jetson:
                         lt_ms *= _jetson_slowdown_factor
-                    self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms,
-                                    "sequence expected less than " +
-                                    str(lt_ms) + "ms response time, got " +
-                                    str(seq_end_ms - seq_start_ms) + " ms")
+                    self.assertTrue(
+                        (seq_end_ms - seq_start_ms) < lt_ms,
+                        "sequence expected less than "
+                        + str(lt_ms)
+                        + "ms response time, got "
+                        + str(seq_end_ms - seq_start_ms)
+                        + " ms",
+                    )
                 if gt_ms is not None:
-                    self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms,
-                                    "sequence expected greater than " +
-                                    str(gt_ms) + "ms response time, got " +
-                                    str(seq_end_ms - seq_start_ms) + " ms")
+                    self.assertTrue(
+                        (seq_end_ms - seq_start_ms) > gt_ms,
+                        "sequence expected greater than "
+                        + str(gt_ms)
+                        + "ms response time, got "
+                        + str(seq_end_ms - seq_start_ms)
+                        + " ms",
+                    )
         except Exception as ex:
             self.add_deferred_exception(ex)
         triton_client.stop_stream()
 
     # This sequence util only sends inference via streaming scenario
-    def check_sequence_shape_tensor_io(self,
-                                       model_name,
-                                       input_dtype,
-                                       correlation_id,
-                                       sequence_thresholds,
-                                       values,
-                                       expected_result,
-                                       shm_region_handles,
-                                       using_dynamic_batcher=False,
-                                       sequence_name="<unknown>"):
+    def check_sequence_shape_tensor_io(
+        self,
+        model_name,
+        input_dtype,
+        correlation_id,
+        sequence_thresholds,
+        values,
+        expected_result,
+        shm_region_handles,
+        using_dynamic_batcher=False,
+        sequence_name="<unknown>",
+    ):
         """Perform sequence of inferences using async run. The 'values' holds
         a list of tuples, one for each inference with format:
 
@@ -808,12 +896,15 @@ def check_sequence_shape_tensor_io(self,
         tensor_shape = (1, 1)
         # shape tensor is 1-D tensor that doesn't contain batch size as first value
         shape_tensor_shape = (1,)
-        self.assertFalse(_test_cuda_shared_memory,
-                         "Shape tensors does not support CUDA shared memory")
+        self.assertFalse(
+            _test_cuda_shared_memory,
+            "Shape tensors does not support CUDA shared memory",
+        )
 
         client_utils = grpcclient
         triton_client = client_utils.InferenceServerClient(
-            f"{_tritonserver_ipaddr}:8001", verbose=True)
+            f"{_tritonserver_ipaddr}:8001", verbose=True
+        )
         user_data = UserData()
         triton_client.start_stream(partial(completion_callback, user_data))
         # Execute the sequence of inference...
@@ -826,8 +917,8 @@ def check_sequence_shape_tensor_io(self,
                 seq_start = False
                 seq_end = False
                 if flag_str is not None:
-                    seq_start = ("start" in flag_str)
-                    seq_end = ("end" in flag_str)
+                    seq_start = "start" in flag_str
+                    seq_end = "end" in flag_str
 
                 # Construct request IOs
                 inputs = []
@@ -835,53 +926,54 @@ def check_sequence_shape_tensor_io(self,
                 # input order: input, shape(, dummy)
                 inputs.append(
                     client_utils.InferInput(
-                        "INPUT", tensor_shape,
-                        np_to_triton_dtype(np.int32 if using_dynamic_batcher
-                                           else input_dtype)))
+                        "INPUT",
+                        tensor_shape,
+                        np_to_triton_dtype(
+                            np.int32 if using_dynamic_batcher else input_dtype
+                        ),
+                    )
+                )
                 inputs.append(
-                    client_utils.InferInput("SHAPE_INPUT", shape_tensor_shape,
-                                            np_to_triton_dtype(np.int32)))
+                    client_utils.InferInput(
+                        "SHAPE_INPUT", shape_tensor_shape, np_to_triton_dtype(np.int32)
+                    )
+                )
                 if using_dynamic_batcher:
                     inputs.append(
                         client_utils.InferInput(
-                            "DUMMY_INPUT", tensor_shape,
-                            np_to_triton_dtype(input_dtype)))
+                            "DUMMY_INPUT", tensor_shape, np_to_triton_dtype(input_dtype)
+                        )
+                    )
                 # output order: shape, output, resized
-                outputs.append(
-                    client_utils.InferRequestedOutput("SHAPE_OUTPUT"))
+                outputs.append(client_utils.InferRequestedOutput("SHAPE_OUTPUT"))
                 outputs.append(client_utils.InferRequestedOutput("OUTPUT"))
-                outputs.append(
-                    client_utils.InferRequestedOutput("RESIZED_OUTPUT"))
+                outputs.append(client_utils.InferRequestedOutput("RESIZED_OUTPUT"))
 
                 # Set IO values
                 shape_values.append(
-                    np.full(shape_tensor_shape, shape_value, dtype=np.int32))
+                    np.full(shape_tensor_shape, shape_value, dtype=np.int32)
+                )
                 if not _test_system_shared_memory:
                     if using_dynamic_batcher:
                         if input_dtype == np.object_:
-                            dummy_in0 = np.full(tensor_shape,
-                                                value,
-                                                dtype=np.int32)
+                            dummy_in0 = np.full(tensor_shape, value, dtype=np.int32)
                             dummy_in0n = np.array(
                                 [str(x) for x in in0.reshape(dummy_in0.size)],
-                                dtype=object)
+                                dtype=object,
+                            )
                             dummy_in0 = dummy_in0n.reshape(tensor_shape)
                         else:
-                            dummy_in0 = np.full(tensor_shape,
-                                                value,
-                                                dtype=input_dtype)
+                            dummy_in0 = np.full(tensor_shape, value, dtype=input_dtype)
                         in0 = np.full(tensor_shape, value, dtype=np.int32)
                     else:
                         if input_dtype == np.object_:
                             in0 = np.full(tensor_shape, value, dtype=np.int32)
                             in0n = np.array(
-                                [str(x) for x in in0.reshape(in0.size)],
-                                dtype=object)
+                                [str(x) for x in in0.reshape(in0.size)], dtype=object
+                            )
                             in0 = in0n.reshape(tensor_shape)
                         else:
-                            in0 = np.full(tensor_shape,
-                                          value,
-                                          dtype=input_dtype)
+                            in0 = np.full(tensor_shape, value, dtype=input_dtype)
 
                     inputs[0].set_data_from_numpy(in0)
                     inputs[1].set_data_from_numpy(shape_values[-1])
@@ -897,21 +989,25 @@ def check_sequence_shape_tensor_io(self,
                     for i in range(len(inputs)):
                         inputs[i].set_shared_memory(
                             shm_region_handles[input_offset + i][0],
-                            shm_region_handles[input_offset + i][1])
+                            shm_region_handles[input_offset + i][1],
+                        )
                     for i in range(len(outputs)):
                         outputs[i].set_shared_memory(
                             shm_region_handles[output_offset + i][0],
-                            shm_region_handles[output_offset + i][1])
+                            shm_region_handles[output_offset + i][1],
+                        )
 
                 if pre_delay_ms is not None:
                     time.sleep(pre_delay_ms / 1000.0)
 
-                triton_client.async_stream_infer(model_name,
-                                                 inputs,
-                                                 outputs=outputs,
-                                                 sequence_id=correlation_id,
-                                                 sequence_start=seq_start,
-                                                 sequence_end=seq_end)
+                triton_client.async_stream_infer(
+                    model_name,
+                    inputs,
+                    outputs=outputs,
+                    sequence_id=correlation_id,
+                    sequence_start=seq_start,
+                    sequence_end=seq_end,
+                )
 
                 sent_count += 1
 
@@ -924,27 +1020,35 @@ def check_sequence_shape_tensor_io(self,
                     raise error
                 # Get value of "OUTPUT", for shared memory, need to get it via
                 # shared memory utils
-                if (not _test_system_shared_memory):
+                if not _test_system_shared_memory:
                     out = results.as_numpy("OUTPUT")
                 else:
                     output = results.get_output("OUTPUT")
-                    output_offset = 6 * processed_count + 4 if using_dynamic_batcher else 5 * processed_count + 3
+                    output_offset = (
+                        6 * processed_count + 4
+                        if using_dynamic_batcher
+                        else 5 * processed_count + 3
+                    )
                     output_shape = output.shape
                     output_type = np.int32 if using_dynamic_batcher else np.float32
                     out = shm.get_contents_as_numpy(
-                        shm_region_handles[output_offset][2], output_type,
-                        output_shape)
+                        shm_region_handles[output_offset][2], output_type, output_shape
+                    )
                 result = out[0][0]
 
                 # Validate the (debatched) shape of the resized output matches
                 # with the shape input values
                 resized_shape = results.get_output("RESIZED_OUTPUT").shape[1:]
                 self.assertTrue(
-                    np.array_equal(resized_shape,
-                                   shape_values[processed_count]),
+                    np.array_equal(resized_shape, shape_values[processed_count]),
                     "{}, {}, slot {}, expected: {}, got {}".format(
-                        model_name, "RESIZED_OUTPUT", processed_count,
-                        shape_values[processed_count], resized_shape))
+                        model_name,
+                        "RESIZED_OUTPUT",
+                        processed_count,
+                        shape_values[processed_count],
+                        resized_shape,
+                    ),
+                )
                 print("{}: {}".format(sequence_name, result))
                 processed_count += 1
 
@@ -961,15 +1065,23 @@ def check_sequence_shape_tensor_io(self,
                 if lt_ms is not None:
                     if _test_jetson:
                         lt_ms *= _jetson_slowdown_factor
-                    self.assertTrue((seq_end_ms - seq_start_ms) < lt_ms,
-                                    "sequence expected less than " +
-                                    str(lt_ms) + "ms response time, got " +
-                                    str(seq_end_ms - seq_start_ms) + " ms")
+                    self.assertTrue(
+                        (seq_end_ms - seq_start_ms) < lt_ms,
+                        "sequence expected less than "
+                        + str(lt_ms)
+                        + "ms response time, got "
+                        + str(seq_end_ms - seq_start_ms)
+                        + " ms",
+                    )
                 if gt_ms is not None:
-                    self.assertTrue((seq_end_ms - seq_start_ms) > gt_ms,
-                                    "sequence expected greater than " +
-                                    str(gt_ms) + "ms response time, got " +
-                                    str(seq_end_ms - seq_start_ms) + " ms")
+                    self.assertTrue(
+                        (seq_end_ms - seq_start_ms) > gt_ms,
+                        "sequence expected greater than "
+                        + str(gt_ms)
+                        + "ms response time, got "
+                        + str(seq_end_ms - seq_start_ms)
+                        + " ms",
+                    )
         except Exception as ex:
             self.add_deferred_exception(ex)
         triton_client.stop_stream()
@@ -980,8 +1092,9 @@ def check_setup(self, model_name):
         # Skip the sequence batching check on ensemble model
         if config.platform != "ensemble":
             bconfig = config.sequence_batching
-            self.assertEqual(bconfig.max_sequence_idle_microseconds,
-                             _max_sequence_idle_ms * 1000)  # 5 secs
+            self.assertEqual(
+                bconfig.max_sequence_idle_microseconds, _max_sequence_idle_ms * 1000
+            )  # 5 secs
 
     def check_status(self, model_name, batch_exec, exec_cnt, infer_cnt):
         # There is a time window between when responses are returned and statistics are updated.
@@ -989,50 +1102,67 @@ def check_status(self, model_name, batch_exec, exec_cnt, infer_cnt):
         # inference statistics to be ready.
         num_tries = 10
         for i in range(num_tries):
-            stats = self.triton_client_.get_inference_statistics(
-                model_name, "1")
+            stats = self.triton_client_.get_inference_statistics(model_name, "1")
             self.assertEqual(len(stats.model_stats), 1, "expect 1 model stats")
             actual_exec_cnt = stats.model_stats[0].execution_count
             if actual_exec_cnt == exec_cnt:
                 break
-            print("WARNING: expect {} executions, got {} (attempt {})".format(
-                exec_cnt, actual_exec_cnt, i))
+            print(
+                "WARNING: expect {} executions, got {} (attempt {})".format(
+                    exec_cnt, actual_exec_cnt, i
+                )
+            )
             time.sleep(1)
 
-        self.assertEqual(stats.model_stats[0].name, model_name,
-                         "expect model stats for model {}".format(model_name))
         self.assertEqual(
-            stats.model_stats[0].version, "1",
-            "expect model stats for model {} version 1".format(model_name))
+            stats.model_stats[0].name,
+            model_name,
+            "expect model stats for model {}".format(model_name),
+        )
+        self.assertEqual(
+            stats.model_stats[0].version,
+            "1",
+            "expect model stats for model {} version 1".format(model_name),
+        )
 
         if batch_exec is not None:
             batch_stats = stats.model_stats[0].batch_stats
             print(batch_stats)
             self.assertEqual(
-                len(batch_stats), len(batch_exec),
+                len(batch_stats),
+                len(batch_exec),
                 "expected {} different batch-sizes, got {}".format(
-                    len(batch_exec), len(batch_stats)))
+                    len(batch_exec), len(batch_stats)
+                ),
+            )
 
             for batch_stat in batch_stats:
                 bs = batch_stat.batch_size
                 bc = batch_stat.compute_infer.count
                 self.assertTrue(
-                    bs in batch_exec,
-                    "did not find expected batch-size {}".format(bs))
+                    bs in batch_exec, "did not find expected batch-size {}".format(bs)
+                )
                 # Get count from one of the stats
                 self.assertEqual(
-                    bc, batch_exec[bs],
-                    "expected model-execution-count {} for batch size {}, got {}"
-                    .format(batch_exec[bs], bs, bc))
+                    bc,
+                    batch_exec[bs],
+                    "expected model-execution-count {} for batch size {}, got {}".format(
+                        batch_exec[bs], bs, bc
+                    ),
+                )
 
         actual_exec_cnt = stats.model_stats[0].execution_count
         self.assertEqual(
-            actual_exec_cnt, exec_cnt,
-            "expected model-exec-count {}, got {}".format(
-                exec_cnt, actual_exec_cnt))
+            actual_exec_cnt,
+            exec_cnt,
+            "expected model-exec-count {}, got {}".format(exec_cnt, actual_exec_cnt),
+        )
 
         actual_infer_cnt = stats.model_stats[0].inference_count
         self.assertEqual(
-            actual_infer_cnt, infer_cnt,
+            actual_infer_cnt,
+            infer_cnt,
             "expected model-inference-count {}, got {}".format(
-                infer_cnt, actual_infer_cnt))
+                infer_cnt, actual_infer_cnt
+            ),
+        )
diff --git a/qa/common/shm_util.py b/qa/common/shm_util.py
old mode 100644
new mode 100755
index 58495d0c33..80607b24b1
--- a/qa/common/shm_util.py
+++ b/qa/common/shm_util.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -25,17 +27,17 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
-from os import listdir
-import numpy as np
 from ctypes import *
+from os import listdir
 
+import numpy as np
 import tritonclient.http as httpclient
 from tritonclient.utils import *
 
 # By default, find tritonserver on "localhost", but can be overridden
 # with TRITONSERVER_IPADDR envvar
-_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost')
-_test_jetson = bool(int(os.environ.get('TEST_JETSON', 0)))
+_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
+_test_jetson = bool(int(os.environ.get("TEST_JETSON", 0)))
 
 
 def _range_repr_dtype(dtype):
@@ -50,10 +52,17 @@ def _range_repr_dtype(dtype):
     return dtype
 
 
-def create_set_shm_regions(input0_list, input1_list, output0_byte_size,
-                           output1_byte_size, outputs, shm_region_names,
-                           precreated_shm_regions, use_system_shared_memory,
-                           use_cuda_shared_memory):
+def create_set_shm_regions(
+    input0_list,
+    input1_list,
+    output0_byte_size,
+    output1_byte_size,
+    outputs,
+    shm_region_names,
+    precreated_shm_regions,
+    use_system_shared_memory,
+    use_cuda_shared_memory,
+):
     # Lazy shm imports...
     if use_system_shared_memory:
         import tritonclient.utils.shared_memory as shm
@@ -61,8 +70,7 @@ def create_set_shm_regions(input0_list, input1_list, output0_byte_size,
         import tritonclient.utils.cuda_shared_memory as cudashm
 
     if use_system_shared_memory and use_cuda_shared_memory:
-        raise ValueError(
-            "Cannot set both System and CUDA shared memory flags to 1")
+        raise ValueError("Cannot set both System and CUDA shared memory flags to 1")
 
     if not (use_system_shared_memory or use_cuda_shared_memory):
         return [], []
@@ -78,33 +86,37 @@ def create_set_shm_regions(input0_list, input1_list, output0_byte_size,
         input1_byte_size = sum([i1.nbytes for i1 in input1_list])
 
     if shm_region_names is None:
-        shm_region_names = ['input0', 'input1', 'output0', 'output1']
+        shm_region_names = ["input0", "input1", "output0", "output1"]
 
     shm_op0_handle = None
     shm_op1_handle = None
 
     if use_system_shared_memory:
         shm_ip0_handle = shm.create_shared_memory_region(
-            shm_region_names[0] + '_data', '/' + shm_region_names[0],
-            input0_byte_size)
+            shm_region_names[0] + "_data", "/" + shm_region_names[0], input0_byte_size
+        )
         shm_ip1_handle = shm.create_shared_memory_region(
-            shm_region_names[1] + '_data', '/' + shm_region_names[1],
-            input1_byte_size)
+            shm_region_names[1] + "_data", "/" + shm_region_names[1], input1_byte_size
+        )
 
         i = 0
         if "OUTPUT0" in outputs:
             if precreated_shm_regions is None:
                 shm_op0_handle = shm.create_shared_memory_region(
-                    shm_region_names[2] + '_data', '/' + shm_region_names[2],
-                    output0_byte_size)
+                    shm_region_names[2] + "_data",
+                    "/" + shm_region_names[2],
+                    output0_byte_size,
+                )
             else:
                 shm_op0_handle = precreated_shm_regions[0]
             i += 1
         if "OUTPUT1" in outputs:
             if precreated_shm_regions is None:
                 shm_op1_handle = shm.create_shared_memory_region(
-                    shm_region_names[2 + i] + '_data',
-                    '/' + shm_region_names[2 + i], output1_byte_size)
+                    shm_region_names[2 + i] + "_data",
+                    "/" + shm_region_names[2 + i],
+                    output1_byte_size,
+                )
             else:
                 shm_op1_handle = precreated_shm_regions[i]
 
@@ -113,21 +125,25 @@ def create_set_shm_regions(input0_list, input1_list, output0_byte_size,
 
     if use_cuda_shared_memory:
         shm_ip0_handle = cudashm.create_shared_memory_region(
-            shm_region_names[0] + '_data', input0_byte_size, 0)
+            shm_region_names[0] + "_data", input0_byte_size, 0
+        )
         shm_ip1_handle = cudashm.create_shared_memory_region(
-            shm_region_names[1] + '_data', input1_byte_size, 0)
+            shm_region_names[1] + "_data", input1_byte_size, 0
+        )
         i = 0
         if "OUTPUT0" in outputs:
             if precreated_shm_regions is None:
                 shm_op0_handle = cudashm.create_shared_memory_region(
-                    shm_region_names[2] + '_data', output0_byte_size, 0)
+                    shm_region_names[2] + "_data", output0_byte_size, 0
+                )
             else:
                 shm_op0_handle = precreated_shm_regions[0]
             i += 1
         if "OUTPUT1" in outputs:
             if precreated_shm_regions is None:
                 shm_op1_handle = cudashm.create_shared_memory_region(
-                    shm_region_names[2 + i] + '_data', output1_byte_size, 0)
+                    shm_region_names[2 + i] + "_data", output1_byte_size, 0
+                )
             else:
                 shm_op1_handle = precreated_shm_regions[i]
 
@@ -135,16 +151,27 @@ def create_set_shm_regions(input0_list, input1_list, output0_byte_size,
         cudashm.set_shared_memory_region(shm_ip1_handle, input1_list)
 
     return shm_region_names, [
-        shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle
+        shm_ip0_handle,
+        shm_ip1_handle,
+        shm_op0_handle,
+        shm_op1_handle,
     ]
 
 
-def register_add_shm_regions(inputs, outputs, shm_region_names,
-                             precreated_shm_regions, shm_handles,
-                             input0_byte_size, input1_byte_size,
-                             output0_byte_size, output1_byte_size,
-                             use_system_shared_memory, use_cuda_shared_memory,
-                             triton_client):
+def register_add_shm_regions(
+    inputs,
+    outputs,
+    shm_region_names,
+    precreated_shm_regions,
+    shm_handles,
+    input0_byte_size,
+    input1_byte_size,
+    output0_byte_size,
+    output1_byte_size,
+    use_system_shared_memory,
+    use_cuda_shared_memory,
+    triton_client,
+):
     # Lazy shm imports...
     if use_system_shared_memory:
         import tritonclient.utils.shared_memory as shm
@@ -154,74 +181,94 @@ def register_add_shm_regions(inputs, outputs, shm_region_names,
     if use_system_shared_memory or use_cuda_shared_memory:
         # Unregister then register required shared memory regions
         if use_system_shared_memory:
-            triton_client.unregister_system_shared_memory(shm_region_names[0] +
-                                                          '_data')
-            triton_client.unregister_system_shared_memory(shm_region_names[1] +
-                                                          '_data')
+            triton_client.unregister_system_shared_memory(shm_region_names[0] + "_data")
+            triton_client.unregister_system_shared_memory(shm_region_names[1] + "_data")
             triton_client.register_system_shared_memory(
-                shm_region_names[0] + '_data', '/' + shm_region_names[0],
-                input0_byte_size)
+                shm_region_names[0] + "_data",
+                "/" + shm_region_names[0],
+                input0_byte_size,
+            )
             triton_client.register_system_shared_memory(
-                shm_region_names[1] + '_data', '/' + shm_region_names[1],
-                input1_byte_size)
+                shm_region_names[1] + "_data",
+                "/" + shm_region_names[1],
+                input1_byte_size,
+            )
             i = 0
             if "OUTPUT0" in outputs:
                 if precreated_shm_regions is None:
                     triton_client.unregister_system_shared_memory(
-                        shm_region_names[2] + '_data')
+                        shm_region_names[2] + "_data"
+                    )
                     triton_client.register_system_shared_memory(
-                        shm_region_names[2] + '_data',
-                        '/' + shm_region_names[2], output0_byte_size)
+                        shm_region_names[2] + "_data",
+                        "/" + shm_region_names[2],
+                        output0_byte_size,
+                    )
                 i += 1
             if "OUTPUT1" in outputs:
                 if precreated_shm_regions is None:
                     triton_client.unregister_system_shared_memory(
-                        shm_region_names[2 + i] + '_data')
+                        shm_region_names[2 + i] + "_data"
+                    )
                     triton_client.register_system_shared_memory(
-                        shm_region_names[2 + i] + '_data',
-                        '/' + shm_region_names[2 + i], output1_byte_size)
+                        shm_region_names[2 + i] + "_data",
+                        "/" + shm_region_names[2 + i],
+                        output1_byte_size,
+                    )
 
         if use_cuda_shared_memory:
-            triton_client.unregister_cuda_shared_memory(shm_region_names[0] +
-                                                        '_data')
-            triton_client.unregister_cuda_shared_memory(shm_region_names[1] +
-                                                        '_data')
+            triton_client.unregister_cuda_shared_memory(shm_region_names[0] + "_data")
+            triton_client.unregister_cuda_shared_memory(shm_region_names[1] + "_data")
             triton_client.register_cuda_shared_memory(
-                shm_region_names[0] + '_data',
-                cudashm.get_raw_handle(shm_handles[0]), 0, input0_byte_size)
+                shm_region_names[0] + "_data",
+                cudashm.get_raw_handle(shm_handles[0]),
+                0,
+                input0_byte_size,
+            )
             triton_client.register_cuda_shared_memory(
-                shm_region_names[1] + '_data',
-                cudashm.get_raw_handle(shm_handles[1]), 0, input1_byte_size)
+                shm_region_names[1] + "_data",
+                cudashm.get_raw_handle(shm_handles[1]),
+                0,
+                input1_byte_size,
+            )
             i = 0
             if "OUTPUT0" in outputs:
                 if precreated_shm_regions is None:
                     triton_client.unregister_cuda_shared_memory(
-                        shm_region_names[2] + '_data')
+                        shm_region_names[2] + "_data"
+                    )
                     triton_client.register_cuda_shared_memory(
-                        shm_region_names[2] + '_data',
-                        cudashm.get_raw_handle(shm_handles[2]), 0,
-                        output0_byte_size)
+                        shm_region_names[2] + "_data",
+                        cudashm.get_raw_handle(shm_handles[2]),
+                        0,
+                        output0_byte_size,
+                    )
                 i += 1
             if "OUTPUT1" in outputs:
                 if precreated_shm_regions is None:
                     triton_client.unregister_cuda_shared_memory(
-                        shm_region_names[2 + i] + '_data')
+                        shm_region_names[2 + i] + "_data"
+                    )
                     triton_client.register_cuda_shared_memory(
-                        shm_region_names[2 + i] + '_data',
-                        cudashm.get_raw_handle(shm_handles[3]), 0,
-                        output1_byte_size)
+                        shm_region_names[2 + i] + "_data",
+                        cudashm.get_raw_handle(shm_handles[3]),
+                        0,
+                        output1_byte_size,
+                    )
 
         # Add shared memory regions to inputs
-        inputs[0].set_shared_memory(shm_region_names[0] + '_data',
-                                    input0_byte_size)
-        inputs[1].set_shared_memory(shm_region_names[1] + '_data',
-                                    input1_byte_size)
-
-
-def unregister_cleanup_shm_regions(shm_regions, shm_handles,
-                                   precreated_shm_regions, outputs,
-                                   use_system_shared_memory,
-                                   use_cuda_shared_memory):
+        inputs[0].set_shared_memory(shm_region_names[0] + "_data", input0_byte_size)
+        inputs[1].set_shared_memory(shm_region_names[1] + "_data", input1_byte_size)
+
+
+def unregister_cleanup_shm_regions(
+    shm_regions,
+    shm_handles,
+    precreated_shm_regions,
+    outputs,
+    use_system_shared_memory,
+    use_cuda_shared_memory,
+):
     # Lazy shm imports...
     if use_system_shared_memory:
         import tritonclient.utils.shared_memory as shm
@@ -231,17 +278,16 @@ def unregister_cleanup_shm_regions(shm_regions, shm_handles,
     if not (use_system_shared_memory or use_cuda_shared_memory):
         return None
 
-    triton_client = httpclient.InferenceServerClient(
-        f"{_tritonserver_ipaddr}:8000")
+    triton_client = httpclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8000")
 
     if use_cuda_shared_memory:
-        triton_client.unregister_cuda_shared_memory(shm_regions[0] + '_data')
-        triton_client.unregister_cuda_shared_memory(shm_regions[1] + '_data')
+        triton_client.unregister_cuda_shared_memory(shm_regions[0] + "_data")
+        triton_client.unregister_cuda_shared_memory(shm_regions[1] + "_data")
         cudashm.destroy_shared_memory_region(shm_handles[0])
         cudashm.destroy_shared_memory_region(shm_handles[1])
     else:
-        triton_client.unregister_system_shared_memory(shm_regions[0] + '_data')
-        triton_client.unregister_system_shared_memory(shm_regions[1] + '_data')
+        triton_client.unregister_system_shared_memory(shm_regions[0] + "_data")
+        triton_client.unregister_system_shared_memory(shm_regions[1] + "_data")
         shm.destroy_shared_memory_region(shm_handles[0])
         shm.destroy_shared_memory_region(shm_handles[1])
 
@@ -249,29 +295,33 @@ def unregister_cleanup_shm_regions(shm_regions, shm_handles,
         i = 0
         if "OUTPUT0" in outputs:
             if use_cuda_shared_memory:
-                triton_client.unregister_cuda_shared_memory(shm_regions[2] +
-                                                            '_data')
+                triton_client.unregister_cuda_shared_memory(shm_regions[2] + "_data")
                 cudashm.destroy_shared_memory_region(shm_handles[2])
             else:
-                triton_client.unregister_system_shared_memory(shm_regions[2] +
-                                                              '_data')
+                triton_client.unregister_system_shared_memory(shm_regions[2] + "_data")
                 shm.destroy_shared_memory_region(shm_handles[2])
             i += 1
         if "OUTPUT1" in outputs:
             if use_cuda_shared_memory:
-                triton_client.unregister_cuda_shared_memory(shm_regions[2 + i] +
-                                                            '_data')
+                triton_client.unregister_cuda_shared_memory(
+                    shm_regions[2 + i] + "_data"
+                )
                 cudashm.destroy_shared_memory_region(shm_handles[3])
             else:
-                triton_client.unregister_system_shared_memory(shm_regions[2 +
-                                                                          i] +
-                                                              '_data')
+                triton_client.unregister_system_shared_memory(
+                    shm_regions[2 + i] + "_data"
+                )
                 shm.destroy_shared_memory_region(shm_handles[3])
 
 
-def create_set_either_shm_region(shm_region_names, input_list, input_byte_size,
-                                 output_byte_size, use_system_shared_memory,
-                                 use_cuda_shared_memory):
+def create_set_either_shm_region(
+    shm_region_names,
+    input_list,
+    input_byte_size,
+    output_byte_size,
+    use_system_shared_memory,
+    use_cuda_shared_memory,
+):
     # Lazy shm imports...
     if use_system_shared_memory:
         import tritonclient.utils.shared_memory as shm
@@ -279,34 +329,43 @@ def create_set_either_shm_region(shm_region_names, input_list, input_byte_size,
         import tritonclient.utils.cuda_shared_memory as cudashm
 
     if use_cuda_shared_memory and use_system_shared_memory:
-        raise ValueError(
-            "Cannot set both System and CUDA shared memory flags to 1")
+        raise ValueError("Cannot set both System and CUDA shared memory flags to 1")
 
     if not (use_system_shared_memory or use_cuda_shared_memory):
         return []
 
     if use_cuda_shared_memory:
         shm_ip_handle = cudashm.create_shared_memory_region(
-            shm_region_names[0] + "_data", input_byte_size, 0)
+            shm_region_names[0] + "_data", input_byte_size, 0
+        )
         shm_op_handle = cudashm.create_shared_memory_region(
-            shm_region_names[1] + "_data", output_byte_size, 0)
+            shm_region_names[1] + "_data", output_byte_size, 0
+        )
         cudashm.set_shared_memory_region(shm_ip_handle, input_list)
     elif use_system_shared_memory:
         shm_ip_handle = shm.create_shared_memory_region(
-            shm_region_names[0] + "_data", "/" + shm_region_names[0],
-            input_byte_size)
+            shm_region_names[0] + "_data", "/" + shm_region_names[0], input_byte_size
+        )
         shm_op_handle = shm.create_shared_memory_region(
-            shm_region_names[1] + "_data", "/" + shm_region_names[1],
-            output_byte_size)
+            shm_region_names[1] + "_data", "/" + shm_region_names[1], output_byte_size
+        )
         shm.set_shared_memory_region(shm_ip_handle, input_list)
 
     return [shm_ip_handle, shm_op_handle]
 
 
-def register_add_either_shm_regions(inputs, outputs, shm_region_prefix,
-                                    shm_handles, io_num, input_byte_size,
-                                    output_byte_size, use_system_shared_memory,
-                                    use_cuda_shared_memory, triton_client):
+def register_add_either_shm_regions(
+    inputs,
+    outputs,
+    shm_region_prefix,
+    shm_handles,
+    io_num,
+    input_byte_size,
+    output_byte_size,
+    use_system_shared_memory,
+    use_cuda_shared_memory,
+    triton_client,
+):
     # Lazy shm imports...
     if use_system_shared_memory:
         import tritonclient.utils.shared_memory as shm
@@ -318,42 +377,40 @@ def register_add_either_shm_regions(inputs, outputs, shm_region_prefix,
         input_shm_name = shm_region_prefix[0] + str(io_num)
         output_shm_name = shm_region_prefix[1] + str(io_num)
         if use_system_shared_memory:
-            triton_client.unregister_system_shared_memory(input_shm_name +
-                                                          '_data')
-            triton_client.unregister_system_shared_memory(output_shm_name +
-                                                          '_data')
+            triton_client.unregister_system_shared_memory(input_shm_name + "_data")
+            triton_client.unregister_system_shared_memory(output_shm_name + "_data")
             triton_client.register_system_shared_memory(
-                input_shm_name + '_data', '/' + input_shm_name, input_byte_size)
+                input_shm_name + "_data", "/" + input_shm_name, input_byte_size
+            )
             triton_client.register_system_shared_memory(
-                output_shm_name + '_data', '/' + output_shm_name,
-                output_byte_size)
+                output_shm_name + "_data", "/" + output_shm_name, output_byte_size
+            )
 
         if use_cuda_shared_memory:
-            triton_client.unregister_cuda_shared_memory(input_shm_name +
-                                                        '_data')
-            triton_client.unregister_cuda_shared_memory(output_shm_name +
-                                                        '_data')
+            triton_client.unregister_cuda_shared_memory(input_shm_name + "_data")
+            triton_client.unregister_cuda_shared_memory(output_shm_name + "_data")
             triton_client.register_cuda_shared_memory(
-                input_shm_name + '_data',
-                cudashm.get_raw_handle(shm_handles[0][io_num]), 0,
-                input_byte_size)
+                input_shm_name + "_data",
+                cudashm.get_raw_handle(shm_handles[0][io_num]),
+                0,
+                input_byte_size,
+            )
             triton_client.register_cuda_shared_memory(
-                output_shm_name + '_data',
-                cudashm.get_raw_handle(shm_handles[1][io_num]), 0,
-                output_byte_size)
+                output_shm_name + "_data",
+                cudashm.get_raw_handle(shm_handles[1][io_num]),
+                0,
+                output_byte_size,
+            )
 
         # Add shared memory regions to inputs
-        inputs[io_num].set_shared_memory(input_shm_name + '_data',
-                                         input_byte_size)
-        outputs[io_num].set_shared_memory(output_shm_name + '_data',
-                                          output_byte_size)
+        inputs[io_num].set_shared_memory(input_shm_name + "_data", input_byte_size)
+        outputs[io_num].set_shared_memory(output_shm_name + "_data", output_byte_size)
 
 
 class ShmLeakDetector:
     """Detect shared memory leaks when testing Python backend."""
 
     class ShmLeakProbe:
-
         def __init__(self, shm_monitors):
             self._shm_monitors = shm_monitors
 
@@ -375,24 +432,27 @@ def __exit__(self, type, value, traceback):
 
             shm_leak_detected = False
             for current_shm_size, prev_shm_size in zip(
-                    current_shm_sizes, self._shm_region_free_sizes):
+                current_shm_sizes, self._shm_region_free_sizes
+            ):
                 if current_shm_size != prev_shm_size:
                     shm_leak_detected = True
                     print(
-                        f'Shared memory leak detected: {current_shm_size} (current) != {prev_shm_size} (prev).'
+                        f"Shared memory leak detected: {current_shm_size} (current) != {prev_shm_size} (prev)."
                     )
             assert not shm_leak_detected, "Shared memory leak detected."
 
-    def __init__(self, prefix='triton_python_backend_shm_region'):
+    def __init__(self, prefix="triton_python_backend_shm_region"):
         if _test_jetson:
             return
         import triton_shm_monitor
+
         self._shm_monitors = []
-        shm_regions = listdir('/dev/shm')
+        shm_regions = listdir("/dev/shm")
         for shm_region in shm_regions:
             if shm_region.startswith(prefix):
                 self._shm_monitors.append(
-                    triton_shm_monitor.SharedMemoryManager(shm_region))
+                    triton_shm_monitor.SharedMemoryManager(shm_region)
+                )
 
     def Probe(self):
         # Jetson cleanup takes too long and results in false positives.
diff --git a/qa/common/test_util.py b/qa/common/test_util.py
old mode 100644
new mode 100755
index a4475afcf4..d0d7bda590
--- a/qa/common/test_util.py
+++ b/qa/common/test_util.py
@@ -1,4 +1,6 @@
-# Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,9 +26,10 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
-import unittest
 import json
+import unittest
+
+import numpy as np
 
 _last_request_id = 0
 
@@ -68,34 +71,39 @@ def shape_to_onnx_shape(shape, idx=0, increment_index=True):
 
 
 def shape_to_dims_str(shape):
-    return ','.join(str(i) for i in shape)
+    return ",".join(str(i) for i in shape)
 
 
-def validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
-                          input_shape, output0_shape, output1_shape):
+def validate_for_tf_model(
+    input_dtype, output0_dtype, output1_dtype, input_shape, output0_shape, output1_shape
+):
     """Return True if input and output dtypes are supported by a TF model."""
 
     # Not extending test to uint8 yet
-    if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
+    if (
+        input_dtype == np.uint8
+        or output0_dtype == np.uint8
+        or output1_dtype == np.uint8
+    ):
         return False
 
     # If the input type is string the output type must be string or
     # int32. This is because the QA models we generate convert strings
     # internally to int32 for compute.
-    if ((input_dtype == np.object_) and
-        (((output0_dtype != np.object_) and (output0_dtype != np.int32)) or
-         ((output1_dtype != np.object_) and (output1_dtype != np.int32)))):
+    if (input_dtype == np.object_) and (
+        ((output0_dtype != np.object_) and (output0_dtype != np.int32))
+        or ((output1_dtype != np.object_) and (output1_dtype != np.int32))
+    ):
         return False
 
     return True
 
 
-def validate_for_trt_model(input_dtype, output0_dtype, output1_dtype,
-                           input_shape, output0_shape, output1_shape):
+def validate_for_trt_model(
+    input_dtype, output0_dtype, output1_dtype, input_shape, output0_shape, output1_shape
+):
     """Return True if input and output dtypes are supported by a TRT model."""
-    supported_datatypes = [
-        bool, np.int8, np.int32, np.uint8, np.float16, np.float32
-    ]
+    supported_datatypes = [bool, np.int8, np.int32, np.uint8, np.float16, np.float32]
     # FIXME: Remove this check when jetson supports TRT 8.5 (DLIS-4256)
     if not support_trt_uint8():
         supported_datatypes.remove(np.uint8)
@@ -117,20 +125,34 @@ def validate_for_trt_model(input_dtype, output0_dtype, output1_dtype,
     return True
 
 
-def validate_for_ensemble_model(ensemble_type, input_dtype, output0_dtype,
-                                output1_dtype, input_shape, output0_shape,
-                                output1_shape):
+def validate_for_ensemble_model(
+    ensemble_type,
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    input_shape,
+    output0_shape,
+    output1_shape,
+):
     """Return True if input and output dtypes are supported by the ensemble type."""
 
     # Not extending test to uint8 yet
-    if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
+    if (
+        input_dtype == np.uint8
+        or output0_dtype == np.uint8
+        or output1_dtype == np.uint8
+    ):
         return False
 
     # Those ensemble types contains "identity" model which doesn't allow STRING
     # data type
     # Test types that use identity for both input and output
     test_type_involved = ["reshape", "zero", "fan"]
-    if input_dtype == np.object_ or output0_dtype == np.object_ or output1_dtype == np.object_:
+    if (
+        input_dtype == np.object_
+        or output0_dtype == np.object_
+        or output1_dtype == np.object_
+    ):
         for type_str in test_type_involved:
             if type_str in ensemble_type:
                 return False
@@ -142,68 +164,96 @@ def validate_for_ensemble_model(ensemble_type, input_dtype, output0_dtype,
     return True
 
 
-def validate_for_onnx_model(input_dtype, output0_dtype, output1_dtype,
-                            input_shape, output0_shape, output1_shape):
+def validate_for_onnx_model(
+    input_dtype, output0_dtype, output1_dtype, input_shape, output0_shape, output1_shape
+):
     """Return True if input and output dtypes are supported by a Onnx model."""
 
     # Not extending test to uint8 yet
-    if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
+    if (
+        input_dtype == np.uint8
+        or output0_dtype == np.uint8
+        or output1_dtype == np.uint8
+    ):
         return False
 
     # If the input type is string the output type must be string or
     # int32. This is because the QA models we generate convert strings
     # internally to int32 for compute.
-    if ((input_dtype == np.object_) and
-        (((output0_dtype != np.object_) and (output0_dtype != np.int32)) or
-         ((output1_dtype != np.object_) and (output1_dtype != np.int32)))):
+    if (input_dtype == np.object_) and (
+        ((output0_dtype != np.object_) and (output0_dtype != np.int32))
+        or ((output1_dtype != np.object_) and (output1_dtype != np.int32))
+    ):
         return False
 
     return True
 
 
-def validate_for_libtorch_model(input_dtype,
-                                output0_dtype,
-                                output1_dtype,
-                                input_shape,
-                                output0_shape,
-                                output1_shape,
-                                max_batch=0,
-                                reshape=False):
+def validate_for_libtorch_model(
+    input_dtype,
+    output0_dtype,
+    output1_dtype,
+    input_shape,
+    output0_shape,
+    output1_shape,
+    max_batch=0,
+    reshape=False,
+):
     """Return True if input and output dtypes are supported by a libtorch model."""
 
     # Not extending test to uint8 yet
-    if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
+    if (
+        input_dtype == np.uint8
+        or output0_dtype == np.uint8
+        or output1_dtype == np.uint8
+    ):
         return False
 
     # STRING data type does not support I/O with more than 1 dims. It supports
     # batching when 'reshape' field is set properly to empty shape.
-    has_string_type = (input_dtype == np.object_) or (
-        output0_dtype == np.object_) or (output1_dtype == np.object_)
-    is_more_than_one_dimensional = (len(input_shape) > 1) or (
-        len(output0_shape) > 1) or (len(output1_shape) > 1) or (max_batch != 0)
+    has_string_type = (
+        (input_dtype == np.object_)
+        or (output0_dtype == np.object_)
+        or (output1_dtype == np.object_)
+    )
+    is_more_than_one_dimensional = (
+        (len(input_shape) > 1)
+        or (len(output0_shape) > 1)
+        or (len(output1_shape) > 1)
+        or (max_batch != 0)
+    )
 
     if has_string_type and is_more_than_one_dimensional and not reshape:
         return False
 
     # FLOAT16 and UINT16 data types are not supported currently
-    if (input_dtype == np.uint16) or (output0_dtype
-                                      == np.uint16) or (output1_dtype
-                                                        == np.uint16):
+    if (
+        (input_dtype == np.uint16)
+        or (output0_dtype == np.uint16)
+        or (output1_dtype == np.uint16)
+    ):
         return False
-    if (input_dtype == np.float16) or (output0_dtype
-                                       == np.float16) or (output1_dtype
-                                                          == np.float16):
+    if (
+        (input_dtype == np.float16)
+        or (output0_dtype == np.float16)
+        or (output1_dtype == np.float16)
+    ):
         return False
 
     return True
 
 
-def validate_for_openvino_model(input_dtype, output0_dtype, output1_dtype,
-                                input_shape, output0_shape, output1_shape):
+def validate_for_openvino_model(
+    input_dtype, output0_dtype, output1_dtype, input_shape, output0_shape, output1_shape
+):
     """Return True if input and output dtypes are supported by an OpenVino model."""
 
     # Not extending test to uint8 yet
-    if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
+    if (
+        input_dtype == np.uint8
+        or output0_dtype == np.uint8
+        or output1_dtype == np.uint8
+    ):
         return False
 
     # float16 is not supported on CPU by OpenVino
@@ -217,18 +267,21 @@ def validate_for_openvino_model(input_dtype, output0_dtype, output1_dtype,
 
     # Return false if input dtype != output dtype and shape > 1 dims
     # https://github.com/openvinotoolkit/openvino/issues/7173
-    if ((output1_dtype != input_dtype) or
-        (output0_dtype != input_dtype)) and len(input_shape) > 1:
+    if ((output1_dtype != input_dtype) or (output0_dtype != input_dtype)) and len(
+        input_shape
+    ) > 1:
         return False
 
     return True
 
 
 def get_model_name(pf, input_dtype, output0_dtype, output1_dtype):
-    return "{}_{}_{}_{}".format(pf,
-                                np.dtype(input_dtype).name,
-                                np.dtype(output0_dtype).name,
-                                np.dtype(output1_dtype).name)
+    return "{}_{}_{}_{}".format(
+        pf,
+        np.dtype(input_dtype).name,
+        np.dtype(output0_dtype).name,
+        np.dtype(output1_dtype).name,
+    )
 
 
 def get_sequence_model_name(pf, dtype):
@@ -250,6 +303,7 @@ def support_trt_uint8():
     except:
         # tensorrt library is not found, detect from environment
         import os
+
         return not bool(int(os.environ.get("TEST_JETSON", 0)))
     # tensorrt library is found, return if uint8 is defined
     return hasattr(trt, "uint8")
@@ -263,19 +317,14 @@ class TestResultCollector(unittest.TestCase):
 
     @classmethod
     def setResult(cls, total, errors, failures):
-        cls.total, cls.errors, cls.failures = \
-            total, errors, failures
+        cls.total, cls.errors, cls.failures = total, errors, failures
 
     @classmethod
     def tearDownClass(cls):
         # this method is called when all the unit tests in a class are
         # finished.
-        json_res = {
-            'total': cls.total,
-            'errors': cls.errors,
-            'failures': cls.failures
-        }
-        with open('test_results.txt', 'w+') as f:
+        json_res = {"total": cls.total, "errors": cls.errors, "failures": cls.failures}
+        with open("test_results.txt", "w+") as f:
             f.write(json.dumps(json_res))
 
     def run(self, result=None):
diff --git a/qa/common/trace_summary.py b/qa/common/trace_summary.py
index 01cfca15c8..d7dfff184e 100755
--- a/qa/common/trace_summary.py
+++ b/qa/common/trace_summary.py
@@ -29,6 +29,7 @@
 import argparse
 import csv
 import json
+
 import numpy as np
 
 FLAGS = None
@@ -37,18 +38,17 @@
 def add_span(span_map, timestamps, span_name, ts_start, ts_end):
     for tag in (ts_start, ts_end):
         if tag not in timestamps:
-            raise ValueError('timestamps missing "{}": {}'.format(
-                tag, timestamps))
+            raise ValueError('timestamps missing "{}": {}'.format(tag, timestamps))
     if timestamps[ts_end] < timestamps[ts_start]:
-        raise ValueError('end timestamp "{}" < start timestamp "{}"'.format(
-            ts_end, ts_start))
+        raise ValueError(
+            'end timestamp "{}" < start timestamp "{}"'.format(ts_end, ts_start)
+        )
     if span_name not in span_map:
         span_map[span_name] = 0
     span_map[span_name] += timestamps[ts_end] - timestamps[ts_start]
 
 
-class AbstractFrontend():
-
+class AbstractFrontend:
     @property
     def filter_timestamp(self):
         return None
@@ -61,65 +61,88 @@ def summarize_frontend_span(self, span_map, cnt):
 
 
 class HttpFrontend(AbstractFrontend):
-
     @property
     def filter_timestamp(self):
         return "HTTP_RECV_START"
 
     def add_frontend_span(self, span_map, timestamps):
-        if ("HTTP_RECV_START" in timestamps) and ("HTTP_SEND_END"
-                                                  in timestamps):
-            add_span(span_map, timestamps, "HTTP_INFER", "HTTP_RECV_START",
-                     "HTTP_SEND_END")
-            add_span(span_map, timestamps, "HTTP_RECV", "HTTP_RECV_START",
-                     "HTTP_RECV_END")
-            add_span(span_map, timestamps, "HTTP_SEND", "HTTP_SEND_START",
-                     "HTTP_SEND_END")
+        if ("HTTP_RECV_START" in timestamps) and ("HTTP_SEND_END" in timestamps):
+            add_span(
+                span_map, timestamps, "HTTP_INFER", "HTTP_RECV_START", "HTTP_SEND_END"
+            )
+            add_span(
+                span_map, timestamps, "HTTP_RECV", "HTTP_RECV_START", "HTTP_RECV_END"
+            )
+            add_span(
+                span_map, timestamps, "HTTP_SEND", "HTTP_SEND_START", "HTTP_SEND_END"
+            )
 
     def summarize_frontend_span(self, span_map, cnt):
         if "HTTP_INFER" in span_map:
             res = "HTTP infer request (avg): {}us\n".format(
-                span_map["HTTP_INFER"] / (cnt * 1000))
-            res += "\tReceive (avg): {}us\n".format(span_map["HTTP_RECV"] /
-                                                    (cnt * 1000))
-            res += "\tSend (avg): {}us\n".format(span_map["HTTP_SEND"] /
-                                                 (cnt * 1000))
+                span_map["HTTP_INFER"] / (cnt * 1000)
+            )
+            res += "\tReceive (avg): {}us\n".format(
+                span_map["HTTP_RECV"] / (cnt * 1000)
+            )
+            res += "\tSend (avg): {}us\n".format(span_map["HTTP_SEND"] / (cnt * 1000))
             res += "\tOverhead (avg): {}us\n".format(
-                (span_map["HTTP_INFER"] - span_map["REQUEST"] -
-                 span_map["HTTP_RECV"] - span_map["HTTP_SEND"]) / (cnt * 1000))
+                (
+                    span_map["HTTP_INFER"]
+                    - span_map["REQUEST"]
+                    - span_map["HTTP_RECV"]
+                    - span_map["HTTP_SEND"]
+                )
+                / (cnt * 1000)
+            )
             return res
         else:
             return None
 
 
 class GrpcFrontend(AbstractFrontend):
-
     @property
     def filter_timestamp(self):
         return "GRPC_WAITREAD_START"
 
     def add_frontend_span(self, span_map, timestamps):
-        if ("GRPC_WAITREAD_START" in timestamps) and ("GRPC_SEND_END"
-                                                      in timestamps):
-            add_span(span_map, timestamps, "GRPC_INFER", "GRPC_WAITREAD_START",
-                     "GRPC_SEND_END")
-            add_span(span_map, timestamps, "GRPC_WAITREAD",
-                     "GRPC_WAITREAD_START", "GRPC_WAITREAD_END")
-            add_span(span_map, timestamps, "GRPC_SEND", "GRPC_SEND_START",
-                     "GRPC_SEND_END")
+        if ("GRPC_WAITREAD_START" in timestamps) and ("GRPC_SEND_END" in timestamps):
+            add_span(
+                span_map,
+                timestamps,
+                "GRPC_INFER",
+                "GRPC_WAITREAD_START",
+                "GRPC_SEND_END",
+            )
+            add_span(
+                span_map,
+                timestamps,
+                "GRPC_WAITREAD",
+                "GRPC_WAITREAD_START",
+                "GRPC_WAITREAD_END",
+            )
+            add_span(
+                span_map, timestamps, "GRPC_SEND", "GRPC_SEND_START", "GRPC_SEND_END"
+            )
 
     def summarize_frontend_span(self, span_map, cnt):
         if "GRPC_INFER" in span_map:
             res = "GRPC infer request (avg): {}us\n".format(
-                span_map["GRPC_INFER"] / (cnt * 1000))
+                span_map["GRPC_INFER"] / (cnt * 1000)
+            )
             res += "\tWait/Read (avg): {}us\n".format(
-                span_map["GRPC_WAITREAD"] / (cnt * 1000))
-            res += "\tSend (avg): {}us\n".format(span_map["GRPC_SEND"] /
-                                                 (cnt * 1000))
+                span_map["GRPC_WAITREAD"] / (cnt * 1000)
+            )
+            res += "\tSend (avg): {}us\n".format(span_map["GRPC_SEND"] / (cnt * 1000))
             res += "\tOverhead (avg): {}us\n".format(
-                (span_map["GRPC_INFER"] - span_map["REQUEST"] -
-                 span_map["GRPC_WAITREAD"] - span_map["GRPC_SEND"]) /
-                (cnt * 1000))
+                (
+                    span_map["GRPC_INFER"]
+                    - span_map["REQUEST"]
+                    - span_map["GRPC_WAITREAD"]
+                    - span_map["GRPC_SEND"]
+                )
+                / (cnt * 1000)
+            )
             return res
         else:
             return None
@@ -132,7 +155,7 @@ def summarize(frontend, traces):
     model_span_map = dict()
 
     # Order traces by id to be more intuitive if 'show_trace'
-    traces = sorted(traces, key=lambda t: t.get('id', -1))
+    traces = sorted(traces, key=lambda t: t.get("id", -1))
 
     # Filter the trace that is not for the requested frontend
     match_frontend_id_set = set()
@@ -158,9 +181,9 @@ def summarize(frontend, traces):
         if "id" not in trace:
             continue
         if trace["id"] in match_frontend_id_set:
-            if (trace['id'] in filtered_traces.keys()):
-                rep_trace = filtered_traces[trace['id']]
-                # Apend the timestamp to the trace representing this 'id'
+            if trace["id"] in filtered_traces.keys():
+                rep_trace = filtered_traces[trace["id"]]
+                # Append the timestamp to the trace representing this 'id'
                 if "model_name" in trace:
                     rep_trace["model_name"] = trace["model_name"]
                 if "model_version" in trace:
@@ -171,7 +194,7 @@ def summarize(frontend, traces):
                 # Use this trace to represent this 'id'
                 if "timestamps" not in trace:
                     trace["timestamps"] = []
-                filtered_traces[trace['id']] = trace
+                filtered_traces[trace["id"]] = trace
 
     for trace_id, trace in filtered_traces.items():
         if trace_id not in match_frontend_id_set:
@@ -190,29 +213,57 @@ def summarize(frontend, traces):
 
             frontend.add_frontend_span(model_span_map[key], timestamps)
 
-            add_span(model_span_map[key], timestamps, "REQUEST",
-                     "REQUEST_START", "REQUEST_END")
+            add_span(
+                model_span_map[key],
+                timestamps,
+                "REQUEST",
+                "REQUEST_START",
+                "REQUEST_END",
+            )
 
             # The tags below will be missing for ensemble model
-            if ("QUEUE_START" in timestamps) and ("COMPUTE_START"
-                                                  in timestamps):
-                add_span(model_span_map[key], timestamps, "QUEUE",
-                         "QUEUE_START", "COMPUTE_START")
-            if ("COMPUTE_START" in timestamps) and ("COMPUTE_END"
-                                                    in timestamps):
-                add_span(model_span_map[key], timestamps, "COMPUTE",
-                         "COMPUTE_START", "COMPUTE_END")
-            if ("COMPUTE_INPUT_END" in timestamps) and ("COMPUTE_OUTPUT_START"
-                                                        in timestamps):
-                add_span(model_span_map[key], timestamps, "COMPUTE_INPUT",
-                         "COMPUTE_START", "COMPUTE_INPUT_END")
-                add_span(model_span_map[key], timestamps, "COMPUTE_INFER",
-                         "COMPUTE_INPUT_END", "COMPUTE_OUTPUT_START")
-                add_span(model_span_map[key], timestamps, "COMPUTE_OUTPUT",
-                         "COMPUTE_OUTPUT_START", "COMPUTE_END")
+            if ("QUEUE_START" in timestamps) and ("COMPUTE_START" in timestamps):
+                add_span(
+                    model_span_map[key],
+                    timestamps,
+                    "QUEUE",
+                    "QUEUE_START",
+                    "COMPUTE_START",
+                )
+            if ("COMPUTE_START" in timestamps) and ("COMPUTE_END" in timestamps):
+                add_span(
+                    model_span_map[key],
+                    timestamps,
+                    "COMPUTE",
+                    "COMPUTE_START",
+                    "COMPUTE_END",
+                )
+            if ("COMPUTE_INPUT_END" in timestamps) and (
+                "COMPUTE_OUTPUT_START" in timestamps
+            ):
+                add_span(
+                    model_span_map[key],
+                    timestamps,
+                    "COMPUTE_INPUT",
+                    "COMPUTE_START",
+                    "COMPUTE_INPUT_END",
+                )
+                add_span(
+                    model_span_map[key],
+                    timestamps,
+                    "COMPUTE_INFER",
+                    "COMPUTE_INPUT_END",
+                    "COMPUTE_OUTPUT_START",
+                )
+                add_span(
+                    model_span_map[key],
+                    timestamps,
+                    "COMPUTE_OUTPUT",
+                    "COMPUTE_OUTPUT_START",
+                    "COMPUTE_END",
+                )
             if FLAGS.show_trace:
-                print("{} ({}):".format(trace["model_name"],
-                                        trace["model_version"]))
+                print("{} ({}):".format(trace["model_name"], trace["model_version"]))
                 print("\tid: {}".format(trace["id"]))
                 if "parent_id" in trace:
                     print("\tparent id: {}".format(trace["parent_id"]))
@@ -230,34 +281,59 @@ def summarize(frontend, traces):
 
     for key, cnt in model_count_map.items():
         model_name, model_value = key
-        print("Summary for {} ({}): trace count = {}".format(
-            model_name, model_value, cnt))
+        print(
+            "Summary for {} ({}): trace count = {}".format(model_name, model_value, cnt)
+        )
 
-        frontend_summary = frontend.summarize_frontend_span(
-            model_span_map[key], cnt)
+        frontend_summary = frontend.summarize_frontend_span(model_span_map[key], cnt)
         if frontend_summary is not None:
             print(frontend_summary)
 
         # collect handler timeline
-        print("\tHandler (avg): {}us".format(model_span_map[key]["REQUEST"] /
-                                             (cnt * 1000)))
-        if ("QUEUE"
-                in model_span_map[key]) and "COMPUTE" in model_span_map[key]:
-            print("\t\tOverhead (avg): {}us".format(
-                (model_span_map[key]["REQUEST"] - model_span_map[key]["QUEUE"] -
-                 model_span_map[key]["COMPUTE"]) / (cnt * 1000)))
-            print("\t\tQueue (avg): {}us".format(model_span_map[key]["QUEUE"] /
-                                                 (cnt * 1000)))
-            print("\t\tCompute (avg): {}us".format(
-                model_span_map[key]["COMPUTE"] / (cnt * 1000)))
-        if ("COMPUTE_INPUT" in model_span_map[key]
-           ) and "COMPUTE_OUTPUT" in model_span_map[key]:
-            print("\t\t\tInput (avg): {}us".format(
-                model_span_map[key]["COMPUTE_INPUT"] / (cnt * 1000)))
-            print("\t\t\tInfer (avg): {}us".format(
-                model_span_map[key]["COMPUTE_INFER"] / (cnt * 1000)))
-            print("\t\t\tOutput (avg): {}us".format(
-                model_span_map[key]["COMPUTE_OUTPUT"] / (cnt * 1000)))
+        print(
+            "\tHandler (avg): {}us".format(
+                model_span_map[key]["REQUEST"] / (cnt * 1000)
+            )
+        )
+        if ("QUEUE" in model_span_map[key]) and "COMPUTE" in model_span_map[key]:
+            print(
+                "\t\tOverhead (avg): {}us".format(
+                    (
+                        model_span_map[key]["REQUEST"]
+                        - model_span_map[key]["QUEUE"]
+                        - model_span_map[key]["COMPUTE"]
+                    )
+                    / (cnt * 1000)
+                )
+            )
+            print(
+                "\t\tQueue (avg): {}us".format(
+                    model_span_map[key]["QUEUE"] / (cnt * 1000)
+                )
+            )
+            print(
+                "\t\tCompute (avg): {}us".format(
+                    model_span_map[key]["COMPUTE"] / (cnt * 1000)
+                )
+            )
+        if (
+            "COMPUTE_INPUT" in model_span_map[key]
+        ) and "COMPUTE_OUTPUT" in model_span_map[key]:
+            print(
+                "\t\t\tInput (avg): {}us".format(
+                    model_span_map[key]["COMPUTE_INPUT"] / (cnt * 1000)
+                )
+            )
+            print(
+                "\t\t\tInfer (avg): {}us".format(
+                    model_span_map[key]["COMPUTE_INFER"] / (cnt * 1000)
+                )
+            )
+            print(
+                "\t\t\tOutput (avg): {}us".format(
+                    model_span_map[key]["COMPUTE_OUTPUT"] / (cnt * 1000)
+                )
+            )
 
 
 def summarize_dataflow(traces):
@@ -268,7 +344,7 @@ def summarize_dataflow(traces):
     #   - child output
 
     # Order traces by id to be more intuitive if 'show_trace'
-    traces = sorted(traces, key=lambda t: t.get('id', -1))
+    traces = sorted(traces, key=lambda t: t.get("id", -1))
 
     # {3: [4, 5, 6], 4: [7]}
     dataflow_parent_map = dict()
@@ -295,14 +371,16 @@ def summarize_dataflow(traces):
     # {3: {4: {7: None}, 5: None, 6: None}}
     dataflow_tree_map = dict()
     depth = [0]
-    append_dataflow_tensor(dataflow_tree_map, first_parent_id,
-                           dataflow_parent_map, traces, depth)
+    append_dataflow_tensor(
+        dataflow_tree_map, first_parent_id, dataflow_parent_map, traces, depth
+    )
 
     print_dataflow_tensor(dataflow_tree_map, traces, depth[0], step=0)
 
 
-def append_dataflow_tensor(dataflow_tensor_map, parent_id, dataflow_tree_map,
-                           traces, depth):
+def append_dataflow_tensor(
+    dataflow_tensor_map, parent_id, dataflow_tree_map, traces, depth
+):
     if parent_id not in dataflow_tree_map:
         dataflow_tensor_map[parent_id] = None
         return
@@ -313,8 +391,9 @@ def append_dataflow_tensor(dataflow_tensor_map, parent_id, dataflow_tree_map,
 
     child_ids = dataflow_tree_map[parent_id]
     for child_id in child_ids:
-        append_dataflow_tensor(child_tensor_map, child_id, dataflow_tree_map,
-                               traces, depth)
+        append_dataflow_tensor(
+            child_tensor_map, child_id, dataflow_tree_map, traces, depth
+        )
 
 
 def print_dataflow_tensor(dataflow_tree_map, traces, depth, step):
@@ -324,8 +403,7 @@ def print_dataflow_tensor(dataflow_tree_map, traces, depth, step):
         if dataflow_tree_map[parent_id] is None:
             continue
 
-        print_dataflow_tensor(dataflow_tree_map[parent_id], traces, depth,
-                              step + 1)
+        print_dataflow_tensor(dataflow_tree_map[parent_id], traces, depth, step + 1)
 
 
 def print_tensor_by_id(id, traces, depth, step):
@@ -337,35 +415,48 @@ def print_tensor_by_id(id, traces, depth, step):
     print("{0}{1}".format(tabs, "=" * (50 + 8 * (depth - step))))
     for trace in traces:
         # print model name and version
-        if "id" in trace and "model_name" in trace and "model_version" in trace and "timestamps" in trace and trace[
-                "id"] == id:
+        if (
+            "id" in trace
+            and "model_name" in trace
+            and "model_version" in trace
+            and "timestamps" in trace
+            and trace["id"] == id
+        ):
             print("{0}Name:   {1}".format(tabs, trace["model_name"]))
             print("{0}Version:{1}".format(tabs, trace["model_version"]))
         # print data
         if "id" in trace and "activity" in trace:
             if trace["id"] == id and trace["activity"] == "TENSOR_QUEUE_INPUT":
                 print("{0}{1}:".format(tabs, "QUEUE_INPUT"))
-                print("{0}\t{1}: {2}".format(tabs, trace["tensor"]["name"],
-                                             get_numpy_array(trace["tensor"])))
-            elif trace["id"] == id and trace[
-                    "activity"] == "TENSOR_BACKEND_INPUT":
+                print(
+                    "{0}\t{1}: {2}".format(
+                        tabs, trace["tensor"]["name"], get_numpy_array(trace["tensor"])
+                    )
+                )
+            elif trace["id"] == id and trace["activity"] == "TENSOR_BACKEND_INPUT":
                 print("{0}{1}:".format(tabs, "BACKEND_INPUT"))
-                print("{0}\t{1}: {2}".format(tabs, trace["tensor"]["name"],
-                                             get_numpy_array(trace["tensor"])))
-            elif trace["id"] == id and trace[
-                    "activity"] == "TENSOR_BACKEND_OUTPUT":
+                print(
+                    "{0}\t{1}: {2}".format(
+                        tabs, trace["tensor"]["name"], get_numpy_array(trace["tensor"])
+                    )
+                )
+            elif trace["id"] == id and trace["activity"] == "TENSOR_BACKEND_OUTPUT":
                 print("{0}{1}:".format(tabs, "BACKEND_OUTPUT"))
-                print("{0}\t{1}: {2}".format(tabs, trace["tensor"]["name"],
-                                             get_numpy_array(trace["tensor"])))
+                print(
+                    "{0}\t{1}: {2}".format(
+                        tabs, trace["tensor"]["name"], get_numpy_array(trace["tensor"])
+                    )
+                )
     print("{0}{1}".format(tabs, "=" * (50 + 8 * (depth - step))))
 
 
 def find_first_id_with_tensor(traces):
     for trace in traces:
         if "activity" in trace and (
-                trace["activity"] == "TENSOR_QUEUE_INPUT" or
-                trace["activity"] == "TENSOR_BACKEND_INPUT" or
-                trace["activity"] == "TENSOR_BACKEND_OUTPUT"):
+            trace["activity"] == "TENSOR_QUEUE_INPUT"
+            or trace["activity"] == "TENSOR_BACKEND_INPUT"
+            or trace["activity"] == "TENSOR_BACKEND_OUTPUT"
+        ):
             return trace["id"]
     return 0
 
@@ -383,7 +474,7 @@ def find_first_id_with_tensor(traces):
     "FP16": np.float16,
     "FP32": np.float32,
     "FP64": np.float64,
-    "BYTES": np.object_
+    "BYTES": np.object_,
 }
 
 
@@ -399,21 +490,25 @@ def get_numpy_array(tensor):
     return array
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-t',
-                        '--show-trace',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Show timestamps for each individual trace')
-    parser.add_argument('file', type=argparse.FileType('r'), nargs='+')
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-t",
+        "--show-trace",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Show timestamps for each individual trace",
+    )
+    parser.add_argument("file", type=argparse.FileType("r"), nargs="+")
     FLAGS = parser.parse_args()
 
     for f in FLAGS.file:
diff --git a/qa/common/util.sh b/qa/common/util.sh
index c8535249fd..9dfdc7a389 100755
--- a/qa/common/util.sh
+++ b/qa/common/util.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -169,7 +170,7 @@ function gdb_helper () {
   for corefile in $(ls core.* > /dev/null 2>&1); do
     GDB_LOG="${corefile}.log"
     echo -e "=== WARNING: SEGFAULT DETECTED, DUMPING GDB BACKTRACE TO [${PWD}/${GDB_LOG}] ==="
-    gdb -batch ${SERVER} ${corefile} -ex "thread apply all bt" | tee "${corefile}.log" || true; 
+    gdb -batch ${SERVER} ${corefile} -ex "thread apply all bt" | tee "${corefile}.log" || true;
   done
 }
 
diff --git a/qa/custom_models/custom_zero_1_float32/config.pbtxt b/qa/custom_models/custom_zero_1_float32/config.pbtxt
old mode 100755
new mode 100644
diff --git a/qa/python_models/add_sub/model.py b/qa/python_models/add_sub/model.py
old mode 100644
new mode 100755
index 4aac895e1c..6a5710d869
--- a/qa/python_models/add_sub/model.py
+++ b/qa/python_models/add_sub/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,29 +26,28 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
 import json
+
+import numpy as np
 import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
-        output0_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT0")
-        output1_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT1")
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
 
         self.output0_dtype = pb_utils.triton_string_to_numpy(
-            output0_config['data_type'])
+            output0_config["data_type"]
+        )
         self.output1_dtype = pb_utils.triton_string_to_numpy(
-            output1_config['data_type'])
+            output1_config["data_type"]
+        )
 
     def execute(self, requests):
-        """ This function is called on inference request.
-        """
+        """This function is called on inference request."""
 
         output0_dtype = self.output0_dtype
         output1_dtype = self.output1_dtype
@@ -55,18 +56,21 @@ def execute(self, requests):
         for request in requests:
             in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
             in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
-            if in_0.as_numpy().dtype.type is np.bytes_ or in_0.as_numpy(
-            ).dtype == np.object_:
-                out_0, out_1 = (in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32),\
-                    in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32))
+            if (
+                in_0.as_numpy().dtype.type is np.bytes_
+                or in_0.as_numpy().dtype == np.object_
+            ):
+                out_0, out_1 = (
+                    in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32),
+                    in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32),
+                )
             else:
-                out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(),
-                                in_0.as_numpy() - in_1.as_numpy())
+                out_0, out_1 = (
+                    in_0.as_numpy() + in_1.as_numpy(),
+                    in_0.as_numpy() - in_1.as_numpy(),
+                )
 
-            out_tensor_0 = pb_utils.Tensor("OUTPUT0",
-                                           out_0.astype(output0_dtype))
-            out_tensor_1 = pb_utils.Tensor("OUTPUT1",
-                                           out_1.astype(output1_dtype))
-            responses.append(
-                pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
+            out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
+            responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
         return responses
diff --git a/qa/python_models/add_sub_gpu/config.pbtxt b/qa/python_models/add_sub_gpu/config.pbtxt
index 79154871c2..dd4a3ebecf 100644
--- a/qa/python_models/add_sub_gpu/config.pbtxt
+++ b/qa/python_models/add_sub_gpu/config.pbtxt
@@ -32,7 +32,7 @@ input [
     name: "INPUT0"
     data_type: TYPE_FP32
     dims: [ 4 ]
-    
+
   }
 ]
 input [
@@ -40,7 +40,7 @@ input [
     name: "INPUT1"
     data_type: TYPE_FP32
     dims: [ 4 ]
-    
+
   }
 ]
 output [
@@ -55,8 +55,8 @@ output [
     name: "OUTPUT1"
     data_type: TYPE_FP32
     dims: [ 4 ]
-    
-    
+
+
   }
 ]
 
diff --git a/qa/python_models/auto_complete/model.py b/qa/python_models/auto_complete/model.py
old mode 100644
new mode 100755
index c4768a562e..f02e532b24
--- a/qa/python_models/auto_complete/model.py
+++ b/qa/python_models/auto_complete/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,19 +26,19 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
 import json
+
+import numpy as np
 import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(0)
         auto_complete_model_config.add_input(input0)
@@ -47,21 +49,20 @@ def auto_complete_config(auto_complete_model_config):
         return auto_complete_model_config
 
     def initialize(self, args):
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
-        output0_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT0")
-        output1_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT1")
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
 
         self.output0_dtype = pb_utils.triton_string_to_numpy(
-            output0_config['data_type'])
+            output0_config["data_type"]
+        )
         self.output1_dtype = pb_utils.triton_string_to_numpy(
-            output1_config['data_type'])
+            output1_config["data_type"]
+        )
 
     def execute(self, requests):
-        """ This function is called on inference request.
-        """
+        """This function is called on inference request."""
 
         output0_dtype = self.output0_dtype
         output1_dtype = self.output1_dtype
@@ -70,18 +71,21 @@ def execute(self, requests):
         for request in requests:
             in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
             in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
-            if in_0.as_numpy().dtype.type is np.bytes_ or in_0.as_numpy(
-            ).dtype == np.object_:
-                out_0, out_1 = (in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32),\
-                    in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32))
+            if (
+                in_0.as_numpy().dtype.type is np.bytes_
+                or in_0.as_numpy().dtype == np.object_
+            ):
+                out_0, out_1 = (
+                    in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32),
+                    in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32),
+                )
             else:
-                out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(),
-                                in_0.as_numpy() - in_1.as_numpy())
+                out_0, out_1 = (
+                    in_0.as_numpy() + in_1.as_numpy(),
+                    in_0.as_numpy() - in_1.as_numpy(),
+                )
 
-            out_tensor_0 = pb_utils.Tensor("OUTPUT0",
-                                           out_0.astype(output0_dtype))
-            out_tensor_1 = pb_utils.Tensor("OUTPUT1",
-                                           out_1.astype(output1_dtype))
-            responses.append(
-                pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
+            out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
+            responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
         return responses
diff --git a/qa/python_models/auto_complete_error/model.py b/qa/python_models/auto_complete_error/model.py
old mode 100644
new mode 100755
index 5b8b0f64fe..3562824c46
--- a/qa/python_models/auto_complete_error/model.py
+++ b/qa/python_models/auto_complete_error/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,7 +28,6 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
         """
@@ -34,10 +35,10 @@ def auto_complete_config(auto_complete_model_config):
         to test correct handling of Python errors in the `auto_complete_config`
         function.
         """
-        input0 = {'name': 'INPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        input1 = {'name': 'INPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output0 = {'name': 'OUTPUT0', 'data_type': 'TYPE_FP32', 'dims': [4]}
-        output1 = {'name': 'OUTPUT1', 'data_type': 'TYPE_FP32', 'dims': [4]}
+        input0 = {"name": "INPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        input1 = {"name": "INPUT1", "data_type": "TYPE_FP32", "dims": [4]}
+        output0 = {"name": "OUTPUT0", "data_type": "TYPE_FP32", "dims": [4]}
+        output1 = {"name": "OUTPUT1", "data_type": "TYPE_FP32", "dims": [4]}
 
         auto_complete_model_config.set_max_batch_size(0)
         auto_complete_model_config.add_input(input0)
diff --git a/qa/python_models/bls/model.py b/qa/python_models/bls/model.py
old mode 100644
new mode 100755
index cf7946e1ec..41eacfb933
--- a/qa/python_models/bls/model.py
+++ b/qa/python_models/bls/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -25,14 +27,15 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
-import numpy as np
+import sys
+import threading
 import unittest
-import triton_python_backend_utils as pb_utils
+from multiprocessing import Pool
+
+import numpy as np
 import torch
+import triton_python_backend_utils as pb_utils
 from torch.utils.dlpack import from_dlpack, to_dlpack
-import threading
-from multiprocessing import Pool
-import sys
 
 _deferred_exceptions_lock = threading.Lock()
 _deferred_exceptions = []
@@ -43,18 +46,19 @@ def bls_add_sub(_=None):
     input0_np = input0_np.astype(np.float32)
     input1_np = np.random.randn(*[16])
     input1_np = input1_np.astype(np.float32)
-    input0 = pb_utils.Tensor('INPUT0', input0_np)
-    input1 = pb_utils.Tensor('INPUT1', input1_np)
+    input0 = pb_utils.Tensor("INPUT0", input0_np)
+    input1 = pb_utils.Tensor("INPUT1", input1_np)
     infer_request = pb_utils.InferenceRequest(
-        model_name='add_sub',
+        model_name="add_sub",
         inputs=[input0, input1],
-        requested_output_names=['OUTPUT0', 'OUTPUT1'])
+        requested_output_names=["OUTPUT0", "OUTPUT1"],
+    )
     infer_response = infer_request.exec()
     if infer_response.has_error():
         return False
 
-    output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
-    output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')
+    output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
+    output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT1")
     if output0 is None or output1 is None:
         return False
 
@@ -72,10 +76,10 @@ def bls_add_sub(_=None):
 
 def bls_square(_=None):
     input0_np = np.random.randint(16, size=1, dtype=np.int32)
-    input0 = pb_utils.Tensor('IN', input0_np)
-    infer_request = pb_utils.InferenceRequest(model_name='square_int32',
-                                              inputs=[input0],
-                                              requested_output_names=['OUT'])
+    input0 = pb_utils.Tensor("IN", input0_np)
+    infer_request = pb_utils.InferenceRequest(
+        model_name="square_int32", inputs=[input0], requested_output_names=["OUT"]
+    )
     infer_responses = infer_request.exec(decoupled=True)
 
     response_count = 0
@@ -86,8 +90,7 @@ def bls_square(_=None):
                 return False
 
             if len(infer_response.output_tensors()) > 0:
-                output0 = pb_utils.get_output_tensor_by_name(
-                    infer_response, 'OUT')
+                output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
                 if output0 is None:
                     return False
 
@@ -108,29 +111,28 @@ def bls_libtorch(model_name, result_device):
     shape = [16]
     input0_np = np.random.rand(*shape).astype(np.float32)
     input1_np = np.random.rand(*shape).astype(np.float32)
-    input0 = pb_utils.Tensor('INPUT0', input0_np)
-    input1 = pb_utils.Tensor('INPUT1', input1_np)
+    input0 = pb_utils.Tensor("INPUT0", input0_np)
+    input1 = pb_utils.Tensor("INPUT1", input1_np)
 
     if result_device == "CPU":
-        preferred_memory = pb_utils.PreferredMemory(
-            pb_utils.TRITONSERVER_MEMORY_CPU)
+        preferred_memory = pb_utils.PreferredMemory(pb_utils.TRITONSERVER_MEMORY_CPU)
     else:
-        preferred_memory = pb_utils.PreferredMemory(
-            pb_utils.TRITONSERVER_MEMORY_GPU, 0)
+        preferred_memory = pb_utils.PreferredMemory(pb_utils.TRITONSERVER_MEMORY_GPU, 0)
 
     infer_request = pb_utils.InferenceRequest(
         model_name=model_name,
         model_version=1,
         inputs=[input0, input1],
-        requested_output_names=['OUTPUT__0', 'OUTPUT__1'],
-        preferred_memory=preferred_memory)
+        requested_output_names=["OUTPUT__0", "OUTPUT__1"],
+        preferred_memory=preferred_memory,
+    )
 
     infer_response = infer_request.exec()
     if infer_response.has_error():
         return False
 
-    output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT__0')
-    output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT__1')
+    output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT__0")
+    output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT__1")
     if output0 is None or output1 is None:
         return False
 
@@ -149,10 +151,8 @@ def bls_libtorch(model_name, result_device):
     else:
         if output0.is_cpu() or output1.is_cpu():
             return False
-        output0 = from_dlpack(
-            output0.to_dlpack()).to('cpu').cpu().detach().numpy()
-        output1 = from_dlpack(
-            output1.to_dlpack()).to('cpu').cpu().detach().numpy()
+        output0 = from_dlpack(output0.to_dlpack()).to("cpu").cpu().detach().numpy()
+        output1 = from_dlpack(output1.to_dlpack()).to("cpu").cpu().detach().numpy()
 
         if not np.all(output0 == expected_output_0):
             return False
@@ -163,10 +163,8 @@ def bls_libtorch(model_name, result_device):
 
 
 class PBBLSTest(unittest.TestCase):
-
     def setUp(self):
-        self._is_decoupled = True if os.environ[
-            'BLS_KIND'] == "decoupled" else False
+        self._is_decoupled = True if os.environ["BLS_KIND"] == "decoupled" else False
 
     def add_deferred_exception(self, ex):
         global _deferred_exceptions
@@ -179,61 +177,64 @@ def check_deferred_exception(self):
                 raise _deferred_exceptions[0]
 
     def test_bls_wrong_inputs(self):
-        input0 = pb_utils.Tensor('INPUT0', np.random.randn(*[1, 16]))
+        input0 = pb_utils.Tensor("INPUT0", np.random.randn(*[1, 16]))
 
         if self._is_decoupled:
             infer_request = pb_utils.InferenceRequest(
-                model_name='square_int32',
-                inputs=[],
-                requested_output_names=['OUT'])
+                model_name="square_int32", inputs=[], requested_output_names=["OUT"]
+            )
             infer_responses = infer_request.exec(decoupled=True)
             for infer_response in infer_responses:
                 self.assertTrue(infer_response.has_error())
                 self.assertIn(
                     "expected 1 inputs but got 0 inputs for model 'square_int32'",
-                    infer_response.error().message())
+                    infer_response.error().message(),
+                )
                 self.assertTrue(len(infer_response.output_tensors()) == 0)
         else:
             infer_request = pb_utils.InferenceRequest(
-                model_name='add_sub',
+                model_name="add_sub",
                 inputs=[input0],
-                requested_output_names=['OUTPUT0', 'OUTPUT1'])
+                requested_output_names=["OUTPUT0", "OUTPUT1"],
+            )
             infer_response = infer_request.exec()
             self.assertTrue(infer_response.has_error())
             self.assertIn(
                 "expected 2 inputs but got 1 inputs for model 'add_sub'",
-                infer_response.error().message())
+                infer_response.error().message(),
+            )
             self.assertTrue(len(infer_response.output_tensors()) == 0)
 
     def _send_bls_sequence_requests(self, correlation_id, is_decoupled):
         # Start request
         try:
-            input = pb_utils.Tensor('INPUT', np.array([1000], dtype=np.int32))
+            input = pb_utils.Tensor("INPUT", np.array([1000], dtype=np.int32))
 
             infer_request = pb_utils.InferenceRequest(
-                model_name='onnx_nobatch_sequence_int32',
+                model_name="onnx_nobatch_sequence_int32",
                 inputs=[input],
-                requested_output_names=['OUTPUT'],
+                requested_output_names=["OUTPUT"],
                 flags=pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START,
-                correlation_id=correlation_id)
-            self.assertTrue(infer_request.flags(),
-                            pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START)
+                correlation_id=correlation_id,
+            )
+            self.assertTrue(
+                infer_request.flags(), pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START
+            )
             infer_response = infer_request.exec()
             self.assertFalse(infer_response.has_error())
-            output = pb_utils.get_output_tensor_by_name(infer_response,
-                                                        'OUTPUT')
+            output = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT")
             self.assertFalse(output.is_cpu())
-            output = from_dlpack(
-                output.to_dlpack()).to('cpu').cpu().detach().numpy()
+            output = from_dlpack(output.to_dlpack()).to("cpu").cpu().detach().numpy()
             self.assertEqual(output[0], input.as_numpy()[0])
 
             for i in range(10):
-                input = pb_utils.Tensor('INPUT', np.array([i], dtype=np.int32))
+                input = pb_utils.Tensor("INPUT", np.array([i], dtype=np.int32))
                 infer_request = pb_utils.InferenceRequest(
-                    model_name='onnx_nobatch_sequence_int32',
+                    model_name="onnx_nobatch_sequence_int32",
                     inputs=[input],
-                    requested_output_names=['OUTPUT'],
-                    correlation_id=correlation_id)
+                    requested_output_names=["OUTPUT"],
+                    correlation_id=correlation_id,
+                )
 
                 if is_decoupled:
                     infer_responses = infer_request.exec(decoupled=True)
@@ -246,25 +247,26 @@ def _send_bls_sequence_requests(self, correlation_id, is_decoupled):
 
                 # The new output is the previous output + the current input
                 expected_output = output[0] + i
-                output = pb_utils.get_output_tensor_by_name(
-                    infer_response, 'OUTPUT')
+                output = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT")
                 self.assertFalse(output.is_cpu())
-                output = from_dlpack(
-                    output.to_dlpack()).to('cpu').cpu().detach().numpy()
+                output = (
+                    from_dlpack(output.to_dlpack()).to("cpu").cpu().detach().numpy()
+                )
                 self.assertEqual(output[0], expected_output)
 
             # Final request
-            input = pb_utils.Tensor('INPUT', np.array([2000], dtype=np.int32))
+            input = pb_utils.Tensor("INPUT", np.array([2000], dtype=np.int32))
 
             infer_request = pb_utils.InferenceRequest(
-                model_name='onnx_nobatch_sequence_int32',
+                model_name="onnx_nobatch_sequence_int32",
                 inputs=[input],
-                requested_output_names=['OUTPUT'],
-                correlation_id=correlation_id)
-            infer_request.set_flags(
-                pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END)
-            self.assertTrue(infer_request.flags(),
-                            pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END)
+                requested_output_names=["OUTPUT"],
+                correlation_id=correlation_id,
+            )
+            infer_request.set_flags(pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END)
+            self.assertTrue(
+                infer_request.flags(), pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_END
+            )
 
             if is_decoupled:
                 infer_responses = infer_request.exec(decoupled=True)
@@ -276,11 +278,9 @@ def _send_bls_sequence_requests(self, correlation_id, is_decoupled):
 
             self.assertFalse(infer_response.has_error())
             expected_output = output[0] + input.as_numpy()[0]
-            output = pb_utils.get_output_tensor_by_name(infer_response,
-                                                        'OUTPUT')
+            output = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT")
             self.assertFalse(output.is_cpu())
-            output = from_dlpack(
-                output.to_dlpack()).to('cpu').cpu().detach().numpy()
+            output = from_dlpack(output.to_dlpack()).to("cpu").cpu().detach().numpy()
             self.assertEqual(output[0], expected_output)
         except Exception as e:
             self.add_deferred_exception(e)
@@ -288,17 +288,21 @@ def _send_bls_sequence_requests(self, correlation_id, is_decoupled):
     def test_bls_sequence(self):
         # Send 2 sequence of BLS requests simultaneously and check the responses.
         threads = []
-        thread1 = threading.Thread(target=self._send_bls_sequence_requests,
-                                   args=(
-                                       1000,
-                                       self._is_decoupled,
-                                   ))
+        thread1 = threading.Thread(
+            target=self._send_bls_sequence_requests,
+            args=(
+                1000,
+                self._is_decoupled,
+            ),
+        )
         threads.append(thread1)
-        thread2 = threading.Thread(target=self._send_bls_sequence_requests,
-                                   args=(
-                                       1001,
-                                       self._is_decoupled,
-                                   ))
+        thread2 = threading.Thread(
+            target=self._send_bls_sequence_requests,
+            args=(
+                1001,
+                self._is_decoupled,
+            ),
+        )
         threads.append(thread2)
 
         for thread in threads:
@@ -313,15 +317,16 @@ def test_bls_sequence(self):
     def test_bls_incorrect_args(self):
         with self.assertRaises(TypeError):
             pb_utils.InferenceRequest(
-                inputs=[], requested_output_names=['OUTPUT0', 'OUTPUT1'])
+                inputs=[], requested_output_names=["OUTPUT0", "OUTPUT1"]
+            )
 
         with self.assertRaises(TypeError):
             pb_utils.InferenceRequest(
-                model_name='add_sub',
-                requested_output_names=['OUTPUT0', 'OUTPUT1'])
+                model_name="add_sub", requested_output_names=["OUTPUT0", "OUTPUT1"]
+            )
 
         with self.assertRaises(TypeError):
-            pb_utils.InferenceRequest(model_name='add_sub', inputs=[])
+            pb_utils.InferenceRequest(model_name="add_sub", inputs=[])
 
     def _get_gpu_bls_outputs(self, input0_pb, input1_pb, is_decoupled):
         """
@@ -329,9 +334,10 @@ def _get_gpu_bls_outputs(self, input0_pb, input1_pb, is_decoupled):
         properly when the inference response and outputs go out of scope.
         """
         infer_request = pb_utils.InferenceRequest(
-            model_name='dlpack_add_sub',
+            model_name="dlpack_add_sub",
             inputs=[input0_pb, input1_pb],
-            requested_output_names=['OUTPUT0', 'OUTPUT1'])
+            requested_output_names=["OUTPUT0", "OUTPUT1"],
+        )
         if is_decoupled:
             infer_responses = infer_request.exec(decoupled=True)
             infer_response = next(infer_responses)
@@ -342,8 +348,8 @@ def _get_gpu_bls_outputs(self, input0_pb, input1_pb, is_decoupled):
 
         self.assertFalse(infer_response.has_error())
 
-        output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
-        output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')
+        output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
+        output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT1")
         self.assertIsNotNone(output0)
         self.assertIsNotNone(output1)
 
@@ -373,21 +379,20 @@ def _get_gpu_bls_outputs(self, input0_pb, input1_pb, is_decoupled):
         output1_dlpack = None
         rc_after_del_dlpack_output0 = sys.getrefcount(output0)
         rc_after_del_dlpack_output1 = sys.getrefcount(output1)
-        self.assertEqual(rc_after_del_dlpack_output0 - rc_after_dlpack_output0,
-                         -1)
-        self.assertEqual(rc_after_del_dlpack_output1 - rc_after_dlpack_output1,
-                         -1)
+        self.assertEqual(rc_after_del_dlpack_output0 - rc_after_dlpack_output0, -1)
+        self.assertEqual(rc_after_del_dlpack_output1 - rc_after_dlpack_output1, -1)
 
         return output0.to_dlpack(), output1.to_dlpack()
 
     def test_zero_length_io(self):
-        model_name = 'identity_fp32'
+        model_name = "identity_fp32"
         input0 = np.zeros([1, 0], dtype=np.float32)
-        input0_pb = pb_utils.Tensor('INPUT0', input0)
+        input0_pb = pb_utils.Tensor("INPUT0", input0)
         infer_request = pb_utils.InferenceRequest(
             model_name=model_name,
             inputs=[input0_pb],
-            requested_output_names=['OUTPUT0'])
+            requested_output_names=["OUTPUT0"],
+        )
 
         if self._is_decoupled:
             infer_responses = infer_request.exec(decoupled=True)
@@ -399,11 +404,11 @@ def test_zero_length_io(self):
 
         self.assertFalse(infer_response.has_error())
 
-        output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
+        output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
         self.assertTrue(np.all(output0 == input0))
 
     def test_bls_tensor_lifecycle(self):
-        model_name = 'dlpack_identity'
+        model_name = "dlpack_identity"
 
         # A 10 MB tensor.
         input_size = 10 * 1024 * 1024
@@ -413,11 +418,12 @@ def test_bls_tensor_lifecycle(self):
         # there will be an out of shared memory error.
         for _ in range(50):
             input0 = np.ones([1, input_size], dtype=np.float32)
-            input0_pb = pb_utils.Tensor('INPUT0', input0)
+            input0_pb = pb_utils.Tensor("INPUT0", input0)
             infer_request = pb_utils.InferenceRequest(
                 model_name=model_name,
                 inputs=[input0_pb],
-                requested_output_names=['OUTPUT0'])
+                requested_output_names=["OUTPUT0"],
+            )
 
             if self._is_decoupled:
                 infer_responses = infer_request.exec(decoupled=True)
@@ -428,10 +434,10 @@ def test_bls_tensor_lifecycle(self):
                 infer_response = infer_request.exec()
             self.assertFalse(infer_response.has_error())
 
-            output0 = pb_utils.get_output_tensor_by_name(
-                infer_response, 'OUTPUT0')
-            np.testing.assert_equal(output0.as_numpy(), input0,
-                                    "BLS CPU memory lifecycle failed.")
+            output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
+            np.testing.assert_equal(
+                output0.as_numpy(), input0, "BLS CPU memory lifecycle failed."
+            )
 
         # Checking the same with the GPU tensors.
         for index in range(50):
@@ -445,15 +451,17 @@ def test_bls_tensor_lifecycle(self):
                 recorded_memory = free_memory
 
             if index > 1:
-                self.assertEqual(free_memory, recorded_memory,
-                                 "GPU memory lifecycle test failed.")
+                self.assertEqual(
+                    free_memory, recorded_memory, "GPU memory lifecycle test failed."
+                )
 
-            input0 = torch.ones([1, input_size], dtype=torch.float32).to('cuda')
-            input0_pb = pb_utils.Tensor.from_dlpack('INPUT0', to_dlpack(input0))
+            input0 = torch.ones([1, input_size], dtype=torch.float32).to("cuda")
+            input0_pb = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(input0))
             infer_request = pb_utils.InferenceRequest(
                 model_name=model_name,
                 inputs=[input0_pb],
-                requested_output_names=['OUTPUT0'])
+                requested_output_names=["OUTPUT0"],
+            )
 
             if self._is_decoupled:
                 infer_responses = infer_request.exec(decoupled=True)
@@ -465,8 +473,7 @@ def test_bls_tensor_lifecycle(self):
 
             self.assertFalse(infer_response.has_error())
 
-            output0 = pb_utils.get_output_tensor_by_name(
-                infer_response, 'OUTPUT0')
+            output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
             output0_pytorch = from_dlpack(output0.to_dlpack())
 
             # Set inference response and output0_pytorch to None, to make sure
@@ -475,47 +482,46 @@ def test_bls_tensor_lifecycle(self):
             infer_response = None
             self.assertTrue(
                 torch.all(output0_pytorch == input0),
-                f"input ({input0}) and output ({output0_pytorch}) didn't match for identity model."
+                f"input ({input0}) and output ({output0_pytorch}) didn't match for identity model.",
             )
 
-    def _test_gpu_bls_add_sub(self,
-                              is_input0_gpu,
-                              is_input1_gpu,
-                              is_decoupled=False):
+    def _test_gpu_bls_add_sub(self, is_input0_gpu, is_input1_gpu, is_decoupled=False):
         input0 = torch.rand(16)
         input1 = torch.rand(16)
 
         if is_input0_gpu:
-            input0 = input0.to('cuda')
+            input0 = input0.to("cuda")
 
         if is_input1_gpu:
-            input1 = input1.to('cuda')
+            input1 = input1.to("cuda")
 
-        input0_pb = pb_utils.Tensor.from_dlpack('INPUT0', to_dlpack(input0))
-        input1_pb = pb_utils.Tensor.from_dlpack('INPUT1', to_dlpack(input1))
+        input0_pb = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(input0))
+        input1_pb = pb_utils.Tensor.from_dlpack("INPUT1", to_dlpack(input1))
 
         output0_dlpack, output1_dlpack = self._get_gpu_bls_outputs(
-            input0_pb, input1_pb, is_decoupled=is_decoupled)
+            input0_pb, input1_pb, is_decoupled=is_decoupled
+        )
 
-        expected_output_0 = from_dlpack(
-            input0_pb.to_dlpack()).to('cpu') + from_dlpack(
-                input1_pb.to_dlpack()).to('cpu')
-        expected_output_1 = from_dlpack(
-            input0_pb.to_dlpack()).to('cpu') - from_dlpack(
-                input1_pb.to_dlpack()).to('cpu')
+        expected_output_0 = from_dlpack(input0_pb.to_dlpack()).to("cpu") + from_dlpack(
+            input1_pb.to_dlpack()
+        ).to("cpu")
+        expected_output_1 = from_dlpack(input0_pb.to_dlpack()).to("cpu") - from_dlpack(
+            input1_pb.to_dlpack()
+        ).to("cpu")
 
         self.assertTrue(
-            torch.all(
-                expected_output_0 == from_dlpack(output0_dlpack).to('cpu')))
+            torch.all(expected_output_0 == from_dlpack(output0_dlpack).to("cpu"))
+        )
         self.assertTrue(
-            torch.all(
-                expected_output_1 == from_dlpack(output1_dlpack).to('cpu')))
+            torch.all(expected_output_1 == from_dlpack(output1_dlpack).to("cpu"))
+        )
 
     def test_gpu_bls(self):
         for input0_device in [True, False]:
             for input1_device in [True, False]:
-                self._test_gpu_bls_add_sub(input0_device, input1_device,
-                                           self._is_decoupled)
+                self._test_gpu_bls_add_sub(
+                    input0_device, input1_device, self._is_decoupled
+                )
 
     def test_multiprocess(self):
         # Test multiprocess Pool with sync BLS
@@ -533,9 +539,8 @@ def test_multiprocess(self):
 
     def test_bls_sync(self):
         infer_request = pb_utils.InferenceRequest(
-            model_name='non_existent_model',
-            inputs=[],
-            requested_output_names=[])
+            model_name="non_existent_model", inputs=[], requested_output_names=[]
+        )
 
         if self._is_decoupled:
             infer_responses = infer_request.exec(decoupled=True)
@@ -546,7 +551,8 @@ def test_bls_sync(self):
                 self.assertTrue(infer_response.has_error())
                 self.assertIn(
                     "Failed for execute the inference request. Model 'non_existent_model' is not ready.",
-                    infer_response.error().message())
+                    infer_response.error().message(),
+                )
 
                 # Make sure that the inference requests can be performed properly after
                 # an error.
@@ -559,7 +565,8 @@ def test_bls_sync(self):
             self.assertTrue(infer_response.has_error())
             self.assertIn(
                 "Failed for execute the inference request. Model 'non_existent_model' is not ready.",
-                infer_response.error().message())
+                infer_response.error().message(),
+            )
 
             # Make sure that the inference requests can be performed properly after
             # an error.
@@ -567,9 +574,9 @@ def test_bls_sync(self):
 
     def test_bls_execute_error(self):
         # Test BLS with a model that has an error during execution.
-        infer_request = pb_utils.InferenceRequest(model_name='execute_error',
-                                                  inputs=[],
-                                                  requested_output_names=[])
+        infer_request = pb_utils.InferenceRequest(
+            model_name="execute_error", inputs=[], requested_output_names=[]
+        )
         if self._is_decoupled:
             infer_responses = infer_request.exec(decoupled=True)
             infer_response = next(infer_responses)
@@ -581,7 +588,8 @@ def test_bls_execute_error(self):
         self.assertTrue(infer_response.has_error())
         self.assertIn(
             "expected 1 inputs but got 0 inputs for model 'execute_error'",
-            infer_response.error().message())
+            infer_response.error().message(),
+        )
         self.assertTrue(len(infer_response.output_tensors()) == 0)
 
     def test_multiple_bls(self):
@@ -596,12 +604,13 @@ def test_multiple_bls(self):
     def test_timeout(self):
         tensor_size = [1, 1024 * 1024]
         input0_np = np.random.randn(*tensor_size)
-        input0 = pb_utils.Tensor('INPUT0', input0_np.astype(np.float32))
+        input0 = pb_utils.Tensor("INPUT0", input0_np.astype(np.float32))
         infer_request = pb_utils.InferenceRequest(
-            model_name='identity_fp32_timeout',
+            model_name="identity_fp32_timeout",
             inputs=[input0],
-            requested_output_names=['OUTPUT0'],
-            timeout=5)
+            requested_output_names=["OUTPUT0"],
+            timeout=5,
+        )
 
         if self._is_decoupled:
             infer_responses = infer_request.exec(decoupled=True)
@@ -611,21 +620,19 @@ def test_timeout(self):
 
         # Expect timeout error
         self.assertTrue(infer_response.has_error())
-        self.assertIn("Request timeout expired",
-                      infer_response.error().message())
+        self.assertIn("Request timeout expired", infer_response.error().message())
         self.assertTrue(len(infer_response.output_tensors()) == 0)
 
-    def _test_response_iterator_square(self, expected_output_cnt,
-                                       expected_output_value,
-                                       response_iterator):
+    def _test_response_iterator_square(
+        self, expected_output_cnt, expected_output_value, response_iterator
+    ):
         response_count = 0
         expected_output_cnt = np.array([expected_output_cnt], dtype=np.int32)
 
         for infer_response in response_iterator:
             self.assertFalse(infer_response.has_error())
             if len(infer_response.output_tensors()) > 0:
-                output0 = pb_utils.get_output_tensor_by_name(
-                    infer_response, 'OUT')
+                output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
                 self.assertIsNotNone(output0)
                 self.assertEqual(expected_output_value, output0.as_numpy())
 
@@ -645,39 +652,43 @@ def test_response_iterator(self):
             # has 4 decoupled responses followed by an empty response.
             response_value = 4
             input0_np = np.array([response_value], dtype=np.int32)
-            input0 = pb_utils.Tensor('IN', input0_np)
+            input0 = pb_utils.Tensor("IN", input0_np)
             infer_request = pb_utils.InferenceRequest(
-                model_name='square_int32',
+                model_name="square_int32",
                 inputs=[input0],
-                requested_output_names=['OUT'])
+                requested_output_names=["OUT"],
+            )
             infer_responses = infer_request.exec(decoupled=True)
 
             # case 1. Use Next() to get the next response first, then use
             # for-loop to get the remaining responses.
             infer_response = next(infer_responses)
             self.assertFalse(infer_response.has_error())
-            output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUT')
+            output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
             self.assertIsNotNone(output0)
             self.assertEqual(response_value, output0.as_numpy())
             # The iterator now should only have 4 remaining responses.
             infer_responses = self._test_response_iterator_square(
-                4, response_value, infer_responses)
+                4, response_value, infer_responses
+            )
 
             # case 2. Call for-loop to get all the responses multiple times.
             infer_responses = self._test_response_iterator_square(
-                5, response_value, infer_responses)
+                5, response_value, infer_responses
+            )
             infer_responses = self._test_response_iterator_square(
-                5, response_value, infer_responses)
+                5, response_value, infer_responses
+            )
             infer_responses = self._test_response_iterator_square(
-                5, response_value, infer_responses)
+                5, response_value, infer_responses
+            )
 
             # case 3. Break from the iteration, then use Next() and for-loop to
             # get the remaining responses.
             response_count = 0
             for infer_response in infer_responses:
                 self.assertFalse(infer_response.has_error())
-                output0 = pb_utils.get_output_tensor_by_name(
-                    infer_response, 'OUT')
+                output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
                 self.assertIsNotNone(output0)
                 self.assertEqual(response_value, output0.as_numpy())
 
@@ -687,13 +698,14 @@ def test_response_iterator(self):
 
             infer_response = next(infer_responses)
             self.assertFalse(infer_response.has_error())
-            output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUT')
+            output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
             self.assertIsNotNone(output0)
             self.assertEqual(response_value, output0.as_numpy())
 
             # The iterator now should only have 2 remaining responses.
             infer_responses = self._test_response_iterator_square(
-                2, response_value, infer_responses)
+                2, response_value, infer_responses
+            )
 
             # case 4. Delete the iterator before all the responses have been
             # retrieved.
@@ -701,7 +713,7 @@ def test_response_iterator(self):
 
             infer_response = next(infer_responses)
             self.assertFalse(infer_response.has_error())
-            output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUT')
+            output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
             self.assertIsNotNone(output0)
             self.assertEqual(response_value, output0.as_numpy())
 
@@ -713,17 +725,19 @@ def test_preferred_memory(self):
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         responses = []
         for _ in requests:
             # Run the unittest and store the results in InferenceResponse.
-            test = unittest.main('model', exit=False)
+            test = unittest.main("model", exit=False)
             responses.append(
-                pb_utils.InferenceResponse([
-                    pb_utils.Tensor(
-                        'OUTPUT0',
-                        np.array([test.result.wasSuccessful()],
-                                 dtype=np.float16))
-                ]))
+                pb_utils.InferenceResponse(
+                    [
+                        pb_utils.Tensor(
+                            "OUTPUT0",
+                            np.array([test.result.wasSuccessful()], dtype=np.float16),
+                        )
+                    ]
+                )
+            )
         return responses
diff --git a/qa/python_models/bls_async/model.py b/qa/python_models/bls_async/model.py
old mode 100644
new mode 100755
index 4158c82e9d..a4bc98e85f
--- a/qa/python_models/bls_async/model.py
+++ b/qa/python_models/bls_async/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,46 +26,43 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import asyncio
 import os
+
 import numpy as np
-import triton_python_backend_utils as pb_utils
 import torch
+import triton_python_backend_utils as pb_utils
 from torch.utils.dlpack import from_dlpack, to_dlpack
-import asyncio
 
 
 def verify_add_sub_results(input0, input1, infer_response):
     if infer_response.has_error():
-        print('Async BLS failed:', infer_response.error().message(), flush=True)
+        print("Async BLS failed:", infer_response.error().message(), flush=True)
         return False
 
-    output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
-    output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')
+    output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
+    output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT1")
 
     if (output0 is None) or (output1 is None):
         return False
 
     if not input0.is_cpu():
-        input0 = from_dlpack(
-            input0.to_dlpack()).to('cpu').cpu().detach().numpy()
+        input0 = from_dlpack(input0.to_dlpack()).to("cpu").cpu().detach().numpy()
     else:
         input0 = input0.as_numpy()
 
     if not input1.is_cpu():
-        input1 = from_dlpack(
-            input1.to_dlpack()).to('cpu').cpu().detach().numpy()
+        input1 = from_dlpack(input1.to_dlpack()).to("cpu").cpu().detach().numpy()
     else:
         input1 = input1.as_numpy()
 
     if not output0.is_cpu():
-        output0 = from_dlpack(
-            output0.to_dlpack()).to('cpu').cpu().detach().numpy()
+        output0 = from_dlpack(output0.to_dlpack()).to("cpu").cpu().detach().numpy()
     else:
         output0 = output0.as_numpy()
 
     if not output1.is_cpu():
-        output1 = from_dlpack(
-            output1.to_dlpack()).to('cpu').cpu().detach().numpy()
+        output1 = from_dlpack(output1.to_dlpack()).to("cpu").cpu().detach().numpy()
     else:
         output1 = output1.as_numpy()
 
@@ -71,11 +70,11 @@ def verify_add_sub_results(input0, input1, infer_response):
     expected_output_1 = input0 - input1
 
     if not np.all(expected_output_0 == output0):
-        print(f'For OUTPUT0 expected {expected_output_0} found {output0}')
+        print(f"For OUTPUT0 expected {expected_output_0} found {output0}")
         return False
 
     if not np.all(expected_output_1 == output1):
-        print(f'For OUTPUT1 expected {expected_output_1} found {output1}')
+        print(f"For OUTPUT1 expected {expected_output_1} found {output1}")
         return False
 
     return True
@@ -83,8 +82,7 @@ def verify_add_sub_results(input0, input1, infer_response):
 
 def verify_square_results(input0, infer_responses):
     if not input0.is_cpu():
-        input0 = from_dlpack(
-            input0.to_dlpack()).to('cpu').cpu().detach().numpy()
+        input0 = from_dlpack(input0.to_dlpack()).to("cpu").cpu().detach().numpy()
     else:
         input0 = input0.as_numpy()
 
@@ -92,34 +90,36 @@ def verify_square_results(input0, infer_responses):
 
     for infer_response in infer_responses:
         if infer_response.has_error():
-            print('Async BLS decoupled failed:',
-                  infer_response.error().message(),
-                  flush=True)
+            print(
+                "Async BLS decoupled failed:",
+                infer_response.error().message(),
+                flush=True,
+            )
             return False
 
         if len(infer_response.output_tensors()) > 0:
-            output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUT')
+            output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUT")
 
-            if (output0 is None):
+            if output0 is None:
                 return False
 
             if not output0.is_cpu():
-                output0 = from_dlpack(
-                    output0.to_dlpack()).to('cpu').cpu().detach().numpy()
+                output0 = (
+                    from_dlpack(output0.to_dlpack()).to("cpu").cpu().detach().numpy()
+                )
             else:
                 output0 = output0.as_numpy()
 
             expected_output = input0
 
             if not np.all(expected_output == input0):
-                print(f'For OUT expected {expected_output} found {output0}')
+                print(f"For OUT expected {expected_output} found {output0}")
                 return False
 
         response_count += 1
 
     if not np.all(input0 == response_count - 1):
-        print('Expected {} responses, got {}'.format(input0,
-                                                     response_count - 1))
+        print("Expected {} responses, got {}".format(input0, response_count - 1))
         return False
 
     return True
@@ -131,35 +131,33 @@ def create_addsub_inference_request(gpu=False):
         input1_np = np.random.randn(16)
         input0_np = input0_np.astype(np.float32)
         input1_np = input1_np.astype(np.float32)
-        input0 = pb_utils.Tensor('INPUT0', input0_np)
-        input1 = pb_utils.Tensor('INPUT1', input1_np)
+        input0 = pb_utils.Tensor("INPUT0", input0_np)
+        input1 = pb_utils.Tensor("INPUT1", input1_np)
     else:
-        input0_pytorch = torch.rand(16).to('cuda')
-        input1_pytorch = torch.rand(16).to('cuda')
-        input0 = pb_utils.Tensor.from_dlpack('INPUT0',
-                                             to_dlpack(input0_pytorch))
-        input1 = pb_utils.Tensor.from_dlpack('INPUT1',
-                                             to_dlpack(input1_pytorch))
+        input0_pytorch = torch.rand(16).to("cuda")
+        input1_pytorch = torch.rand(16).to("cuda")
+        input0 = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(input0_pytorch))
+        input1 = pb_utils.Tensor.from_dlpack("INPUT1", to_dlpack(input1_pytorch))
 
     infer_request = pb_utils.InferenceRequest(
-        model_name='dlpack_add_sub',
+        model_name="dlpack_add_sub",
         inputs=[input0, input1],
-        requested_output_names=['OUTPUT0', 'OUTPUT1'])
+        requested_output_names=["OUTPUT0", "OUTPUT1"],
+    )
     return input0, input1, infer_request
 
 
 def create_square_inference_request(gpu=False):
     if not gpu:
         input0_np = np.random.randint(16, size=1, dtype=np.int32)
-        input0 = pb_utils.Tensor('IN', input0_np)
+        input0 = pb_utils.Tensor("IN", input0_np)
     else:
-        input0_pytorch = torch.randint(1, 16, (1,),
-                                       dtype=torch.int32).to('cuda')
-        input0 = pb_utils.Tensor.from_dlpack('IN', to_dlpack(input0_pytorch))
+        input0_pytorch = torch.randint(1, 16, (1,), dtype=torch.int32).to("cuda")
+        input0 = pb_utils.Tensor.from_dlpack("IN", to_dlpack(input0_pytorch))
 
-    infer_request = pb_utils.InferenceRequest(model_name='dlpack_square',
-                                              inputs=[input0],
-                                              requested_output_names=['OUT'])
+    infer_request = pb_utils.InferenceRequest(
+        model_name="dlpack_square", inputs=[input0], requested_output_names=["OUT"]
+    )
     return input0, infer_request
 
 
@@ -203,8 +201,9 @@ async def multiple_async_bls_addsub(gpu):
 
     infer_responses = await asyncio.gather(*infer_request_aws)
     for infer_response, input_pair in zip(infer_responses, inputs):
-        result_correct = verify_add_sub_results(input_pair[0], input_pair[1],
-                                                infer_response)
+        result_correct = verify_add_sub_results(
+            input_pair[0], input_pair[1], infer_response
+        )
         if not result_correct:
             return False
 
@@ -229,9 +228,8 @@ async def multiple_async_bls_square(gpu):
 
 
 class TritonPythonModel:
-
     async def execute(self, requests):
-        is_decoupled = True if os.environ['BLS_KIND'] == "decoupled" else False
+        is_decoupled = True if os.environ["BLS_KIND"] == "decoupled" else False
 
         responses = []
         for _ in requests:
@@ -245,9 +243,11 @@ async def execute(self, requests):
                 test3 = await async_bls_add_sub()
 
             responses.append(
-                pb_utils.InferenceResponse(output_tensors=[
-                    pb_utils.Tensor('OUTPUT0', np.array([test1 & test2 &
-                                                         test3]))
-                ]))
+                pb_utils.InferenceResponse(
+                    output_tensors=[
+                        pb_utils.Tensor("OUTPUT0", np.array([test1 & test2 & test3]))
+                    ]
+                )
+            )
 
         return responses
diff --git a/qa/python_models/bls_finalize_error/model.py b/qa/python_models/bls_finalize_error/model.py
old mode 100644
new mode 100755
index f3db1d6bbe..a0b900c75e
--- a/qa/python_models/bls_finalize_error/model.py
+++ b/qa/python_models/bls_finalize_error/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,12 +26,11 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
 import numpy as np
+import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
         pass
 
@@ -37,11 +38,10 @@ def execute(self, requests):
         pass
 
     def finalize(self):
-        print('Cleaning up...')
+        print("Cleaning up...")
         input0_np = np.random.randint(3, size=1, dtype=np.int32)
-        input0 = pb_utils.Tensor('IN', input0_np)
+        input0 = pb_utils.Tensor("IN", input0_np)
         infer_request = pb_utils.InferenceRequest(
-            model_name='square_int32',
-            inputs=[input0],
-            requested_output_names=['OUT'])
+            model_name="square_int32", inputs=[input0], requested_output_names=["OUT"]
+        )
         infer_responses = infer_request.exec(decoupled=True)
diff --git a/qa/python_models/bls_init_error/model.py b/qa/python_models/bls_init_error/model.py
old mode 100644
new mode 100755
index f95ce4eff8..1d890d1a8f
--- a/qa/python_models/bls_init_error/model.py
+++ b/qa/python_models/bls_init_error/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,23 +26,21 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
 import numpy as np
+import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
         input0_np = np.random.randint(3, size=1, dtype=np.int32)
-        input0 = pb_utils.Tensor('IN', input0_np)
+        input0 = pb_utils.Tensor("IN", input0_np)
         infer_request = pb_utils.InferenceRequest(
-            model_name='square_int32',
-            inputs=[input0],
-            requested_output_names=['OUT'])
+            model_name="square_int32", inputs=[input0], requested_output_names=["OUT"]
+        )
         infer_responses = infer_request.exec(decoupled=True)
 
     def execute(self, requests):
         pass
 
     def finalize(self):
-        print('Cleaning up...')
+        print("Cleaning up...")
diff --git a/qa/python_models/bls_memory/model.py b/qa/python_models/bls_memory/model.py
old mode 100644
new mode 100755
index a92b37c287..d5df420d23
--- a/qa/python_models/bls_memory/model.py
+++ b/qa/python_models/bls_memory/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -25,25 +27,25 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
-import numpy as np
 import unittest
+
+import numpy as np
 import triton_python_backend_utils as pb_utils
 
 
 class PBBLSMemoryTest(unittest.TestCase):
-
     def setUp(self):
-        self._is_decoupled = True if os.environ[
-            'BLS_KIND'] == "decoupled" else False
+        self._is_decoupled = True if os.environ["BLS_KIND"] == "decoupled" else False
 
     def _send_identity_tensor(self, size, is_decoupled):
         tensor_size = [1, size]
         input0_np = np.random.randn(*tensor_size)
-        input0 = pb_utils.Tensor('INPUT0', input0_np.astype(np.float32))
+        input0 = pb_utils.Tensor("INPUT0", input0_np.astype(np.float32))
         infer_request = pb_utils.InferenceRequest(
-            model_name='identity_fp32',
+            model_name="identity_fp32",
             inputs=[input0],
-            requested_output_names=['OUTPUT0'])
+            requested_output_names=["OUTPUT0"],
+        )
 
         if is_decoupled:
             infer_responses = infer_request.exec(decoupled=True)
@@ -58,46 +60,46 @@ def _send_identity_tensor(self, size, is_decoupled):
     def test_bls_out_of_memory(self):
         tensor_size = 256 * 1024 * 1024
         input0_np, infer_response = self._send_identity_tensor(
-            tensor_size, self._is_decoupled)
+            tensor_size, self._is_decoupled
+        )
         out_of_memory_message = "Failed to increase the shared memory pool size for key"
 
         if infer_response.has_error():
-            self.assertIn(out_of_memory_message,
-                          infer_response.error().message())
+            self.assertIn(out_of_memory_message, infer_response.error().message())
         else:
             self.assertFalse(infer_response.has_error())
-            output0 = pb_utils.get_output_tensor_by_name(
-                infer_response, 'OUTPUT0')
+            output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
             self.assertIsNotNone(output0)
             self.assertTrue(np.allclose(output0.as_numpy(), input0_np))
 
         tensor_size = 50 * 1024 * 1024
         for _ in range(4):
             input0_np, infer_response = self._send_identity_tensor(
-                tensor_size, self._is_decoupled)
+                tensor_size, self._is_decoupled
+            )
             if infer_response.has_error():
-                self.assertIn(out_of_memory_message,
-                              infer_response.error().message())
+                self.assertIn(out_of_memory_message, infer_response.error().message())
             else:
                 self.assertFalse(infer_response.has_error())
-                output0 = pb_utils.get_output_tensor_by_name(
-                    infer_response, 'OUTPUT0')
+                output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
                 self.assertIsNotNone(output0)
                 self.assertTrue(np.allclose(output0.as_numpy(), input0_np))
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         responses = []
         for _ in requests:
             # Run the unittest and store the results in InferenceResponse.
-            test = unittest.main('model', exit=False)
+            test = unittest.main("model", exit=False)
             responses.append(
-                pb_utils.InferenceResponse([
-                    pb_utils.Tensor(
-                        'OUTPUT0',
-                        np.array([test.result.wasSuccessful()],
-                                 dtype=np.float16))
-                ]))
+                pb_utils.InferenceResponse(
+                    [
+                        pb_utils.Tensor(
+                            "OUTPUT0",
+                            np.array([test.result.wasSuccessful()], dtype=np.float16),
+                        )
+                    ]
+                )
+            )
         return responses
diff --git a/qa/python_models/bls_memory_async/model.py b/qa/python_models/bls_memory_async/model.py
old mode 100644
new mode 100755
index 0939420d2a..2ff6044148
--- a/qa/python_models/bls_memory_async/model.py
+++ b/qa/python_models/bls_memory_async/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -25,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+
 import numpy as np
 import triton_python_backend_utils as pb_utils
 
@@ -32,11 +35,10 @@
 async def _send_identity_tensor(size, is_decoupled):
     tensor_size = [1, size]
     input0_np = np.random.randn(*tensor_size)
-    input0 = pb_utils.Tensor('INPUT0', input0_np.astype(np.float32))
+    input0 = pb_utils.Tensor("INPUT0", input0_np.astype(np.float32))
     infer_request = pb_utils.InferenceRequest(
-        model_name='identity_fp32',
-        inputs=[input0],
-        requested_output_names=['OUTPUT0'])
+        model_name="identity_fp32", inputs=[input0], requested_output_names=["OUTPUT0"]
+    )
 
     if is_decoupled:
         infer_responses = await infer_request.async_exec(decoupled=True)
@@ -48,11 +50,10 @@ async def _send_identity_tensor(size, is_decoupled):
 
 
 async def test_bls_out_of_memory():
-    is_decoupled = True if os.environ['BLS_KIND'] == "decoupled" else False
+    is_decoupled = True if os.environ["BLS_KIND"] == "decoupled" else False
 
     tensor_size = 256 * 1024 * 1024
-    input0_np, infer_response = await _send_identity_tensor(
-        tensor_size, is_decoupled)
+    input0_np, infer_response = await _send_identity_tensor(tensor_size, is_decoupled)
 
     out_of_memory_message = "Failed to increase the shared memory pool size for key"
 
@@ -60,7 +61,7 @@ async def test_bls_out_of_memory():
         if not (out_of_memory_message in infer_response.error().message()):
             return False
     else:
-        output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
+        output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
         if output0 is None:
             return False
         if not np.allclose(output0.as_numpy(), input0_np):
@@ -69,14 +70,14 @@ async def test_bls_out_of_memory():
     tensor_size = 50 * 1024 * 1024
     for _ in range(4):
         input0_np, infer_response = await _send_identity_tensor(
-            tensor_size, is_decoupled)
+            tensor_size, is_decoupled
+        )
 
         if infer_response.has_error():
             if not (out_of_memory_message in infer_response.error().message()):
                 return False
         else:
-            output0 = pb_utils.get_output_tensor_by_name(
-                infer_response, 'OUTPUT0')
+            output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
             if output0 is None:
                 return False
             if not np.allclose(output0.as_numpy(), input0_np):
@@ -86,15 +87,14 @@ async def test_bls_out_of_memory():
 
 
 class TritonPythonModel:
-
     async def execute(self, requests):
         responses = []
         for _ in requests:
             # Run the unittest and store the results in InferenceResponse.
             result = await test_bls_out_of_memory()
             responses.append(
-                pb_utils.InferenceResponse([
-                    pb_utils.Tensor('OUTPUT0',
-                                    np.array([result], dtype=np.float16))
-                ]))
+                pb_utils.InferenceResponse(
+                    [pb_utils.Tensor("OUTPUT0", np.array([result], dtype=np.float16))]
+                )
+            )
         return responses
diff --git a/qa/python_models/bls_model_loading/config.pbtxt b/qa/python_models/bls_model_loading/config.pbtxt
index 8b11d06201..282746a4d9 100644
--- a/qa/python_models/bls_model_loading/config.pbtxt
+++ b/qa/python_models/bls_model_loading/config.pbtxt
@@ -36,8 +36,8 @@ output [
 ]
 
 instance_group [
-  { 
+  {
     count: 3
-    kind: KIND_CPU 
+    kind: KIND_CPU
   }
 ]
diff --git a/qa/python_models/bls_model_loading/model.py b/qa/python_models/bls_model_loading/model.py
old mode 100644
new mode 100755
index aadeb8f855..7557f4066b
--- a/qa/python_models/bls_model_loading/model.py
+++ b/qa/python_models/bls_model_loading/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,13 +26,13 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
 import unittest
+
+import numpy as np
 import triton_python_backend_utils as pb_utils
 
 
 class PBBLSModelLoadingTest(unittest.TestCase):
-
     def setUp(self):
         self.model_name = "onnx_int32_int32_int32"
 
@@ -50,15 +52,21 @@ def test_load_with_config_override(self):
         self.assertTrue(pb_utils.is_model_ready(self.model_name))
 
         # Send the config with the wrong format
-        wrong_config = "\"parameters\": {\"config\": {{\"backend\":\"onnxruntime\", \"version_policy\":{\"specific\":{\"versions\":[2]}}}}}"
+        wrong_config = '"parameters": {"config": {{"backend":"onnxruntime", "version_policy":{"specific":{"versions":[2]}}}}}'
         with self.assertRaises(pb_utils.TritonModelException):
             pb_utils.load_model(model_name=self.model_name, config=wrong_config)
         # The model should not be changed after a failed load model request
         for version in ["2", "3"]:
-            self.assertTrue(pb_utils.is_model_ready(model_name=self.model_name, model_version=version))
+            self.assertTrue(
+                pb_utils.is_model_ready(
+                    model_name=self.model_name, model_version=version
+                )
+            )
 
         # Send the config with the correct format
-        config = "{\"backend\":\"onnxruntime\", \"version_policy\":{\"specific\":{\"versions\":[2]}}}"
+        config = (
+            '{"backend":"onnxruntime", "version_policy":{"specific":{"versions":[2]}}}'
+        )
         pb_utils.load_model(self.model_name, config=config)
         # The model should be changed after a successful load model request
         self.assertTrue(pb_utils.is_model_ready(self.model_name, "2"))
@@ -70,8 +78,8 @@ def test_load_with_file_override(self):
         self.assertTrue(pb_utils.is_model_ready(self.model_name))
 
         override_name = "override_model"
-        config = "{\"backend\":\"onnxruntime\"}"
-        with open('models/onnx_int32_int32_int32/3/model.onnx', 'rb') as file:
+        config = '{"backend":"onnxruntime"}'
+        with open("models/onnx_int32_int32_int32/3/model.onnx", "rb") as file:
             data = file.read()
         files = {"file:1/model.onnx": data}
 
@@ -102,18 +110,21 @@ def test_load_with_file_override(self):
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
         # Run the unittest during initialization
-        test = unittest.main('model', exit=False)
+        test = unittest.main("model", exit=False)
         self.result = test.result.wasSuccessful()
 
     def execute(self, requests):
         responses = []
         for _ in requests:
             responses.append(
-                pb_utils.InferenceResponse([
-                    pb_utils.Tensor('OUTPUT0',
-                                    np.array([self.result], dtype=np.float16))
-                ]))
+                pb_utils.InferenceResponse(
+                    [
+                        pb_utils.Tensor(
+                            "OUTPUT0", np.array([self.result], dtype=np.float16)
+                        )
+                    ]
+                )
+            )
         return responses
diff --git a/qa/python_models/bls_onnx_warmup/config.pbtxt b/qa/python_models/bls_onnx_warmup/config.pbtxt
old mode 100644
new mode 100755
index c89069a7a6..879f85ca81
--- a/qa/python_models/bls_onnx_warmup/config.pbtxt
+++ b/qa/python_models/bls_onnx_warmup/config.pbtxt
@@ -35,4 +35,4 @@ output [
   }
 ]
 
-instance_group [{ kind: KIND_CPU }]
+instance_group [{ kind: KIND_CPU }]
\ No newline at end of file
diff --git a/qa/python_models/bls_undefined/model.py b/qa/python_models/bls_undefined/model.py
old mode 100644
new mode 100755
index 4b52c6e54f..a78dec61d4
--- a/qa/python_models/bls_undefined/model.py
+++ b/qa/python_models/bls_undefined/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,9 +28,8 @@
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         undefined_variable
 
     def finalize(self):
-        print('Cleaning up...')
+        print("Cleaning up...")
diff --git a/qa/python_models/cuda_memory_consumer/1/model.py b/qa/python_models/cuda_memory_consumer/1/model.py
old mode 100644
new mode 100755
index 68f2edc4d5..5e451e6d82
--- a/qa/python_models/cuda_memory_consumer/1/model.py
+++ b/qa/python_models/cuda_memory_consumer/1/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -29,11 +31,10 @@
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input = {'name': 'INPUT', 'data_type': 'TYPE_FP32', 'dims': [1]}
-        output = {'name': 'OUTPUT', 'data_type': 'TYPE_FP32', 'dims': [1]}
+        input = {"name": "INPUT", "data_type": "TYPE_FP32", "dims": [1]}
+        output = {"name": "OUTPUT", "data_type": "TYPE_FP32", "dims": [1]}
 
         auto_complete_model_config.set_max_batch_size(0)
         auto_complete_model_config.add_input(input)
@@ -48,14 +49,12 @@ def initialize(self, args):
         cuda.cuCtxCreate(0, 0)
 
         mem_info = cuda.cuMemGetInfo()
-        if (mem_info[0] != 0):
-            raise pb_utils.TritonModelException(
-                "Failed to get CUDA memory info")
+        if mem_info[0] != 0:
+            raise pb_utils.TritonModelException("Failed to get CUDA memory info")
 
         mem_alloc = cuda.cuMemAlloc(mem_info[2] * 0.4)
-        if (mem_alloc[0] != 0):
-            raise pb_utils.TritonModelException(
-                "Failed to allocate CUDA memory")
+        if mem_alloc[0] != 0:
+            raise pb_utils.TritonModelException("Failed to allocate CUDA memory")
         self.mem_ptr = mem_alloc[1]
 
     def finalize(self):
@@ -63,8 +62,7 @@ def finalize(self):
             cuda.cuMemFree(self.mem_ptr)
 
     def execute(self, requests):
-        """ This function is called on inference request.
-        """
+        """This function is called on inference request."""
         responses = []
         for request in requests:
             input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
diff --git a/qa/python_models/custom_metrics/config.pbtxt b/qa/python_models/custom_metrics/config.pbtxt
index bba420d9d2..c2bf81331b 100644
--- a/qa/python_models/custom_metrics/config.pbtxt
+++ b/qa/python_models/custom_metrics/config.pbtxt
@@ -36,8 +36,8 @@ output [
 ]
 
 instance_group [
-  { 
+  {
     count: 3
-    kind: KIND_CPU 
+    kind: KIND_CPU
   }
 ]
diff --git a/qa/python_models/custom_metrics/model.py b/qa/python_models/custom_metrics/model.py
old mode 100644
new mode 100755
index a9173556a9..9b25382d08
--- a/qa/python_models/custom_metrics/model.py
+++ b/qa/python_models/custom_metrics/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,14 +26,14 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
 import unittest
-import triton_python_backend_utils as pb_utils
+
+import numpy as np
 import requests
+import triton_python_backend_utils as pb_utils
 
 
 class PBCustomMetricsTest(unittest.TestCase):
-
     def _get_metrics(self):
         metrics_url = "http://localhost:8002/metrics"
         r = requests.get(metrics_url)
@@ -54,7 +56,7 @@ def _metric_api_helper(self, metric, kind):
 
         # Test increment negative value
         decrement = -23.5
-        if kind == 'counter':
+        if kind == "counter":
             # Counter should not accept negative values
             with self.assertRaises(pb_utils.TritonModelException):
                 metric.increment(decrement)
@@ -65,7 +67,7 @@ def _metric_api_helper(self, metric, kind):
 
         # Test set value
         value = 999.9
-        if kind == 'counter':
+        if kind == "counter":
             # Counter does not support set
             with self.assertRaises(pb_utils.TritonModelException):
                 metric.set(value)
@@ -83,7 +85,8 @@ def _dup_metric_helper(self, labels={}):
         metric_family = pb_utils.MetricFamily(
             name="test_dup_metric",
             description=description,
-            kind=pb_utils.MetricFamily.COUNTER)
+            kind=pb_utils.MetricFamily.COUNTER,
+        )
 
         # Verify dupe metrics reference same underlying metric
         metric1 = metric_family.Metric(labels=labels)
@@ -109,12 +112,15 @@ def test_counter_e2e(self):
         metric_family = pb_utils.MetricFamily(
             name="test_counter_e2e",
             description="test metric counter kind end to end",
-            kind=pb_utils.MetricFamily.COUNTER)
+            kind=pb_utils.MetricFamily.COUNTER,
+        )
         labels = {"example1": "counter_label1", "example2": "counter_label2"}
         metric = metric_family.Metric(labels=labels)
-        self._metric_api_helper(metric, 'counter')
+        self._metric_api_helper(metric, "counter")
 
-        pattern = 'test_counter_e2e{example1="counter_label1",example2="counter_label2"}'
+        pattern = (
+            'test_counter_e2e{example1="counter_label1",example2="counter_label2"}'
+        )
         metrics = self._get_metrics()
         self.assertIn(pattern, metrics)
 
@@ -122,10 +128,11 @@ def test_gauge_e2e(self):
         metric_family = pb_utils.MetricFamily(
             name="test_gauge_e2e",
             description="test metric gauge kind end to end",
-            kind=pb_utils.MetricFamily.GAUGE)
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
         labels = {"example1": "counter_label1", "example2": "counter_label2"}
         metric = metric_family.Metric(labels=labels)
-        self._metric_api_helper(metric, 'gauge')
+        self._metric_api_helper(metric, "gauge")
 
         pattern = 'test_gauge_e2e{example1="counter_label1",example2="counter_label2"}'
         metrics = self._get_metrics()
@@ -136,13 +143,14 @@ def test_dup_metric_family_diff_kind(self):
         metric_family1 = pb_utils.MetricFamily(
             name="test_dup_metric_family_diff_kind",
             description="test metric family with same name but different kind",
-            kind=pb_utils.MetricFamily.COUNTER)
+            kind=pb_utils.MetricFamily.COUNTER,
+        )
         with self.assertRaises(pb_utils.TritonModelException):
             metric_family2 = pb_utils.MetricFamily(
                 name="test_dup_metric_family_diff_kind",
-                description=
-                "test metric family with same name but different kind",
-                kind=pb_utils.MetricFamily.GAUGE)
+                description="test metric family with same name but different kind",
+                kind=pb_utils.MetricFamily.GAUGE,
+            )
             self.assertIsNone(metric_family2)
 
         self.assertIsNotNone(metric_family1)
@@ -153,24 +161,26 @@ def test_dup_metric_family_diff_description(self):
         metric_family1 = pb_utils.MetricFamily(
             name="test_dup_metric_family_diff_description",
             description="first description",
-            kind=pb_utils.MetricFamily.COUNTER)
+            kind=pb_utils.MetricFamily.COUNTER,
+        )
         metric_family2 = pb_utils.MetricFamily(
             name="test_dup_metric_family_diff_description",
             description="second description",
-            kind=pb_utils.MetricFamily.COUNTER)
+            kind=pb_utils.MetricFamily.COUNTER,
+        )
 
         metric2 = metric_family2.Metric()
         self.assertEqual(metric2.value(), 0)
 
         # Delete metric_family1 and check if metric_family2 still references it
         del metric_family1
-        pattern = 'test_dup_metric_family_diff_description first description'
+        pattern = "test_dup_metric_family_diff_description first description"
         metrics = self._get_metrics()
         self.assertIn(pattern, metrics)
 
         # The first description will be kept if adding a duplicate metric
         # family name with a different description
-        pattern = 'test_dup_metric_family_diff_description second description'
+        pattern = "test_dup_metric_family_diff_description second description"
         self.assertNotIn(pattern, metrics)
 
     def test_dup_metric_family(self):
@@ -179,11 +189,13 @@ def test_dup_metric_family(self):
         metric_family1 = pb_utils.MetricFamily(
             name="test_dup_metric_family",
             description="dup description",
-            kind=pb_utils.MetricFamily.COUNTER)
+            kind=pb_utils.MetricFamily.COUNTER,
+        )
         metric_family2 = pb_utils.MetricFamily(
             name="test_dup_metric_family",
             description="dup description",
-            kind=pb_utils.MetricFamily.COUNTER)
+            kind=pb_utils.MetricFamily.COUNTER,
+        )
 
         metric_key = "custom_metric_key"
         metric1 = metric_family1.Metric(labels={metric_key: "label1"})
@@ -193,10 +205,10 @@ def test_dup_metric_family(self):
         self.assertEqual(metric2.value(), 0)
 
         patterns = [
-            '# HELP test_dup_metric_family dup description',
-            '# TYPE test_dup_metric_family counter',
+            "# HELP test_dup_metric_family dup description",
+            "# TYPE test_dup_metric_family counter",
             'test_dup_metric_family{custom_metric_key="label2"} 0',
-            'test_dup_metric_family{custom_metric_key="label1"} 0'
+            'test_dup_metric_family{custom_metric_key="label1"} 0',
         ]
         metrics = self._get_metrics()
         for pattern in patterns:
@@ -220,15 +232,13 @@ def test_metric_lifetime_error(self):
         kinds = [pb_utils.MetricFamily.COUNTER, pb_utils.MetricFamily.GAUGE]
         metric_family_names = [
             "test_metric_lifetime_error_counter",
-            "test_metric_lifetime_error_gauge"
+            "test_metric_lifetime_error_gauge",
         ]
         for kind, name in zip(kinds, metric_family_names):
             metric_family = pb_utils.MetricFamily(
-                name=name, description="test metric lifetime error", kind=kind)
-            labels = {
-                "example1": "counter_label1",
-                "example2": "counter_label2"
-            }
+                name=name, description="test metric lifetime error", kind=kind
+            )
+            labels = {"example1": "counter_label1", "example2": "counter_label2"}
             metric = metric_family.Metric(labels=labels)
 
             # Intentionally delete the 'MetricFamily' before the 'Metric' being deleted
@@ -252,17 +262,19 @@ def test_metric_lifetime_error(self):
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         responses = []
         for _ in requests:
             # Run the unittest and store the results in InferenceResponse.
-            test = unittest.main('model', exit=False)
+            test = unittest.main("model", exit=False)
             responses.append(
-                pb_utils.InferenceResponse([
-                    pb_utils.Tensor(
-                        'OUTPUT0',
-                        np.array([test.result.wasSuccessful()],
-                                 dtype=np.float16))
-                ]))
+                pb_utils.InferenceResponse(
+                    [
+                        pb_utils.Tensor(
+                            "OUTPUT0",
+                            np.array([test.result.wasSuccessful()], dtype=np.float16),
+                        )
+                    ]
+                )
+            )
         return responses
diff --git a/qa/python_models/delayed_model/model.py b/qa/python_models/delayed_model/model.py
old mode 100644
new mode 100755
index 8b8efd8294..0587c0ff0e
--- a/qa/python_models/delayed_model/model.py
+++ b/qa/python_models/delayed_model/model.py
@@ -1,4 +1,6 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,15 +26,15 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
 import time
 
+import triton_python_backend_utils as pb_utils
+
 # Sleep for 5 seconds to ensure that delayed startup works properly.
 time.sleep(5)
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         responses = []
         for request in requests:
diff --git a/qa/python_models/dlpack_add_sub/model.py b/qa/python_models/dlpack_add_sub/model.py
old mode 100644
new mode 100755
index e32e31c9a8..ead2db6017
--- a/qa/python_models/dlpack_add_sub/model.py
+++ b/qa/python_models/dlpack_add_sub/model.py
@@ -1,4 +1,6 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,27 +26,27 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
-from torch.utils.dlpack import to_dlpack, from_dlpack
-import torch
-import numpy as np
 import json
 
+import numpy as np
+import torch
+import triton_python_backend_utils as pb_utils
+from torch.utils.dlpack import from_dlpack, to_dlpack
 
-class TritonPythonModel:
 
+class TritonPythonModel:
     def initialize(self, args):
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
-        output0_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT0")
-        output1_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT1")
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
 
         self.output0_dtype = pb_utils.triton_string_to_numpy(
-            output0_config['data_type'])
+            output0_config["data_type"]
+        )
         self.output1_dtype = pb_utils.triton_string_to_numpy(
-            output1_config['data_type'])
+            output1_config["data_type"]
+        )
         self.numpy_to_pytorch_dtype = {
             np.bool_: torch.bool,
             np.uint8: torch.uint8,
@@ -68,52 +70,63 @@ def execute(self, requests):
 
             # If both of the tensors are in CPU, use NumPy.
             if in_0.is_cpu() and in_1.is_cpu():
-                if in_0.as_numpy().dtype.type is np.bytes_ or in_0.as_numpy(
-                ).dtype == np.object_:
-                    out_0, out_1 = (in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32),\
-                        in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32))
-                    out_tensor_0 = pb_utils.Tensor("OUTPUT0",
-                                                   out_0.astype(output0_dtype))
-                    out_tensor_1 = pb_utils.Tensor("OUTPUT1",
-                                                   out_1.astype(output1_dtype))
+                if (
+                    in_0.as_numpy().dtype.type is np.bytes_
+                    or in_0.as_numpy().dtype == np.object_
+                ):
+                    out_0, out_1 = (
+                        in_0.as_numpy().astype(np.int32)
+                        + in_1.as_numpy().astype(np.int32),
+                        in_0.as_numpy().astype(np.int32)
+                        - in_1.as_numpy().astype(np.int32),
+                    )
+                    out_tensor_0 = pb_utils.Tensor(
+                        "OUTPUT0", out_0.astype(output0_dtype)
+                    )
+                    out_tensor_1 = pb_utils.Tensor(
+                        "OUTPUT1", out_1.astype(output1_dtype)
+                    )
                 else:
                     in_0_pytorch, in_1_pytorch = from_dlpack(
-                        in_0.to_dlpack()), from_dlpack(in_1.to_dlpack())
-                    out_0, out_1 = (in_0_pytorch + in_1_pytorch,
-                                    in_0_pytorch - in_1_pytorch)
+                        in_0.to_dlpack()
+                    ), from_dlpack(in_1.to_dlpack())
+                    out_0, out_1 = (
+                        in_0_pytorch + in_1_pytorch,
+                        in_0_pytorch - in_1_pytorch,
+                    )
 
                     if self.output0_dtype == np.object_:
                         out_tensor_0 = pb_utils.Tensor(
-                            "OUTPUT0",
-                            out_0.numpy().astype(output0_dtype))
+                            "OUTPUT0", out_0.numpy().astype(output0_dtype)
+                        )
                     else:
-                        out_0 = out_0.type(
-                            self.numpy_to_pytorch_dtype[output0_dtype])
+                        out_0 = out_0.type(self.numpy_to_pytorch_dtype[output0_dtype])
                         out_tensor_0 = pb_utils.Tensor.from_dlpack(
-                            "OUTPUT0", to_dlpack(out_0))
+                            "OUTPUT0", to_dlpack(out_0)
+                        )
 
                     if self.output1_dtype == np.object_:
                         out_tensor_1 = pb_utils.Tensor(
-                            "OUTPUT1",
-                            out_1.numpy().astype(output1_dtype))
+                            "OUTPUT1", out_1.numpy().astype(output1_dtype)
+                        )
                     else:
-                        out_1 = out_1.type(
-                            self.numpy_to_pytorch_dtype[output1_dtype])
+                        out_1 = out_1.type(self.numpy_to_pytorch_dtype[output1_dtype])
                         out_tensor_1 = pb_utils.Tensor.from_dlpack(
-                            "OUTPUT1", to_dlpack(out_1))
+                            "OUTPUT1", to_dlpack(out_1)
+                        )
 
             else:
-                in_0_pytorch, in_1_pytorch = from_dlpack(
-                    in_0.to_dlpack()).cuda(), from_dlpack(
-                        in_1.to_dlpack()).cuda()
-                out_0, out_1 = (in_0_pytorch + in_1_pytorch,
-                                in_0_pytorch - in_1_pytorch)
-                out_tensor_0 = pb_utils.Tensor.from_dlpack(
-                    "OUTPUT0", to_dlpack(out_0))
-                out_tensor_1 = pb_utils.Tensor.from_dlpack(
-                    "OUTPUT1", to_dlpack(out_1))
+                in_0_pytorch, in_1_pytorch = (
+                    from_dlpack(in_0.to_dlpack()).cuda(),
+                    from_dlpack(in_1.to_dlpack()).cuda(),
+                )
+                out_0, out_1 = (
+                    in_0_pytorch + in_1_pytorch,
+                    in_0_pytorch - in_1_pytorch,
+                )
+                out_tensor_0 = pb_utils.Tensor.from_dlpack("OUTPUT0", to_dlpack(out_0))
+                out_tensor_1 = pb_utils.Tensor.from_dlpack("OUTPUT1", to_dlpack(out_1))
 
-            responses.append(
-                pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
+            responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
 
         return responses
diff --git a/qa/python_models/dlpack_empty_output/model.py b/qa/python_models/dlpack_empty_output/model.py
old mode 100644
new mode 100755
index 9afc15430e..fba96cca1a
--- a/qa/python_models/dlpack_empty_output/model.py
+++ b/qa/python_models/dlpack_empty_output/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -30,7 +32,6 @@
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
         pass
 
@@ -46,10 +47,9 @@ def execute(self, requests):
             pytorch_tensor = pytorch_tensor.to(device)
 
             dlpack_tensor = to_dlpack(pytorch_tensor)
-            pb_tensor = pb_utils.Tensor.from_dlpack('OUTPUT', dlpack_tensor)
+            pb_tensor = pb_utils.Tensor.from_dlpack("OUTPUT", dlpack_tensor)
 
-            inference_response = pb_utils.InferenceResponse(
-                output_tensors=[pb_tensor])
+            inference_response = pb_utils.InferenceResponse(output_tensors=[pb_tensor])
             responses.append(inference_response)
 
         return responses
diff --git a/qa/python_models/dlpack_identity/model.py b/qa/python_models/dlpack_identity/model.py
old mode 100644
new mode 100755
index 5a4e11fd5d..39fffcbfdb
--- a/qa/python_models/dlpack_identity/model.py
+++ b/qa/python_models/dlpack_identity/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,7 +30,6 @@
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         """Identity model in Python backend that works with GPU and CPU
         tensors."""
@@ -36,7 +37,8 @@ def execute(self, requests):
         responses = []
         for request in requests:
             input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
-            out_tensor = pb_utils.Tensor.from_dlpack("OUTPUT0",
-                                                     input_tensor.to_dlpack())
+            out_tensor = pb_utils.Tensor.from_dlpack(
+                "OUTPUT0", input_tensor.to_dlpack()
+            )
             responses.append(pb_utils.InferenceResponse([out_tensor]))
         return responses
diff --git a/qa/python_models/dlpack_io_identity/model.py b/qa/python_models/dlpack_io_identity/model.py
old mode 100644
new mode 100755
index f98a4f51c4..7b72d1f02f
--- a/qa/python_models/dlpack_io_identity/model.py
+++ b/qa/python_models/dlpack_io_identity/model.py
@@ -1,4 +1,6 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,9 +26,9 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
-from torch.utils.dlpack import to_dlpack, from_dlpack
 import numpy as np
+import triton_python_backend_utils as pb_utils
+from torch.utils.dlpack import from_dlpack, to_dlpack
 
 
 class TritonPythonModel:
@@ -36,70 +38,73 @@ class TritonPythonModel:
     """
 
     def initialize(self, args):
-        self._model_name = args['model_name']
+        self._model_name = args["model_name"]
 
     def execute(self, requests):
         responses = []
         for request in requests:
             input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
             gpu_output = pb_utils.get_input_tensor_by_name(
-                request, "GPU_OUTPUT").as_numpy()
+                request, "GPU_OUTPUT"
+            ).as_numpy()
 
             if input0.is_cpu():
                 if not gpu_output[0]:
-                    output0 = pb_utils.Tensor.from_dlpack(
-                        "OUTPUT0", input0.to_dlpack())
+                    output0 = pb_utils.Tensor.from_dlpack("OUTPUT0", input0.to_dlpack())
                 else:
                     outptu0_pytorch = from_dlpack(input0.to_dlpack()).cuda()
                     output0 = pb_utils.Tensor.from_dlpack(
-                        "OUTPUT0", to_dlpack(outptu0_pytorch))
+                        "OUTPUT0", to_dlpack(outptu0_pytorch)
+                    )
             else:
                 if gpu_output[0]:
-                    output0 = pb_utils.Tensor.from_dlpack(
-                        "OUTPUT0", input0.to_dlpack())
+                    output0 = pb_utils.Tensor.from_dlpack("OUTPUT0", input0.to_dlpack())
                 else:
                     outptu0_pytorch = from_dlpack(input0.to_dlpack()).cpu()
                     output0 = pb_utils.Tensor.from_dlpack(
-                        "OUTPUT0", to_dlpack(outptu0_pytorch))
+                        "OUTPUT0", to_dlpack(outptu0_pytorch)
+                    )
 
             next_gpu_output = pb_utils.Tensor("NEXT_GPU_OUTPUT", gpu_output[1:])
 
             # Do not perform BLS inference if it is the first
             # model in the pipeline.
-            if self._model_name != 'dlpack_io_identity_1':
+            if self._model_name != "dlpack_io_identity_1":
                 infer_request = pb_utils.InferenceRequest(
-                    model_name='dlpack_io_identity_1',
+                    model_name="dlpack_io_identity_1",
                     inputs=[
                         input0,
-                        pb_utils.get_input_tensor_by_name(
-                            request, "GPU_OUTPUT")
+                        pb_utils.get_input_tensor_by_name(request, "GPU_OUTPUT"),
                     ],
-                    requested_output_names=['OUTPUT0'])
+                    requested_output_names=["OUTPUT0"],
+                )
                 infer_response = infer_request.exec()
 
                 if infer_response.has_error():
                     raise pb_utils.TritonModelException(
-                        infer_response.error().message())
+                        infer_response.error().message()
+                    )
 
                 bls_output0 = pb_utils.get_output_tensor_by_name(
-                    infer_response, 'OUTPUT0')
+                    infer_response, "OUTPUT0"
+                )
                 if not output0.is_cpu():
-                    bls_output0 = from_dlpack(
-                        bls_output0.to_dlpack()).detach().cpu().numpy()
+                    bls_output0 = (
+                        from_dlpack(bls_output0.to_dlpack()).detach().cpu().numpy()
+                    )
                 else:
                     bls_output0 = bls_output0.as_numpy()
 
                 if not input0.is_cpu():
-                    input0 = from_dlpack(
-                        input0.to_dlpack()).detach().cpu().numpy()
+                    input0 = from_dlpack(input0.to_dlpack()).detach().cpu().numpy()
                 else:
                     input0 = input0.as_numpy()
 
                 if not np.allclose(bls_output0, input0):
                     raise pb_utils.TritonModelException(
-                        'BLS input and output tensors are not equal')
+                        "BLS input and output tensors are not equal"
+                    )
 
-            responses.append(
-                pb_utils.InferenceResponse([output0, next_gpu_output]))
+            responses.append(pb_utils.InferenceResponse([output0, next_gpu_output]))
 
         return responses
diff --git a/qa/python_models/dlpack_io_identity_decoupled/model.py b/qa/python_models/dlpack_io_identity_decoupled/model.py
old mode 100644
new mode 100755
index 756c3f96eb..9395c756ed
--- a/qa/python_models/dlpack_io_identity_decoupled/model.py
+++ b/qa/python_models/dlpack_io_identity_decoupled/model.py
@@ -1,4 +1,6 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,10 +26,11 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
-from torch.utils.dlpack import to_dlpack, from_dlpack
-import time
 import threading
+import time
+
+import triton_python_backend_utils as pb_utils
+from torch.utils.dlpack import from_dlpack, to_dlpack
 
 
 class TritonPythonModel:
@@ -37,7 +40,7 @@ class TritonPythonModel:
     """
 
     def initialize(self, args):
-        self._model_name = args['model_name']
+        self._model_name = args["model_name"]
         self.inflight_thread_count = 0
         self.inflight_thread_count_lck = threading.Lock()
 
@@ -47,20 +50,20 @@ def response_thread(self, response_sender, input0, gpu_output):
 
         if input0.is_cpu():
             if not gpu_output[0]:
-                output0 = pb_utils.Tensor.from_dlpack("OUTPUT0",
-                                                      input0.to_dlpack())
+                output0 = pb_utils.Tensor.from_dlpack("OUTPUT0", input0.to_dlpack())
             else:
                 outptu0_pytorch = from_dlpack(input0.to_dlpack()).cuda()
                 output0 = pb_utils.Tensor.from_dlpack(
-                    "OUTPUT0", to_dlpack(outptu0_pytorch))
+                    "OUTPUT0", to_dlpack(outptu0_pytorch)
+                )
         else:
             if gpu_output[0]:
-                output0 = pb_utils.Tensor.from_dlpack("OUTPUT0",
-                                                      input0.to_dlpack())
+                output0 = pb_utils.Tensor.from_dlpack("OUTPUT0", input0.to_dlpack())
             else:
                 output0_pytorch = from_dlpack(input0.to_dlpack()).cpu()
                 output0 = pb_utils.Tensor.from_dlpack(
-                    "OUTPUT0", to_dlpack(output0_pytorch))
+                    "OUTPUT0", to_dlpack(output0_pytorch)
+                )
 
         next_gpu_output = pb_utils.Tensor("NEXT_GPU_OUTPUT", gpu_output[1:])
         infer_response = pb_utils.InferenceResponse([output0, next_gpu_output])
@@ -70,8 +73,7 @@ def response_thread(self, response_sender, input0, gpu_output):
         for _ in range(response_repeat):
             response_sender.send(infer_response)
 
-        response_sender.send(
-            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+        response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
 
         with self.inflight_thread_count_lck:
             self.inflight_thread_count -= 1
@@ -80,11 +82,13 @@ def execute(self, requests):
         for request in requests:
             input0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
             gpu_output = pb_utils.get_input_tensor_by_name(
-                request, "GPU_OUTPUT").as_numpy()
+                request, "GPU_OUTPUT"
+            ).as_numpy()
 
-            thread = threading.Thread(target=self.response_thread,
-                                      args=(request.get_response_sender(),
-                                            input0, gpu_output))
+            thread = threading.Thread(
+                target=self.response_thread,
+                args=(request.get_response_sender(), input0, gpu_output),
+            )
 
             thread.daemon = True
 
@@ -98,11 +102,11 @@ def finalize(self):
         cycles = 0
         logging_time_sec = 5
         sleep_time_sec = 0.1
-        cycle_to_log = (logging_time_sec / sleep_time_sec)
+        cycle_to_log = logging_time_sec / sleep_time_sec
         while inflight_threads:
             with self.inflight_thread_count_lck:
-                inflight_threads = (self.inflight_thread_count != 0)
-                if (cycles % cycle_to_log == 0):
+                inflight_threads = self.inflight_thread_count != 0
+                if cycles % cycle_to_log == 0:
                     print(
                         f"Waiting for {self.inflight_thread_count} response threads to complete..."
                     )
diff --git a/qa/python_models/dlpack_square/model.py b/qa/python_models/dlpack_square/model.py
old mode 100644
new mode 100755
index 10b912ad0c..7e3a592426
--- a/qa/python_models/dlpack_square/model.py
+++ b/qa/python_models/dlpack_square/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,17 +26,18 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from torch.utils.dlpack import to_dlpack, from_dlpack
-import torch
 import json
-import numpy as np
 import threading
 
+import numpy as np
+import torch
+
 # triton_python_backend_utils is available in every Triton Python model. You
 # need to use this module to create inference requests and responses. It also
 # contains some utility functions for extracting information from model_config
 # and converting Triton input/output types to numpy types.
 import triton_python_backend_utils as pb_utils
+from torch.utils.dlpack import from_dlpack, to_dlpack
 
 numpy_to_pytorch_dtype = {
     np.bool_: torch.bool,
@@ -50,21 +53,23 @@
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
         output_config = pb_utils.get_output_config_by_name(model_config, "OUT")
-        self.output_dtype = pb_utils.triton_string_to_numpy(
-            output_config['data_type'])
+        self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"])
 
         using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
-            model_config)
+            model_config
+        )
         if not using_decoupled:
             raise pb_utils.TritonModelException(
                 """the model `{}` can generate any number of responses per request,
                 enable decoupled transaction policy in model configuration to
-                serve this model""".format(args['model_name']))
+                serve this model""".format(
+                    args["model_name"]
+                )
+            )
 
         self.inflight_thread_count = 0
         self.inflight_thread_count_lck = threading.Lock()
@@ -78,10 +83,14 @@ def execute(self, requests):
     def process_request(self, request):
         # Start a separate thread to send the responses for the request. The
         # sending back the responses is delegated to this thread.
-        thread = threading.Thread(target=self.response_thread,
-                                  args=(request.get_response_sender(),
-                                        pb_utils.get_input_tensor_by_name(
-                                            request, 'IN'), self.output_dtype))
+        thread = threading.Thread(
+            target=self.response_thread,
+            args=(
+                request.get_response_sender(),
+                pb_utils.get_input_tensor_by_name(request, "IN"),
+                self.output_dtype,
+            ),
+        )
 
         thread.daemon = True
 
@@ -96,28 +105,28 @@ def response_thread(self, response_sender, in_input, output_dtype):
 
         for idx in range(in_input.as_numpy()[0]):
             if in_input.is_cpu():
-                if in_input.as_numpy(
-                ).dtype.type is np.bytes_ or in_input.as_numpy(
-                ).dtype == np.object_:
+                if (
+                    in_input.as_numpy().dtype.type is np.bytes_
+                    or in_input.as_numpy().dtype == np.object_
+                ):
                     out_0 = in_input.as_numpy().astype(np.int32)
-                    out_tensor = pb_utils.Tensor("OUT",
-                                                 out_0.astype(output_dtype))
+                    out_tensor = pb_utils.Tensor("OUT", out_0.astype(output_dtype))
                 else:
                     in_0_pytorch = from_dlpack(in_input.to_dlpack())
                     out_0 = in_0_pytorch
                     if output_dtype == np.object_:
                         out_tensor = pb_utils.Tensor(
-                            "OUT",
-                            out_0.numpy().astype(output_dtype))
+                            "OUT", out_0.numpy().astype(output_dtype)
+                        )
                     else:
                         out_0 = out_0.type(numpy_to_pytorch_dtype[output_dtype])
                         out_tensor = pb_utils.Tensor.from_dlpack(
-                            "OUT", to_dlpack(out_0))
+                            "OUT", to_dlpack(out_0)
+                        )
             else:
                 in_0_pytorch = from_dlpack(in_input.to_dlpack()).cuda()
                 out_0 = in_0_pytorch
-                out_tensor = pb_utils.Tensor.from_dlpack(
-                    "OUTPUT0", to_dlpack(out_0))
+                out_tensor = pb_utils.Tensor.from_dlpack("OUTPUT0", to_dlpack(out_0))
 
             response = pb_utils.InferenceResponse(output_tensors=[out_tensor])
             response_sender.send(response)
@@ -126,8 +135,7 @@ def response_thread(self, response_sender, in_input, output_dtype):
         # done sending responses for the corresponding request. We can't use the
         # response sender after closing it. The response sender is closed by
         # setting the TRITONSERVER_RESPONSE_COMPLETE_FINAL.
-        response_sender.send(
-            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+        response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
 
         with self.inflight_thread_count_lck:
             self.inflight_thread_count -= 1
diff --git a/qa/python_models/dlpack_sub_add/model.py b/qa/python_models/dlpack_sub_add/model.py
old mode 100644
new mode 100755
index af07874a9f..bb5d7531b5
--- a/qa/python_models/dlpack_sub_add/model.py
+++ b/qa/python_models/dlpack_sub_add/model.py
@@ -1,4 +1,6 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,27 +26,27 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
-from torch.utils.dlpack import to_dlpack, from_dlpack
-import torch
-import numpy as np
 import json
 
+import numpy as np
+import torch
+import triton_python_backend_utils as pb_utils
+from torch.utils.dlpack import from_dlpack, to_dlpack
 
-class TritonPythonModel:
 
+class TritonPythonModel:
     def initialize(self, args):
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
-        output0_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT0")
-        output1_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT1")
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
 
         self.output0_dtype = pb_utils.triton_string_to_numpy(
-            output0_config['data_type'])
+            output0_config["data_type"]
+        )
         self.output1_dtype = pb_utils.triton_string_to_numpy(
-            output1_config['data_type'])
+            output1_config["data_type"]
+        )
         self.numpy_to_pytorch_dtype = {
             np.bool_: torch.bool,
             np.uint8: torch.uint8,
@@ -68,52 +70,63 @@ def execute(self, requests):
 
             # If both of the tensors are in CPU, use NumPy.
             if in_0.is_cpu() and in_1.is_cpu():
-                if in_0.as_numpy().dtype.type is np.bytes_ or in_0.as_numpy(
-                ).dtype == np.object_:
-                    out_0, out_1 = (in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32),\
-                        in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32))
-                    out_tensor_0 = pb_utils.Tensor("OUTPUT0",
-                                                   out_0.astype(output0_dtype))
-                    out_tensor_1 = pb_utils.Tensor("OUTPUT1",
-                                                   out_1.astype(output1_dtype))
+                if (
+                    in_0.as_numpy().dtype.type is np.bytes_
+                    or in_0.as_numpy().dtype == np.object_
+                ):
+                    out_0, out_1 = (
+                        in_0.as_numpy().astype(np.int32)
+                        - in_1.as_numpy().astype(np.int32),
+                        in_0.as_numpy().astype(np.int32)
+                        + in_1.as_numpy().astype(np.int32),
+                    )
+                    out_tensor_0 = pb_utils.Tensor(
+                        "OUTPUT0", out_0.astype(output0_dtype)
+                    )
+                    out_tensor_1 = pb_utils.Tensor(
+                        "OUTPUT1", out_1.astype(output1_dtype)
+                    )
                 else:
                     in_0_pytorch, in_1_pytorch = from_dlpack(
-                        in_0.to_dlpack()), from_dlpack(in_1.to_dlpack())
-                    out_0, out_1 = (in_0_pytorch - in_1_pytorch,
-                                    in_0_pytorch + in_1_pytorch)
+                        in_0.to_dlpack()
+                    ), from_dlpack(in_1.to_dlpack())
+                    out_0, out_1 = (
+                        in_0_pytorch - in_1_pytorch,
+                        in_0_pytorch + in_1_pytorch,
+                    )
 
                     if self.output0_dtype == np.object_:
                         out_tensor_0 = pb_utils.Tensor(
-                            "OUTPUT0",
-                            out_0.numpy().astype(output0_dtype))
+                            "OUTPUT0", out_0.numpy().astype(output0_dtype)
+                        )
                     else:
-                        out_0 = out_0.type(
-                            self.numpy_to_pytorch_dtype[output0_dtype])
+                        out_0 = out_0.type(self.numpy_to_pytorch_dtype[output0_dtype])
                         out_tensor_0 = pb_utils.Tensor.from_dlpack(
-                            "OUTPUT0", to_dlpack(out_0))
+                            "OUTPUT0", to_dlpack(out_0)
+                        )
 
                     if self.output1_dtype == np.object_:
                         out_tensor_1 = pb_utils.Tensor(
-                            "OUTPUT1",
-                            out_1.numpy().astype(output1_dtype))
+                            "OUTPUT1", out_1.numpy().astype(output1_dtype)
+                        )
                     else:
-                        out_1 = out_1.type(
-                            self.numpy_to_pytorch_dtype[output1_dtype])
+                        out_1 = out_1.type(self.numpy_to_pytorch_dtype[output1_dtype])
                         out_tensor_1 = pb_utils.Tensor.from_dlpack(
-                            "OUTPUT1", to_dlpack(out_1))
+                            "OUTPUT1", to_dlpack(out_1)
+                        )
 
             else:
-                in_0_pytorch, in_1_pytorch = from_dlpack(
-                    in_0.to_dlpack()).cuda(), from_dlpack(
-                        in_1.to_dlpack()).cuda()
-                out_0, out_1 = (in_0_pytorch - in_1_pytorch,
-                                in_0_pytorch + in_1_pytorch)
-                out_tensor_0 = pb_utils.Tensor.from_dlpack(
-                    "OUTPUT0", to_dlpack(out_0))
-                out_tensor_1 = pb_utils.Tensor.from_dlpack(
-                    "OUTPUT1", to_dlpack(out_1))
+                in_0_pytorch, in_1_pytorch = (
+                    from_dlpack(in_0.to_dlpack()).cuda(),
+                    from_dlpack(in_1.to_dlpack()).cuda(),
+                )
+                out_0, out_1 = (
+                    in_0_pytorch - in_1_pytorch,
+                    in_0_pytorch + in_1_pytorch,
+                )
+                out_tensor_0 = pb_utils.Tensor.from_dlpack("OUTPUT0", to_dlpack(out_0))
+                out_tensor_1 = pb_utils.Tensor.from_dlpack("OUTPUT1", to_dlpack(out_1))
 
-            responses.append(
-                pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
+            responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
 
         return responses
diff --git a/qa/python_models/dlpack_test/model.py b/qa/python_models/dlpack_test/model.py
old mode 100644
new mode 100755
index 2beab4af7c..8a461c74db
--- a/qa/python_models/dlpack_test/model.py
+++ b/qa/python_models/dlpack_test/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,56 +26,61 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import unittest
+
 import cupy as cp
 import numpy as np
-import unittest
 import torch
-from torch.utils.dlpack import from_dlpack, to_dlpack
 import triton_python_backend_utils as pb_utils
+from torch.utils.dlpack import from_dlpack, to_dlpack
 
 
 class PBTensorTest(unittest.TestCase):
-
     def test_pytorch_dlpack(self):
         # Test different dtypes
         pytorch_dtypes = [
-            torch.float16, torch.float32, torch.float64, torch.int8,
-            torch.int16, torch.int32, torch.int64, torch.uint8
+            torch.float16,
+            torch.float32,
+            torch.float64,
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.uint8,
         ]
 
         for pytorch_dtype in pytorch_dtypes:
             pytorch_tensor = torch.ones([100], dtype=pytorch_dtype)
             dlpack_tensor = to_dlpack(pytorch_tensor)
-            pb_tensor = pb_utils.Tensor.from_dlpack('test_tensor',
-                                                    dlpack_tensor)
+            pb_tensor = pb_utils.Tensor.from_dlpack("test_tensor", dlpack_tensor)
             self.assertTrue(
-                np.array_equal(pb_tensor.as_numpy(), pytorch_tensor.numpy()))
+                np.array_equal(pb_tensor.as_numpy(), pytorch_tensor.numpy())
+            )
 
             # Convert the tensor back to DLPack and ensure that both tensors are
             # the same
             pytorch_tensor_dlpack = from_dlpack(pb_tensor.to_dlpack())
             self.assertTrue(torch.equal(pytorch_tensor_dlpack, pytorch_tensor))
 
-            self.assertEqual(pytorch_tensor.type(),
-                             pytorch_tensor_dlpack.type())
+            self.assertEqual(pytorch_tensor.type(), pytorch_tensor_dlpack.type())
 
             # Now let's check that upgraded DLPack implementation also
             # works as expected, i.e. from_dlpack should work with
             # external pytorch tensor directly
 
             pb_tensor_upgraded = pb_utils.Tensor.from_dlpack(
-                'test_tensor', pytorch_tensor)
+                "test_tensor", pytorch_tensor
+            )
             self.assertTrue(
-                np.array_equal(pb_tensor_upgraded.as_numpy(),
-                               pytorch_tensor.numpy()))
+                np.array_equal(pb_tensor_upgraded.as_numpy(), pytorch_tensor.numpy())
+            )
 
             # Here we check that `pb_tensor` as a producer, properly
             # invokes `__dlpack__` and `__dlpack_device__`
             pytorch_tensor_dlpack = from_dlpack(pb_tensor_upgraded)
             self.assertTrue(torch.equal(pytorch_tensor_dlpack, pytorch_tensor))
 
-            self.assertEqual(pytorch_tensor.type(),
-                             pytorch_tensor_dlpack.type())
+            self.assertEqual(pytorch_tensor.type(), pytorch_tensor_dlpack.type())
 
     def test_non_contiguous_error(self):
         pytorch_tensor = torch.rand([20, 30], dtype=torch.float16)
@@ -82,118 +89,114 @@ def test_non_contiguous_error(self):
         pytorch_tensor = torch.transpose(pytorch_tensor, 0, 1)
 
         with self.assertRaises(Exception) as e:
-            pb_utils.Tensor.from_dlpack('test_tensor',
-                                        to_dlpack(pytorch_tensor))
+            pb_utils.Tensor.from_dlpack("test_tensor", to_dlpack(pytorch_tensor))
         self.assertTrue(
-            str(e.exception) ==
-            'DLPack tensor is not contiguous. Only contiguous DLPack tensors that are stored in C-Order are supported.'
+            str(e.exception)
+            == "DLPack tensor is not contiguous. Only contiguous DLPack tensors that are stored in C-Order are supported."
         )
 
     def test_dlpack_string_tensor(self):
-        np_object = np.array(['An Example String'], dtype=np.object_)
-        pb_tensor = pb_utils.Tensor('test_tensor', np_object)
+        np_object = np.array(["An Example String"], dtype=np.object_)
+        pb_tensor = pb_utils.Tensor("test_tensor", np_object)
 
         with self.assertRaises(Exception) as e:
             pb_tensor.to_dlpack()
 
         self.assertTrue(
-            str(e.exception) ==
-            'DLPack does not have support for string tensors.')
+            str(e.exception) == "DLPack does not have support for string tensors."
+        )
 
     def test_dlpack_gpu_tensors(self):
         # Test different dtypes
         # PyTorch does not support DLPack bool type yet:
         # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/DLConvertor.cpp
         pytorch_dtypes = [
-            torch.float16, torch.float32, torch.float64, torch.int8,
-            torch.int16, torch.int32, torch.int64, torch.uint8
+            torch.float16,
+            torch.float32,
+            torch.float64,
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.uint8,
         ]
 
         for pytorch_dtype in pytorch_dtypes:
-            pytorch_tensor = torch.ones([100],
-                                        dtype=pytorch_dtype,
-                                        device='cuda')
+            pytorch_tensor = torch.ones([100], dtype=pytorch_dtype, device="cuda")
             dlpack_tensor = to_dlpack(pytorch_tensor)
-            pb_tensor = pb_utils.Tensor.from_dlpack('test_tensor',
-                                                    dlpack_tensor)
+            pb_tensor = pb_utils.Tensor.from_dlpack("test_tensor", dlpack_tensor)
 
             # Convert the tensor back to DLPack and ensure that both tensors are
             # the same
             pytorch_tensor_dlpack = from_dlpack(pb_tensor.to_dlpack())
             self.assertTrue(torch.equal(pytorch_tensor_dlpack, pytorch_tensor))
-            self.assertEqual(pytorch_tensor.type(),
-                             pytorch_tensor_dlpack.type())
+            self.assertEqual(pytorch_tensor.type(), pytorch_tensor_dlpack.type())
 
             # Now we make sure that updated DLPack implementation works
             # with GPU as well
-            pb_tensor = pb_utils.Tensor.from_dlpack('test_tensor',
-                                                    pytorch_tensor)
+            pb_tensor = pb_utils.Tensor.from_dlpack("test_tensor", pytorch_tensor)
             pytorch_tensor_dlpack = from_dlpack(pb_tensor)
             self.assertTrue(torch.equal(pytorch_tensor_dlpack, pytorch_tensor))
-            self.assertEqual(pytorch_tensor.type(),
-                             pytorch_tensor_dlpack.type())
+            self.assertEqual(pytorch_tensor.type(), pytorch_tensor_dlpack.type())
 
     def test_dlpack_gpu_numpy(self):
         # DLPack tesnors that are in GPU cannot be converted to NumPy
-        pytorch_tensor = torch.rand([100], dtype=torch.float16,
-                                    device='cuda') * 100
-        pb_tensor = pb_utils.Tensor.from_dlpack('tensor',
-                                                to_dlpack(pytorch_tensor))
+        pytorch_tensor = torch.rand([100], dtype=torch.float16, device="cuda") * 100
+        pb_tensor = pb_utils.Tensor.from_dlpack("tensor", to_dlpack(pytorch_tensor))
         # Make sure that `__dlpack_device__` works as expected
         self.assertFalse(pb_tensor.is_cpu())
         self.assertTrue(pytorch_tensor.is_cuda)
-        self.assertEqual(pb_tensor.__dlpack_device__(),
-                         pytorch_tensor.__dlpack_device__())
+        self.assertEqual(
+            pb_tensor.__dlpack_device__(), pytorch_tensor.__dlpack_device__()
+        )
 
         with self.assertRaises(Exception) as e:
             pb_tensor.as_numpy()
         self.assertTrue(
-            str(e.exception) ==
-            'Tensor is stored in GPU and cannot be converted to NumPy.')
+            str(e.exception)
+            == "Tensor is stored in GPU and cannot be converted to NumPy."
+        )
 
     def test_dlpack_cpu_numpy(self):
         # Check compatibiity of PbTensor DLPack implementation
         # with numpy
-        pytorch_tensor = torch.rand([100], dtype=torch.float16,
-                                    device='cpu') * 100
-        pb_tensor = pb_utils.Tensor.from_dlpack('tensor', pytorch_tensor)
+        pytorch_tensor = torch.rand([100], dtype=torch.float16, device="cpu") * 100
+        pb_tensor = pb_utils.Tensor.from_dlpack("tensor", pytorch_tensor)
         numpy_tensor_dlpack = np.from_dlpack(pb_tensor)
-        self.assertTrue(
-            np.array_equal(numpy_tensor_dlpack, pytorch_tensor.numpy()))
+        self.assertTrue(np.array_equal(numpy_tensor_dlpack, pytorch_tensor.numpy()))
         # Make sure that `__dlpack_device__` works as expected
         self.assertTrue(pb_tensor.is_cpu())
         self.assertFalse(pytorch_tensor.is_cuda)
-        self.assertEqual(pb_tensor.__dlpack_device__(),
-                         pytorch_tensor.__dlpack_device__())
+        self.assertEqual(
+            pb_tensor.__dlpack_device__(), pytorch_tensor.__dlpack_device__()
+        )
 
     def test_bool_datatype(self):
         # [FIXME] pass bool_array directly to `pb_utils.Tensor.from_dlpack`,
         # when numpy release supports DLPack bool type
         bool_array = np.asarray([False, True])
-        bool_tensor = pb_utils.Tensor('tensor', bool_array)
-        bool_tensor_dlpack = pb_utils.Tensor.from_dlpack('tensor', bool_tensor)
-        self.assertTrue(
-            np.array_equal(bool_array, bool_tensor_dlpack.as_numpy()))
+        bool_tensor = pb_utils.Tensor("tensor", bool_array)
+        bool_tensor_dlpack = pb_utils.Tensor.from_dlpack("tensor", bool_tensor)
+        self.assertTrue(np.array_equal(bool_array, bool_tensor_dlpack.as_numpy()))
 
     def test_cuda_multi_stream(self):
         # Test that external stream syncs with the default
         # and pb_tensor has proper data
         size = 5000
-        pytorch_tensor_1 = torch.tensor([0, 0, 0, 0], device='cuda')
-        pytorch_tensor_2 = torch.tensor([0, 0, 0, 0], device='cuda')
-        expected_output = torch.tensor([2, 2, 2, 2], device='cuda')
+        pytorch_tensor_1 = torch.tensor([0, 0, 0, 0], device="cuda")
+        pytorch_tensor_2 = torch.tensor([0, 0, 0, 0], device="cuda")
+        expected_output = torch.tensor([2, 2, 2, 2], device="cuda")
         s1 = torch.cuda.Stream()
         with torch.cuda.stream(s1):
-            matrix_a = torch.randn(size, size, device='cuda')
+            matrix_a = torch.randn(size, size, device="cuda")
             res = torch.matmul(matrix_a, matrix_a)
             for _ in range(1000):
                 res = torch.matmul(res, matrix_a)
-            pytorch_tensor_1 += torch.tensor([2, 2, 2, 2], device='cuda')
-            pytorch_tensor_2 += torch.tensor([2, 2, 2, 2], device='cuda')
+            pytorch_tensor_1 += torch.tensor([2, 2, 2, 2], device="cuda")
+            pytorch_tensor_2 += torch.tensor([2, 2, 2, 2], device="cuda")
 
-        pb_tensor_1 = pb_utils.Tensor.from_dlpack('tensor', pytorch_tensor_1)
-        pb_tensor_2 = pb_utils.Tensor.from_dlpack('tensor',
-                                                  to_dlpack(pytorch_tensor_2))
+        pb_tensor_1 = pb_utils.Tensor.from_dlpack("tensor", pytorch_tensor_1)
+        pb_tensor_2 = pb_utils.Tensor.from_dlpack("tensor", to_dlpack(pytorch_tensor_2))
         pytorch_tensor_dlpack = from_dlpack(pb_tensor_1)
         self.assertTrue(torch.equal(pytorch_tensor_dlpack, expected_output))
         pytorch_tensor_dlpack = from_dlpack(pb_tensor_2)
@@ -213,17 +216,16 @@ def test_cuda_non_blocking_multi_stream(self):
                 res = cp.matmul(res, matrix_a)
             cupy_tensor += cp.array([2, 2, 2, 2])
 
-        pb_tensor = pb_utils.Tensor.from_dlpack('tensor', cupy_tensor)
+        pb_tensor = pb_utils.Tensor.from_dlpack("tensor", cupy_tensor)
         # Verify that non-blocking stream has no pending jobs left
         self.assertTrue(non_blocking_stream.done)
         cupy_tensor_dlpack = cp.from_dlpack(pb_tensor)
         self.assertTrue(cp.array_equal(cupy_tensor_dlpack, expected_output))
         self.assertFalse(pb_tensor.is_cpu())
-        self.assertEqual(pb_tensor.__dlpack_device__(),
-                         cupy_tensor.__dlpack_device__())
+        self.assertEqual(pb_tensor.__dlpack_device__(), cupy_tensor.__dlpack_device__())
 
     def test_cuda_multi_gpu(self):
-        # Test that when `pb_utils.Tensor.from_dlpack` is called on defferent
+        # Test that when `pb_utils.Tensor.from_dlpack` is called on different
         # GPU from where external tensor is stored, we receive a pointer
         # and all pending work on different GPU's default stream
         # on external tensor is done
@@ -239,7 +241,7 @@ def test_cuda_multi_gpu(self):
                 res = cp.matmul(res, matrix_a)
             cupy_tensor += cp.array([2, 2, 2, 2])
         with cp.cuda.Device(0):
-            pb_tensor = pb_utils.Tensor.from_dlpack('tensor', cupy_tensor)
+            pb_tensor = pb_utils.Tensor.from_dlpack("tensor", cupy_tensor)
             with cp.cuda.Device(1):
                 # To make sure that the default stream is done with
                 # all compute work
@@ -251,11 +253,10 @@ def test_cuda_multi_gpu(self):
 
         self.assertFalse(pb_tensor.is_cpu())
         self.assertEqual(pb_tensor.__dlpack_device__(), expected_dlpack_device)
-        self.assertEqual(pb_tensor.__dlpack_device__(),
-                         cupy_tensor.__dlpack_device__())
+        self.assertEqual(pb_tensor.__dlpack_device__(), cupy_tensor.__dlpack_device__())
 
     def test_cuda_blocking_stream_multi_gpu(self):
-        # Test that when `pb_utils.Tensor.from_dlpack` is called on defferent
+        # Test that when `pb_utils.Tensor.from_dlpack` is called on different
         # GPU from where external tensor is stored, we receive a pointer
         # and all pending work on different GPU's a blocking stream
         # on external tensor is done
@@ -273,7 +274,7 @@ def test_cuda_blocking_stream_multi_gpu(self):
                     res = cp.matmul(res, matrix_a)
                 cupy_tensor += cp.array([2, 2, 2, 2])
         with cp.cuda.Device(0):
-            pb_tensor = pb_utils.Tensor.from_dlpack('tensor', cupy_tensor)
+            pb_tensor = pb_utils.Tensor.from_dlpack("tensor", cupy_tensor)
             with cp.cuda.Device(1):
                 # To make sure that blocking stream is done with
                 # all compute work
@@ -285,11 +286,10 @@ def test_cuda_blocking_stream_multi_gpu(self):
 
         self.assertFalse(pb_tensor.is_cpu())
         self.assertEqual(pb_tensor.__dlpack_device__(), expected_dlpack_device)
-        self.assertEqual(pb_tensor.__dlpack_device__(),
-                         cupy_tensor.__dlpack_device__())
+        self.assertEqual(pb_tensor.__dlpack_device__(), cupy_tensor.__dlpack_device__())
 
     def test_cuda_non_blocking_stream_multi_gpu(self):
-        # Test that when `pb_utils.Tensor.from_dlpack` is called on defferent
+        # Test that when `pb_utils.Tensor.from_dlpack` is called on different
         # GPU from where external tensor is stored, we receive a pointer
         # and all pending work on different GPU's non-blocking stream
         # on external tensor is done.
@@ -311,7 +311,7 @@ def test_cuda_non_blocking_stream_multi_gpu(self):
                     res = cp.matmul(res, matrix_a)
                 cupy_tensor += cp.array([2, 2, 2, 2])
         with cp.cuda.Device(0):
-            pb_tensor = pb_utils.Tensor.from_dlpack('tensor', cupy_tensor)
+            pb_tensor = pb_utils.Tensor.from_dlpack("tensor", cupy_tensor)
             with cp.cuda.Device(2):
                 # To make sure that non_blocking stream is done with
                 # all compute work
@@ -323,22 +323,23 @@ def test_cuda_non_blocking_stream_multi_gpu(self):
 
         self.assertFalse(pb_tensor.is_cpu())
         self.assertEqual(pb_tensor.__dlpack_device__(), expected_dlpack_device)
-        self.assertEqual(pb_tensor.__dlpack_device__(),
-                         cupy_tensor.__dlpack_device__())
+        self.assertEqual(pb_tensor.__dlpack_device__(), cupy_tensor.__dlpack_device__())
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         responses = []
         for _ in requests:
             # Run the unittest and store the results in InferenceResponse.
-            test = unittest.main('model', exit=False)
+            test = unittest.main("model", exit=False)
             responses.append(
-                pb_utils.InferenceResponse([
-                    pb_utils.Tensor(
-                        'OUTPUT0',
-                        np.array([test.result.wasSuccessful()],
-                                 dtype=np.float16))
-                ]))
+                pb_utils.InferenceResponse(
+                    [
+                        pb_utils.Tensor(
+                            "OUTPUT0",
+                            np.array([test.result.wasSuccessful()], dtype=np.float16),
+                        )
+                    ]
+                )
+            )
         return responses
diff --git a/qa/python_models/execute_error/model.py b/qa/python_models/execute_error/model.py
old mode 100644
new mode 100755
index ebe476c2bf..72a8bd482d
--- a/qa/python_models/execute_error/model.py
+++ b/qa/python_models/execute_error/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,10 +30,8 @@
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
-        """ This function is called on inference request.
-        """
+        """This function is called on inference request."""
         responses = []
 
         # Generate the error for the first and third request
@@ -40,15 +40,12 @@ def execute(self, requests):
             input_tensor = pb_utils.get_input_tensor_by_name(request, "IN")
             out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy())
             if i == 0:
-                error = pb_utils.TritonError(
-                    'An error occured during execution')
-                responses.append(pb_utils.InferenceResponse([out_tensor],
-                                                            error))
+                error = pb_utils.TritonError("An error occurred during execution")
+                responses.append(pb_utils.InferenceResponse([out_tensor], error))
             elif i == 1:
                 responses.append(pb_utils.InferenceResponse([out_tensor]))
             elif i == 2:
-                error = pb_utils.TritonError(
-                    'An error occured during execution')
+                error = pb_utils.TritonError("An error occurred during execution")
                 responses.append(pb_utils.InferenceResponse(error=error))
             i += 1
 
diff --git a/qa/python_models/execute_return_error/model.py b/qa/python_models/execute_return_error/model.py
old mode 100644
new mode 100755
index 29367d4a0d..3c38ca615a
--- a/qa/python_models/execute_return_error/model.py
+++ b/qa/python_models/execute_return_error/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -26,7 +28,6 @@
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
         self._i = -1
 
diff --git a/qa/python_models/fini_error/model.py b/qa/python_models/fini_error/model.py
old mode 100644
new mode 100755
index 3f8c1ab5f3..bd95355091
--- a/qa/python_models/fini_error/model.py
+++ b/qa/python_models/fini_error/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,7 +30,6 @@
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         """
         The body of this model doesn't matter. The main purpose of this model is
diff --git a/qa/python_models/ground_truth/model.py b/qa/python_models/ground_truth/model.py
old mode 100644
new mode 100755
index ee04c3a073..d79b97ec2c
--- a/qa/python_models/ground_truth/model.py
+++ b/qa/python_models/ground_truth/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,18 +26,18 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
 import time
 
+import triton_python_backend_utils as pb_utils
 
-class TritonPythonModel:
 
+class TritonPythonModel:
     def execute(self, requests):
         """
         Mock Model that uses the input data to determine how long to wait
         before returning identity data
         """
-        assert (len(requests) == 1)
+        assert len(requests) == 1
         delay = 0
         request = requests[0]
         responses = []
diff --git a/qa/python_models/identity_fp32/model.py b/qa/python_models/identity_fp32/model.py
old mode 100644
new mode 100755
index 4273977263..ab9d76a97f
--- a/qa/python_models/identity_fp32/model.py
+++ b/qa/python_models/identity_fp32/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,7 +30,6 @@
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         """
         Identity model in Python backend.
diff --git a/qa/python_models/identity_fp32_logging/model.py b/qa/python_models/identity_fp32_logging/model.py
old mode 100644
new mode 100755
index 9bc24ce488..411a00794e
--- a/qa/python_models/identity_fp32_logging/model.py
+++ b/qa/python_models/identity_fp32_logging/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,7 +30,6 @@
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
         logger = pb_utils.Logger
         logger.log("Initialize-Specific Msg!", logger.INFO)
diff --git a/qa/python_models/identity_fp32_timeout/model.py b/qa/python_models/identity_fp32_timeout/model.py
old mode 100644
new mode 100755
index 7235e33d83..a6c6703d16
--- a/qa/python_models/identity_fp32_timeout/model.py
+++ b/qa/python_models/identity_fp32_timeout/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,12 +26,12 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
 import time
 
+import triton_python_backend_utils as pb_utils
 
-class TritonPythonModel:
 
+class TritonPythonModel:
     def execute(self, requests):
         """
         Identity model in Python backend.
diff --git a/qa/python_models/init_args/model.py b/qa/python_models/init_args/model.py
old mode 100644
new mode 100755
index 8303d7ebcd..f3a5e6dbc4
--- a/qa/python_models/init_args/model.py
+++ b/qa/python_models/init_args/model.py
@@ -81,4 +81,4 @@ def execute(self, requests):
             out_args = pb_utils.Tensor(
                 "OUT", np.array([correct_keys], dtype=np.float32))
             responses.append(pb_utils.InferenceResponse([out_args]))
-        return responses
+        return responses
\ No newline at end of file
diff --git a/qa/python_models/init_error/model.py b/qa/python_models/init_error/model.py
old mode 100644
new mode 100755
index 11c6a6fb07..fff43a8f2d
--- a/qa/python_models/init_error/model.py
+++ b/qa/python_models/init_error/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,9 +30,8 @@
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
-        self.model_config = args['model_config']
+        self.model_config = args["model_config"]
         lorem_ipsum
 
     def execute(self, requests):
diff --git a/qa/python_models/init_exit/model.py b/qa/python_models/init_exit/model.py
old mode 100644
new mode 100755
index fa41343a65..10966b86d9
--- a/qa/python_models/init_exit/model.py
+++ b/qa/python_models/init_exit/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -25,12 +27,11 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
-import time
 import signal
+import time
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
         time.sleep(3)
         # Simulate the case that the model goes out of memory and gets killed
diff --git a/qa/python_models/model_env/model.py b/qa/python_models/model_env/model.py
old mode 100644
new mode 100755
index 0eff470394..0785d6bc2a
--- a/qa/python_models/model_env/model.py
+++ b/qa/python_models/model_env/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,17 +27,18 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+
 import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
         # Make sure that environment variables are correctly propagated
         # to the Python models
-        if "MY_ENV" not in os.environ or os.environ["MY_ENV"] != 'MY_ENV':
+        if "MY_ENV" not in os.environ or os.environ["MY_ENV"] != "MY_ENV":
             raise pb_utils.TritonModelException(
-                "MY_ENV doesn't exists or contains incorrect value")
+                "MY_ENV doesn't exists or contains incorrect value"
+            )
 
     def execute(self, requests):
         pass
diff --git a/qa/python_models/model_init_del/model.py b/qa/python_models/model_init_del/model.py
old mode 100644
new mode 100755
index f204391e03..d0a9cc37a5
--- a/qa/python_models/model_init_del/model.py
+++ b/qa/python_models/model_init_del/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,18 +26,17 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
-
-import sys
 import os
+import sys
 import time
 
+import triton_python_backend_utils as pb_utils
+
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
-from util import inc_count, get_delay
+from util import get_delay, inc_count
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
         inc_count("initialize")
         self.__sleep("initialize")
diff --git a/qa/python_models/model_init_del/util.py b/qa/python_models/model_init_del/util.py
old mode 100644
new mode 100755
index 10b9df724a..f7d5c30d6b
--- a/qa/python_models/model_init_del/util.py
+++ b/qa/python_models/model_init_del/util.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,8 +26,8 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
 import fcntl
+import os
 
 __model_name = "model_init_del"
 
diff --git a/qa/python_models/multi_file/file1.py b/qa/python_models/multi_file/file1.py
old mode 100644
new mode 100755
index 3e6706ade9..46b6d76934
--- a/qa/python_models/multi_file/file1.py
+++ b/qa/python_models/multi_file/file1.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,4 +26,4 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-FILE_NAME = 'FILE1'
+FILE_NAME = "FILE1"
diff --git a/qa/python_models/multi_file/file2.py b/qa/python_models/multi_file/file2.py
old mode 100644
new mode 100755
index 2b73ab0e3d..b7174da748
--- a/qa/python_models/multi_file/file2.py
+++ b/qa/python_models/multi_file/file2.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,4 +26,4 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-FILE_NAME = 'FILE2'
+FILE_NAME = "FILE2"
diff --git a/qa/python_models/multi_file/model.py b/qa/python_models/multi_file/model.py
old mode 100644
new mode 100755
index a5a55002aa..d48f71456f
--- a/qa/python_models/multi_file/model.py
+++ b/qa/python_models/multi_file/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,16 +27,15 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import file1
-from . import file2
-
 import triton_python_backend_utils as pb_utils
 
+from . import file2
 
-class TritonPythonModel:
 
+class TritonPythonModel:
     def initialize(self, args):
-        if file1.FILE_NAME != 'FILE1' or file2.FILE_NAME != 'FILE2':
-            raise pb_utils.TritonModelException('Imports do not work')
+        if file1.FILE_NAME != "FILE1" or file2.FILE_NAME != "FILE2":
+            raise pb_utils.TritonModelException("Imports do not work")
 
     def execute(self, requests):
         pass
diff --git a/qa/python_models/non_contiguous/model.py b/qa/python_models/non_contiguous/model.py
old mode 100644
new mode 100755
index c8cb4b5570..68fcec97b5
--- a/qa/python_models/non_contiguous/model.py
+++ b/qa/python_models/non_contiguous/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,7 +31,6 @@
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         responses = []
         new_shape = [10, 2, 6, 5, 11]
@@ -40,8 +41,8 @@ def execute(self, requests):
             output0 = pb_utils.Tensor("OUTPUT0", input_numpy.reshape(new_shape))
             # Transpose the tensor to create a non-contiguous tensor.
             output1 = pb_utils.Tensor("OUTPUT1", input_numpy.T)
-            output2 = pb_utils.Tensor("OUTPUT2",
-                                      np.transpose(input_numpy, shape_reorder))
-            responses.append(
-                pb_utils.InferenceResponse([output0, output1, output2]))
+            output2 = pb_utils.Tensor(
+                "OUTPUT2", np.transpose(input_numpy, shape_reorder)
+            )
+            responses.append(pb_utils.InferenceResponse([output0, output1, output2]))
         return responses
diff --git a/qa/python_models/optional/model.py b/qa/python_models/optional/model.py
old mode 100644
new mode 100755
index 8e22d3b492..91d0c2117f
--- a/qa/python_models/optional/model.py
+++ b/qa/python_models/optional/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,12 +26,11 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
 import numpy as np
+import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         """Model supporting optional inputs. If the input is not provided, an
         input tensor of size 1 containing scalar 5 will be used."""
@@ -48,11 +49,10 @@ def execute(self, requests):
             else:
                 input1_numpy = np.array([5], dtype=np.int32)
 
-            output0_tensor = pb_utils.Tensor("OUTPUT0",
-                                             input0_numpy + input1_numpy)
-            output1_tensor = pb_utils.Tensor("OUTPUT1",
-                                             input0_numpy - input1_numpy)
+            output0_tensor = pb_utils.Tensor("OUTPUT0", input0_numpy + input1_numpy)
+            output1_tensor = pb_utils.Tensor("OUTPUT1", input0_numpy - input1_numpy)
             responses.append(
-                pb_utils.InferenceResponse([output0_tensor, output1_tensor]))
+                pb_utils.InferenceResponse([output0_tensor, output1_tensor])
+            )
 
         return responses
diff --git a/qa/python_models/python_version/model.py b/qa/python_models/python_version/model.py
old mode 100644
new mode 100755
index fdea123de4..720fa29a5f
--- a/qa/python_models/python_version/model.py
+++ b/qa/python_models/python_version/model.py
@@ -1,4 +1,6 @@
-# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,19 +26,19 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
-import sys
-import os
 import locale
+import os
+import sys
+
+import numpy as np
 import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-
     @staticmethod
     def auto_complete_config(auto_complete_model_config):
-        input = {'name': 'INPUT', 'data_type': 'TYPE_FP32', 'dims': [1]}
-        output = {'name': 'OUTPUT', 'data_type': 'TYPE_FP32', 'dims': [1]}
+        input = {"name": "INPUT", "data_type": "TYPE_FP32", "dims": [1]}
+        output = {"name": "OUTPUT", "data_type": "TYPE_FP32", "dims": [1]}
 
         auto_complete_model_config.set_max_batch_size(0)
         auto_complete_model_config.add_input(input)
@@ -46,20 +48,21 @@ def auto_complete_config(auto_complete_model_config):
 
     def initialize(self, args):
         import tensorflow
-        self.model_config = args['model_config']
+
+        self.model_config = args["model_config"]
         # This is to make sure that /bin/bash is not picking up
         # the wrong shared libraries after installing Tensorflow.
         # Tensorflow uses a shared library which is common with
         # bash.
-        os.system('/bin/bash --help')
+        os.system("/bin/bash --help")
         print(
-            f'Python version is {sys.version_info.major}.{sys.version_info.minor}, NumPy version is {np.version.version}, and Tensorflow version is {tensorflow.__version__}',
-            flush=True)
-        print(f'Locale is {locale.getlocale()}', flush=True)
+            f"Python version is {sys.version_info.major}.{sys.version_info.minor}, NumPy version is {np.version.version}, and Tensorflow version is {tensorflow.__version__}",
+            flush=True,
+        )
+        print(f"Locale is {locale.getlocale()}", flush=True)
 
     def execute(self, requests):
-        """ This function is called on inference request.
-        """
+        """This function is called on inference request."""
         responses = []
         for request in requests:
             input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
diff --git a/qa/python_models/pytorch_fp32_fp32/model.py b/qa/python_models/pytorch_fp32_fp32/model.py
old mode 100644
new mode 100755
index 4f11d3c726..232cb8754e
--- a/qa/python_models/pytorch_fp32_fp32/model.py
+++ b/qa/python_models/pytorch_fp32_fp32/model.py
@@ -1,4 +1,6 @@
-# Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,16 +27,13 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-
 import triton_python_backend_utils as pb_utils
 
 
 class Net(nn.Module):
-
     def __init__(self):
         super(Net, self).__init__()
         self.conv1 = nn.Conv2d(1, 32, 3, 1)
@@ -61,7 +60,6 @@ def forward(self, x):
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
         torch.manual_seed(0)
         self.model = Net()
diff --git a/qa/python_models/response_sender_error/model.py b/qa/python_models/response_sender_error/model.py
old mode 100644
new mode 100755
index 55570deec5..cbdd75c2a4
--- a/qa/python_models/response_sender_error/model.py
+++ b/qa/python_models/response_sender_error/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,30 +27,31 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import json
+
 import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-    """ This model tries to create a response sender in
+    """This model tries to create a response sender in
     a model that is not configured with decoupled
     model transaction policy.
     """
 
     def initialize(self, args):
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
-        output0_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT0")
-        output1_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT1")
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
 
         self.output0_dtype = pb_utils.triton_string_to_numpy(
-            output0_config['data_type'])
+            output0_config["data_type"]
+        )
         self.output1_dtype = pb_utils.triton_string_to_numpy(
-            output1_config['data_type'])
+            output1_config["data_type"]
+        )
 
     def execute(self, requests):
-        """ Tries to create a response sender object and use that
+        """Tries to create a response sender object and use that
         for sending the response.
         """
 
@@ -59,15 +62,16 @@ def execute(self, requests):
             response_sender = request.get_response_sender()
             in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
             in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
-            out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(),
-                            in_0.as_numpy() - in_1.as_numpy())
+            out_0, out_1 = (
+                in_0.as_numpy() + in_1.as_numpy(),
+                in_0.as_numpy() - in_1.as_numpy(),
+            )
 
-            out_tensor_0 = pb_utils.Tensor("OUTPUT0",
-                                           out_0.astype(output0_dtype))
-            out_tensor_1 = pb_utils.Tensor("OUTPUT1",
-                                           out_1.astype(output1_dtype))
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
+            out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
             response_sender.send(
-                pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
+                pb_utils.InferenceResponse([out_tensor_0, out_tensor_1])
+            )
             response_sender.close()
 
         return None
diff --git a/qa/python_models/sequence_int32/config.pbtxt b/qa/python_models/sequence_int32/config.pbtxt
index 988a5eef04..fb9236b347 100644
--- a/qa/python_models/sequence_int32/config.pbtxt
+++ b/qa/python_models/sequence_int32/config.pbtxt
@@ -43,7 +43,7 @@ input [
     name: "INPUT"
     data_type: TYPE_INT32
     dims: [ 1 ]
-    
+
   }
 ]
 output [
@@ -51,8 +51,8 @@ output [
     name: "OUTPUT"
     data_type: TYPE_INT32
     dims: [ 1 ]
-    
-    
+
+
   }
 ]
 sequence_batching {
diff --git a/qa/python_models/sequence_int32/model.py b/qa/python_models/sequence_int32/model.py
old mode 100644
new mode 100755
index c69ca10661..46766c5ef7
--- a/qa/python_models/sequence_int32/model.py
+++ b/qa/python_models/sequence_int32/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,21 +26,19 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
-import numpy as np
 import json
 
+import numpy as np
+import triton_python_backend_utils as pb_utils
+
 
 class TritonPythonModel:
-
     def initialize(self, args):
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
-        output_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT")
+        output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT")
 
-        self.output_dtype = pb_utils.triton_string_to_numpy(
-            output_config['data_type'])
+        self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"])
 
         self.accumulator = np.zeros(1)
         self.max_batch_size = model_config["max_batch_size"]
@@ -46,8 +46,8 @@ def initialize(self, args):
     def execute(self, requests):
         """
         This function is called on inference request.
-        It is derived from "create_tf_modelfile" in 
-        common/gen_qa_sequence_models.py and mantains
+        It is derived from "create_tf_modelfile" in
+        common/gen_qa_sequence_models.py and maintains
         a true accumulator when the max batch size is 0
 
         """
@@ -55,26 +55,39 @@ def execute(self, requests):
 
         responses = []
         for request in requests:
-            input_tensor = pb_utils.get_input_tensor_by_name(
-                request, "INPUT").as_numpy().astype(np.int32)
-            start_tensor = pb_utils.get_input_tensor_by_name(
-                request, "START").as_numpy().astype(np.int32)
-            ready_tensor = pb_utils.get_input_tensor_by_name(
-                request, "READY").as_numpy().astype(np.int32)
+            input_tensor = (
+                pb_utils.get_input_tensor_by_name(request, "INPUT")
+                .as_numpy()
+                .astype(np.int32)
+            )
+            start_tensor = (
+                pb_utils.get_input_tensor_by_name(request, "START")
+                .as_numpy()
+                .astype(np.int32)
+            )
+            ready_tensor = (
+                pb_utils.get_input_tensor_by_name(request, "READY")
+                .as_numpy()
+                .astype(np.int32)
+            )
 
             if self.max_batch_size == 0:
-                tmp = np.where(np.equal(start_tensor, 1), input_tensor,
-                               np.add(self.accumulator, input_tensor))
-                newacc = np.where(np.equal(ready_tensor, 1), tmp,
-                                  self.accumulator)
+                tmp = np.where(
+                    np.equal(start_tensor, 1),
+                    input_tensor,
+                    np.add(self.accumulator, input_tensor),
+                )
+                newacc = np.where(np.equal(ready_tensor, 1), tmp, self.accumulator)
                 self.accumulator = newacc
                 out_tensor = pb_utils.Tensor(
-                    "OUTPUT", self.accumulator.astype(output_dtype))
+                    "OUTPUT", self.accumulator.astype(output_dtype)
+                )
             else:
                 tmp = np.where(
-                    np.equal(ready_tensor, 1), np.add(start_tensor,
-                                                      input_tensor),
-                    np.zeros(np.shape(input_tensor), dtype=output_dtype))
+                    np.equal(ready_tensor, 1),
+                    np.add(start_tensor, input_tensor),
+                    np.zeros(np.shape(input_tensor), dtype=output_dtype),
+                )
                 out_tensor = pb_utils.Tensor("OUTPUT", tmp.astype(output_dtype))
 
             responses.append(pb_utils.InferenceResponse([out_tensor]))
diff --git a/qa/python_models/string/model.py b/qa/python_models/string/model.py
old mode 100644
new mode 100755
index 7c4eb93623..f8d1c783c8
--- a/qa/python_models/string/model.py
+++ b/qa/python_models/string/model.py
@@ -1,4 +1,6 @@
-# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -42,8 +44,8 @@ def execute(self, requests):
         for request in requests:
             in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
             out_tensor_0 = pb_utils.Tensor(
-                "OUTPUT0",
-                in_0.as_numpy().astype(self._dtypes[self._index]))
+                "OUTPUT0", in_0.as_numpy().astype(self._dtypes[self._index])
+            )
             self._index += 1
             responses.append(pb_utils.InferenceResponse([out_tensor_0]))
         return responses
diff --git a/qa/python_models/string_fixed/model.py b/qa/python_models/string_fixed/model.py
old mode 100644
new mode 100755
index a925b2bb09..6fdee5084f
--- a/qa/python_models/string_fixed/model.py
+++ b/qa/python_models/string_fixed/model.py
@@ -1,4 +1,6 @@
-# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -44,16 +46,20 @@ def execute(self, requests):
         for _ in requests:
             if self._index == 0:
                 out_tensor_0 = pb_utils.Tensor(
-                    "OUTPUT0", np.array(['123456'], dtype=self._dtypes[0]))
+                    "OUTPUT0", np.array(["123456"], dtype=self._dtypes[0])
+                )
             elif self._index == 1:
                 out_tensor_0 = pb_utils.Tensor(
-                    "OUTPUT0", np.array([], dtype=self._dtypes[1]))
+                    "OUTPUT0", np.array([], dtype=self._dtypes[1])
+                )
             elif self._index == 2:
                 out_tensor_0 = pb_utils.Tensor(
-                    "OUTPUT0", np.array(['123456'], dtype=self._dtypes[0]))
+                    "OUTPUT0", np.array(["123456"], dtype=self._dtypes[0])
+                )
             elif self._index == 3:
                 out_tensor_0 = pb_utils.Tensor(
-                    "OUTPUT0", np.array([], dtype=self._dtypes[1]))
+                    "OUTPUT0", np.array([], dtype=self._dtypes[1])
+                )
             self._index += 1
             responses.append(pb_utils.InferenceResponse([out_tensor_0]))
         return responses
diff --git a/qa/python_models/string_identity/model.py b/qa/python_models/string_identity/model.py
old mode 100644
new mode 100755
index 39575c119b..593fdb08b3
--- a/qa/python_models/string_identity/model.py
+++ b/qa/python_models/string_identity/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,23 +26,21 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import sys
 import json
+import sys
 
-sys.path.append('../../')
+sys.path.append("../../")
 import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-    """This model always returns the input that it has received.
-    """
+    """This model always returns the input that it has received."""
 
     def initialize(self, args):
-        self.model_config = json.loads(args['model_config'])
+        self.model_config = json.loads(args["model_config"])
 
     def execute(self, requests):
-        """ This function is called on inference request.
-        """
+        """This function is called on inference request."""
 
         responses = []
         for request in requests:
diff --git a/qa/python_models/sub_add/model.py b/qa/python_models/sub_add/model.py
old mode 100644
new mode 100755
index 0a53874629..f5d7df549f
--- a/qa/python_models/sub_add/model.py
+++ b/qa/python_models/sub_add/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,32 +26,31 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
-import sys
 import json
+import sys
 
-sys.path.append('../../')
+import numpy as np
+
+sys.path.append("../../")
 import triton_python_backend_utils as pb_utils
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
-        self.model_config = model_config = json.loads(args['model_config'])
+        self.model_config = model_config = json.loads(args["model_config"])
 
-        output0_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT0")
-        output1_config = pb_utils.get_output_config_by_name(
-            model_config, "OUTPUT1")
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+        output1_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT1")
 
         self.output0_dtype = pb_utils.triton_string_to_numpy(
-            output0_config['data_type'])
+            output0_config["data_type"]
+        )
         self.output1_dtype = pb_utils.triton_string_to_numpy(
-            output1_config['data_type'])
+            output1_config["data_type"]
+        )
 
     def execute(self, requests):
-        """ This function is called on inference request.
-        """
+        """This function is called on inference request."""
 
         output0_dtype = self.output0_dtype
         output1_dtype = self.output1_dtype
@@ -59,18 +60,21 @@ def execute(self, requests):
             input_tensors = request.inputs()
             in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
             in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
-            if in_0.as_numpy().dtype.type is np.bytes_ or in_0.as_numpy(
-            ).dtype == np.object_:
-                out_0, out_1 = (in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32),\
-                    in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32))
+            if (
+                in_0.as_numpy().dtype.type is np.bytes_
+                or in_0.as_numpy().dtype == np.object_
+            ):
+                out_0, out_1 = (
+                    in_0.as_numpy().astype(np.int32) - in_1.as_numpy().astype(np.int32),
+                    in_0.as_numpy().astype(np.int32) + in_1.as_numpy().astype(np.int32),
+                )
             else:
-                out_0, out_1 = (in_0.as_numpy() - in_1.as_numpy(),
-                                in_0.as_numpy() + in_1.as_numpy())
+                out_0, out_1 = (
+                    in_0.as_numpy() - in_1.as_numpy(),
+                    in_0.as_numpy() + in_1.as_numpy(),
+                )
 
-            out_tensor_0 = pb_utils.Tensor("OUTPUT0",
-                                           out_0.astype(output0_dtype))
-            out_tensor_1 = pb_utils.Tensor("OUTPUT1",
-                                           out_1.astype(output1_dtype))
-            responses.append(
-                pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype))
+            out_tensor_1 = pb_utils.Tensor("OUTPUT1", out_1.astype(output1_dtype))
+            responses.append(pb_utils.InferenceResponse([out_tensor_0, out_tensor_1]))
         return responses
diff --git a/qa/python_models/torchvision/resnet50/config.pbtxt b/qa/python_models/torchvision/resnet50/config.pbtxt
old mode 100755
new mode 100644
diff --git a/qa/python_models/torchvision/resnet50/model.py b/qa/python_models/torchvision/resnet50/model.py
old mode 100644
new mode 100755
index 25d80fafae..5bd720cd07
--- a/qa/python_models/torchvision/resnet50/model.py
+++ b/qa/python_models/torchvision/resnet50/model.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -30,21 +32,21 @@
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
         """
         This function initializes pre-trained ResNet50 model.
         """
-        self.device = 'cuda' if args["model_instance_kind"] == "GPU" else 'cpu'
+        self.device = "cuda" if args["model_instance_kind"] == "GPU" else "cpu"
         # Our tests currently depend on torchvision=0.14,
         # to make sure `torch.hub` loads Resnet50 implementation
         # compatible with torchvision=0.14, we need to provide tag
-        self.model = torch.hub.load(
-                            "pytorch/vision:v0.14.1",
-                            "resnet50",
-                            weights="IMAGENET1K_V2")\
-                        .to(self.device)\
-                        .eval()
+        self.model = (
+            torch.hub.load(
+                "pytorch/vision:v0.14.1", "resnet50", weights="IMAGENET1K_V2"
+            )
+            .to(self.device)
+            .eval()
+        )
 
     def execute(self, requests):
         """
@@ -55,8 +57,8 @@ def execute(self, requests):
         for request in requests:
             input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
             result = self.model(
-                torch.as_tensor(input_tensor.as_numpy(), device=self.device))
-            out_tensor = pb_utils.Tensor.from_dlpack("OUTPUT0",
-                                                     to_dlpack(result))
+                torch.as_tensor(input_tensor.as_numpy(), device=self.device)
+            )
+            out_tensor = pb_utils.Tensor.from_dlpack("OUTPUT0", to_dlpack(result))
             responses.append(pb_utils.InferenceResponse([out_tensor]))
         return responses
diff --git a/qa/python_models/variable_gpu_output/model.py b/qa/python_models/variable_gpu_output/model.py
old mode 100644
new mode 100755
index a64a17d3c7..11050c4de3
--- a/qa/python_models/variable_gpu_output/model.py
+++ b/qa/python_models/variable_gpu_output/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,24 +26,23 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import triton_python_backend_utils as pb_utils
 import torch
+import triton_python_backend_utils as pb_utils
 from torch.utils.dlpack import to_dlpack
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         # The client will send 5 requests
-        assert (len(requests) == 5)
+        assert len(requests) == 5
         responses = []
         for i, request in enumerate(requests):
             # Create an (i+1)-element array with all the tensors equal to (i+1)
-            output = torch.ones(i + 1, dtype=torch.float32, device='cuda')
+            output = torch.ones(i + 1, dtype=torch.float32, device="cuda")
             output = output * (i + 1)
-            output_pb_tensor = pb_utils.Tensor.from_dlpack(
-                "OUTPUT", to_dlpack(output))
+            output_pb_tensor = pb_utils.Tensor.from_dlpack("OUTPUT", to_dlpack(output))
             inference_response = pb_utils.InferenceResponse(
-                output_tensors=[output_pb_tensor])
+                output_tensors=[output_pb_tensor]
+            )
             responses.append(inference_response)
         return responses
diff --git a/qa/python_models/wrong_model/model.py b/qa/python_models/wrong_model/model.py
old mode 100644
new mode 100755
index 9059255395..c6c1696a20
--- a/qa/python_models/wrong_model/model.py
+++ b/qa/python_models/wrong_model/model.py
@@ -1,4 +1,6 @@
-# Copyright 2020-2021, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,7 +30,6 @@
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         """
         This model ensures that errors in the execute function are properly
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 42778772cb..a6a4a58edf 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -390,7 +390,7 @@ if(${TRITON_ENABLE_HTTP}
     http-endpoint-library
     PRIVATE $<TARGET_PROPERTY:libevhtp::evhtp,INTERFACE_INCLUDE_DIRECTORIES>
   )
-  
+
   # FIXME when Triton support of Opentelemetry is available on Windows
   # add ${OPENTELEMETRY_CPP_INCLUDE_DIRS} to above target_include_directories
   # JIRA DLIS-4786
@@ -506,18 +506,18 @@ if(${TRITON_ENABLE_TRACING})
     tracing-library EXCLUDE_FROM_ALL
     tracer.cc tracer.h
   )
-  
+
   if (NOT WIN32)
     target_compile_features(tracing-library PRIVATE cxx_std_17)
 
     target_include_directories(
-      tracing-library 
+      tracing-library
       PRIVATE ${OPENTELEMETRY_CPP_INCLUDE_DIRS}
     )
 
     target_link_libraries(
       tracing-library
-      PRIVATE 
+      PRIVATE
       ${OPENTELEMETRY_CPP_LIBRARIES})
   endif()
 
diff --git a/src/command_line_parser.cc b/src/command_line_parser.cc
index 9b3470b454..1f7e08c412 100644
--- a/src/command_line_parser.cc
+++ b/src/command_line_parser.cc
@@ -98,7 +98,7 @@ namespace {
 
 // There must be specialization for the types to be parsed into so that
 // the argument is properly validated and parsed. Attempted to use input
-// opeartor (>>) but it will consume inproper argument without error
+// operator (>>) but it will consume improper argument without error
 // (i.e. parse "1.4" to 'int' will return 1 but we want to report error).
 template <typename T>
 T ParseOption(const std::string& arg);
@@ -215,7 +215,7 @@ ParsePairOption(const std::string& arg, const std::string& delim_str)
 }
 
 #ifdef TRITON_ENABLE_GRPC
-// Split 'options' by 'delim_str' and place splitted strings into a vector
+// Split 'options' by 'delim_str' and place split strings into a vector
 std::vector<std::string>
 SplitOptions(std::string options, const std::string& delim_str)
 {
@@ -772,7 +772,7 @@ TritonParser::SetupOptions()
        "DEPRECATED: Please use --trace-config level=<OFF|TIMESTAMPS|TENSORS>"
        "Specify a trace level. OFF to disable tracing, TIMESTAMPS to "
        "trace timestamps, TENSORS to trace tensors. It may be specified "
-       "multiple times to trace multiple informations. Default is OFF."});
+       "multiple times to trace multiple information. Default is OFF."});
   deprecated_options_.push_back(
       {OPTION_TRACE_RATE, "trace-rate", Option::ArgInt,
        "DEPRECATED: Please use --trace-config rate=<rate value>"
diff --git a/src/common.h b/src/common.h
index b7b4f845ac..fc9857fed9 100644
--- a/src/common.h
+++ b/src/common.h
@@ -96,17 +96,16 @@ const std::vector<std::string> TRITON_RESERVED_REQUEST_PARAMS{
     }                                                             \
   } while (false)
 
-#define THROW_IF_ERR(EX_TYPE, X, MSG)                     \
-  do {                                                    \
-    TRITONSERVER_Error* err__ = (X);                      \
-    if (err__ != nullptr) {                               \
-      auto ex__ = (EX_TYPE)(                            \
-          std::string("error: ") + (MSG) + ": " +       \
-          TRITONSERVER_ErrorCodeString(err__) + " - " + \
-          TRITONSERVER_ErrorMessage(err__)); \
-      TRITONSERVER_ErrorDelete(err__);                    \
-      throw ex__;                                         \
-    }                                                     \
+#define THROW_IF_ERR(EX_TYPE, X, MSG)                                     \
+  do {                                                                    \
+    TRITONSERVER_Error* err__ = (X);                                      \
+    if (err__ != nullptr) {                                               \
+      auto ex__ = (EX_TYPE)(std::string("error: ") + (MSG) + ": " +       \
+                            TRITONSERVER_ErrorCodeString(err__) + " - " + \
+                            TRITONSERVER_ErrorMessage(err__));            \
+      TRITONSERVER_ErrorDelete(err__);                                    \
+      throw ex__;                                                         \
+    }                                                                     \
   } while (false)
 
 #define IGNORE_ERR(X)                  \
@@ -152,7 +151,7 @@ std::string GetEnvironmentVariableOrDefault(
 /// \param dims The shape.
 /// \return The number of elements, or -1 if the number of elements
 /// cannot be determined because the shape contains one or more
-/// wilcard dimensions.
+/// wildcard dimensions.
 int64_t GetElementCount(const std::vector<int64_t>& dims);
 
 /// Returns if 'vec' contains 'str'.
diff --git a/src/data_compressor.h b/src/data_compressor.h
index e417558901..a63fb43774 100644
--- a/src/data_compressor.h
+++ b/src/data_compressor.h
@@ -130,7 +130,7 @@ class DataCompressor {
               CommitEVBuffer(
                   compressed_data, &current_reserved_space,
                   expected_compressed_size),
-              "unexpected error comitting output buffer for compression: ");
+              "unexpected error committing output buffer for compression: ");
           RETURN_MSG_IF_ERR(
               AllocEVBuffer(
                   expected_compressed_size, compressed_data,
@@ -155,7 +155,7 @@ class DataCompressor {
           CommitEVBuffer(
               compressed_data, &current_reserved_space,
               expected_compressed_size - stream.avail_out),
-          "unexpected error comitting output buffer for compression: ");
+          "unexpected error committing output buffer for compression: ");
     }
     return nullptr;  // success
   }
@@ -239,7 +239,7 @@ class DataCompressor {
                     CommitEVBuffer(
                         decompressed_data, &current_reserved_space,
                         output_buffer_size),
-                    "unexpected error comitting output buffer for "
+                    "unexpected error committing output buffer for "
                     "decompression: ");
                 RETURN_MSG_IF_ERR(
                     AllocEVBuffer(
@@ -266,7 +266,7 @@ class DataCompressor {
                 CommitEVBuffer(
                     decompressed_data, &current_reserved_space,
                     output_buffer_size - stream.avail_out),
-                "unexpected error comitting output buffer for compression: ");
+                "unexpected error committing output buffer for compression: ");
           }
           break;
         }
diff --git a/src/grpc/grpc_server.cc b/src/grpc/grpc_server.cc
index 717766a366..8374d094b5 100644
--- a/src/grpc/grpc_server.cc
+++ b/src/grpc/grpc_server.cc
@@ -1196,7 +1196,7 @@ CommonHandler::RegisterModelStatistics()
 #else
     auto err = TRITONSERVER_ErrorNew(
         TRITONSERVER_ERROR_UNAVAILABLE,
-        "the server does not suppport model statistics");
+        "the server does not support model statistics");
     GrpcStatusUtil::Create(status, err);
     TRITONSERVER_ErrorDelete(err);
 #endif
@@ -1410,7 +1410,7 @@ CommonHandler::RegisterTrace()
     TRITONSERVER_ErrorDelete(err);
 #else
     auto err = TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_UNAVAILABLE, "the server does not suppport trace");
+        TRITONSERVER_ERROR_UNAVAILABLE, "the server does not support trace");
     GrpcStatusUtil::Create(status, err);
     TRITONSERVER_ErrorDelete(err);
 #endif
@@ -1620,7 +1620,7 @@ CommonHandler::RegisterLogging()
 #else
     auto err = TRITONSERVER_ErrorNew(
         TRITONSERVER_ERROR_UNAVAILABLE,
-        "the server does not suppport dynamic logging");
+        "the server does not support dynamic logging");
     GrpcStatusUtil::Create(status, err);
     TRITONSERVER_ErrorDelete(err);
 #endif
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 268c6aafa0..0c70389d48 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -218,7 +218,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     state->context_->IncrementRequestCounter();
 
     // If the request is not for a model with decoupled transaction policy
-    // then put it in the context queue so thats it's response is sent in
+    // then put it in the context queue so that's it's response is sent in
     // the same order as the request was received.
     if (!state->is_decoupled_) {
       state->context_->EnqueueForResponse(state);
diff --git a/src/http_server.cc b/src/http_server.cc
index 3493e620be..315390bc3b 100644
--- a/src/http_server.cc
+++ b/src/http_server.cc
@@ -1633,7 +1633,7 @@ HTTPAPIServer::HandleModelStats(
 #else
   auto err = TRITONSERVER_ErrorNew(
       TRITONSERVER_ERROR_UNAVAILABLE,
-      "the server does not suppport model statistics");
+      "the server does not support model statistics");
 #endif
 
   if (err != nullptr) {
@@ -1830,7 +1830,7 @@ HTTPAPIServer::HandleTrace(evhtp_request_t* req, const std::string& model_name)
   HTTP_RESPOND_IF_ERR(
       req, TRITONSERVER_ErrorNew(
                TRITONSERVER_ERROR_UNAVAILABLE,
-               "the server does not suppport tracing"));
+               "the server does not support tracing"));
 #endif
 }
 
@@ -1980,7 +1980,7 @@ HTTPAPIServer::HandleLogging(evhtp_request_t* req)
   HTTP_RESPOND_IF_ERR(
       req, TRITONSERVER_ErrorNew(
                TRITONSERVER_ERROR_UNAVAILABLE,
-               "the server does not suppport dynamic logging"));
+               "the server does not support dynamic logging"));
 #endif  // TRITON_ENABLE_LOGGING
 }
 
diff --git a/src/sagemaker_server.h b/src/sagemaker_server.h
index 45c6b22044..2b8189be86 100644
--- a/src/sagemaker_server.h
+++ b/src/sagemaker_server.h
@@ -131,7 +131,7 @@ class SagemakerAPIServer : public HTTPAPIServer {
       size_t* header_length) override;
 
 
-  // Currently the compresssion schema hasn't been defined,
+  // Currently the compression schema hasn't been defined,
   // assume identity compression type is used for both request and response
   DataCompressor::Type GetRequestCompressionType(evhtp_request_t* req) override
   {
diff --git a/src/shared_memory_manager.cc b/src/shared_memory_manager.cc
index d4d99b1889..6e4d6fc0e2 100644
--- a/src/shared_memory_manager.cc
+++ b/src/shared_memory_manager.cc
@@ -347,8 +347,8 @@ SharedMemoryManager::GetMemoryInfo(
             .c_str());
   }
   if (it->second->kind_ == TRITONSERVER_MEMORY_CPU) {
-    *shm_mapped_addr =
-        (void*)((uint8_t*)it->second->mapped_addr_ + it->second->offset_ + offset);
+    *shm_mapped_addr = (void*)((uint8_t*)it->second->mapped_addr_ +
+                               it->second->offset_ + offset);
   } else {
     *shm_mapped_addr = (void*)((uint8_t*)it->second->mapped_addr_ + offset);
   }
diff --git a/src/simple.cc b/src/simple.cc
index 5a6bd3b04b..9ad95293f3 100644
--- a/src/simple.cc
+++ b/src/simple.cc
@@ -838,7 +838,7 @@ main(int argc, char** argv)
       "assigning INPUT1 data");
 
   // Perform inference by calling TRITONSERVER_ServerInferAsync. This
-  // call is asychronous and therefore returns immediately. The
+  // call is asynchronous and therefore returns immediately. The
   // completion of the inference and delivery of the response is done
   // by triton by calling the "response complete" callback functions
   // (InferResponseComplete in this case).
diff --git a/src/test/distributed_addsub/src/distributed_addsub.cc b/src/test/distributed_addsub/src/distributed_addsub.cc
index 4444e6a735..db6e8222e6 100644
--- a/src/test/distributed_addsub/src/distributed_addsub.cc
+++ b/src/test/distributed_addsub/src/distributed_addsub.cc
@@ -683,7 +683,7 @@ TRITONBACKEND_ModelInstanceExecute(
     // Compute... Get GPU instance from model state and let it compute
     // the subtraction, while the CPU instance computes the addition.
     // In real world some parallelization should be used, but here just
-    // seralize the "distributed" work.
+    // serialize the "distributed" work.
     TRITONBACKEND_Response* response = responses[r];
 
     uint64_t compute_start_ns = 0;
diff --git a/src/test/repoagent/relocation_repoagent/src/relocation.cc b/src/test/repoagent/relocation_repoagent/src/relocation.cc
index 1db8c35c97..3fee0b3ca0 100644
--- a/src/test/repoagent/relocation_repoagent/src/relocation.cc
+++ b/src/test/repoagent/relocation_repoagent/src/relocation.cc
@@ -166,4 +166,4 @@ TRITONREPOAGENT_ModelAction(
 
 }  // extern "C"
 
-}}}  // namespace triton::repoagent::relocation
\ No newline at end of file
+}}}  // namespace triton::repoagent::relocation
diff --git a/src/vertex_ai_server.h b/src/vertex_ai_server.h
index cf4d5f9ab9..13fb62675a 100644
--- a/src/vertex_ai_server.h
+++ b/src/vertex_ai_server.h
@@ -57,7 +57,7 @@ class VertexAiAPIServer : public HTTPAPIServer {
       evhtp_request_t* req, int32_t content_length,
       size_t* header_length) override;
 
-  // Currently the compresssion schema hasn't been defined,
+  // Currently the compression schema hasn't been defined,
   // assume identity compression type is used for both request and response
   DataCompressor::Type GetRequestCompressionType(evhtp_request_t* req) override
   {

From b3288fc66fbf76428b0f7bb5cfc6815fa1c00c45 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 4 Jul 2023 12:07:30 -0700
Subject: [PATCH 03/39] Remove clang-format

---
 .clang-format | 37 -------------------------------------
 1 file changed, 37 deletions(-)
 delete mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
deleted file mode 100644
index 98c649734c..0000000000
--- a/.clang-format
+++ /dev/null
@@ -1,37 +0,0 @@
----
-BasedOnStyle: Google
-
-IndentWidth: 2
-ContinuationIndentWidth: 4
-UseTab: Never
-MaxEmptyLinesToKeep: 2
-
-SortIncludes: true
-CompactNamespaces: true
-ReflowComments: true
-
-DerivePointerAlignment: false
-PointerAlignment: Left
-
-AllowShortIfStatementsOnASingleLine: false
-AllowShortBlocksOnASingleLine: false
-AllowShortFunctionsOnASingleLine: Inline
-
-AlwaysBreakAfterReturnType: TopLevelDefinitions
-AlignAfterOpenBracket: AlwaysBreak
-BreakBeforeBraces: Custom
-BraceWrapping:
-  AfterClass: false
-  AfterControlStatement: false
-  AfterEnum: false
-  AfterFunction: true
-  AfterNamespace: false
-  AfterStruct: false
-  AfterUnion: false
-  BeforeCatch: true
-
-BinPackArguments: true
-BinPackParameters: true
-ConstructorInitializerAllOnOneLineOrOnePerLine: false
-
-IndentCaseLabels: true
\ No newline at end of file

From 2088a848ade2caeb0002f21b426be2629fba3a6b Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 4 Jul 2023 12:15:37 -0700
Subject: [PATCH 04/39] Update contributing md to include pre-commit

---
 CONTRIBUTING.md | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6ca236651e..c969ea633d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2018-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -103,21 +103,16 @@ proposed change so that the Triton team can provide feedback.
 
 # Coding Convention
 
-Use clang-format to format all source files (\*.h, \*.cc, \*.proto,
-*.py) to a consistent format. You should run clang-format on all
-source files before submitting a pull request:
+All pull requests are checked against the
+[pre-commit hooks](https://github.com/pre-commit/pre-commit-hooks)
+located [here](../.pre-commit-config.yaml). The hooks do some sanity
+checking like linting and formatting. These checks must pass to merge
+a change.
 
-```
-$ apt-get install clang-format-15
-```
-
-For convenience there is a format.py script in the
-triton-inference-server/common repo in the "tools" directory that can
-be used to clang-format all files within the repo:
-
-```
-$ python3 ../common/tools/format.py *
-```
+To run these locally, you can run `pip install pre-commit`
+inside the cloned repo. When you commit a change, the pre-commit
+hooks will run automatically. If a fix is implemented by a pre-commit
+hook, running it a second time will pass and successfully commit.
 
 # Contributor License Agreement (CLA)
 

From ada02e20a7a6bb788e17fc55e4bcfa047be2a6fa Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 4 Jul 2023 12:28:01 -0700
Subject: [PATCH 05/39] Update spacing in CONTRIBUTING

---
 CONTRIBUTING.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c969ea633d..3a3b196304 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -105,9 +105,9 @@ proposed change so that the Triton team can provide feedback.
 
 All pull requests are checked against the
 [pre-commit hooks](https://github.com/pre-commit/pre-commit-hooks)
-located [here](../.pre-commit-config.yaml). The hooks do some sanity
-checking like linting and formatting. These checks must pass to merge
-a change.
+located [in the repository's top-level .pre-commit-config.yaml](../.pre-commit-config.yaml).
+The hooks do some sanity checking like linting and formatting.
+These checks must pass to merge a change.
 
 To run these locally, you can run `pip install pre-commit`
 inside the cloned repo. When you commit a change, the pre-commit

From 8e3d527dbdb0e4bc2a02dfc756423ca05e9592e9 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 4 Jul 2023 12:31:14 -0700
Subject: [PATCH 06/39] Fix contributing pre-commit link

---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3a3b196304..0627c5833b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -105,7 +105,7 @@ proposed change so that the Triton team can provide feedback.
 
 All pull requests are checked against the
 [pre-commit hooks](https://github.com/pre-commit/pre-commit-hooks)
-located [in the repository's top-level .pre-commit-config.yaml](../.pre-commit-config.yaml).
+located [in the repository's top-level .pre-commit-config.yaml](https://github.com/NVIDIA/triton-inference-server/blob/master/pre-commit-config.yaml).
 The hooks do some sanity checking like linting and formatting.
 These checks must pass to merge a change.
 

From ff54d19ab859fb7ff4a38b823e5ec275cb637c2e Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 4 Jul 2023 13:44:59 -0700
Subject: [PATCH 07/39] Link to pre-commit install directions

---
 CONTRIBUTING.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0627c5833b..94f69930b2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -109,10 +109,12 @@ located [in the repository's top-level .pre-commit-config.yaml](https://github.c
 The hooks do some sanity checking like linting and formatting.
 These checks must pass to merge a change.
 
-To run these locally, you can run `pip install pre-commit`
-inside the cloned repo. When you commit a change, the pre-commit
-hooks will run automatically. If a fix is implemented by a pre-commit
-hook, running it a second time will pass and successfully commit.
+To run these locally, you can
+[install pre-commit,](https://pre-commit.com/#install)
+then run `pre-commit install` inside the cloned repo. When you
+commit a change, the pre-commit hooks will run automatically.
+If a fix is implemented by a pre-commit hook, running commit a
+second time will pass and successfully commit.
 
 # Contributor License Agreement (CLA)
 

From 42b9de08c951dc36578ca754cd875649b34a455b Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 4 Jul 2023 13:45:40 -0700
Subject: [PATCH 08/39] Wording

---
 CONTRIBUTING.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 94f69930b2..59e0ace975 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -113,8 +113,9 @@ To run these locally, you can
 [install pre-commit,](https://pre-commit.com/#install)
 then run `pre-commit install` inside the cloned repo. When you
 commit a change, the pre-commit hooks will run automatically.
-If a fix is implemented by a pre-commit hook, running commit a
-second time will pass and successfully commit.
+If a fix is implemented by a pre-commit hook, adding the file again
+and running `git commit` a second time will pass and successfully
+commit.
 
 # Contributor License Agreement (CLA)
 

From d111bc92c83c3688658073d0cc82c4579c0cbd1f Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 4 Jul 2023 14:22:34 -0700
Subject: [PATCH 09/39] Restore clang-format

---
 .clang-format | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000..b7b27be9af
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,38 @@
+---
+BasedOnStyle: Google
+
+IndentWidth: 2
+ContinuationIndentWidth: 4
+UseTab: Never
+MaxEmptyLinesToKeep: 2
+
+SortIncludes: true
+CompactNamespaces: true
+ReflowComments: true
+
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+AllowShortIfStatementsOnASingleLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+
+AlwaysBreakAfterReturnType: TopLevelDefinitions
+AlignAfterOpenBracket: AlwaysBreak
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterClass: false
+  AfterControlStatement: false
+  AfterEnum: false
+  AfterFunction: true
+  AfterNamespace: false
+  AfterStruct: false
+  AfterUnion: false
+  BeforeCatch: true
+
+BinPackArguments: true
+BinPackParameters: true
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+
+IndentCaseLabels: true
+

From 3740d40f9189548f0e7e808fb93b207d5557ddf1 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 4 Jul 2023 16:38:22 -0700
Subject: [PATCH 10/39] Fix yaml spacing

---
 .pre-commit-config.yaml | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1985278fd3..1dfc6a9c3f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -28,8 +28,8 @@ repos:
 - repo: https://github.com/timothycrosley/isort
   rev: 5.12.0
   hooks:
-  - id: isort
-    additional_dependencies: [toml]
+      - id: isort
+        additional_dependencies: [toml]
 - repo: https://github.com/psf/black
   rev: 23.1.0
   hooks:
@@ -59,18 +59,16 @@ repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
   rev: v4.4.0
   hooks:
-    - id: check-case-conflict
-    - id: check-executables-have-shebangs
-    - id: check-merge-conflict
-    - id: check-json
-    - id: check-toml
-    # Do not check template yaml files in deploy directory
-    - id: check-yaml
-      exclude: ^deploy(\/[^\/]+)*\/templates\/.*$
-    - id: check-shebang-scripts-are-executable
-    - id: end-of-file-fixer
-      types_or: [c, c++, cuda, proto, textproto, java, python]
-    - id: mixed-line-ending
-    - id: requirements-txt-fixer
-    - id: trailing-whitespace
+        - id: check-case-conflict
+        - id: check-executables-have-shebangs
+        - id: check-merge-conflict
+        - id: check-json
+        - id: check-toml
+        - id: check-yaml
+        - id: check-shebang-scripts-are-executable
+        - id: end-of-file-fixer
+          types_or: [c, c++, cuda, proto, textproto, java, python]
+        - id: mixed-line-ending
+        - id: requirements-txt-fixer
+        - id: trailing-whitespace
 

From cf11ab4229d8f3855ae8b5a427be956767053454 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 4 Jul 2023 16:42:48 -0700
Subject: [PATCH 11/39] Exclude templates folder for check-yaml

---
 .pre-commit-config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1dfc6a9c3f..6aeaf459b0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -65,6 +65,7 @@ repos:
         - id: check-json
         - id: check-toml
         - id: check-yaml
+          exclude: ^deploy(\/[^\/]+)*\/templates\/.*$
         - id: check-shebang-scripts-are-executable
         - id: end-of-file-fixer
           types_or: [c, c++, cuda, proto, textproto, java, python]

From b8e24b2b889e25e506f56e6bbcd4e25fb56be951 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 4 Jul 2023 16:45:32 -0700
Subject: [PATCH 12/39] Remove unused vars

---
 qa/L0_client_nobatch/client_test.py         | 4 ++--
 qa/L0_client_timeout/client_timeout_test.py | 4 ++--
 qa/L0_grpc/python_unit_test.py              | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/qa/L0_client_nobatch/client_test.py b/qa/L0_client_nobatch/client_test.py
index ed6a3149df..c821d446d2 100755
--- a/qa/L0_client_nobatch/client_test.py
+++ b/qa/L0_client_nobatch/client_test.py
@@ -84,7 +84,7 @@ def test_nobatch_request_for_batching_model(self):
             inputs[1].set_data_from_numpy(in1)
 
             try:
-                results = triton_client.infer(model_name, inputs, outputs=outputs)
+                _ = triton_client.infer(model_name, inputs, outputs=outputs)
                 self.assertTrue(
                     False, "expected failure with no batch request for batching model"
                 )
@@ -137,7 +137,7 @@ def test_batch_request_for_nobatching_model(self):
             inputs[1].set_data_from_numpy(in1)
 
             try:
-                results = triton_client.infer(model_name, inputs, outputs=outputs)
+                _ = triton_client.infer(model_name, inputs, outputs=outputs)
                 self.assertTrue(
                     False,
                     "expected failure with batched request for non-batching model",
diff --git a/qa/L0_client_timeout/client_timeout_test.py b/qa/L0_client_timeout/client_timeout_test.py
index af7ea768eb..7b0081074a 100755
--- a/qa/L0_client_timeout/client_timeout_test.py
+++ b/qa/L0_client_timeout/client_timeout_test.py
@@ -82,7 +82,7 @@ def test_grpc_infer(self):
         # The model is configured to take three seconds to send the
         # response. Expect an exception for small timeout values.
         with self.assertRaises(InferenceServerException) as cm:
-            result = triton_client.infer(
+            _ = triton_client.infer(
                 model_name=self.model_name_,
                 inputs=self.inputs_,
                 outputs=self.outputs_,
@@ -192,7 +192,7 @@ def test_http_infer(self):
             triton_client = httpclient.InferenceServerClient(
                 url="localhost:8000", verbose=True, network_timeout=2.0
             )
-            result = triton_client.infer(
+            _ = triton_client.infer(
                 model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
             )
         self.assertIn("timed out", str(cm.exception))
diff --git a/qa/L0_grpc/python_unit_test.py b/qa/L0_grpc/python_unit_test.py
index 0fb6d97554..9591d4274c 100755
--- a/qa/L0_grpc/python_unit_test.py
+++ b/qa/L0_grpc/python_unit_test.py
@@ -100,7 +100,7 @@ def test_infer(self):
         with self.assertRaisesRegex(
             InferenceServerException, "This protocol is restricted"
         ):
-            results = self.client_.infer(
+            _ = self.client_.infer(
                 model_name=self.model_name_, inputs=inputs, headers={"test": "1"}
             )
         self.client_.infer(

From 5f44531d35dd8edd3ce8ee69059b104d9b4d754f Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 4 Jul 2023 16:52:48 -0700
Subject: [PATCH 13/39] Normalize spacing

---
 .pre-commit-config.yaml | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6aeaf459b0..f812282ced 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -59,17 +59,16 @@ repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
   rev: v4.4.0
   hooks:
-        - id: check-case-conflict
-        - id: check-executables-have-shebangs
-        - id: check-merge-conflict
-        - id: check-json
-        - id: check-toml
-        - id: check-yaml
-          exclude: ^deploy(\/[^\/]+)*\/templates\/.*$
-        - id: check-shebang-scripts-are-executable
-        - id: end-of-file-fixer
-          types_or: [c, c++, cuda, proto, textproto, java, python]
-        - id: mixed-line-ending
-        - id: requirements-txt-fixer
-        - id: trailing-whitespace
-
+      - id: check-case-conflict
+      - id: check-executables-have-shebangs
+      - id: check-merge-conflict
+      - id: check-json
+      - id: check-toml
+      - id: check-yaml
+        exclude: ^deploy(\/[^\/]+)*\/templates\/.*$
+      - id: check-shebang-scripts-are-executable
+      - id: end-of-file-fixer
+        types_or: [c, c++, cuda, proto, textproto, java, python]
+      - id: mixed-line-ending
+      - id: requirements-txt-fixer
+      - id: trailing-whitespace

From 0e34e3d54770436823f62dbf7529604de0a88027 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Wed, 5 Jul 2023 07:59:30 -0700
Subject: [PATCH 14/39] Remove unused variable

---
 qa/L0_backend_python/io/io_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qa/L0_backend_python/io/io_test.py b/qa/L0_backend_python/io/io_test.py
index 9adb4414ab..ff67e8c0ff 100755
--- a/qa/L0_backend_python/io/io_test.py
+++ b/qa/L0_backend_python/io/io_test.py
@@ -138,7 +138,7 @@ def test_variable_gpu_output(self):
         # responses with different GPU output shapes
         num_requests = 5
         for _ in range(num_requests):
-            result = self._client.async_infer(
+            _ = self._client.async_infer(
                 model_name=model_name,
                 inputs=inputs,
                 callback=partial(callback, user_data),

From a6861f84b301882fb540063f97cc236232f1b4e4 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 6 Jul 2023 12:30:37 -0700
Subject: [PATCH 15/39] Normalize config indentation

---
 .pre-commit-config.yaml | 54 ++++++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f812282ced..f44f815351 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -28,47 +28,47 @@ repos:
 - repo: https://github.com/timothycrosley/isort
   rev: 5.12.0
   hooks:
-      - id: isort
-        additional_dependencies: [toml]
+  - id: isort
+    additional_dependencies: [toml]
 - repo: https://github.com/psf/black
   rev: 23.1.0
   hooks:
-      - id: black
-        types_or: [python, cython]
+  - id: black
+    types_or: [python, cython]
 - repo: https://github.com/PyCQA/flake8
   rev: 5.0.4
   hooks:
-      - id: flake8
-        args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
-        types_or: [python, cython]
+  - id: flake8
+    args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
+    types_or: [python, cython]
 - repo: https://github.com/pre-commit/mirrors-clang-format
   rev: v16.0.5
   hooks:
-      - id: clang-format
-        types_or: [c, c++, cuda, proto, textproto, java]
-        args: ["-fallback-style=none", "-style=file", "-i"]
+  - id: clang-format
+    types_or: [c, c++, cuda, proto, textproto, java]
+    args: ["-fallback-style=none", "-style=file", "-i"]
 - repo: https://github.com/codespell-project/codespell
   rev: v2.2.4
   hooks:
-      - id: codespell
-        additional_dependencies: [tomli]
-        args: ["--toml", "pyproject.toml"]
-        exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$)
+  - id: codespell
+    additional_dependencies: [tomli]
+    args: ["--toml", "pyproject.toml"]
+    exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$)
 # More details about these pre-commit hooks here:
 # https://pre-commit.com/hooks.html
 - repo: https://github.com/pre-commit/pre-commit-hooks
   rev: v4.4.0
   hooks:
-      - id: check-case-conflict
-      - id: check-executables-have-shebangs
-      - id: check-merge-conflict
-      - id: check-json
-      - id: check-toml
-      - id: check-yaml
-        exclude: ^deploy(\/[^\/]+)*\/templates\/.*$
-      - id: check-shebang-scripts-are-executable
-      - id: end-of-file-fixer
-        types_or: [c, c++, cuda, proto, textproto, java, python]
-      - id: mixed-line-ending
-      - id: requirements-txt-fixer
-      - id: trailing-whitespace
+  - id: check-case-conflict
+  - id: check-executables-have-shebangs
+  - id: check-merge-conflict
+  - id: check-json
+  - id: check-toml
+  - id: check-yaml
+    exclude: ^deploy(\/[^\/]+)*\/templates\/.*$
+  - id: check-shebang-scripts-are-executable
+  - id: end-of-file-fixer
+    types_or: [c, c++, cuda, proto, textproto, java, python]
+  - id: mixed-line-ending
+  - id: requirements-txt-fixer
+  - id: trailing-whitespace

From 7fe4c3f7ef0c0d7f8afd7f8f402236512ab9b984 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 6 Jul 2023 12:40:08 -0700
Subject: [PATCH 16/39] Update .clang-format to enforce max line length of 80

---
 .clang-format                                 |   1 +
 .../MemoryGrowthTest.java                     | 294 ++++++++++++------
 qa/L0_java_resnet/ResnetTest.java             | 170 ++++++----
 qa/L0_java_sequence_batcher/SequenceTest.java | 209 ++++++++-----
 4 files changed, 442 insertions(+), 232 deletions(-)

diff --git a/.clang-format b/.clang-format
index b7b27be9af..1defc175de 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,6 +2,7 @@
 BasedOnStyle: Google
 
 IndentWidth: 2
+ColumnLimit: 80
 ContinuationIndentWidth: 4
 UseTab: Never
 MaxEmptyLinesToKeep: 2
diff --git a/qa/L0_java_memory_growth/MemoryGrowthTest.java b/qa/L0_java_memory_growth/MemoryGrowthTest.java
index 3060b6542c..b4acc9a246 100644
--- a/qa/L0_java_memory_growth/MemoryGrowthTest.java
+++ b/qa/L0_java_memory_growth/MemoryGrowthTest.java
@@ -1,4 +1,5 @@
-// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -68,7 +69,8 @@ public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p)
       super(p);
       deallocator(new DeleteDeallocator(this));
     }
-    protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
+    protected static class DeleteDeallocator
+        extends TRITONSERVER_Server implements Deallocator {
       DeleteDeallocator(Pointer p) { super(p); }
       @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
     }
@@ -80,7 +82,8 @@ static void Usage(String msg)
       System.err.println(msg);
     }
 
-    System.err.println("Usage: java " + MemoryGrowthTest.class.getSimpleName() + " [options]");
+    System.err.println(
+        "Usage: java " + MemoryGrowthTest.class.getSimpleName() + " [options]");
     System.err.println("\t-i Set number of iterations");
     System.err.println(
         "\t-m <\"system\"|\"pinned\"|gpu>"
@@ -89,7 +92,8 @@ static void Usage(String msg)
         + " will be based on the model's preferred type.");
     System.err.println("\t-v Enable verbose logging");
     System.err.println("\t-r [model repository absolute path]");
-    System.err.println("\t--max-growth Specify maximum allowed memory growth (%)");
+    System.err.println(
+        "\t--max-growth Specify maximum allowed memory growth (%)");
     System.err.println("\t--max-memory Specify maximum allowed memory (MB)");
 
     System.exit(1);
@@ -98,9 +102,10 @@ static void Usage(String msg)
   static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
     @Override
     public TRITONSERVER_Error call(
-        TRITONSERVER_ResponseAllocator allocator, String tensor_name, long byte_size,
-        int preferred_memory_type, long preferred_memory_type_id, Pointer userp,
-        PointerPointer buffer, PointerPointer buffer_userp, IntPointer actual_memory_type,
+        TRITONSERVER_ResponseAllocator allocator, String tensor_name,
+        long byte_size, int preferred_memory_type,
+        long preferred_memory_type_id, Pointer userp, PointerPointer buffer,
+        PointerPointer buffer_userp, IntPointer actual_memory_type,
         LongPointer actual_memory_type_id)
     {
       // Initially attempt to make the actual memory type and id that we
@@ -134,11 +139,13 @@ public TRITONSERVER_Error call(
     }
   }
 
-  static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
+  static class ResponseRelease
+      extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
     @Override
     public TRITONSERVER_Error call(
-        TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
-        long byte_size, int memory_type, long memory_type_id)
+        TRITONSERVER_ResponseAllocator allocator, Pointer buffer,
+        Pointer buffer_userp, long byte_size, int memory_type,
+        long memory_type_id)
     {
       String name = null;
       if (buffer_userp != null) {
@@ -153,15 +160,21 @@ public TRITONSERVER_Error call(
     }
   }
 
-  static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
-    @Override public void call(TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
+  static class InferRequestComplete
+      extends TRITONSERVER_InferenceRequestReleaseFn_t {
+    @Override
+    public void call(
+        TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
     {
       // We reuse the request so we don't delete it here.
     }
   }
 
-  static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
-    @Override public void call(TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
+  static class InferResponseComplete
+      extends TRITONSERVER_InferenceResponseCompleteFn_t {
+    @Override
+    public void call(
+        TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
     {
       if (response != null) {
         // Send 'response' to the future.
@@ -170,18 +183,21 @@ static class InferResponseComplete extends TRITONSERVER_InferenceResponseComplet
     }
   }
 
-  static ConcurrentHashMap<Pointer, CompletableFuture<TRITONSERVER_InferenceResponse>> futures =
+  static ConcurrentHashMap<
+      Pointer, CompletableFuture<TRITONSERVER_InferenceResponse>> futures =
       new ConcurrentHashMap<>();
   static ResponseAlloc responseAlloc = new ResponseAlloc();
   static ResponseRelease responseRelease = new ResponseRelease();
   static InferRequestComplete inferRequestComplete = new InferRequestComplete();
-  static InferResponseComplete inferResponseComplete = new InferResponseComplete();
+  static InferResponseComplete inferResponseComplete =
+      new InferResponseComplete();
 
   static TRITONSERVER_Error ParseModelMetadata(
       JsonObject model_metadata, boolean[] is_int, boolean[] is_torch_model)
   {
     String seen_data_type = null;
-    for (JsonElement input_element : model_metadata.get("inputs").getAsJsonArray()) {
+    for (JsonElement input_element :
+         model_metadata.get("inputs").getAsJsonArray()) {
       JsonObject input = input_element.getAsJsonObject();
       if (!input.get("datatype").getAsString().equals("INT32")
           && !input.get("datatype").getAsString().equals("FP32")) {
@@ -198,7 +214,8 @@ static TRITONSERVER_Error ParseModelMetadata(
             "the inputs and outputs of 'simple' model must have the data type");
       }
     }
-    for (JsonElement output_element : model_metadata.get("outputs").getAsJsonArray()) {
+    for (JsonElement output_element :
+         model_metadata.get("outputs").getAsJsonArray()) {
       JsonObject output = output_element.getAsJsonObject();
       if (!output.get("datatype").getAsString().equals("INT32")
           && !output.get("datatype").getAsString().equals("FP32")) {
@@ -214,11 +231,13 @@ static TRITONSERVER_Error ParseModelMetadata(
     }
 
     is_int[0] = seen_data_type.equals("INT32");
-    is_torch_model[0] = model_metadata.get("platform").getAsString().equals("pytorch_libtorch");
+    is_torch_model[0] =
+        model_metadata.get("platform").getAsString().equals("pytorch_libtorch");
     return null;
   }
 
-  static void GenerateInputData(IntPointer[] input0_data, IntPointer[] input1_data)
+  static void GenerateInputData(
+      IntPointer[] input0_data, IntPointer[] input1_data)
   {
     input0_data[0] = new IntPointer(16);
     input1_data[0] = new IntPointer(16);
@@ -228,7 +247,8 @@ static void GenerateInputData(IntPointer[] input0_data, IntPointer[] input1_data
     }
   }
 
-  static void GenerateInputData(FloatPointer[] input0_data, FloatPointer[] input1_data)
+  static void GenerateInputData(
+      FloatPointer[] input0_data, FloatPointer[] input1_data)
   {
     input0_data[0] = new FloatPointer(16);
     input1_data[0] = new FloatPointer(16);
@@ -239,8 +259,8 @@ static void GenerateInputData(FloatPointer[] input0_data, FloatPointer[] input1_
   }
 
   static void CompareResult(
-      String output0_name, String output1_name, IntPointer input0, IntPointer input1,
-      IntPointer output0, IntPointer output1)
+      String output0_name, String output1_name, IntPointer input0,
+      IntPointer input1, IntPointer output0, IntPointer output1)
   {
     for (int i = 0; i < 16; ++i) {
       if ((input0.get(i) + input1.get(i)) != output0.get(i)) {
@@ -253,8 +273,8 @@ static void CompareResult(
   }
 
   static void CompareResult(
-      String output0_name, String output1_name, FloatPointer input0, FloatPointer input1,
-      FloatPointer output0, FloatPointer output1)
+      String output0_name, String output1_name, FloatPointer input0,
+      FloatPointer input1, FloatPointer output0, FloatPointer output1)
   {
     for (int i = 0; i < 16; ++i) {
       if ((input0.get(i) + input1.get(i)) != output0.get(i)) {
@@ -267,9 +287,9 @@ static void CompareResult(
   }
 
   static void Check(
-      TRITONSERVER_InferenceResponse response, Pointer input0_data, Pointer input1_data,
-      String output0, String output1, long expected_byte_size, int expected_datatype,
-      boolean is_int)
+      TRITONSERVER_InferenceResponse response, Pointer input0_data,
+      Pointer input1_data, String output0, String output1,
+      long expected_byte_size, int expected_datatype, boolean is_int)
   {
     HashMap<String, Pointer> output_data = new HashMap<>();
 
@@ -294,8 +314,8 @@ static void Check(
 
       FAIL_IF_ERR(
           TRITONSERVER_InferenceResponseOutput(
-              response, idx, cname, datatype, shape, dim_count, base, byte_size, memory_type,
-              memory_type_id, userp),
+              response, idx, cname, datatype, shape, dim_count, base, byte_size,
+              memory_type, memory_type_id, userp),
           "getting output info");
 
       if (cname.isNull()) {
@@ -307,28 +327,30 @@ static void Check(
         FAIL("unexpected output '" + name + "'");
       }
 
-      if ((dim_count.get() != 2) || (shape.get(0) != 1) || (shape.get(1) != 16)) {
+      if ((dim_count.get() != 2) || (shape.get(0) != 1)
+          || (shape.get(1) != 16)) {
         FAIL("unexpected shape for '" + name + "'");
       }
 
       if (datatype.get() != expected_datatype) {
         FAIL(
-            "unexpected datatype '" + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name
+            "unexpected datatype '"
+            + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name
             + "'");
       }
 
       if (byte_size.get() != expected_byte_size) {
         FAIL(
-            "unexpected byte-size, expected " + expected_byte_size + ", got " + byte_size.get()
-            + " for " + name);
+            "unexpected byte-size, expected " + expected_byte_size + ", got "
+            + byte_size.get() + " for " + name);
       }
 
       if (enforce_memory_type && (memory_type.get() != requested_memory_type)) {
         FAIL(
             "unexpected memory type, expected to be allocated in "
             + TRITONSERVER_MemoryTypeString(requested_memory_type) + ", got "
-            + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id " + memory_type_id.get()
-            + " for " + name);
+            + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id "
+            + memory_type_id.get() + " for " + name);
       }
 
       // We make a copy of the data here... which we could avoid for
@@ -340,12 +362,15 @@ static void Check(
 
     if (is_int) {
       CompareResult(
-          output0, output1, new IntPointer(input0_data), new IntPointer(input1_data),
-          new IntPointer(output_data.get(output0)), new IntPointer(output_data.get(output1)));
+          output0, output1, new IntPointer(input0_data),
+          new IntPointer(input1_data), new IntPointer(output_data.get(output0)),
+          new IntPointer(output_data.get(output1)));
     } else {
       CompareResult(
-          output0, output1, new FloatPointer(input0_data), new FloatPointer(input1_data),
-          new FloatPointer(output_data.get(output0)), new FloatPointer(output_data.get(output1)));
+          output0, output1, new FloatPointer(input0_data),
+          new FloatPointer(input1_data),
+          new FloatPointer(output_data.get(output0)),
+          new FloatPointer(output_data.get(output1)));
     }
   }
 
@@ -354,9 +379,11 @@ output0, output1, new FloatPointer(input0_data), new FloatPointer(input1_data),
   @param  max_float_allowed     Maximum allowed memory growth (%)
   @param  max_mem_allowed       Maximum allowed memory (MB)
    */
-  static boolean ValidateMemoryGrowth(float max_growth_allowed, int max_mem_allowed)
+  static boolean ValidateMemoryGrowth(
+      float max_growth_allowed, int max_mem_allowed)
   {
-    // Allocate list starting capacity to hold up to 24 hours worth of snapshots.
+    // Allocate list starting capacity to hold up to 24 hours worth of
+    // snapshots.
     List<Double> memory_snapshots = new ArrayList<Double>(20000);
     while (!done) {
       try {
@@ -366,13 +393,15 @@ static boolean ValidateMemoryGrowth(float max_growth_allowed, int max_mem_allowe
         System.out.println("Memory growth validation interrupted.");
       }
       System.gc();
-      double snapshot = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory();
+      double snapshot = Runtime.getRuntime().totalMemory()
+          - Runtime.getRuntime().freeMemory();
       memory_snapshots.add(snapshot);
       System.out.println("Memory allocated (MB):" + snapshot / 1E6);
     }
     if (memory_snapshots.size() < 5) {
       System.out.println(
-          "Error: Not enough snapshots, found " + memory_snapshots.size() + " snapshots");
+          "Error: Not enough snapshots, found " + memory_snapshots.size()
+          + " snapshots");
       return false;
     }
 
@@ -380,8 +409,10 @@ static boolean ValidateMemoryGrowth(float max_growth_allowed, int max_mem_allowe
     // between 90th percentile and 10th percentile memory usage.
     final double bytes_in_mb = 1E6;
     Collections.sort(memory_snapshots);
-    int index_max = ((int) Math.ceil(max_percentile / 100.0 * memory_snapshots.size())) - 1;
-    int index_min = ((int) Math.ceil(min_percentile / 100.0 * memory_snapshots.size())) - 1;
+    int index_max =
+        ((int) Math.ceil(max_percentile / 100.0 * memory_snapshots.size())) - 1;
+    int index_min =
+        ((int) Math.ceil(min_percentile / 100.0 * memory_snapshots.size())) - 1;
     double memory_allocation_delta =
         memory_snapshots.get(index_max) - memory_snapshots.get(index_min);
     double memory_allocation_delta_mb = memory_allocation_delta / bytes_in_mb;
@@ -396,7 +427,9 @@ static boolean ValidateMemoryGrowth(float max_growth_allowed, int max_mem_allowe
 
     if (memory_allocation_delta_percent >= max_growth_allowed) {
       passed = false;
-      System.out.println("Exceeded allowed memory growth (" + (max_growth_allowed * 100) + "%)");
+      System.out.println(
+          "Exceeded allowed memory growth (" + (max_growth_allowed * 100)
+          + "%)");
     }
 
     if ((memory_snapshots.get(index_max) / bytes_in_mb) >= max_mem_allowed) {
@@ -414,16 +447,19 @@ static void RunInference(
   {
     // Create the allocator that will be used to allocate buffers for
     // the result tensors.
-    TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
+    TRITONSERVER_ResponseAllocator allocator =
+        new TRITONSERVER_ResponseAllocator(null);
     FAIL_IF_ERR(
         TRITONSERVER_ResponseAllocatorNew(
             allocator, responseAlloc, responseRelease, null /* start_fn */),
         "creating response allocator");
 
     // Inference
-    TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
+    TRITONSERVER_InferenceRequest irequest =
+        new TRITONSERVER_InferenceRequest(null);
     FAIL_IF_ERR(
-        TRITONSERVER_InferenceRequestNew(irequest, server, model_name, -1 /* model_version */),
+        TRITONSERVER_InferenceRequestNew(
+            irequest, server, model_name, -1 /* model_version */),
         "creating inference request");
 
     FAIL_IF_ERR(
@@ -442,7 +478,8 @@ static void RunInference(
     long[] input0_shape = {1, 16};
     long[] input1_shape = {1, 16};
 
-    int datatype = (is_int[0]) ? TRITONSERVER_TYPE_INT32 : TRITONSERVER_TYPE_FP32;
+    int datatype =
+        (is_int[0]) ? TRITONSERVER_TYPE_INT32 : TRITONSERVER_TYPE_FP32;
 
     FAIL_IF_ERR(
         TRITONSERVER_InferenceRequestAddInput(
@@ -498,68 +535,79 @@ static void RunInference(
 
     // Perform inference...
     {
-      CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
+      CompletableFuture<TRITONSERVER_InferenceResponse> completed =
+          new CompletableFuture<>();
       futures.put(irequest, completed);
 
       FAIL_IF_ERR(
           TRITONSERVER_InferenceRequestSetResponseCallback(
-              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
-              irequest),
+              irequest, allocator, null /* response_allocator_userp */,
+              inferResponseComplete, irequest),
           "setting response callback");
 
       FAIL_IF_ERR(
-          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */),
+          "running inference");
 
       // Wait for the inference to complete.
       TRITONSERVER_InferenceResponse completed_response = completed.get();
       futures.remove(irequest);
 
-      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceResponseError(completed_response),
+          "response status");
       if (check_accuracy) {
         Check(
-            completed_response, input0_data, input1_data, output0, output1, input0_size, datatype,
-            is_int[0]);
+            completed_response, input0_data, input1_data, output0, output1,
+            input0_size, datatype, is_int[0]);
       }
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
+          TRITONSERVER_InferenceResponseDelete(completed_response),
+          "deleting inference response");
     }
 
     // Modify some input data in place and then reuse the request
     // object. For simplicity we only do this when the input tensors are
     // in non-pinned system memory.
-    if (!enforce_memory_type || (requested_memory_type == TRITONSERVER_MEMORY_CPU)) {
+    if (!enforce_memory_type
+        || (requested_memory_type == TRITONSERVER_MEMORY_CPU)) {
       if (is_int[0]) {
         new IntPointer(input0_data).put(0, 27);
       } else {
         new FloatPointer(input0_data).put(0, 27.0f);
       }
 
-      CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
+      CompletableFuture<TRITONSERVER_InferenceResponse> completed =
+          new CompletableFuture<>();
       futures.put(irequest, completed);
 
       // Using a new promise so have to re-register the callback to set
       // the promise as the userp.
       FAIL_IF_ERR(
           TRITONSERVER_InferenceRequestSetResponseCallback(
-              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
-              irequest),
+              irequest, allocator, null /* response_allocator_userp */,
+              inferResponseComplete, irequest),
           "setting response callback");
 
       FAIL_IF_ERR(
-          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */),
+          "running inference");
 
       // Wait for the inference to complete.
       TRITONSERVER_InferenceResponse completed_response = completed.get();
       futures.remove(irequest);
-      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceResponseError(completed_response),
+          "response status");
       if (check_accuracy) {
         Check(
-            completed_response, input0_data, input1_data, output0, output1, input0_size, datatype,
-            is_int[0]);
+            completed_response, input0_data, input1_data, output0, output1,
+            input0_size, datatype, is_int[0]);
       }
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
+          TRITONSERVER_InferenceResponseDelete(completed_response),
+          "deleting inference response");
     }
 
     // Remove input data and then add back different data.
@@ -573,39 +621,48 @@ static void RunInference(
               0 /* memory_type_id */),
           "assigning INPUT1 data to INPUT0");
 
-      CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
+      CompletableFuture<TRITONSERVER_InferenceResponse> completed =
+          new CompletableFuture<>();
       futures.put(irequest, completed);
 
       // Using a new promise so have to re-register the callback to set
       // the promise as the userp.
       FAIL_IF_ERR(
           TRITONSERVER_InferenceRequestSetResponseCallback(
-              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
-              irequest),
+              irequest, allocator, null /* response_allocator_userp */,
+              inferResponseComplete, irequest),
           "setting response callback");
 
       FAIL_IF_ERR(
-          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */),
+          "running inference");
 
       // Wait for the inference to complete.
       TRITONSERVER_InferenceResponse completed_response = completed.get();
       futures.remove(irequest);
-      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceResponseError(completed_response),
+          "response status");
 
       if (check_accuracy) {
         // Both inputs are using input1_data...
         Check(
-            completed_response, input1_data, input1_data, output0, output1, input0_size, datatype,
-            is_int[0]);
+            completed_response, input1_data, input1_data, output0, output1,
+            input0_size, datatype, is_int[0]);
       }
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
+          TRITONSERVER_InferenceResponseDelete(completed_response),
+          "deleting inference response");
     }
 
-    FAIL_IF_ERR(TRITONSERVER_InferenceRequestDelete(irequest), "deleting inference request");
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestDelete(irequest),
+        "deleting inference request");
 
-    FAIL_IF_ERR(TRITONSERVER_ResponseAllocatorDelete(allocator), "deleting response allocator");
+    FAIL_IF_ERR(
+        TRITONSERVER_ResponseAllocatorDelete(allocator),
+        "deleting response allocator");
   }
 
   public static void main(String[] args) throws Exception
@@ -660,7 +717,8 @@ public static void main(String[] args) throws Exception
             max_growth_allowed = Integer.parseInt(args[i]) / 100.0f;
           }
           catch (NumberFormatException e) {
-            Usage("--max-growth must be an integer value specifying allowed memory growth (%)");
+            Usage(
+                "--max-growth must be an integer value specifying allowed memory growth (%)");
           }
           break;
         case "--max-memory":
@@ -669,7 +727,8 @@ public static void main(String[] args) throws Exception
             max_mem_allowed = Integer.parseInt(args[i]);
           }
           catch (NumberFormatException e) {
-            Usage("--max-memory must be an integer value specifying maximum allowed memory (MB)");
+            Usage(
+                "--max-memory must be an integer value specifying maximum allowed memory (MB)");
           }
           break;
       }
@@ -678,7 +737,8 @@ public static void main(String[] args) throws Exception
     if (model_repository_path == null) {
       Usage("-r must be used to specify model repository path");
     }
-    if (enforce_memory_type && requested_memory_type != TRITONSERVER_MEMORY_CPU) {
+    if (enforce_memory_type
+        && requested_memory_type != TRITONSERVER_MEMORY_CPU) {
       Usage("-m can only be set to \"system\" without enabling GPU");
     }
 
@@ -693,16 +753,21 @@ public static void main(String[] args) throws Exception
     }
 
     // Create the server...
-    TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
-    FAIL_IF_ERR(TRITONSERVER_ServerOptionsNew(server_options), "creating server options");
+    TRITONSERVER_ServerOptions server_options =
+        new TRITONSERVER_ServerOptions(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsNew(server_options),
+        "creating server options");
     FAIL_IF_ERR(
-        TRITONSERVER_ServerOptionsSetModelRepositoryPath(server_options, model_repository_path),
+        TRITONSERVER_ServerOptionsSetModelRepositoryPath(
+            server_options, model_repository_path),
         "setting model repository path");
     FAIL_IF_ERR(
         TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
         "setting verbose logging level");
     FAIL_IF_ERR(
-        TRITONSERVER_ServerOptionsSetBackendDirectory(server_options, "/opt/tritonserver/backends"),
+        TRITONSERVER_ServerOptionsSetBackendDirectory(
+            server_options, "/opt/tritonserver/backends"),
         "setting backend directory");
     FAIL_IF_ERR(
         TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
@@ -718,18 +783,27 @@ public static void main(String[] args) throws Exception
         "setting minimum supported CUDA compute capability");
 
     TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
-    FAIL_IF_ERR(TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
-    FAIL_IF_ERR(TRITONSERVER_ServerOptionsDelete(server_options), "deleting server options");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsDelete(server_options),
+        "deleting server options");
 
-    TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
+    TRITONSERVER_ServerDeleter server =
+        new TRITONSERVER_ServerDeleter(server_ptr);
 
     // Wait until the server is both live and ready.
     int health_iters = 0;
     while (true) {
       boolean[] live = {false}, ready = {false};
-      FAIL_IF_ERR(TRITONSERVER_ServerIsLive(server, live), "unable to get server liveness");
-      FAIL_IF_ERR(TRITONSERVER_ServerIsReady(server, ready), "unable to get server readiness");
-      System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerIsLive(server, live),
+          "unable to get server liveness");
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerIsReady(server, ready),
+          "unable to get server readiness");
+      System.out.println(
+          "Server Health: live " + live[0] + ", ready " + ready[0]);
       if (live[0] && ready[0]) {
         break;
       }
@@ -743,20 +817,24 @@ public static void main(String[] args) throws Exception
 
     // Print status of the server.
     {
-      TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
+      TRITONSERVER_Message server_metadata_message =
+          new TRITONSERVER_Message(null);
       FAIL_IF_ERR(
           TRITONSERVER_ServerMetadata(server, server_metadata_message),
           "unable to get server metadata message");
       BytePointer buffer = new BytePointer((Pointer) null);
       SizeTPointer byte_size = new SizeTPointer(1);
       FAIL_IF_ERR(
-          TRITONSERVER_MessageSerializeToJson(server_metadata_message, buffer, byte_size),
+          TRITONSERVER_MessageSerializeToJson(
+              server_metadata_message, buffer, byte_size),
           "unable to serialize server metadata message");
 
       System.out.println("Server Status:");
       System.out.println(buffer.limit(byte_size.get()).getString());
 
-      FAIL_IF_ERR(TRITONSERVER_MessageDelete(server_metadata_message), "deleting status metadata");
+      FAIL_IF_ERR(
+          TRITONSERVER_MessageDelete(server_metadata_message),
+          "deleting status metadata");
     }
 
     String model_name = "simple";
@@ -778,26 +856,32 @@ public static void main(String[] args) throws Exception
         continue;
       }
 
-      TRITONSERVER_Message model_metadata_message = new TRITONSERVER_Message(null);
+      TRITONSERVER_Message model_metadata_message =
+          new TRITONSERVER_Message(null);
       FAIL_IF_ERR(
-          TRITONSERVER_ServerModelMetadata(server, model_name, 1, model_metadata_message),
+          TRITONSERVER_ServerModelMetadata(
+              server, model_name, 1, model_metadata_message),
           "unable to get model metadata message");
       BytePointer buffer = new BytePointer((Pointer) null);
       SizeTPointer byte_size = new SizeTPointer(1);
       FAIL_IF_ERR(
-          TRITONSERVER_MessageSerializeToJson(model_metadata_message, buffer, byte_size),
+          TRITONSERVER_MessageSerializeToJson(
+              model_metadata_message, buffer, byte_size),
           "unable to serialize model status protobuf");
 
       JsonParser parser = new JsonParser();
       JsonObject model_metadata = null;
       try {
-        model_metadata = parser.parse(buffer.limit(byte_size.get()).getString()).getAsJsonObject();
+        model_metadata = parser.parse(buffer.limit(byte_size.get()).getString())
+                             .getAsJsonObject();
       }
       catch (Exception e) {
         FAIL("error: failed to parse model metadata from JSON: " + e);
       }
 
-      FAIL_IF_ERR(TRITONSERVER_MessageDelete(model_metadata_message), "deleting status protobuf");
+      FAIL_IF_ERR(
+          TRITONSERVER_MessageDelete(model_metadata_message),
+          "deleting status protobuf");
 
       if (!model_metadata.get("name").getAsString().equals(model_name)) {
         FAIL("unable to find metadata for model");
@@ -805,7 +889,8 @@ public static void main(String[] args) throws Exception
 
       boolean found_version = false;
       if (model_metadata.has("versions")) {
-        for (JsonElement version : model_metadata.get("versions").getAsJsonArray()) {
+        for (JsonElement version :
+             model_metadata.get("versions").getAsJsonArray()) {
           if (version.getAsString().equals("1")) {
             found_version = true;
             break;
@@ -817,12 +902,14 @@ public static void main(String[] args) throws Exception
       }
 
       FAIL_IF_ERR(
-          ParseModelMetadata(model_metadata, is_int, is_torch_model), "parsing model metadata");
+          ParseModelMetadata(model_metadata, is_int, is_torch_model),
+          "parsing model metadata");
     }
 
     Runnable runnable = () ->
     {
-      boolean passed = ValidateMemoryGrowth(max_growth_allowed, max_mem_allowed);
+      boolean passed =
+          ValidateMemoryGrowth(max_growth_allowed, max_mem_allowed);
 
       // Sleep to give the garbage collector time to free the server.
       // This avoids race conditions between Triton bindings' printing and
@@ -845,7 +932,8 @@ public static void main(String[] args) throws Exception
 
     for (int i = 0; i < num_iterations; i++) {
       try (PointerScope scope = new PointerScope()) {
-        RunInference(server, model_name, is_int, is_torch_model, check_accuracy);
+        RunInference(
+            server, model_name, is_int, is_torch_model, check_accuracy);
       }
     }
     done = true;
diff --git a/qa/L0_java_resnet/ResnetTest.java b/qa/L0_java_resnet/ResnetTest.java
index e9f353cf62..78ff5c3e97 100644
--- a/qa/L0_java_resnet/ResnetTest.java
+++ b/qa/L0_java_resnet/ResnetTest.java
@@ -1,4 +1,5 @@
-// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -75,7 +76,8 @@ public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p)
       super(p);
       deallocator(new DeleteDeallocator(this));
     }
-    protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
+    protected static class DeleteDeallocator
+        extends TRITONSERVER_Server implements Deallocator {
       DeleteDeallocator(Pointer p) { super(p); }
       @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
     }
@@ -87,7 +89,8 @@ static void Usage(String msg)
       System.err.println(msg);
     }
 
-    System.err.println("Usage: java " + ResnetTest.class.getSimpleName() + " [options]");
+    System.err.println(
+        "Usage: java " + ResnetTest.class.getSimpleName() + " [options]");
     System.err.println(
         "\t-m <\"system\"|\"pinned\"|gpu>"
         + " Enforce the memory type for input and output tensors."
@@ -102,9 +105,10 @@ static void Usage(String msg)
   static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
     @Override
     public TRITONSERVER_Error call(
-        TRITONSERVER_ResponseAllocator allocator, String tensor_name, long byte_size,
-        int preferred_memory_type, long preferred_memory_type_id, Pointer userp,
-        PointerPointer buffer, PointerPointer buffer_userp, IntPointer actual_memory_type,
+        TRITONSERVER_ResponseAllocator allocator, String tensor_name,
+        long byte_size, int preferred_memory_type,
+        long preferred_memory_type_id, Pointer userp, PointerPointer buffer,
+        PointerPointer buffer_userp, IntPointer actual_memory_type,
         LongPointer actual_memory_type_id)
     {
       // Initially attempt to make the actual memory type and id that we
@@ -117,7 +121,9 @@ public TRITONSERVER_Error call(
       if (byte_size == 0) {
         buffer.put(0, null);
         buffer_userp.put(0, null);
-        System.out.println("allocated " + byte_size + " bytes for result tensor " + tensor_name);
+        System.out.println(
+            "allocated " + byte_size + " bytes for result tensor "
+            + tensor_name);
       } else {
         Pointer allocated_ptr = new Pointer();
         if (enforce_memory_type) {
@@ -134,8 +140,8 @@ public TRITONSERVER_Error call(
           buffer_userp.put(0, Loader.newGlobalRef(tensor_name));
           System.out.println(
               "allocated " + byte_size + " bytes in "
-              + TRITONSERVER_MemoryTypeString(actual_memory_type.get()) + " for result tensor "
-              + tensor_name);
+              + TRITONSERVER_MemoryTypeString(actual_memory_type.get())
+              + " for result tensor " + tensor_name);
         }
       }
 
@@ -143,11 +149,13 @@ public TRITONSERVER_Error call(
     }
   }
 
-  static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
+  static class ResponseRelease
+      extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
     @Override
     public TRITONSERVER_Error call(
-        TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
-        long byte_size, int memory_type, long memory_type_id)
+        TRITONSERVER_ResponseAllocator allocator, Pointer buffer,
+        Pointer buffer_userp, long byte_size, int memory_type,
+        long memory_type_id)
     {
       String name = null;
       if (buffer_userp != null) {
@@ -163,15 +171,21 @@ public TRITONSERVER_Error call(
     }
   }
 
-  static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
-    @Override public void call(TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
+  static class InferRequestComplete
+      extends TRITONSERVER_InferenceRequestReleaseFn_t {
+    @Override
+    public void call(
+        TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
     {
       // We reuse the request so we don't delete it here.
     }
   }
 
-  static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
-    @Override public void call(TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
+  static class InferResponseComplete
+      extends TRITONSERVER_InferenceResponseCompleteFn_t {
+    @Override
+    public void call(
+        TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
     {
       if (response != null) {
         // Send 'response' to the future.
@@ -180,12 +194,14 @@ static class InferResponseComplete extends TRITONSERVER_InferenceResponseComplet
     }
   }
 
-  static ConcurrentHashMap<Pointer, CompletableFuture<TRITONSERVER_InferenceResponse>> futures =
+  static ConcurrentHashMap<
+      Pointer, CompletableFuture<TRITONSERVER_InferenceResponse>> futures =
       new ConcurrentHashMap<>();
   static ResponseAlloc responseAlloc = new ResponseAlloc();
   static ResponseRelease responseRelease = new ResponseRelease();
   static InferRequestComplete inferRequestComplete = new InferRequestComplete();
-  static InferResponseComplete inferResponseComplete = new InferResponseComplete();
+  static InferResponseComplete inferResponseComplete =
+      new InferResponseComplete();
 
   static void GenerateInputData(FloatPointer[] input_data)
   {
@@ -207,7 +223,8 @@ static boolean AreValidResults(
             model_name + "inference failure: unexpected output "
             + "in " + model_name + ", index " + i);
 
-        System.out.println("Value: " + output.get(i) + ", expected " + expected_output.get(i));
+        System.out.println(
+            "Value: " + output.get(i) + ", expected " + expected_output.get(i));
 
         return false; // Failure
       }
@@ -216,8 +233,9 @@ static boolean AreValidResults(
   }
 
   static void Check(
-      String model_name, Backend backend, TRITONSERVER_InferenceResponse response,
-      Pointer input_data, String output, int expected_datatype) throws Exception
+      String model_name, Backend backend,
+      TRITONSERVER_InferenceResponse response, Pointer input_data,
+      String output, int expected_datatype) throws Exception
   {
     HashMap<String, Pointer> output_data = new HashMap<>();
 
@@ -242,8 +260,8 @@ static void Check(
 
       FAIL_IF_ERR(
           TRITONSERVER_InferenceResponseOutput(
-              response, idx, cname, datatype, shape, dim_count, base, byte_size, memory_type,
-              memory_type_id, userp),
+              response, idx, cname, datatype, shape, dim_count, base, byte_size,
+              memory_type, memory_type_id, userp),
           "getting output info");
 
       if (cname.isNull()) {
@@ -257,13 +275,15 @@ static void Check(
 
       int output_length = backend == backend.TF ? 1001 : 1000;
 
-      if ((dim_count.get() != 2) || (shape.get(0) != 1) || shape.get(1) != output_length) {
+      if ((dim_count.get() != 2) || (shape.get(0) != 1)
+          || shape.get(1) != output_length) {
         FAIL("unexpected shape for '" + name + "'");
       }
 
       if (datatype.get() != expected_datatype) {
         FAIL(
-            "unexpected datatype '" + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name
+            "unexpected datatype '"
+            + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name
             + "'");
       }
 
@@ -271,8 +291,8 @@ static void Check(
         FAIL(
             "unexpected memory type, expected to be allocated in "
             + TRITONSERVER_MemoryTypeString(requested_memory_type) + ", got "
-            + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id " + memory_type_id.get()
-            + " for " + name);
+            + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id "
+            + memory_type_id.get() + " for " + name);
       }
 
       // We make a copy of the data here... which we could avoid for
@@ -309,8 +329,8 @@ static void Check(
       }
     }
 
-    boolean correct_results =
-        AreValidResults(model_name, new FloatPointer(output_data.get(output)), expected_output);
+    boolean correct_results = AreValidResults(
+        model_name, new FloatPointer(output_data.get(output)), expected_output);
 
     if (correct_results) {
       System.out.println(backend.name() + " test PASSED");
@@ -319,8 +339,8 @@ static void Check(
     }
   }
 
-  static void PerformInference(TRITONSERVER_ServerDeleter server, String model_name)
-      throws Exception
+  static void PerformInference(
+      TRITONSERVER_ServerDeleter server, String model_name) throws Exception
   {
     // Get type of model
     Backend backend = Backend.NONE;
@@ -354,16 +374,19 @@ static void PerformInference(TRITONSERVER_ServerDeleter server, String model_nam
 
     // Create the allocator that will be used to allocate buffers for
     // the result tensors.
-    TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
+    TRITONSERVER_ResponseAllocator allocator =
+        new TRITONSERVER_ResponseAllocator(null);
     FAIL_IF_ERR(
         TRITONSERVER_ResponseAllocatorNew(
             allocator, responseAlloc, responseRelease, null /* start_fn */),
         "creating response allocator");
 
     // Inference
-    TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
+    TRITONSERVER_InferenceRequest irequest =
+        new TRITONSERVER_InferenceRequest(null);
     FAIL_IF_ERR(
-        TRITONSERVER_InferenceRequestNew(irequest, server, model_name, -1 /* model_version */),
+        TRITONSERVER_InferenceRequestNew(
+            irequest, server, model_name, -1 /* model_version */),
         "creating inference request");
 
     FAIL_IF_ERR(
@@ -423,38 +446,50 @@ static void PerformInference(TRITONSERVER_ServerDeleter server, String model_nam
 
     FAIL_IF_ERR(
         TRITONSERVER_InferenceRequestAppendInputData(
-            irequest, input, input_base, input_size, requested_memory_type, 0 /* memory_type_id */),
+            irequest, input, input_base, input_size, requested_memory_type,
+            0 /* memory_type_id */),
         "assigning INPUT data");
 
     // Perform inference...
     {
-      CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
+      CompletableFuture<TRITONSERVER_InferenceResponse> completed =
+          new CompletableFuture<>();
       futures.put(irequest, completed);
 
       FAIL_IF_ERR(
           TRITONSERVER_InferenceRequestSetResponseCallback(
-              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
-              irequest),
+              irequest, allocator, null /* response_allocator_userp */,
+              inferResponseComplete, irequest),
           "setting response callback");
 
       FAIL_IF_ERR(
-          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */),
+          "running inference");
 
       // Wait for the inference to complete.
       TRITONSERVER_InferenceResponse completed_response = completed.get();
       futures.remove(irequest);
 
-      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceResponseError(completed_response),
+          "response status");
 
-      Check(model_name, backend, completed_response, input_data, output, datatype);
+      Check(
+          model_name, backend, completed_response, input_data, output,
+          datatype);
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
+          TRITONSERVER_InferenceResponseDelete(completed_response),
+          "deleting inference response");
     }
 
-    FAIL_IF_ERR(TRITONSERVER_InferenceRequestDelete(irequest), "deleting inference request");
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestDelete(irequest),
+        "deleting inference request");
 
-    FAIL_IF_ERR(TRITONSERVER_ResponseAllocatorDelete(allocator), "deleting response allocator");
+    FAIL_IF_ERR(
+        TRITONSERVER_ResponseAllocatorDelete(allocator),
+        "deleting response allocator");
   }
 
   public static void main(String[] args) throws Exception
@@ -496,7 +531,8 @@ public static void main(String[] args) throws Exception
     if (model_repository_path == null) {
       Usage("-r must be used to specify model repository path");
     }
-    if (enforce_memory_type && requested_memory_type != TRITONSERVER_MEMORY_CPU) {
+    if (enforce_memory_type
+        && requested_memory_type != TRITONSERVER_MEMORY_CPU) {
       Usage("-m can only be set to \"system\" without enabling GPU");
     }
 
@@ -511,16 +547,21 @@ public static void main(String[] args) throws Exception
     }
 
     // Create the server...
-    TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
-    FAIL_IF_ERR(TRITONSERVER_ServerOptionsNew(server_options), "creating server options");
+    TRITONSERVER_ServerOptions server_options =
+        new TRITONSERVER_ServerOptions(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsNew(server_options),
+        "creating server options");
     FAIL_IF_ERR(
-        TRITONSERVER_ServerOptionsSetModelRepositoryPath(server_options, model_repository_path),
+        TRITONSERVER_ServerOptionsSetModelRepositoryPath(
+            server_options, model_repository_path),
         "setting model repository path");
     FAIL_IF_ERR(
         TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
         "setting verbose logging level");
     FAIL_IF_ERR(
-        TRITONSERVER_ServerOptionsSetBackendDirectory(server_options, "/opt/tritonserver/backends"),
+        TRITONSERVER_ServerOptionsSetBackendDirectory(
+            server_options, "/opt/tritonserver/backends"),
         "setting backend directory");
     FAIL_IF_ERR(
         TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
@@ -536,18 +577,27 @@ public static void main(String[] args) throws Exception
         "setting minimum supported CUDA compute capability");
 
     TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
-    FAIL_IF_ERR(TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
-    FAIL_IF_ERR(TRITONSERVER_ServerOptionsDelete(server_options), "deleting server options");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsDelete(server_options),
+        "deleting server options");
 
-    TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
+    TRITONSERVER_ServerDeleter server =
+        new TRITONSERVER_ServerDeleter(server_ptr);
 
     // Wait until the server is both live and ready.
     int health_iters = 0;
     while (true) {
       boolean[] live = {false}, ready = {false};
-      FAIL_IF_ERR(TRITONSERVER_ServerIsLive(server, live), "unable to get server liveness");
-      FAIL_IF_ERR(TRITONSERVER_ServerIsReady(server, ready), "unable to get server readiness");
-      System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerIsLive(server, live),
+          "unable to get server liveness");
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerIsReady(server, ready),
+          "unable to get server readiness");
+      System.out.println(
+          "Server Health: live " + live[0] + ", ready " + ready[0]);
       if (live[0] && ready[0]) {
         break;
       }
@@ -561,20 +611,24 @@ public static void main(String[] args) throws Exception
 
     // Print status of the server.
     {
-      TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
+      TRITONSERVER_Message server_metadata_message =
+          new TRITONSERVER_Message(null);
       FAIL_IF_ERR(
           TRITONSERVER_ServerMetadata(server, server_metadata_message),
           "unable to get server metadata message");
       BytePointer buffer = new BytePointer((Pointer) null);
       SizeTPointer byte_size = new SizeTPointer(1);
       FAIL_IF_ERR(
-          TRITONSERVER_MessageSerializeToJson(server_metadata_message, buffer, byte_size),
+          TRITONSERVER_MessageSerializeToJson(
+              server_metadata_message, buffer, byte_size),
           "unable to serialize server metadata message");
 
       System.out.println("Server Status:");
       System.out.println(buffer.limit(byte_size.get()).getString());
 
-      FAIL_IF_ERR(TRITONSERVER_MessageDelete(server_metadata_message), "deleting status metadata");
+      FAIL_IF_ERR(
+          TRITONSERVER_MessageDelete(server_metadata_message),
+          "deleting status metadata");
     }
 
     for (String model : MODELS) {
diff --git a/qa/L0_java_sequence_batcher/SequenceTest.java b/qa/L0_java_sequence_batcher/SequenceTest.java
index e74214f695..b25b2c383d 100644
--- a/qa/L0_java_sequence_batcher/SequenceTest.java
+++ b/qa/L0_java_sequence_batcher/SequenceTest.java
@@ -1,4 +1,5 @@
-// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights
+// reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -60,7 +61,8 @@ public TRITONSERVER_ServerDeleter(TRITONSERVER_Server p)
       super(p);
       deallocator(new DeleteDeallocator(this));
     }
-    protected static class DeleteDeallocator extends TRITONSERVER_Server implements Deallocator {
+    protected static class DeleteDeallocator
+        extends TRITONSERVER_Server implements Deallocator {
       DeleteDeallocator(Pointer p) { super(p); }
       @Override public void deallocate() { TRITONSERVER_ServerDelete(this); }
     }
@@ -72,7 +74,8 @@ static void Usage(String msg)
       System.err.println(msg);
     }
 
-    System.err.println("Usage: java " + SequenceTest.class.getSimpleName() + " [options]");
+    System.err.println(
+        "Usage: java " + SequenceTest.class.getSimpleName() + " [options]");
     System.err.println("\t-m [model name]");
     System.err.println("\t-v Enable verbose logging");
     System.err.println("\t-r [model repository absolute path]");
@@ -83,9 +86,10 @@ static void Usage(String msg)
   static class ResponseAlloc extends TRITONSERVER_ResponseAllocatorAllocFn_t {
     @Override
     public TRITONSERVER_Error call(
-        TRITONSERVER_ResponseAllocator allocator, String tensor_name, long byte_size,
-        int preferred_memory_type, long preferred_memory_type_id, Pointer userp,
-        PointerPointer buffer, PointerPointer buffer_userp, IntPointer actual_memory_type,
+        TRITONSERVER_ResponseAllocator allocator, String tensor_name,
+        long byte_size, int preferred_memory_type,
+        long preferred_memory_type_id, Pointer userp, PointerPointer buffer,
+        PointerPointer buffer_userp, IntPointer actual_memory_type,
         LongPointer actual_memory_type_id)
     {
       // Initially attempt to make the actual memory type and id that we
@@ -98,7 +102,9 @@ public TRITONSERVER_Error call(
       if (byte_size == 0) {
         buffer.put(0, null);
         buffer_userp.put(0, null);
-        System.out.println("allocated " + byte_size + " bytes for result tensor " + tensor_name);
+        System.out.println(
+            "allocated " + byte_size + " bytes for result tensor "
+            + tensor_name);
       } else {
         Pointer allocated_ptr = new Pointer();
         actual_memory_type.put(0, requested_memory_type);
@@ -113,8 +119,8 @@ public TRITONSERVER_Error call(
           buffer_userp.put(0, new BytePointer(tensor_name));
           System.out.println(
               "allocated " + byte_size + " bytes in "
-              + TRITONSERVER_MemoryTypeString(actual_memory_type.get()) + " for result tensor "
-              + tensor_name);
+              + TRITONSERVER_MemoryTypeString(actual_memory_type.get())
+              + " for result tensor " + tensor_name);
         }
       }
 
@@ -122,11 +128,13 @@ public TRITONSERVER_Error call(
     }
   }
 
-  static class ResponseRelease extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
+  static class ResponseRelease
+      extends TRITONSERVER_ResponseAllocatorReleaseFn_t {
     @Override
     public TRITONSERVER_Error call(
-        TRITONSERVER_ResponseAllocator allocator, Pointer buffer, Pointer buffer_userp,
-        long byte_size, int memory_type, long memory_type_id)
+        TRITONSERVER_ResponseAllocator allocator, Pointer buffer,
+        Pointer buffer_userp, long byte_size, int memory_type,
+        long memory_type_id)
     {
       BytePointer name = null;
       if (buffer_userp != null) {
@@ -137,7 +145,8 @@ public TRITONSERVER_Error call(
 
       System.out.println(
           "Releasing buffer " + buffer + " of size " + byte_size + " in "
-          + TRITONSERVER_MemoryTypeString(memory_type) + " for result '" + name.getString() + "'");
+          + TRITONSERVER_MemoryTypeString(memory_type) + " for result '"
+          + name.getString() + "'");
       Pointer.free(buffer);
       name.deallocate();
 
@@ -145,15 +154,21 @@ public TRITONSERVER_Error call(
     }
   }
 
-  static class InferRequestComplete extends TRITONSERVER_InferenceRequestReleaseFn_t {
-    @Override public void call(TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
+  static class InferRequestComplete
+      extends TRITONSERVER_InferenceRequestReleaseFn_t {
+    @Override
+    public void call(
+        TRITONSERVER_InferenceRequest request, int flags, Pointer userp)
     {
       // We reuse the request so we don't delete it here.
     }
   }
 
-  static class InferResponseComplete extends TRITONSERVER_InferenceResponseCompleteFn_t {
-    @Override public void call(TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
+  static class InferResponseComplete
+      extends TRITONSERVER_InferenceResponseCompleteFn_t {
+    @Override
+    public void call(
+        TRITONSERVER_InferenceResponse response, int flags, Pointer userp)
     {
       if (response != null) {
         // Send 'response' to the future.
@@ -162,17 +177,21 @@ static class InferResponseComplete extends TRITONSERVER_InferenceResponseComplet
     }
   }
 
-  static ConcurrentHashMap<Pointer, CompletableFuture<TRITONSERVER_InferenceResponse>> futures =
+  static ConcurrentHashMap<
+      Pointer, CompletableFuture<TRITONSERVER_InferenceResponse>> futures =
       new ConcurrentHashMap<>();
   static ResponseAlloc responseAlloc = new ResponseAlloc();
   static ResponseRelease responseRelease = new ResponseRelease();
   static InferRequestComplete inferRequestComplete = new InferRequestComplete();
-  static InferResponseComplete inferResponseComplete = new InferResponseComplete();
+  static InferResponseComplete inferResponseComplete =
+      new InferResponseComplete();
 
-  static TRITONSERVER_Error ParseModelMetadata(JsonObject model_metadata, boolean[] is_torch_model)
+  static TRITONSERVER_Error ParseModelMetadata(
+      JsonObject model_metadata, boolean[] is_torch_model)
   {
     String seen_data_type = null;
-    for (JsonElement input_element : model_metadata.get("inputs").getAsJsonArray()) {
+    for (JsonElement input_element :
+         model_metadata.get("inputs").getAsJsonArray()) {
       JsonObject input = input_element.getAsJsonObject();
       if (!input.get("datatype").getAsString().equals("INT32")) {
         return TRITONSERVER_ErrorNew(
@@ -187,7 +206,8 @@ static TRITONSERVER_Error ParseModelMetadata(JsonObject model_metadata, boolean[
             "the inputs and outputs of sequence model must have the data type");
       }
     }
-    for (JsonElement output_element : model_metadata.get("outputs").getAsJsonArray()) {
+    for (JsonElement output_element :
+         model_metadata.get("outputs").getAsJsonArray()) {
       JsonObject output = output_element.getAsJsonObject();
       if (!output.get("datatype").getAsString().equals("INT32")) {
         return TRITONSERVER_ErrorNew(
@@ -200,14 +220,15 @@ static TRITONSERVER_Error ParseModelMetadata(JsonObject model_metadata, boolean[
       }
     }
 
-    is_torch_model[0] = model_metadata.get("platform").getAsString().equals("pytorch_libtorch");
+    is_torch_model[0] =
+        model_metadata.get("platform").getAsString().equals("pytorch_libtorch");
     return null;
   }
 
   // Custom function to set metadata required for sequence batcher
   static void SetSequenceMetadata(
-      TRITONSERVER_InferenceRequest irequest, long correlation_id, boolean sequence_start,
-      boolean sequence_end)
+      TRITONSERVER_InferenceRequest irequest, long correlation_id,
+      boolean sequence_start, boolean sequence_end)
   {
     FAIL_IF_ERR(
         TRITONSERVER_InferenceRequestSetCorrelationId(irequest, correlation_id),
@@ -219,13 +240,16 @@ static void SetSequenceMetadata(
     if (sequence_end) {
       flags += TRITONSERVER_REQUEST_FLAG_SEQUENCE_END;
     }
-    FAIL_IF_ERR(TRITONSERVER_InferenceRequestSetFlags(irequest, flags), "Unable to set flags");
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestSetFlags(irequest, flags),
+        "Unable to set flags");
   }
 
   // Custom function for adjusting sequence batcher
   // expected results for backends that do not implement
   // full accumulator
-  static int GetExpectedResult(String model_name, int expected_result, int value, String flag)
+  static int GetExpectedResult(
+      String model_name, int expected_result, int value, String flag)
   {
     if ((!model_name.contains("nobatch") && !model_name.contains("custom"))
         || model_name.contains("graphdef") || model_name.contains("plan")
@@ -242,8 +266,9 @@ static int GetExpectedResult(String model_name, int expected_result, int value,
   // plus customized check that final sequence result
   // "out" matches expected result
   static void Check(
-      String model_name, TRITONSERVER_InferenceResponse response, int input_value, String output0,
-      long expected_byte_size, int expected_datatype, boolean sequence_end, int expected_result)
+      String model_name, TRITONSERVER_InferenceResponse response,
+      int input_value, String output0, long expected_byte_size,
+      int expected_datatype, boolean sequence_end, int expected_result)
   {
     HashMap<String, Pointer> output_data = new HashMap<>();
 
@@ -268,8 +293,8 @@ static void Check(
 
       FAIL_IF_ERR(
           TRITONSERVER_InferenceResponseOutput(
-              response, idx, cname, datatype, shape, dim_count, base, byte_size, memory_type,
-              memory_type_id, userp),
+              response, idx, cname, datatype, shape, dim_count, base, byte_size,
+              memory_type, memory_type_id, userp),
           "getting output info");
 
       if (cname.isNull()) {
@@ -287,22 +312,23 @@ static void Check(
 
       if (datatype.get() != expected_datatype) {
         FAIL(
-            "unexpected datatype '" + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name
+            "unexpected datatype '"
+            + TRITONSERVER_DataTypeString(datatype.get()) + "' for '" + name
             + "'");
       }
 
       if (byte_size.get() != expected_byte_size) {
         FAIL(
-            "unexpected byte-size, expected " + expected_byte_size + ", got " + byte_size.get()
-            + " for " + name);
+            "unexpected byte-size, expected " + expected_byte_size + ", got "
+            + byte_size.get() + " for " + name);
       }
 
       if (memory_type.get() != requested_memory_type) {
         FAIL(
             "unexpected memory type, expected to be allocated in "
             + TRITONSERVER_MemoryTypeString(requested_memory_type) + ", got "
-            + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id " + memory_type_id.get()
-            + " for " + name);
+            + TRITONSERVER_MemoryTypeString(memory_type.get()) + ", id "
+            + memory_type_id.get() + " for " + name);
       }
 
       // We make a copy of the data here... which we could avoid for
@@ -316,7 +342,8 @@ static void Check(
     int out = new IntPointer(output_data.get(output0)).get(0);
     System.out.println("Value: " + out);
     if (sequence_end) {
-      expected_result = GetExpectedResult(model_name, expected_result, input_value, "end");
+      expected_result =
+          GetExpectedResult(model_name, expected_result, input_value, "end");
       if (out != expected_result) {
         FAIL("Expected result: " + expected_result + ", got " + out);
       } else {
@@ -370,16 +397,21 @@ public static void main(String[] args) throws Exception
     }
 
     // Create the server...
-    TRITONSERVER_ServerOptions server_options = new TRITONSERVER_ServerOptions(null);
-    FAIL_IF_ERR(TRITONSERVER_ServerOptionsNew(server_options), "creating server options");
+    TRITONSERVER_ServerOptions server_options =
+        new TRITONSERVER_ServerOptions(null);
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsNew(server_options),
+        "creating server options");
     FAIL_IF_ERR(
-        TRITONSERVER_ServerOptionsSetModelRepositoryPath(server_options, model_repository_path),
+        TRITONSERVER_ServerOptionsSetModelRepositoryPath(
+            server_options, model_repository_path),
         "setting model repository path");
     FAIL_IF_ERR(
         TRITONSERVER_ServerOptionsSetLogVerbose(server_options, verbose_level),
         "setting verbose logging level");
     FAIL_IF_ERR(
-        TRITONSERVER_ServerOptionsSetBackendDirectory(server_options, "/opt/tritonserver/backends"),
+        TRITONSERVER_ServerOptionsSetBackendDirectory(
+            server_options, "/opt/tritonserver/backends"),
         "setting backend directory");
     FAIL_IF_ERR(
         TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
@@ -390,18 +422,27 @@ public static void main(String[] args) throws Exception
         "setting strict model configuration");
 
     TRITONSERVER_Server server_ptr = new TRITONSERVER_Server(null);
-    FAIL_IF_ERR(TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
-    FAIL_IF_ERR(TRITONSERVER_ServerOptionsDelete(server_options), "deleting server options");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerNew(server_ptr, server_options), "creating server");
+    FAIL_IF_ERR(
+        TRITONSERVER_ServerOptionsDelete(server_options),
+        "deleting server options");
 
-    TRITONSERVER_ServerDeleter server = new TRITONSERVER_ServerDeleter(server_ptr);
+    TRITONSERVER_ServerDeleter server =
+        new TRITONSERVER_ServerDeleter(server_ptr);
 
     // Wait until the server is both live and ready.
     int health_iters = 0;
     while (true) {
       boolean[] live = {false}, ready = {false};
-      FAIL_IF_ERR(TRITONSERVER_ServerIsLive(server, live), "unable to get server liveness");
-      FAIL_IF_ERR(TRITONSERVER_ServerIsReady(server, ready), "unable to get server readiness");
-      System.out.println("Server Health: live " + live[0] + ", ready " + ready[0]);
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerIsLive(server, live),
+          "unable to get server liveness");
+      FAIL_IF_ERR(
+          TRITONSERVER_ServerIsReady(server, ready),
+          "unable to get server readiness");
+      System.out.println(
+          "Server Health: live " + live[0] + ", ready " + ready[0]);
       if (live[0] && ready[0]) {
         break;
       }
@@ -415,20 +456,24 @@ public static void main(String[] args) throws Exception
 
     // Print status of the server.
     {
-      TRITONSERVER_Message server_metadata_message = new TRITONSERVER_Message(null);
+      TRITONSERVER_Message server_metadata_message =
+          new TRITONSERVER_Message(null);
       FAIL_IF_ERR(
           TRITONSERVER_ServerMetadata(server, server_metadata_message),
           "unable to get server metadata message");
       BytePointer buffer = new BytePointer((Pointer) null);
       SizeTPointer byte_size = new SizeTPointer(1);
       FAIL_IF_ERR(
-          TRITONSERVER_MessageSerializeToJson(server_metadata_message, buffer, byte_size),
+          TRITONSERVER_MessageSerializeToJson(
+              server_metadata_message, buffer, byte_size),
           "unable to serialize server metadata message");
 
       System.out.println("Server Status:");
       System.out.println(buffer.limit(byte_size.get()).getString());
 
-      FAIL_IF_ERR(TRITONSERVER_MessageDelete(server_metadata_message), "deleting status metadata");
+      FAIL_IF_ERR(
+          TRITONSERVER_MessageDelete(server_metadata_message),
+          "deleting status metadata");
     }
 
     // Wait for the model to become available.
@@ -447,26 +492,32 @@ public static void main(String[] args) throws Exception
         continue;
       }
 
-      TRITONSERVER_Message model_metadata_message = new TRITONSERVER_Message(null);
+      TRITONSERVER_Message model_metadata_message =
+          new TRITONSERVER_Message(null);
       FAIL_IF_ERR(
-          TRITONSERVER_ServerModelMetadata(server, model_name, 1, model_metadata_message),
+          TRITONSERVER_ServerModelMetadata(
+              server, model_name, 1, model_metadata_message),
           "unable to get model metadata message");
       BytePointer buffer = new BytePointer((Pointer) null);
       SizeTPointer byte_size = new SizeTPointer(1);
       FAIL_IF_ERR(
-          TRITONSERVER_MessageSerializeToJson(model_metadata_message, buffer, byte_size),
+          TRITONSERVER_MessageSerializeToJson(
+              model_metadata_message, buffer, byte_size),
           "unable to serialize model status protobuf");
 
       JsonParser parser = new JsonParser();
       JsonObject model_metadata = null;
       try {
-        model_metadata = parser.parse(buffer.limit(byte_size.get()).getString()).getAsJsonObject();
+        model_metadata = parser.parse(buffer.limit(byte_size.get()).getString())
+                             .getAsJsonObject();
       }
       catch (Exception e) {
         FAIL("error: failed to parse model metadata from JSON: " + e);
       }
 
-      FAIL_IF_ERR(TRITONSERVER_MessageDelete(model_metadata_message), "deleting status protobuf");
+      FAIL_IF_ERR(
+          TRITONSERVER_MessageDelete(model_metadata_message),
+          "deleting status protobuf");
 
       if (!model_metadata.get("name").getAsString().equals(model_name)) {
         FAIL("unable to find metadata for model");
@@ -474,7 +525,8 @@ public static void main(String[] args) throws Exception
 
       boolean found_version = false;
       if (model_metadata.has("versions")) {
-        for (JsonElement version : model_metadata.get("versions").getAsJsonArray()) {
+        for (JsonElement version :
+             model_metadata.get("versions").getAsJsonArray()) {
           if (version.getAsString().equals("1")) {
             found_version = true;
             break;
@@ -485,21 +537,26 @@ public static void main(String[] args) throws Exception
         FAIL("unable to find version 1 status for model");
       }
 
-      FAIL_IF_ERR(ParseModelMetadata(model_metadata, is_torch_model), "parsing model metadata");
+      FAIL_IF_ERR(
+          ParseModelMetadata(model_metadata, is_torch_model),
+          "parsing model metadata");
     }
 
     // Create the allocator that will be used to allocate buffers for
     // the result tensors.
-    TRITONSERVER_ResponseAllocator allocator = new TRITONSERVER_ResponseAllocator(null);
+    TRITONSERVER_ResponseAllocator allocator =
+        new TRITONSERVER_ResponseAllocator(null);
     FAIL_IF_ERR(
         TRITONSERVER_ResponseAllocatorNew(
             allocator, responseAlloc, responseRelease, null /* start_fn */),
         "creating response allocator");
 
     // Inference
-    TRITONSERVER_InferenceRequest irequest = new TRITONSERVER_InferenceRequest(null);
+    TRITONSERVER_InferenceRequest irequest =
+        new TRITONSERVER_InferenceRequest(null);
     FAIL_IF_ERR(
-        TRITONSERVER_InferenceRequestNew(irequest, server, model_name, -1 /* model_version */),
+        TRITONSERVER_InferenceRequestNew(
+            irequest, server, model_name, -1 /* model_version */),
         "creating inference request");
 
     FAIL_IF_ERR(
@@ -561,38 +618,48 @@ public static void main(String[] args) throws Exception
       if (i == num_requests - 1) {
         sequence_end = true;
       }
-      SetSequenceMetadata(irequest, correlation_id, sequence_start, sequence_end);
+      SetSequenceMetadata(
+          irequest, correlation_id, sequence_start, sequence_end);
 
       // Perform inference...
-      CompletableFuture<TRITONSERVER_InferenceResponse> completed = new CompletableFuture<>();
+      CompletableFuture<TRITONSERVER_InferenceResponse> completed =
+          new CompletableFuture<>();
       futures.put(irequest, completed);
 
       FAIL_IF_ERR(
           TRITONSERVER_InferenceRequestSetResponseCallback(
-              irequest, allocator, null /* response_allocator_userp */, inferResponseComplete,
-              irequest),
+              irequest, allocator, null /* response_allocator_userp */,
+              inferResponseComplete, irequest),
           "setting response callback");
 
       FAIL_IF_ERR(
-          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */), "running inference");
+          TRITONSERVER_ServerInferAsync(server, irequest, null /* trace */),
+          "running inference");
 
       // Wait for the inference to complete.
       TRITONSERVER_InferenceResponse completed_response = completed.get();
       futures.remove(irequest);
 
-      FAIL_IF_ERR(TRITONSERVER_InferenceResponseError(completed_response), "response status");
+      FAIL_IF_ERR(
+          TRITONSERVER_InferenceResponseError(completed_response),
+          "response status");
 
       Check(
-          model_name, completed_response, input, output0, input0_size, datatype, sequence_end,
-          expected_result);
+          model_name, completed_response, input, output0, input0_size, datatype,
+          sequence_end, expected_result);
 
       FAIL_IF_ERR(
-          TRITONSERVER_InferenceResponseDelete(completed_response), "deleting inference response");
+          TRITONSERVER_InferenceResponseDelete(completed_response),
+          "deleting inference response");
     }
 
-    FAIL_IF_ERR(TRITONSERVER_InferenceRequestDelete(irequest), "deleting inference request");
+    FAIL_IF_ERR(
+        TRITONSERVER_InferenceRequestDelete(irequest),
+        "deleting inference request");
 
-    FAIL_IF_ERR(TRITONSERVER_ResponseAllocatorDelete(allocator), "deleting response allocator");
+    FAIL_IF_ERR(
+        TRITONSERVER_ResponseAllocatorDelete(allocator),
+        "deleting response allocator");
 
     System.exit(0);
   }

From ca1d3c7af0e5f35c2cec07f5eab1c060aae49be9 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 6 Jul 2023 12:56:43 -0700
Subject: [PATCH 17/39] Update copyrights

---
 deploy/alibaba-cloud/README.md                                  | 2 +-
 deploy/aws/templates/deployment.yaml                            | 2 +-
 deploy/gke-marketplace-app/benchmark/README.md                  | 2 +-
 .../benchmark/model-store/bert_base_tf_gpu/config.pbtxt         | 2 +-
 .../benchmark/model-store/bert_distill_tf_cpu/config.pbtxt      | 2 +-
 .../benchmark/model-store/bert_distill_tf_gpu/config.pbtxt      | 2 +-
 .../server-deployer/chart/triton/templates/application.yaml     | 2 +-
 .../server-deployer/chart/triton/templates/deployment.yaml      | 2 +-
 .../server-deployer/chart/triton/templates/service.yaml         | 2 +-
 docker/sagemaker/serve                                          | 2 +-
 docs/Makefile                                                   | 2 +-
 docs/_static/custom.css                                         | 2 +-
 docs/examples/README.md                                         | 2 +-
 docs/examples/jetson/concurrency_and_dynamic_batching/Makefile  | 2 +-
 docs/examples/jetson/concurrency_and_dynamic_batching/README.md | 2 +-
 docs/protocol/extension_logging.md                              | 2 +-
 docs/protocol/extension_model_configuration.md                  | 2 +-
 17 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/deploy/alibaba-cloud/README.md b/deploy/alibaba-cloud/README.md
index 0521eb704f..98f914a693 100644
--- a/deploy/alibaba-cloud/README.md
+++ b/deploy/alibaba-cloud/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/aws/templates/deployment.yaml b/deploy/aws/templates/deployment.yaml
index 48ef82160d..d90e51b113 100644
--- a/deploy/aws/templates/deployment.yaml
+++ b/deploy/aws/templates/deployment.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gke-marketplace-app/benchmark/README.md b/deploy/gke-marketplace-app/benchmark/README.md
index c9c502e1b0..5138148035 100644
--- a/deploy/gke-marketplace-app/benchmark/README.md
+++ b/deploy/gke-marketplace-app/benchmark/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt
index f369db917f..b6ca32f9a2 100644
--- a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt
+++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt
index 3bfccb5c45..c8e8074309 100644
--- a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt
+++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt
index f369db917f..b6ca32f9a2 100644
--- a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt
+++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml
index 5658aea801..28bfbf08c4 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml
index 8bf21d9684..75ac1aee81 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml
index 5562fa76b5..93ef6f9da3 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docker/sagemaker/serve b/docker/sagemaker/serve
index 268f1f0f68..e9abc00bf5 100755
--- a/docker/sagemaker/serve
+++ b/docker/sagemaker/serve
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/Makefile b/docs/Makefile
index 98271dfb29..099a8f659b 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -1,4 +1,4 @@
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
index a8c37ced01..46bab57d4e 100644
--- a/docs/_static/custom.css
+++ b/docs/_static/custom.css
@@ -1,5 +1,5 @@
 /*
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/examples/README.md b/docs/examples/README.md
index 3261bc6a9d..84bfcb9499 100644
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile b/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile
index 6dcf0d0dc4..c535c7cb51 100644
--- a/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile
+++ b/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile
@@ -1,4 +1,4 @@
-# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/README.md b/docs/examples/jetson/concurrency_and_dynamic_batching/README.md
index 30cfe196f1..115983b157 100644
--- a/docs/examples/jetson/concurrency_and_dynamic_batching/README.md
+++ b/docs/examples/jetson/concurrency_and_dynamic_batching/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/protocol/extension_logging.md b/docs/protocol/extension_logging.md
index d5b770d5d4..e30c22b784 100644
--- a/docs/protocol/extension_logging.md
+++ b/docs/protocol/extension_logging.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/protocol/extension_model_configuration.md b/docs/protocol/extension_model_configuration.md
index a9baaa58d7..04a2d28fac 100644
--- a/docs/protocol/extension_model_configuration.md
+++ b/docs/protocol/extension_model_configuration.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions

From cbd3897774469a4a2211088ad60941c52cbc9730 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 6 Jul 2023 12:58:10 -0700
Subject: [PATCH 18/39] Update copyrights

---
 deploy/aws/README.md                                            | 2 +-
 deploy/gcp/README.md                                            | 2 +-
 .../benchmark/model-store/bert_base_trt_gpu/config.pbtxt        | 2 +-
 docs/getting_started/quickstart.md                              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/deploy/aws/README.md b/deploy/aws/README.md
index cbde5610ce..4e60fdd65b 100644
--- a/deploy/aws/README.md
+++ b/deploy/aws/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md
index b1ed1d2d91..dc80cc77de 100644
--- a/deploy/gcp/README.md
+++ b/deploy/gcp/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt
index f3b83d5725..acbd124bf2 100644
--- a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt
+++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index fa1a8ec690..1d475e771e 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions

From 16d1c06507914f8228fde8af89863ef6db40f5bd Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 6 Jul 2023 13:22:36 -0700
Subject: [PATCH 19/39] Run workflows on every PR

---
 .github/workflows/codeql.yml      | 6 ------
 .github/workflows/pre-commit.yaml | 2 --
 2 files changed, 8 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 4f3f98cc6f..745a33730b 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -27,13 +27,7 @@
 name: "CodeQL"
 
 on:
-  push:
-    branches: [ 'main' ]
   pull_request:
-    # The branches below must be a subset of the branches above
-    branches: [ 'main' ]
-  schedule:
-    - cron: '0 1 * * 1-6'
 
 jobs:
   analyze:
diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
index 190610a7aa..531cc2911b 100644
--- a/.github/workflows/pre-commit.yaml
+++ b/.github/workflows/pre-commit.yaml
@@ -28,8 +28,6 @@ name: pre-commit
 
 on:
   pull_request:
-  push:
-    branches: [main]
 
 jobs:
   pre-commit:

From 3d6612a37a21e58d42239b85f48b8a73592e66fd Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Fri, 7 Jul 2023 12:50:32 -0700
Subject: [PATCH 20/39] Fix copyright year

---
 docs/examples/jetson/concurrency_and_dynamic_batching/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile b/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile
index c535c7cb51..6506314999 100644
--- a/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile
+++ b/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions

From dd27ee555372aa23b8b0d44c2b74869c64854739 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Fri, 7 Jul 2023 12:57:49 -0700
Subject: [PATCH 21/39] Fix grammar

---
 docs/user_guide/metrics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
index 6e4f01bcd2..ca796d644f 100644
--- a/docs/user_guide/metrics.md
+++ b/docs/user_guide/metrics.md
@@ -51,7 +51,7 @@ enabled. If http service is not enabled, the metric address will bind to `0.0.0.
 by default. To uniquely specify the metric endpoint, `--metrics-address` option
 can be used. See the `tritonserver --help` output for more info on these CLI options.
 
-To change the interval at which's metrics are polled/updated, see the `--metrics-interval-ms` flag. Metrics that are updated "Per Request" are unaffected by this interval setting. This interval only applies to metrics that are designated as "Per Interval" in the tables of each section below:
+To change the interval at which metrics are polled/updated, see the `--metrics-interval-ms` flag. Metrics that are updated "Per Request" are unaffected by this interval setting. This interval only applies to metrics that are designated as "Per Interval" in the tables of each section below:
 
 - [Inference Request Metrics](#inference-request-metrics)
 - [GPU Metrics](#gpu-metrics)

From ef2da64eb14f85e43f65ac69b47b23bba60c148b Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Fri, 7 Jul 2023 13:55:45 -0700
Subject: [PATCH 22/39] Entrypoint.d files are not executable

---
 docker/cpu_only/entrypoint.d/12-banner.sh              | 1 -
 docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh   | 1 -
 docker/entrypoint.d/50-gpu-driver-check2.sh            | 1 -
 docker/entrypoint.d/56-network-driver-version-check.sh | 1 -
 docker/entrypoint.d/70-shm-check.sh                    | 2 +-
 docker/entrypoint.d/99-check-run-aip-mode.sh           | 1 -
 6 files changed, 1 insertion(+), 6 deletions(-)
 mode change 100755 => 100644 docker/cpu_only/entrypoint.d/12-banner.sh
 mode change 100755 => 100644 docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh
 mode change 100755 => 100644 docker/entrypoint.d/50-gpu-driver-check2.sh
 mode change 100755 => 100644 docker/entrypoint.d/56-network-driver-version-check.sh
 mode change 100755 => 100644 docker/entrypoint.d/70-shm-check.sh
 mode change 100755 => 100644 docker/entrypoint.d/99-check-run-aip-mode.sh

diff --git a/docker/cpu_only/entrypoint.d/12-banner.sh b/docker/cpu_only/entrypoint.d/12-banner.sh
old mode 100755
new mode 100644
index 0b4adda84b..f6c44bede5
--- a/docker/cpu_only/entrypoint.d/12-banner.sh
+++ b/docker/cpu_only/entrypoint.d/12-banner.sh
@@ -1,4 +1,3 @@
-#!/bin/bash
 # Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 prodname_uc=$(echo "${NVIDIA_PRODUCT_NAME}" | tr [:lower:] [:upper:] | sed 's/ /_/g' | sed 's/^NVIDIA_//')  # Product name
diff --git a/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh b/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh
old mode 100755
new mode 100644
index 4caa8eeff7..c5ab38f435
--- a/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh
+++ b/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh
@@ -1,4 +1,3 @@
-#!/bin/bash
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 export TRITON_SERVER_CPU_ONLY=1
diff --git a/docker/entrypoint.d/50-gpu-driver-check2.sh b/docker/entrypoint.d/50-gpu-driver-check2.sh
old mode 100755
new mode 100644
index bc22dd55ad..f831d43801
--- a/docker/entrypoint.d/50-gpu-driver-check2.sh
+++ b/docker/entrypoint.d/50-gpu-driver-check2.sh
@@ -1,4 +1,3 @@
-#!/bin/bash
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 if [[ "${NVIDIA_CPU_ONLY:-0}" == "1" ]]; then
diff --git a/docker/entrypoint.d/56-network-driver-version-check.sh b/docker/entrypoint.d/56-network-driver-version-check.sh
old mode 100755
new mode 100644
index a9bf588e2f..e69de29bb2
--- a/docker/entrypoint.d/56-network-driver-version-check.sh
+++ b/docker/entrypoint.d/56-network-driver-version-check.sh
@@ -1 +0,0 @@
-#!/bin/bash
diff --git a/docker/entrypoint.d/70-shm-check.sh b/docker/entrypoint.d/70-shm-check.sh
old mode 100755
new mode 100644
index a9bf588e2f..8b13789179
--- a/docker/entrypoint.d/70-shm-check.sh
+++ b/docker/entrypoint.d/70-shm-check.sh
@@ -1 +1 @@
-#!/bin/bash
+
diff --git a/docker/entrypoint.d/99-check-run-aip-mode.sh b/docker/entrypoint.d/99-check-run-aip-mode.sh
old mode 100755
new mode 100644
index ec9249e944..32a93fbbb2
--- a/docker/entrypoint.d/99-check-run-aip-mode.sh
+++ b/docker/entrypoint.d/99-check-run-aip-mode.sh
@@ -1,4 +1,3 @@
-#!/bin/bash
 # Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 # If detect Vertex AI environment, launch tritonserver with supplied arguments

From 53a7ed30f949aae66107d7da9535963df07a517e Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Sun, 9 Jul 2023 20:22:45 -0700
Subject: [PATCH 23/39] Run pre-commit hooks

---
 docs/user_guide/trace.md                      |   86 +-
 issues.csv                                    | 2863 +++++++++++++++++
 qa/L0_backend_python/common.sh                |    2 +-
 qa/L0_backend_python/env/test.sh              |    2 +-
 .../sequence_batcher_test.py                  | 2452 ++++++++------
 qa/L0_trace/opentelemetry_unittest.py         |  197 +-
 qa/L0_trace/test.sh                           |   24 +-
 qa/common/gen_qa_implicit_models.py           |  996 +++---
 qa/python_models/bls_onnx_warmup/model.py     |   41 +-
 9 files changed, 5073 insertions(+), 1590 deletions(-)
 create mode 100644 issues.csv

diff --git a/docs/user_guide/trace.md b/docs/user_guide/trace.md
index 473ce8075c..afd56ee335 100644
--- a/docs/user_guide/trace.md
+++ b/docs/user_guide/trace.md
@@ -32,15 +32,15 @@ Triton includes that capability to generate a detailed trace for
 individual inference requests. Tracing is enable by command-line
 arguments when running the tritonserver executable.
 
-`--trace-config` command line option in Triton can be used to specify 
-global and trace mode specific config setting. The format of this flag 
-is `--trace-config <mode>,<setting>=<value>`, where `<mode>` 
+`--trace-config` command line option in Triton can be used to specify
+global and trace mode specific config setting. The format of this flag
+is `--trace-config <mode>,<setting>=<value>`, where `<mode>`
 is either `triton` or `opentelemetry`. By default, the trace mode is set to `triton`,
-and the server will use Triton's trace APIs. For `opentelemetry` mode, 
-the server will use the [OpenTelemetry's APIs](#opentelemetry-trace-support) to generate, 
+and the server will use Triton's trace APIs. For `opentelemetry` mode,
+the server will use the [OpenTelemetry's APIs](#opentelemetry-trace-support) to generate,
 collect and export traces for individual inference requests.
 
-To specify global trace settings (level, rate, count, or mode), 
+To specify global trace settings (level, rate, count, or mode),
 the format is `--trace-config <setting>=<value>`.
 
 An example usage, which invokes Triton's trace APIs:
@@ -70,7 +70,7 @@ The following table shows available global trace settings to pass to `--trace-co
     <td><code>rate</code></td>
     <td>1000</td>
     <td>
-      Specifies the sampling rate. The same as deprecated 
+      Specifies the sampling rate. The same as deprecated
       <code>--trace-rate</code>. <br/>
       For example, a value of 1000 specifies that every 1000-th inference <br/>
       request will be traced.
@@ -81,10 +81,10 @@ The following table shows available global trace settings to pass to `--trace-co
     <td>OFF</td>
     <td>
       Indicates the level of trace detail that should be collected and <br/>
-      may be specified  multiple times to trace multiple informations. <br/>
+      may be specified  multiple times to trace multiple information. <br/>
       The same as deprecated <code>--trace-level</code>. <br/>
-      Choices are <code>TIMESTAMPS</code> and <code>TENSORS</code>.<br/>  
-      <b>Note</b> that <code>opentelemetry</code> mode does not currently <br/> 
+      Choices are <code>TIMESTAMPS</code> and <code>TENSORS</code>.<br/>
+      <b>Note</b> that <code>opentelemetry</code> mode does not currently <br/>
       support <code>TENSORS</code> level.
     </td>
     </tr>
@@ -95,8 +95,8 @@ The following table shows available global trace settings to pass to `--trace-co
       Specifies the remaining number of traces to be collected. <br/>
       The default value of -1 specifies to never stop collecting traces. <br/>
       With a value  of 100, Triton will stop tracing requests<br/>
-      after 100 traces are collected.<br/> 
-      The same as  deprecated <code>--trace-count</code>. 
+      after 100 traces are collected.<br/>
+      The same as  deprecated <code>--trace-count</code>.
     </td>
     </tr>
     <tr>
@@ -112,7 +112,7 @@ The following table shows available global trace settings to pass to `--trace-co
 
 ### Triton Trace APIs Settings
 
-The following table shows available Triton trace APIs settings for 
+The following table shows available Triton trace APIs settings for
 `--trace-config triton,<setting>=<value>`.
 <table>
   <thead>
@@ -155,8 +155,8 @@ The `--trace-file` option indicates where the trace output should be
 written. The `--trace-rate` option specifies the sampling rate. In
 this example every 100-th inference request will be traced. The
 `--trace-level` option indicates the level of trace detail that should
-be collected. `--trace-level` option may be specified multiple times to 
-trace multiple informations. The `--trace-log-frequency` option specifies the
+be collected. `--trace-level` option may be specified multiple times to
+trace multiple information. The `--trace-log-frequency` option specifies the
 rate that the traces are written to file. In this example Triton will log to
 file for every 50 traces collected. The `--trace-count` option specifies the
 remaining number of traces to be collected. In this example Triton will stop
@@ -166,7 +166,7 @@ to get more information.
 ## Supported Trace Level Option
 
 - `TIMESTAMPS`: Tracing execution timestamps of each request.
-- `TENSORS`: Tracing input and output tensors during the execution. 
+- `TENSORS`: Tracing input and output tensors during the execution.
 
 ## JSON Trace Output
 
@@ -201,7 +201,7 @@ The trace output is a JSON file with the following schema.
 ]
 ```
 
-Each trace is assigned a "id", which indicates the model name and 
+Each trace is assigned a "id", which indicates the model name and
 version of the inference request. If the trace is from a
 model run as part of an ensemble, the "parent_id" will indicate the
 "id" of the containing ensemble.
@@ -217,8 +217,8 @@ For example:
 ]
 ```
 
-Each `TIMESTAMPS` trace will have one or more "timestamps" with 
-each timestamp having a name and the timestamp in nanoseconds ("ns"). 
+Each `TIMESTAMPS` trace will have one or more "timestamps" with
+each timestamp having a name and the timestamp in nanoseconds ("ns").
 For example:
 
 ```
@@ -238,9 +238,9 @@ For example:
 ]
 ```
 
-Each `TENSORS` trace will contain an "activity" and a "tensor". 
-"activity" indicates the type of tensor, including "TENSOR_QUEUE_INPUT" 
-and "TENSOR_BACKEND_OUTPUT" by now. "tensor" has the detail of tensor, 
+Each `TENSORS` trace will contain an "activity" and a "tensor".
+"activity" indicates the type of tensor, including "TENSOR_QUEUE_INPUT"
+and "TENSOR_BACKEND_OUTPUT" by now. "tensor" has the detail of tensor,
 including its "name", "data" and "dtype". For example:
 
 ```
@@ -329,8 +329,8 @@ simple (-1):
 ...
 ```
 
-The script can also show the data flow of the first request if there are 
-`TENSORS` traces in the file. If the `TENSORS` traces are from an ensemble, 
+The script can also show the data flow of the first request if there are
+`TENSORS` traces in the file. If the `TENSORS` traces are from an ensemble,
 the data flow will be shown with the dependency of each model.
 
 ```
@@ -415,25 +415,25 @@ The meaning of the trace timestamps is:
   * Overhead: Additional time required for request handling not
     covered by Queue or Compute times.
 
-* Data Flow: The data flow of the first request. It contains the input and 
+* Data Flow: The data flow of the first request. It contains the input and
   output tensors of each part of execution.
 
   * Name: The name of model.
 
   * Version: The version of model.
 
-  * QUEUE_INPUT: The tensor entering the queue of a backend to wait for 
+  * QUEUE_INPUT: The tensor entering the queue of a backend to wait for
     scheduling.
 
   * BACKEND_OUTPUT: The tensor in the response of a backend.
 
 ## OpenTelemetry trace support
 
-Triton provides an option to generate and export traces 
+Triton provides an option to generate and export traces
 for standalone and ensemble models
-using [OpenTelemetry APIs and SDKs](https://opentelemetry.io/). 
+using [OpenTelemetry APIs and SDKs](https://opentelemetry.io/).
 
-To specify OpenTelemetry mode for tracing, specify the `--trace-config` 
+To specify OpenTelemetry mode for tracing, specify the `--trace-config`
 flag as follows:
 
 ```
@@ -442,23 +442,23 @@ $ tritonserver --trace-config mode=opentelemetry \
 ```
 ### Differences in trace contents from Triton's trace [output](#json-trace-output)
 
-OpenTelemetry APIs produce [spans](https://opentelemetry.io/docs/concepts/observability-primer/#spans) 
+OpenTelemetry APIs produce [spans](https://opentelemetry.io/docs/concepts/observability-primer/#spans)
 that collect the same timestamps as Triton's Trace
 APIs. Each span also includes `model_name`, `model_version`, `request_id`,
 and `parent_id` as an [attribute](https://opentelemetry.io/docs/concepts/observability-primer/#span-attributes).
 
-The span collects `TIMESTAMPS` that consist of a name and a timestamp 
-in nanoseconds, which is similar to Triton Trace APIs. However, 
+The span collects `TIMESTAMPS` that consist of a name and a timestamp
+in nanoseconds, which is similar to Triton Trace APIs. However,
 OpenTelemetry relies on the system's clock for event timestamps, which is based
-on the system's real-time clock. On the other hand, Triton Trace APIs 
-report timestamps using steady clock, which is a monotonic clock that ensures 
-time always movess forward. This clock is not related to wall clock time 
+on the system's real-time clock. On the other hand, Triton Trace APIs
+report timestamps using steady clock, which is a monotonic clock that ensures
+time always movess forward. This clock is not related to wall clock time
 and, for example, can measure time since last reboot.
 
 
 ### OpenTelemetry trace APIs settings
 
-The following table shows available OpenTelemetry trace APIs settings for 
+The following table shows available OpenTelemetry trace APIs settings for
 `--trace-config opentelemetry,<setting>=<value>`.
 <table>
   <thead>
@@ -473,7 +473,7 @@ The following table shows available OpenTelemetry trace APIs settings for
     <td><code>url</code></td>
     <td><code>http://localhost:4318/v1/traces</code></td>
     <td>
-      <code>host:port</code> to which the receiver is going to receive 
+      <code>host:port</code> to which the receiver is going to receive
       trace data.
     </td>
     </tr>
@@ -484,15 +484,15 @@ The following table shows available OpenTelemetry trace APIs settings for
 
 - OpenTelemetry trace mode is not supported on Windows systems.
 
-- Tracing [BLS](https://github.com/triton-inference-server/python_backend/tree/main#business-logic-scripting) 
+- Tracing [BLS](https://github.com/triton-inference-server/python_backend/tree/main#business-logic-scripting)
 models is not supported.
 
-- Triton supports only 
-[OTLP/HTTP Exporter](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/otlp.md#otlphttp) 
-and allows specification of only url for this exporter through 
+- Triton supports only
+[OTLP/HTTP Exporter](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/otlp.md#otlphttp)
+and allows specification of only url for this exporter through
 `--trace-config`. Other options and corresponding default values can be
 found [here](https://github.com/open-telemetry/opentelemetry-cpp/tree/v1.8.3/exporters/otlp#configuration-options--otlp-http-exporter-).
 
-- Triton does not support configuration of the opentelemetry trace settings 
-during a Triton run and opentelemetry specific settings are not available 
+- Triton does not support configuration of the opentelemetry trace settings
+during a Triton run and opentelemetry specific settings are not available
 for the retrieval through [Triton's trace extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_trace.md).
\ No newline at end of file
diff --git a/issues.csv b/issues.csv
new file mode 100644
index 0000000000..d520a62427
--- /dev/null
+++ b/issues.csv
@@ -0,0 +1,2863 @@
+6044,OPEN,Is there a triton endpoint to check if there is requests in the model or not?,,2023-07-11 21:27:12 +0000 UTC
+6042,OPEN,transformer model output mismatch,,2023-07-11 09:14:56 +0000 UTC
+6038,OPEN,Memory Leak When Using Python/TensorRT Backend,bug,2023-07-10 20:55:34 +0000 UTC
+6036,OPEN,How to convert an image into the input required by the model in Java,question,2023-07-07 14:21:43 +0000 UTC
+6035,OPEN,How can I get OpenVINO model configure example?,question,2023-07-07 12:38:49 +0000 UTC
+6034,OPEN,triton aiohttp client report "Timeout context manager should be used inside a task" error,question,2023-07-07 12:33:59 +0000 UTC
+6030,CLOSED,error running inference using grpc client for java,invalid, question,2023-07-07 14:16:14 +0000 UTC
+6029,CLOSED,[QST] when/where to exactly load the pytorch mode?,question,2023-07-06 04:17:57 +0000 UTC
+6028,OPEN,model lock for python backend Model Loading API,enhancement, question,2023-07-06 17:51:03 +0000 UTC
+6027,OPEN,Create a torch::multipy (torch::deploy) backend for eager mode serving,enhancement,2023-07-07 20:20:54 +0000 UTC
+6025,OPEN,About the deployment of the chat-glm model,question,2023-07-06 02:32:11 +0000 UTC
+6024,CLOSED,How to set up the config.pbtxt for 2 completely different shaped inputs?,,2023-07-05 03:24:43 +0000 UTC
+6023,CLOSED,There is a bug when I set the initial_state,invalid,2023-07-10 20:18:17 +0000 UTC
+6021,OPEN,model repository to use semantic versioning,enhancement,2023-07-05 14:15:38 +0000 UTC
+6020,OPEN,Support client to build with CMAKE_MSVC_RUNTIME_LIBRARY,enhancement,2023-07-05 22:55:51 +0000 UTC
+6015,OPEN,Build and run client in mac, ios and android,enhancement,2023-07-04 05:58:49 +0000 UTC
+6014,OPEN,Newer versions of triton server have a consirable slowdown in start time,bug, investigating,2023-07-07 18:31:56 +0000 UTC
+6013,CLOSED,Question About Batching Process in Triton,question,2023-07-06 15:31:08 +0000 UTC
+6012,CLOSED,Compiling error,question,2023-07-05 20:59:41 +0000 UTC
+6011,CLOSED,Prometheus metric nv_inference_request_summary_us missing in 23.06 release,investigating,2023-07-07 06:14:28 +0000 UTC
+6009,OPEN,memory_usage is [] on A6000s,,2023-07-03 12:17:06 +0000 UTC
+6008,OPEN,How can i identify the model instances in the instance group ?,question,2023-07-06 22:46:34 +0000 UTC
+6007,OPEN,How can we update models in all triton servers using mlflow triton plugin,,2023-06-29 22:03:11 +0000 UTC
+6004,CLOSED,Default Models not working,,2023-06-29 17:58:35 +0000 UTC
+6003,CLOSED,async_stream_infer has no keyword argument 'enable_empty_final_response',,2023-06-30 01:28:08 +0000 UTC
+6002,CLOSED,Dynamic batching does not work in decoupled model,,2023-07-03 01:54:37 +0000 UTC
+5999,CLOSED,json data,,2023-06-28 11:36:51 +0000 UTC
+5998,CLOSED,how to return json data,question,2023-06-29 02:08:54 +0000 UTC
+5997,OPEN,UNAVAILABLE: Internal: failed to load model 'yolov5': PytorchStreamReader failed locating file constants.pkl: file not found,question,2023-06-30 19:10:11 +0000 UTC
+5996,CLOSED,how to set the ensemble_scheduling when do inference of difference outputs,question,2023-07-05 06:03:13 +0000 UTC
+5989,OPEN,PeftModel work well on python backend?,question, investigating,2023-06-29 01:20:02 +0000 UTC
+5988,CLOSED,【PythonBackend】TRITONBACKEND_ResponseSend hangs,,2023-06-30 05:55:39 +0000 UTC
+5987,OPEN,OpenTelemetry Tracing allow to configure service name,enhancement,2023-06-26 19:19:47 +0000 UTC
+5983,OPEN,How to restart a model instance,enhancement,2023-06-28 00:09:02 +0000 UTC
+5982,OPEN,allow model parameters to be specified in ensemble config,enhancement,2023-06-23 19:50:14 +0000 UTC
+5978,CLOSED,[Question] What is the recommended way to run Triton ?,question,2023-06-23 01:50:28 +0000 UTC
+5977,CLOSED,how can i use triton_python_backend_utils?,question,2023-06-23 19:30:50 +0000 UTC
+5975,CLOSED,Incompatible TensorRT versions between nvcr.io/nvidia/tritonserver:23.05-py3 and nvcr.io/nvidia/tensorrt:23.05-py3 images,,2023-07-07 22:34:20 +0000 UTC
+5973,OPEN,Raise exception when falling back to pinned memory,enhancement,2023-06-21 18:47:52 +0000 UTC
+5972,OPEN,Error when using models with zero-length inputs,bug,2023-06-22 01:29:58 +0000 UTC
+5971,OPEN,CORS Issue,enhancement,2023-06-23 00:12:12 +0000 UTC
+5970,CLOSED,[Question] How does Triton run ensemble models ?,,2023-06-22 14:07:24 +0000 UTC
+5969,CLOSED,multi triton_python_backend cause image preprocess (torchvision) very slow.,question,2023-07-07 22:34:36 +0000 UTC
+5968,OPEN,LoRA support,enhancement,2023-07-10 23:45:47 +0000 UTC
+5964,OPEN,Could not load model using mlflow triton plugin with S3/minio as model repository,bug,2023-06-23 10:18:02 +0000 UTC
+5961,OPEN,Allow introspection and static analysis of `pb_utils` (Python backend),enhancement,2023-06-20 19:38:38 +0000 UTC
+5960,CLOSED,Error with float16 datatype,question,2023-07-07 22:35:34 +0000 UTC
+5959,OPEN,BF16 support for integrated TensorRT precision mode,enhancement,2023-06-20 19:31:18 +0000 UTC
+5958,CLOSED,Am I the only one who can't pull nvidia image from `nvcr.io/nvidia` ?,question,2023-06-21 18:17:31 +0000 UTC
+5957,OPEN,The performance of the model is greatly affected by the request distribution,,2023-07-11 02:37:40 +0000 UTC
+5956,CLOSED,Invalid argument - input 'INPUT' already exists in request,,2023-07-07 22:37:23 +0000 UTC
+5953,OPEN,Performance analyzer with real data byte size mismatch,question,2023-06-23 00:39:26 +0000 UTC
+5952,CLOSED,python backend error: c_python_backend_utils.TritonModelException: Tensor is stored in GPU and cannot be converted to NumPy,,2023-06-16 23:34:58 +0000 UTC
+5951,OPEN,Decoupled Models Hang Unexpectedly with No Clear Error Message,,2023-06-22 21:41:38 +0000 UTC
+5949,CLOSED,boost::interprocess::lock_exception,investigating,2023-07-06 16:58:43 +0000 UTC
+5948,CLOSED,socket timed out when an inference request is made using a client.py script,,2023-07-07 22:37:59 +0000 UTC
+5947,CLOSED,need InferenceRequest and InferenceResponse definitions to better test user codes,question,2023-07-07 22:37:50 +0000 UTC
+5944,OPEN,InferenceRequestNew cost time will slowly increase with the number of model reloads,,2023-06-28 12:52:07 +0000 UTC
+5943,CLOSED,Poll failed for model directory 'ensemble': output 'OUTPUT_0' for ensemble 'ensemble' is not written,,2023-07-06 15:35:35 +0000 UTC
+5942,OPEN,Why does a process still occupy the CPU after the server is started?,,2023-06-14 01:46:05 +0000 UTC
+5940,CLOSED,Questions about loading models,question,2023-07-07 22:38:10 +0000 UTC
+5938,CLOSED,TensorRT model inference speed varies dramatically depending on if an ONNX model is also loaded,,2023-06-13 20:06:27 +0000 UTC
+5934,OPEN,The Docker build process has failed.,,2023-06-13 00:54:27 +0000 UTC
+5933,CLOSED,version `GLIBCXX_3.4.30' not found (required by /opt/tritonserver/backends/python/triton_python_backend_stub,,2023-06-28 21:44:45 +0000 UTC
+5932,OPEN,How to request the deployed server through postman？,question,2023-06-13 02:04:28 +0000 UTC
+5931,OPEN,ERROR: The NVIDIA Driver is present, but CUDA failed to initialize.,,2023-07-07 10:18:27 +0000 UTC
+5929,CLOSED,How the aio stream client gets the response,,2023-06-29 09:39:54 +0000 UTC
+5927,OPEN,Metrics: number of requests in queue,enhancement,2023-06-13 04:52:38 +0000 UTC
+5926,CLOSED,[Python Backend] Dynamic batching does not work in decoupled model,,2023-06-12 19:20:32 +0000 UTC
+5920,CLOSED,Triton server returns Request Timeout error after a few hours of continuous inference,,2023-06-20 05:07:00 +0000 UTC
+5918,CLOSED,gRPC version mismatch between the Triton client and DeepStream,,2023-06-11 11:36:27 +0000 UTC
+5913,CLOSED,Yielding Tokens During LM Inference,,2023-07-01 11:41:46 +0000 UTC
+5912,CLOSED,Triton will initialize logging twice when starting,,2023-06-27 23:33:52 +0000 UTC
+5907,OPEN,onnxruntime_backend doesn't support loading models concurrently,,2023-06-07 16:39:18 +0000 UTC
+5906,CLOSED,Provide server configuration through file,,2023-06-06 10:57:43 +0000 UTC
+5905,OPEN,the c++ API of backends get output from tensrort and onnx is different,,2023-06-21 22:05:06 +0000 UTC
+5904,CLOSED,the c++ API of backends get output from tensrort and onnx,,2023-06-06 19:27:17 +0000 UTC
+5903,CLOSED,How to view server logs,question,2023-06-07 07:51:36 +0000 UTC
+5902,OPEN,how to serve thousands of models in python backend?,,2023-06-26 14:17:12 +0000 UTC
+5901,CLOSED,Python backend custom metrics: Unsupported TRITONSERVER_MetricKind,,2023-06-12 22:59:07 +0000 UTC
+5899,CLOSED,perf_analyzer removes arguments with empty contents,,2023-07-03 15:26:03 +0000 UTC
+5895,CLOSED,Python version used in backend updated to 3.10?,,2023-06-02 20:13:27 +0000 UTC
+5893,CLOSED,about triton client c++ api send float,,2023-07-07 22:53:53 +0000 UTC
+5891,CLOSED,Triton server produces inconsistent results,question,2023-06-07 05:08:22 +0000 UTC
+5890,CLOSED,Ensemble model versioning,question,2023-06-05 18:27:53 +0000 UTC
+5889,CLOSED,How to serve Python models on GPU,question,2023-06-23 20:06:35 +0000 UTC
+5887,CLOSED,Issues regarding the performance of cudashm.set_shared_memory_region(shm_input_handle, [img]),,2023-07-07 22:36:52 +0000 UTC
+5886,CLOSED,How to interpret perf_analyzer results,question,2023-07-07 22:58:28 +0000 UTC
+5879,CLOSED,[Question] Can not infer model because of batch in client request smaller than batch config in server,question,2023-06-02 02:02:51 +0000 UTC
+5874,CLOSED,how to config BLS model to instantiate multi BLS model,question,2023-07-07 22:59:14 +0000 UTC
+5872,CLOSED,How to convert pytorch model to onnx for use ragged batching?,,2023-05-30 12:44:38 +0000 UTC
+5870,CLOSED,multi-gpu error,,2023-05-30 22:14:19 +0000 UTC
+5869,CLOSED,Shared memory error occured when using Triton Inference Server in SageMaker Batch Transform.,question,2023-06-14 19:48:21 +0000 UTC
+5868,CLOSED,No space left on device on when deployed using python backend,,2023-05-30 05:29:58 +0000 UTC
+5867,CLOSED,Triton client InferenceServerException: Failed to process request due to TypeError,,2023-05-30 13:33:47 +0000 UTC
+5866,CLOSED,Model unable to support batching on triton inference server,question,2023-05-31 18:56:50 +0000 UTC
+5865,CLOSED,Unknown performance bottleneck when using ensemble model,,2023-06-23 19:57:04 +0000 UTC
+5864,CLOSED,Incorrect outputs in TRT model,question,2023-06-14 19:48:45 +0000 UTC
+5863,CLOSED,convert tensorrt sucessfully same version with triton tensorrt, but can not load,,2023-05-31 09:39:10 +0000 UTC
+5862,CLOSED,custom repo agent for model repo on s3,,2023-05-30 12:47:45 +0000 UTC
+5859,CLOSED,Why get huge matrix, cannot get detection results, when using C API to implement " densenet_onnx" model inference like simple.cc,,bug,2023-05-30 02:01:30 +0000 UTC
+5858,OPEN,tritonserver stopped at the following step, it seems model loaded fail,question,2023-07-10 16:14:04 +0000 UTC
+5853,OPEN,How to pass traceparent in the OpenTelemetry tracing feature,enhancement,2023-06-26 21:47:08 +0000 UTC
+5850,CLOSED,Model on DEVICE_CPU become 4x slower when it is part of an ensemble,,2023-05-24 15:59:45 +0000 UTC
+5848,CLOSED,how to use perf_analyzer without cuda ?,question,2023-06-12 21:56:10 +0000 UTC
+5847,OPEN,Triton issue,,2023-05-24 09:48:09 +0000 UTC
+5846,CLOSED,Error "unable to create stream: the provided PTX was compiled with an unsupported toolchain" once installed other version of CUDA,,2023-06-12 21:55:59 +0000 UTC
+5844,CLOSED,Onnxruntime backend `gpu_mem_limit` is not respected,,2023-05-24 14:18:28 +0000 UTC
+5843,CLOSED,Run onnixruntime backend occured fault ！,,2023-07-07 23:44:36 +0000 UTC
+5841,OPEN,GPU memory leak when loading/unloading models,bug, investigating,2023-06-20 17:15:32 +0000 UTC
+5840,OPEN,Triton crash when query is > Aproximatively 4 MB,bug, investigating,2023-05-25 17:45:09 +0000 UTC
+5839,CLOSED,Triton Inference Server installation failure,question,2023-06-14 19:49:18 +0000 UTC
+5838,CLOSED,[Question] Inference with torchscript model,question,2023-06-12 21:57:19 +0000 UTC
+5837,OPEN,[Question] How does triton inference server limit endpoint access for HTTP protocol ?,enhancement,2023-07-07 23:43:38 +0000 UTC
+5835,CLOSED,[Question] How does triton server use s3 as model-respoitory?,question,2023-05-23 15:39:37 +0000 UTC
+5834,CLOSED,Model Configuration is wrongly restrictive (2),question,2023-05-24 08:53:11 +0000 UTC
+5833,CLOSED,How to terminate a grpc streaming request immediately during tritonserver inference with a FasterTransformer backend?,question,2023-06-01 02:48:26 +0000 UTC
+5830,CLOSED,[python-backend] How to load script model with _extra files in model.pt?,,2023-07-06 15:35:12 +0000 UTC
+5829,OPEN,The speed of pre- and post- processing based on python backend is very slow!,,2023-07-07 23:24:45 +0000 UTC
+5828,CLOSED,Triton server crashes with a signal 11 when attempting to call nonexistent model inference,bug, investigating,2023-05-23 17:12:11 +0000 UTC
+5827,CLOSED,Cant build server with debug symbols,question,2023-06-12 21:57:49 +0000 UTC
+5823,CLOSED,Can I use triton with deepspeed inference?,question,2023-05-24 15:29:24 +0000 UTC
+5822,CLOSED,model_configuration -> model_transaction_policy -> decoupled mode -> requests[0].get_response_sender(),,2023-06-12 21:58:04 +0000 UTC
+5821,OPEN,error: private field 'event_' is not used [-Werror,-Wunused-private-field],question,2023-06-08 04:25:50 +0000 UTC
+5820,CLOSED,python_backend dlpack: Bool type is not supported,question,2023-05-29 16:07:50 +0000 UTC
+5819,OPEN,python_backend tries to chmod the triton_python_backend_stub even after running `chmod 777 triton_python_backend_stub`,,2023-06-26 20:30:54 +0000 UTC
+5818,CLOSED,Triton server does not load models onto all MiG instances,question,2023-05-22 23:00:53 +0000 UTC
+5817,OPEN,different performance between 23.03 and 23.04,investigating,2023-07-07 21:58:16 +0000 UTC
+5816,CLOSED,DLPack tensor is not contiguous. Only contiguous DLPack tensors that are stored in C-Order are supported,,2023-07-06 15:34:29 +0000 UTC
+5815,CLOSED,Calling TRT backend from BLS and bypassing TritonPythonModel.execute() call - Question,question,2023-06-12 21:59:07 +0000 UTC
+5814,OPEN,int8_mode=2 loading model fails,,2023-05-18 08:39:55 +0000 UTC
+5813,OPEN,Install Python Backend via pip locally,enhancement,2023-06-26 14:12:51 +0000 UTC
+5812,CLOSED,triton server (23.04) doesn't support specific version of tensorrt,question,2023-06-12 21:58:24 +0000 UTC
+5811,OPEN,GRPC prediction calls with BYTES input errors out in Big Endian Machines,investigating,2023-05-26 06:13:04 +0000 UTC
+5809,CLOSED,Tritonserver:23.04 crashes, but it wasn't with 23.03.,question,2023-05-22 19:54:10 +0000 UTC
+5804,CLOSED,ubuntu 20.04 GDB 10.2 debug child process occurred error；,,2023-06-12 22:01:01 +0000 UTC
+5803,CLOSED,Pytorch Backend Undefined Symbol [commit 4a8a870],,2023-05-17 09:24:36 +0000 UTC
+5802,OPEN,dynamic batching log created batch size,,2023-05-17 13:45:00 +0000 UTC
+5798,CLOSED,Failed to register CUDA shared mem: failed to open CUDA IPC handle: invalid resource handle,,2023-05-22 08:19:13 +0000 UTC
+5794,CLOSED,Cannot enable OpenTelemetry trace option; "unrecognized option",,2023-05-16 00:33:38 +0000 UTC
+5793,CLOSED,tritonserver start error: basic_string::_S_construct null not valid,,2023-05-26 19:12:17 +0000 UTC
+5792,CLOSED,Using Triton Python backend to serve as an RTSP camera grabber (Question).,question,2023-06-12 21:59:15 +0000 UTC
+5791,CLOSED,Loading ensemble reloads already loaded models,,2023-05-30 18:23:02 +0000 UTC
+5790,CLOSED,Unable to load models from "Private" s3 location,,2023-06-08 21:59:05 +0000 UTC
+5789,OPEN,tensorrt backend coredump at using cudaGraph with output_copy_stream True,,2023-05-18 00:45:42 +0000 UTC
+5788,OPEN,Can Triton achieve a return in typewriter form?,,2023-05-15 08:46:53 +0000 UTC
+5787,CLOSED,Custom auto scaling,question,2023-05-19 16:01:06 +0000 UTC
+5786,CLOSED,Triton Inference Server(23.03 container) fails to load all models.,,2023-07-07 22:52:43 +0000 UTC
+5785,CLOSED,Reiterating the suggestion for an enhanced extension of CUDA tensors to cudashm.,,2023-06-01 07:46:09 +0000 UTC
+5784,CLOSED,How does the function ModelInstanceState::ProcessRequests do inference for each request?,,2023-07-07 23:44:23 +0000 UTC
+5783,OPEN,Inaccurate request handling when configuring queue policy,,2023-05-13 15:01:10 +0000 UTC
+5781,CLOSED,what should I do if i want to write a plugin for pulling model repository from other file system,,2023-06-21 21:41:15 +0000 UTC
+5779,OPEN,python backend: cuDNN error: CUDNN_STATUS_MAPPING_ERROR and following CUDA error: an illegal memory access was encountered,bug, investigating,2023-07-03 20:06:25 +0000 UTC
+5777,OPEN,Custom Backend Tracing,enhancement,2023-05-12 19:47:21 +0000 UTC
+5774,OPEN,How can I install TF2 in the triton python_backend ?,,2023-05-12 18:21:06 +0000 UTC
+5773,OPEN,Order of tensors in Response does not correspond to config.pbtxt,bug,2023-05-12 21:54:50 +0000 UTC
+5772,OPEN,Sanity check before updating Sagemaker endpoint,,2023-05-12 21:16:15 +0000 UTC
+5768,CLOSED,Founded batching opportunity,question,2023-05-23 15:13:24 +0000 UTC
+5765,OPEN,Server should exit on unrecoverable errors in underlying runtime that cannot be resolved by model reloading,bug,2023-06-27 05:14:11 +0000 UTC
+5759,CLOSED,Reshape property not functional for python models.,,2023-05-22 18:39:04 +0000 UTC
+5758,CLOSED,some errors build from source,,2023-05-22 18:40:02 +0000 UTC
+5757,OPEN,downtime on the server when reloading a model in explicit mode (when using aws s3 repository),bug,2023-05-12 00:20:24 +0000 UTC
+5754,CLOSED,is there any custom C++ post-process backend and pre-process backend examples？,question,2023-05-12 06:30:02 +0000 UTC
+5752,CLOSED,Dynamic Input Shape Handling in triton,,2023-06-07 18:24:45 +0000 UTC
+5751,CLOSED,How about "Support for Reporting Metric Data with Timestamp Labels in Prometheus Metric Reporting Functionality"?,enhancement,2023-05-22 18:40:24 +0000 UTC
+5749,CLOSED,An error occurred while making inference requests for tritonserver,,2023-06-20 07:21:18 +0000 UTC
+5748,CLOSED,Does triton support pulling model repo from OSS,,2023-05-10 08:54:03 +0000 UTC
+5747,CLOSED,1,,2023-05-05 11:23:56 +0000 UTC
+5746,CLOSED,Perf_analyzer input JSON data type,,2023-05-22 18:40:39 +0000 UTC
+5745,OPEN,Occasionally meet Request for unknown model: 'xxx' has no available versions for existing models,bug,2023-07-07 23:59:22 +0000 UTC
+5744,CLOSED,Help on serving OpenPose to Triton backend,,2023-05-10 07:23:16 +0000 UTC
+5743,CLOSED,Question about "max_sequence_idle_microseconds" config value,question,2023-05-22 18:40:31 +0000 UTC
+5742,CLOSED,Issue creating ensembling config, error msg not helpful,,2023-06-02 22:18:53 +0000 UTC
+5741,CLOSED,Where can I find the information about the values for various platforms such as onnx and tensorflow,,2023-05-08 23:44:15 +0000 UTC
+5740,CLOSED,Is it possible to deploy a stable-diffusion-like unet model with a loop using ONNX or TRT?,,2023-05-22 18:41:22 +0000 UTC
+5736,CLOSED,ONNX instances on same device are not running concurrently,,2023-05-26 06:48:37 +0000 UTC
+5735,CLOSED,input and torch.tensor or tensor?,enhancement,2023-05-08 15:41:21 +0000 UTC
+5733,CLOSED,Concurrent inferences get mixed up when preprocessing python model runs torch operations on GPU,,2023-05-20 09:35:36 +0000 UTC
+5732,OPEN,Update the documentation about the triton throughput while using MIG,,2023-05-03 13:19:50 +0000 UTC
+5730,CLOSED,Release notes saying ONNXRuntime for 23.04 is 1.13.1,investigating,2023-05-05 00:01:22 +0000 UTC
+5726,CLOSED,How to load list of strings into input_tensors,,2023-05-03 17:50:47 +0000 UTC
+5725,CLOSED,Not able to fully utilize A16 GPUs with Triton.,,2023-06-23 20:07:54 +0000 UTC
+5722,CLOSED,Better log message for the case when error happens inside of TRITONBACKEND_ModelBatcherInitialize(),bug, investigating,2023-05-04 16:32:27 +0000 UTC
+5721,CLOSED,Dynamic batching does not work on server side,question,2023-06-05 15:56:21 +0000 UTC
+5717,CLOSED,Issue with 23.04-tf2-python-py3 docker image on arm64,investigating,2023-05-19 22:26:17 +0000 UTC
+5715,OPEN,_cshm_get_shared_memory_handle_info allocates memory on different GPU,investigating,2023-05-03 14:49:45 +0000 UTC
+5712,CLOSED,Tesla K80 GPU is not supported by triton-inference-server container 21.10,,2023-04-29 00:42:50 +0000 UTC
+5711,CLOSED,How to upgrade the model configuration file to poll mode for tritonserver,question,2023-05-06 05:33:55 +0000 UTC
+5709,OPEN,Is concurrent model instance loading supported?,enhancement,2023-05-01 17:36:41 +0000 UTC
+5703,CLOSED,Accessing model_config of ensemble config.pbtxt,question,2023-05-01 17:55:06 +0000 UTC
+5702,CLOSED,Triton server crashed when requesting deployed Python and ONNX model services,bug,2023-05-24 16:20:29 +0000 UTC
+5701,CLOSED,Intermittent Error with Python BLS Backend Model,,2023-05-12 21:20:57 +0000 UTC
+5698,CLOSED,Deploy yolov5 Model with Triton Inference Server issue and error : expecting model output to be a vector,question,2023-05-12 21:22:24 +0000 UTC
+5694,CLOSED,Support instance group of type 'MODEL' in pytorch backend,enhancement,2023-06-13 05:42:55 +0000 UTC
+5693,CLOSED,protobuf version used,question,2023-05-12 21:23:42 +0000 UTC
+5690,CLOSED,Output response Parsing Error,question,2023-05-12 21:24:40 +0000 UTC
+5689,OPEN,Docs for Multi model serving with over-commit,enhancement,2023-04-24 18:14:49 +0000 UTC
+5688,CLOSED,Docs for Parameters Support by Triton Backends,investigating,2023-07-08 00:03:23 +0000 UTC
+5687,CLOSED,Question: scheduling on multiple GPUs when using cuda shared memory,question,2023-04-27 18:43:57 +0000 UTC
+5685,OPEN,Inference exception in torchscript backend,bug,2023-04-28 02:04:08 +0000 UTC
+5683,CLOSED,With ensemble mode submodel can run in different gpus, like pipeline parallelism,question,2023-04-24 11:05:54 +0000 UTC
+5682,OPEN,How to implement a CPU-only Rust backend,enhancement,2023-04-21 19:58:14 +0000 UTC
+5681,CLOSED,Alternatives to increasing shared memory,question,2023-04-22 05:04:18 +0000 UTC
+5680,CLOSED,Possible issue with cuda shared memories during batch inference,,2023-04-25 07:05:07 +0000 UTC
+5679,CLOSED,how can i get a C++backends,question,2023-05-12 21:26:10 +0000 UTC
+5678,CLOSED,Vulnerabilities are reported while using the Triton for inference.,,2023-05-22 18:41:48 +0000 UTC
+5677,OPEN,Segmentation fault while triton python BLS model execute infer request,,2023-05-18 19:36:06 +0000 UTC
+5676,CLOSED,cmake failed！,,2023-05-12 21:28:10 +0000 UTC
+5675,CLOSED,Higher inference times than without using Triton inference server,,2023-05-12 21:29:35 +0000 UTC
+5668,OPEN,How long does it take to compile triton? Every time you execute cmake, you will download the dependent library. Can you set a global cache to speed up the build? Is there a more time-saving and labor-saving way?,enhancement,2023-04-21 03:23:01 +0000 UTC
+5666,CLOSED,Install mlflow-triton plugin in mlflow container (ghcr.io/mlflow/mlflow:latest) results error in processing aiohttp,,2023-04-21 22:28:35 +0000 UTC
+5665,CLOSED,Error reported while running BLS instance,,2023-06-08 23:32:48 +0000 UTC
+5664,CLOSED,vector length overflow,bug, investigating,2023-05-16 14:52:12 +0000 UTC
+5661,CLOSED,Unable to run Onnx model converted databricks/dolly-v1-6b (GPT-J),question,2023-05-12 21:31:31 +0000 UTC
+5654,OPEN,ONNX TensorRT FP16 Inference results incorrect,question,2023-04-30 10:26:48 +0000 UTC
+5649,CLOSED,Error: InferenceServerException: inference header size should be in range (0, -1012208453), got: 187,bug,2023-07-08 00:02:25 +0000 UTC
+5648,CLOSED,Best practice for an input pair <I_1, I,2>^0, ..., <I_1, I,2>^N,question,2023-05-12 21:32:20 +0000 UTC
+5647,CLOSED,How does windows11 compile triton locally?,,2023-05-12 21:33:21 +0000 UTC
+5645,CLOSED,CMake Error at /usr/share/cmake-3.25/Modules/ExternalProject.cmake:3115 (message): No download info given for 'triton-server' and its source directory:,,2023-05-12 21:34:29 +0000 UTC
+5644,CLOSED,Unused private field if TRITON_ENABLE_GPU is false,bug,2023-05-17 07:46:50 +0000 UTC
+5643,OPEN,Compile error with ubuntu 22.04 because of -Werror,enhancement,2023-04-20 22:57:49 +0000 UTC
+5641,CLOSED,docker can‘t run --gpus in orin,,2023-05-12 21:35:08 +0000 UTC
+5640,CLOSED,Inference results differ for diffrent batch size,question,2023-04-18 07:43:46 +0000 UTC
+5639,OPEN,Parse error for models that might return empty output,,2023-04-17 10:51:00 +0000 UTC
+5638,CLOSED,Any community group in slack or telegram for Triton?,question,2023-05-22 18:44:19 +0000 UTC
+5636,CLOSED,It seems that the model was loaded successfully, but why was it unloaded immediately?,,2023-04-13 06:17:42 +0000 UTC
+5633,CLOSED,capping resources assigned to each model in multi model serving,question,2023-05-22 18:43:08 +0000 UTC
+5627,OPEN,Does triton-inference-server only support slurm for multi-node deployment?,question,2023-04-13 21:48:10 +0000 UTC
+5622,OPEN,triton inference client pinned to geventhttpclient==2.0.2, cabundle doesn't support letsencrypt,,2023-04-12 18:00:12 +0000 UTC
+5617,CLOSED,HTTPS support,question,2023-06-28 21:34:04 +0000 UTC
+5614,CLOSED,Integrate triton with vector database in Python backend,question,2023-05-22 18:43:47 +0000 UTC
+5610,OPEN,TYPE_STRING datatype is throwing error while using tritonserver/Python backend/HTTP endpoint for Linux-s390x,question,2023-04-12 05:30:28 +0000 UTC
+5609,CLOSED,Implicit State Management in Pytorch,,2023-05-22 18:42:55 +0000 UTC
+5608,CLOSED,How to use golang's ModelStreamInfer interface,question,2023-04-20 03:38:02 +0000 UTC
+5606,CLOSED,Any examples for writing the input.json for perf_analyzer on fastertransformer models,,2023-05-22 18:41:02 +0000 UTC
+5605,CLOSED,Memory illegal access when using perf_analyzer,,2023-04-07 05:34:38 +0000 UTC
+5603,CLOSED,Question about how dynamic batching works,question,2023-04-11 06:59:22 +0000 UTC
+5602,CLOSED,Trouble starting Triton with 3 ONNX models loaded,,2023-04-13 01:17:22 +0000 UTC
+5598,OPEN,r23.03 onnxruntime backend consumes more GPU memory. might need cudnn_conv_use_max_workspace exposed,investigating,2023-04-25 09:44:30 +0000 UTC
+5594,OPEN,Build from source triggers errors "../libtritonserver.so: undefined reference to `absl::lts_20211102::variant_internal::ThrowBadVariantAccess()'",,2023-04-12 20:15:27 +0000 UTC
+5593,OPEN,Slow Inference using Triton Java Bindings,bug, investigating,2023-07-06 15:33:52 +0000 UTC
+5590,CLOSED,Question about the dynamic batcher and multi-instance model,question,2023-04-25 18:32:20 +0000 UTC
+5587,CLOSED,throughput of perf analyzer,question,2023-04-05 02:22:09 +0000 UTC
+5586,CLOSED,Batch between model in ensemble model,question,2023-07-08 00:01:12 +0000 UTC
+5585,CLOSED,Cannot build QA environment,,2023-04-14 07:17:47 +0000 UTC
+5584,OPEN,Triton Server Client libraries linker errors with the Response classes,question,2023-04-07 12:59:50 +0000 UTC
+5583,CLOSED,Serve tf-trt converted model return error: NodeDef mentions attr 'max_batch_size' not in Op: name=TRTEngineOp,question,2023-04-04 07:56:16 +0000 UTC
+5579,OPEN,Questions about model instances and dynamic batch when setting model concurrency,question,2023-04-06 14:15:36 +0000 UTC
+5578,OPEN,GPU tensor support for python backend on Jetson,enhancement,2023-06-23 20:11:10 +0000 UTC
+5577,CLOSED,Triton multi node - multi GPU inference,question,2023-04-10 20:26:36 +0000 UTC
+5576,OPEN,Error deploying TensorRT engine on Triton, possible nonzero op issue with data-dependent output shape.,bug, investigating,2023-04-20 23:17:17 +0000 UTC
+5574,CLOSED,When will ubuntu22.04 be supported?,question,2023-03-30 17:56:12 +0000 UTC
+5573,CLOSED,Error when run multiple triton server process in one machine,question,2023-06-28 21:37:41 +0000 UTC
+5572,CLOSED,AutoCompleteConfig() help,question,2023-04-10 20:26:51 +0000 UTC
+5567,CLOSED,google cloud model repository not working,,2023-03-29 14:17:09 +0000 UTC
+5566,CLOSED,more example for ssl for grpc,question,2023-04-11 14:35:14 +0000 UTC
+5565,CLOSED,Can I add model instances without restarting the triton server?,question,2023-04-04 01:28:14 +0000 UTC
+5564,OPEN,Route request to model instance running on specified GPU device id,enhancement,2023-06-23 19:55:43 +0000 UTC
+5563,CLOSED,Can connect Triton directly to Grafana?,question,2023-04-10 20:27:05 +0000 UTC
+5561,CLOSED,custom backend [undefined symbol],,2023-03-29 02:55:54 +0000 UTC
+5559,CLOSED,the c++ custom backend works with infer but crashes with async_stream_infer,question,2023-03-28 16:23:38 +0000 UTC
+5558,OPEN,Authorisation for endpoints,enhancement,2023-03-31 20:07:43 +0000 UTC
+5555,CLOSED,HTTP 400: Bad Request on all HTTP endpoints,,2023-03-28 04:58:10 +0000 UTC
+5552,CLOSED,Serving python libraries using Triton Inference Server,,2023-03-27 15:36:51 +0000 UTC
+5551,OPEN,Precision Setting for Performance Analyzer Output Validation,,2023-07-08 00:06:37 +0000 UTC
+5548,CLOSED,Logging doesn't output useful information when exception happens,,2023-04-10 20:28:20 +0000 UTC
+5547,OPEN,Error Unrecognized attribute: mask_filter_value for operator Attention,,2023-03-30 07:51:46 +0000 UTC
+5546,OPEN,Linker problems with Error class in Client libraries,,2023-03-24 00:36:12 +0000 UTC
+5545,OPEN,Triton server not combined requests to batch in python_backend,,2023-03-24 01:47:45 +0000 UTC
+5543,CLOSED,error using grpc,,2023-04-17 14:25:25 +0000 UTC
+5537,CLOSED,Is there a way to start triton server without using service account json in GCP ?,,2023-04-10 20:28:30 +0000 UTC
+5536,CLOSED,How to use java client api transfer images and texts to service backend by grcp?,,2023-04-10 20:28:57 +0000 UTC
+5534,OPEN,Multiple GPUs do not scale at the expected rate,,2023-04-05 23:17:19 +0000 UTC
+5533,CLOSED,Unable to Load Models from Azure Storage,,2023-06-06 21:51:35 +0000 UTC
+5530,CLOSED,Question about ragged batching,question,2023-04-10 20:25:42 +0000 UTC
+5527,CLOSED,Error when building grpc gcc11/ubuntu22.04?,,2023-03-21 19:25:02 +0000 UTC
+5525,CLOSED,Runing both normal onnx model and stateful model,,2023-04-10 20:29:14 +0000 UTC
+5524,CLOSED,TRITONBACKEND_ModelInstanceInitialize: xxx (CPU device 0),,2023-04-10 20:29:24 +0000 UTC
+5523,CLOSED,Slower inference times on using triton onnx backend instead of python backend,,2023-04-10 20:25:06 +0000 UTC
+5520,CLOSED,Triton not working with Inferentia in AWS,bug,2023-04-27 19:55:56 +0000 UTC
+5518,CLOSED,Citation of Triton inference server,,2023-03-23 22:45:25 +0000 UTC
+5517,CLOSED,can the custom backend as flexible as the python backend?,question,2023-03-18 02:53:12 +0000 UTC
+5516,CLOSED,Triton support for Red Hat Enterprise Linux,question,2023-03-21 08:08:52 +0000 UTC
+5515,CLOSED,Connection refused,,2023-03-27 18:37:16 +0000 UTC
+5513,OPEN,High GPU consumption when deploying into Kubernetes,,2023-03-20 17:54:32 +0000 UTC
+5512,CLOSED,How to get headers of request?,,2023-03-16 15:10:19 +0000 UTC
+5508,CLOSED,c++ grpc client send raw image to triton ensemble, can not use PIL.Image open,,2023-03-16 07:49:35 +0000 UTC
+5507,CLOSED,When triton loaded the python backend model preload libtensorflow_framework.so, a segment error,,2023-03-20 15:54:58 +0000 UTC
+5506,CLOSED,Some error about building custom Pytorch backend,,2023-03-27 18:37:48 +0000 UTC
+5503,CLOSED,Request ID not in Response when accessed by In-Process Triton Server API,bug, investigating,2023-03-21 01:57:58 +0000 UTC
+5501,OPEN,Minio model repository stuck on downloading files with <=2.31.0,investigating,2023-04-25 06:47:21 +0000 UTC
+5496,CLOSED,How to access inputs in TRITONSERVER_InferenceRequest,,2023-03-13 23:07:24 +0000 UTC
+5495,OPEN,README.md for client repo doesn't give complete instructions for Windows build,,2023-03-16 15:31:25 +0000 UTC
+5494,CLOSED,[Question / Bug?] DLPack tensor is not contiguous, even though I use tensor.contiguous in torch,question,2023-04-03 08:30:55 +0000 UTC
+5493,CLOSED,400 Bad request - http client,,2023-03-22 08:30:42 +0000 UTC
+5491,CLOSED,from /tmp/tritonbuild/tritonserver/build/_deps/repo-third-party-src/libevhtp/libevhtp/triton_timestamp.cc:29: /usr/include/c++/4.8.2/bits/c++0x_warning.h:32:2: error: #error This file requires compiler and library support for the ISO C++ 2011 standard. This support is currently experimental, and must be enabled with the -std=c++11 or -std=gnu++11 compiler options. #error This file requires compiler and library support for the \,question,2023-03-27 18:42:47 +0000 UTC
+5489,CLOSED,Onnx model with initializers as extra input,enhancement,2023-04-10 20:16:06 +0000 UTC
+5487,OPEN,Use boost::interprocess::shared_memory_object i.s.o POSIX shared memory functions,enhancement,2023-03-10 20:53:35 +0000 UTC
+5485,CLOSED,I build a centos image by compiling the source code on mac m1 aarch 64, but the cmake compilation fails,,2023-03-10 21:00:36 +0000 UTC
+5484,CLOSED,Server is returning BYTES when output datatype is FP32,,2023-03-10 12:42:56 +0000 UTC
+5483,CLOSED,Failed to open the cudaIpcHandle when I call an ONNX / TRT backend from Python backend,bug,2023-04-20 19:58:38 +0000 UTC
+5479,CLOSED,TensorRT: batching is unavailable,,2023-03-11 00:37:32 +0000 UTC
+5478,CLOSED,How to build a tritonserver image based on centos7, which is now based on ubuntu, and the current build.py does not support it;,,2023-03-11 14:49:19 +0000 UTC
+5477,CLOSED,The difference between triton bls and python banckend,,2023-03-10 21:36:04 +0000 UTC
+5476,CLOSED,> Hello,,,2023-03-09 09:49:26 +0000 UTC
+5475,CLOSED,The difference between triton bls and python banckend,,2023-03-09 09:49:05 +0000 UTC
+5474,CLOSED,Invalid argument: ensemble 'face_pose' depends on 'face_pose_trt' whose required version 0 is not loaded,question,2023-03-27 18:43:02 +0000 UTC
+5473,CLOSED,Questions about the "--disable-auto-complete-config" flag,,2023-03-10 21:40:21 +0000 UTC
+5472,CLOSED,Confusion about passing/converting NumPy/PyTorch tensors from and to pb_utils.Tensor for inference request,question,2023-03-24 15:59:42 +0000 UTC
+5471,CLOSED,Inference result of single batch ONNX model contains all zeros and also emits "Failed to open the cudaIpcHandle." error in additional inference calls,bug,2023-04-25 15:24:58 +0000 UTC
+5467,OPEN,Configurable rate-limiting / queue policy for sequence batcher,enhancement,2023-03-10 23:03:30 +0000 UTC
+5466,CLOSED,How to deploy a model whose size exceeds the memory of a single gpu,,2023-03-09 02:11:45 +0000 UTC
+5465,CLOSED,ensemble model transfer a context,,2023-03-15 03:38:54 +0000 UTC
+5464,CLOSED,Is dynamic batch use in ensemble model ? can i create a data pineline in triton API?,,2023-03-09 02:19:36 +0000 UTC
+5461,OPEN,23.02-pyt-python-py3 ModuleNotFoundError: No module named 'torch',enhancement,2023-03-31 21:05:30 +0000 UTC
+5460,CLOSED,unable to load shared library: `/lib/x86_64-linux-gnu/libstdc++.so.6`,question,2023-03-27 18:37:26 +0000 UTC
+5459,CLOSED,triton inference latency increases rapidly if the interval of inference request is more than 1 second.,,2023-03-10 05:28:53 +0000 UTC
+5454,CLOSED,Feature Request: More flexible model interactions,,2023-03-07 23:28:59 +0000 UTC
+5453,OPEN,Feature Request: Server side callbacks to evict state,enhancement,2023-03-22 09:29:06 +0000 UTC
+5452,CLOSED,Python BLS script fails to load additional models on sagemaker,enhancement,2023-03-10 21:35:09 +0000 UTC
+5451,CLOSED,Build for Windows 10 fails with hcsshim::PrepareLayer Win32: Incorrect function,question,2023-03-27 18:42:55 +0000 UTC
+5450,CLOSED,Segmentation fault on perf_client with STRING input and custom input_data,,2023-03-07 10:56:26 +0000 UTC
+5448,CLOSED,Error when parsing ONNX model from file in tao-toolkit-triton-apps docker container.,,2023-03-08 14:59:32 +0000 UTC
+5447,CLOSED,Lack of nvcr.io/nvidia/tritonserver:23.02-py3,,2023-03-27 18:43:39 +0000 UTC
+5446,CLOSED,Multiple implicit state,,2023-03-02 10:17:11 +0000 UTC
+5445,OPEN,Best Practices for a secured Triton installation,,2023-03-02 02:29:31 +0000 UTC
+5440,OPEN,Better Error reporting when using device or host memory (C-API),enhancement,2023-03-01 19:26:07 +0000 UTC
+5439,CLOSED,triton server logging time is not right after change timezone in container,enhancement, question,2023-07-06 18:21:19 +0000 UTC
+5438,CLOSED,Why I still perform inference while triton server is not ready?,,2023-03-01 19:01:16 +0000 UTC
+5434,CLOSED,Installation guide for OpenShift,,2023-02-28 17:07:59 +0000 UTC
+5433,OPEN,Connecting to Triton server takes about 2 minutes,question,2023-03-14 16:07:52 +0000 UTC
+5432,CLOSED,Redis Cache Repository does not exist,,2023-02-28 18:33:08 +0000 UTC
+5431,OPEN,Document memory exchanges in an Ensemble and in a BLS,enhancement,2023-03-01 19:34:04 +0000 UTC
+5422,CLOSED,TRITON_ENABLE_METRICS_CPU not found,,2023-02-27 17:10:10 +0000 UTC
+5421,CLOSED,Long queue time when preprocessing using the python backend,question,2023-03-08 02:59:43 +0000 UTC
+5420,CLOSED,Classification extension wrong for multiple images inference with Triton server 22.12-py3 or 23.01-py3 with MacBook M1,,2023-03-16 18:20:48 +0000 UTC
+5419,CLOSED,The deployed model loses a lot of accuracy!,,2023-03-27 18:44:53 +0000 UTC
+5417,CLOSED,Is this to add support for jetson in the model gen scripts?,,2023-02-27 18:09:40 +0000 UTC
+5416,CLOSED,Ensemble model with shared memory,,2023-02-25 01:07:23 +0000 UTC
+5412,CLOSED,Use perf_client on models with string as input and variable shape,,2023-02-24 18:06:11 +0000 UTC
+5410,CLOSED,[Question] Setting the default input and output tensor names in the config.pbtxt,,2023-02-24 18:12:33 +0000 UTC
+5408,CLOSED,Unable to Connect to Triton Server at localhost:8000,,2023-02-24 18:00:39 +0000 UTC
+5396,CLOSED,FasterTransformer: Start to forward terminate called after throwing an instance of 'std::out_of_range',,2023-02-23 19:12:20 +0000 UTC
+5395,CLOSED,unexpected explicit tensor data for input tensor 'attention_mask' for model 'pipeline-poc-inference__isvc-211152d1e7' of type 'INT32', expected datatype 'INT64',,2023-02-23 21:43:32 +0000 UTC
+5393,OPEN,add --version info option to tritonserver,,2023-02-25 12:31:57 +0000 UTC
+5392,OPEN,Triton Server costs too much memory,,2023-02-24 09:12:21 +0000 UTC
+5391,OPEN,Pass a python dict to triton server python backend,enhancement,2023-02-22 20:56:40 +0000 UTC
+5390,CLOSED,TritonServer output tensor diff from original onnx runtime inference,,2023-05-12 02:46:05 +0000 UTC
+5389,CLOSED,ONNX model inferencing :: "vmodel not found",,2023-02-21 14:22:12 +0000 UTC
+5388,CLOSED,Triton failed to load a large (19GB) ONNX model,,2023-03-06 18:56:27 +0000 UTC
+5387,CLOSED,Failed to build on Windows when upgrade from r22.02 to r23.01,,2023-03-10 02:15:08 +0000 UTC
+5386,CLOSED,Set the input data takes so many time,invalid,2023-02-28 00:35:37 +0000 UTC
+5385,CLOSED,Use Encrypted ONNX model in Triton Server,,2023-02-23 00:38:16 +0000 UTC
+5384,CLOSED,Response Cache Metrics Usage,,2023-03-22 00:08:30 +0000 UTC
+5383,CLOSED,Reduce image size and increase build frequency to cut high/medium vulnerabilities in tritonserver docker images,,2023-02-23 00:37:04 +0000 UTC
+5382,OPEN,All deployed model inference outputs are 0 and -1,,2023-02-24 07:38:05 +0000 UTC
+5381,CLOSED,[python backend] Triton hangs when a python backend model process is killed by kernel OOM killer,,2023-03-30 23:54:10 +0000 UTC
+5379,OPEN,Unify building experience among windows and posix,,2023-02-17 23:44:59 +0000 UTC
+5378,CLOSED,Enabling triton client to build with minial set to rid needless dependencies such as zlib and re2,enhancement,2023-05-08 17:08:06 +0000 UTC
+5377,OPEN,Enablging triton client to build and run on Mac,,2023-02-17 23:41:15 +0000 UTC
+5376,OPEN,Documentation for creating small versions of Triton Docker images is not sufficient,,2023-06-21 19:54:05 +0000 UTC
+5374,CLOSED,Need help in understanding triton client "get_inference_statistics" output.,,2023-03-27 18:43:14 +0000 UTC
+5373,CLOSED,How to use a fixed TensorRT version in triton server?,question,2023-02-23 00:35:22 +0000 UTC
+5372,OPEN,Metrics from Metric port being mixed when both Triton Model Analyzer and Triton Inference Server being started,,2023-02-27 18:56:38 +0000 UTC
+5371,CLOSED,Unable to create S3 filesystem client. Check account credentials.,,2023-03-15 09:39:35 +0000 UTC
+5370,CLOSED,torchscript can not serving on multi GPUs,investigating,2023-05-08 19:06:34 +0000 UTC
+5369,CLOSED,Dumping Logs of Triton Inference Server for tools like Grafana Loki,,2023-03-27 18:41:11 +0000 UTC
+5366,OPEN,Custom Header And GRPC MetaData IN TritonServer,question,2023-02-16 05:00:04 +0000 UTC
+5365,CLOSED,Can multiple models share the same python-backend stub?,,2023-03-27 18:45:07 +0000 UTC
+5364,CLOSED,Customize-triton-container using compose.py failed,,2023-02-20 01:42:22 +0000 UTC
+5359,CLOSED,Missing default tag (`latest`) in `nvcr.io/nvidia/tritonserver` container,,2023-03-16 18:23:14 +0000 UTC
+5358,CLOSED,Suggestions for k8s_onprem helm chart,,2023-02-23 00:40:48 +0000 UTC
+5357,CLOSED,How can I use dynamic batch?,question,2023-02-23 00:36:20 +0000 UTC
+5356,CLOSED,How to get the dependencies between backends using BLS,question,2023-02-16 07:02:51 +0000 UTC
+5354,OPEN,Triton limitations when deploying gpt-like generative models,,2023-02-16 16:29:26 +0000 UTC
+5353,CLOSED,Unable to load models from s3 location,question,2023-04-10 20:28:17 +0000 UTC
+5352,CLOSED,slow when I use triton torchscript model to infer,,2023-03-16 18:21:31 +0000 UTC
+5351,CLOSED,How to handle "output config" when "empty tensor" is the output of the detection model ?,question,2023-02-14 18:14:49 +0000 UTC
+5350,CLOSED,Unable to run my own model!,,2023-02-23 00:41:57 +0000 UTC
+5349,OPEN,Question about server/docs/examples/stable_diffusion/,question,2023-05-01 11:02:19 +0000 UTC
+5348,CLOSED,Request timeout expired While The Grpc Deadline Not Exceeded,,2023-02-23 06:59:58 +0000 UTC
+5346,CLOSED,Unable to connect to the triton service!,,2023-02-13 16:12:15 +0000 UTC
+5345,CLOSED,Dynamic model loading/unloading depending on requests?,,2023-02-13 17:50:09 +0000 UTC
+5343,CLOSED,OpenVINO Backend 2022.3 support,investigating,2023-07-07 17:15:06 +0000 UTC
+5342,CLOSED,Python backend Cannot Concurrent Load Model,question,2023-04-28 00:22:14 +0000 UTC
+5341,OPEN,Deploying Triton on Kubernetes results in Crashloopbackoff,question,2023-02-10 22:57:01 +0000 UTC
+5338,CLOSED,Need for help! Is that my configuration wrong?,question,2023-02-19 13:22:54 +0000 UTC
+5337,CLOSED,pytorch(torchscript) custom classes,,2023-02-10 06:39:46 +0000 UTC
+5335,CLOSED,Perf_client reporting different requests per second depending on network,performance,2023-02-09 20:26:50 +0000 UTC
+5334,CLOSED,dynamic batch not work,question,2023-02-09 12:25:06 +0000 UTC
+5333,OPEN,Allow to pass custom logging format via options,enhancement,2023-02-09 10:54:25 +0000 UTC
+5332,CLOSED,Memory segmentation fault in `libtriton_fil.so` for an xgboost model,,2023-02-14 20:51:38 +0000 UTC
+5331,OPEN,Timeout was reached,enhancement,2023-02-10 20:35:24 +0000 UTC
+5328,CLOSED,Signal 11 received while stress testing,bug,2023-02-27 16:10:04 +0000 UTC
+5327,CLOSED,Build without container has protobuf and curl issues,question,2023-06-28 21:40:37 +0000 UTC
+5326,CLOSED,Model Warmup Invalid!,performance,2023-02-09 02:08:12 +0000 UTC
+5325,CLOSED,/v2/repository/index api is very slowly,investigating,2023-02-28 00:16:05 +0000 UTC
+5324,OPEN,Triton cannot retrieve GPU metrics with MIG-enabled GPU devices (A100 and A30),enhancement,2023-02-07 01:54:25 +0000 UTC
+5323,OPEN,run triton docker,print () of model.py in python_backend cannot be printed,How to solve it?,,2023-02-08 08:09:08 +0000 UTC
+5322,CLOSED,How do i set labels for my Triton Model?,,2023-04-24 17:12:39 +0000 UTC
+5321,CLOSED,Custom Build Python Backend Locale Error,investigating,2023-06-26 15:54:14 +0000 UTC
+5320,CLOSED,Shape mismatch attempting to re-use buffer. {1,16,48,48} != {4,16,48,48},,2023-02-08 07:47:11 +0000 UTC
+5319,CLOSED,Document how expected model inputs tensor gets an extra dimension (of -1 items) and how to deal with it,,2023-02-08 18:14:08 +0000 UTC
+5318,CLOSED,Enable cache in TritonServer,,2023-02-21 17:50:59 +0000 UTC
+5317,CLOSED,Running preprocessing on TritonPythonModel is bottlenecking,performance,2023-02-10 05:37:15 +0000 UTC
+5316,CLOSED,TCP port was full and the triton server could not accept any request.,,2023-02-21 17:50:54 +0000 UTC
+5315,CLOSED,Server CPU usage decreases after a new model request arrives,,2023-02-09 10:12:25 +0000 UTC
+5313,OPEN,Unable to run triton outside container with python backend,,2023-02-04 19:24:33 +0000 UTC
+5309,CLOSED,Onnx runtime build error for 23.01,,2023-02-27 20:20:47 +0000 UTC
+5308,CLOSED,Proper input in json file, for python model accepting dictionary with string values as input,,2023-02-15 14:15:03 +0000 UTC
+5307,CLOSED,When loading a pretrained model,question,2023-02-21 17:51:47 +0000 UTC
+5305,CLOSED,Does `tritonserver` Support IPv6 `--http-address` bind?,,2023-04-10 22:19:44 +0000 UTC
+5304,CLOSED,[TRT] | Complete error not propagated to the http server,enhancement,2023-04-19 20:02:33 +0000 UTC
+5301,OPEN,[Feature request]Can you provide a golang grpc image infer client?,enhancement,2023-02-13 16:14:30 +0000 UTC
+5300,CLOSED,Triton onnxruntime backend serving having different result compared to the native python inference on a resnet18 model.,,2023-02-06 01:48:54 +0000 UTC
+5299,CLOSED,[Question] Inference image ensemble model with gRPC request got error,question,2023-02-23 22:21:44 +0000 UTC
+5296,CLOSED,Wrong prometheus metrics reports while using horizontally placed nodes in an ensemble model DAG,bug, investigating,2023-03-22 23:37:42 +0000 UTC
+5295,CLOSED,Scaling is not happening for slow models,question,2023-02-14 21:21:39 +0000 UTC
+5294,OPEN,How to serve n identical models (except for their weights) without using n times the GPU memory ?,enhancement,2023-03-09 17:49:02 +0000 UTC
+5293,CLOSED,[Question] can different models served within the same container share the input names ?,,2023-01-31 22:19:37 +0000 UTC
+5292,CLOSED,Include Python protobuf files in install folder,,2023-02-02 14:24:24 +0000 UTC
+5291,CLOSED,how to build "xx.yy-py3-min" for base image "ubuntu:20.04"?,question,2023-01-31 22:42:48 +0000 UTC
+5286,CLOSED,[Question] Is there a way to send raw string as input to triton server?,question,2023-01-31 18:37:59 +0000 UTC
+5285,CLOSED,[Feature Request] Add load_model api to triton_python_backend_utils,question,2023-02-02 03:03:10 +0000 UTC
+5284,CLOSED,[Question] Inference diffusion model with gRPC request got error,,2023-01-31 07:07:35 +0000 UTC
+5283,CLOSED,[Question]About Data collection and Data send back,,2023-01-31 22:20:44 +0000 UTC
+5279,CLOSED,[Python Backend] When using print() and multiple models, the print logs show out-of-order.,question,2023-04-10 22:12:42 +0000 UTC
+5278,CLOSED,Incorrect output for a movinet model on a tensorflow backend - Triton 22.04,bug,2023-03-01 21:04:24 +0000 UTC
+5277,CLOSED,Backend configs ignoring/not receiving some config flags in latest release,bug, investigating,2023-03-16 21:56:46 +0000 UTC
+5276,CLOSED,Every other sequence_id is set to zero using pytorch backend with stateful TS model.,bug,2023-02-08 23:18:06 +0000 UTC
+5274,CLOSED,Observing gradually increase in response time of pytorch model,,2023-02-13 19:09:25 +0000 UTC
+5273,CLOSED,Build CMake target missing .so definition,,2023-02-21 21:56:29 +0000 UTC
+5272,CLOSED,[Question] How image data should be serialized for warmup,,2023-02-13 19:18:20 +0000 UTC
+5271,CLOSED,Guidance for model instaces and gpu count,,2023-02-06 15:04:44 +0000 UTC
+5269,CLOSED,Starting triton docker container on kube cluster,,2023-02-22 08:51:50 +0000 UTC
+5268,CLOSED,E0120 09:32:45.604616 1854 model_repository_manager.cc:1002] Poll failed for model directory 'ensemble_model': output 'decoded_sequence' for ensemble 'ensemble_model' is not written,,2023-03-03 12:18:53 +0000 UTC
+5266,CLOSED,libtritonserver.so and onnxruntime files are missing from cpu build,,2023-01-20 15:43:04 +0000 UTC
+5259,OPEN,Suggestion to reduce RAM consumption,investigating,2023-02-27 03:44:20 +0000 UTC
+5255,CLOSED,E0116 09:33:42.589212 1 model_repository_manager.cc:1002] Poll failed for model directory 'pytorch_classifier': Invalid model name: Could not determine backend for model 'pytorch_classifier' with no backend in model configuration. Expected model name of the form 'model.<backend_name>'.,question,2023-01-19 22:59:47 +0000 UTC
+5254,OPEN,Keeps on getting "Invalid private key" when using tritonclient.grpc with SSL,bug,2023-04-11 06:53:39 +0000 UTC
+5253,CLOSED,Setup Triton server with build.py Error at setting up libonnxruntime.so,,2023-01-30 18:52:25 +0000 UTC
+5252,CLOSED,Get per requests timing data,question,2023-01-20 01:19:45 +0000 UTC
+5248,CLOSED,Issue with system shared memory and OpenCL (failed reading shared memory buffer with clEnqueueWriteBuffer ),bug,2023-02-01 20:15:09 +0000 UTC
+5242,CLOSED,Traces. OpenTelemetry,enhancement,2023-04-26 20:05:25 +0000 UTC
+5240,OPEN,Using PyTorch 2.0 for the PyTorch Backend,enhancement,2023-07-04 15:10:36 +0000 UTC
+5239,CLOSED,Triton per model resource distribution monitoring,question,2023-02-13 19:17:22 +0000 UTC
+5238,OPEN,ehancement(client): Python type-hints,enhancement,2023-01-31 23:33:27 +0000 UTC
+5237,OPEN,Does ensemble model release CUDA cache?,,2023-01-18 05:23:53 +0000 UTC
+5236,OPEN,[RFC] Provide an option to start any backend out-of-proc to help with memory management on UNLOAD,enhancement,2023-01-20 18:51:55 +0000 UTC
+5234,CLOSED,[Bug] Triton Server crashes and becomes unavailable for a few seconds before restarting,,2023-04-06 16:06:06 +0000 UTC
+5232,CLOSED,Unable to load shared library: libc10.so when building Pytorch backend using Docker image,,2023-01-11 13:29:32 +0000 UTC
+5231,CLOSED,Triton server restarts after polling a new model from GCS,,2023-02-10 20:00:46 +0000 UTC
+5230,CLOSED,Setup Triton Inference Server on a Windows 2019 server with Tesla GPU + inference using python,,2023-02-13 19:19:19 +0000 UTC
+5229,CLOSED,Triton inference is 2 times slower than non triton inference for me,,2023-03-16 18:27:09 +0000 UTC
+5227,CLOSED,Custom metrics for Python backend?,enhancement,2023-05-04 00:38:35 +0000 UTC
+5225,CLOSED,[Question] Tips on sending gRPC requests from .NET,question,2023-02-01 14:41:52 +0000 UTC
+5224,CLOSED,cpu build is failing with --- Target "caffe2plan" links to target "CUDA::cudart" but the target was not found,question,2023-01-30 18:55:31 +0000 UTC
+5223,CLOSED,Server crash running torchvision.io.decode_image on GPU,bug, investigating,2023-04-14 00:04:28 +0000 UTC
+5222,CLOSED,can not set the log file path,,2023-01-06 08:30:56 +0000 UTC
+5221,CLOSED,/v2/repository/index api is very slowly,question,2023-02-01 09:48:18 +0000 UTC
+5220,CLOSED,Python client grpcio==1.42.0 requirement too strict,,2023-01-17 07:59:44 +0000 UTC
+5217,CLOSED,UNAVAILABLE: Internal: unable to create stream: the provided PTX was compiled with an unsupported toolchain.,,2023-01-04 20:18:02 +0000 UTC
+5216,CLOSED,AttributeError: 'InferenceServerClient' object has no attribute '_pool',,2023-01-04 22:26:27 +0000 UTC
+5215,CLOSED,Is it necessary to specify max_batch_size when using dynamic batch?,question,2023-01-30 18:55:53 +0000 UTC
+5214,CLOSED,[Question 🙋] Is this fps performance normal?,question,2023-01-30 18:55:46 +0000 UTC
+5212,CLOSED,Client memory leak on unreachable,,2023-01-03 03:07:05 +0000 UTC
+5211,CLOSED,Triton server stuck during initialization/reload of python models,bug, investigating,2023-01-12 18:27:00 +0000 UTC
+5210,CLOSED,python backend IPC makes triton-model QPS drop rapidly,bug,2023-01-30 18:56:05 +0000 UTC
+5209,OPEN,Is there a way to call other methods rather "forward"?,enhancement,2023-02-03 00:45:31 +0000 UTC
+5208,CLOSED,Input_memories in libtorch backend,,2022-12-30 21:56:32 +0000 UTC
+5207,OPEN,How to start triton server after building the Windows 10 "Min" Image?,question,2023-01-10 06:52:27 +0000 UTC
+5206,CLOSED,Encounter error Invalid argument: unable to find 'libtriton_tensorrt_plan.so' when starting triton server,,2023-01-28 00:00:01 +0000 UTC
+5205,CLOSED,[question] How to make sure that dynamic batching works making concurrent requests?,question,2023-05-17 11:46:56 +0000 UTC
+5201,CLOSED,[Question] Is it possible to load a separate TRT model from the initialize function of a BLS model ?,question,2022-12-29 09:37:06 +0000 UTC
+5200,CLOSED,Has anyone implemented a backend for Goldwasser GPU inference?,question,2022-12-27 18:32:12 +0000 UTC
+5199,CLOSED,can fully use gpu-utils with Python client ( grpc, http ) ?,question,2023-01-30 18:56:46 +0000 UTC
+5198,CLOSED,GRPC client and decoupled mode - detecting last response of an infer request,duplicate,2022-12-27 22:49:37 +0000 UTC
+5197,CLOSED,Serve a Model in 3 Easy Steps is erro,question,2022-12-29 02:27:13 +0000 UTC
+5196,CLOSED,Python backend (BLS) abnormally consumes large GPU memory,,2023-05-22 18:43:02 +0000 UTC
+5195,CLOSED,Internal: An input of type 'Tensor?' was detected in the model. Only a single input of type Dict(str, Tensor) or input(s) of type Tensor are surpported,,2023-01-09 19:51:58 +0000 UTC
+5194,CLOSED,Question about Triton's Setup for a Speech-To-Text model,question,2023-01-09 19:51:27 +0000 UTC
+5192,CLOSED,jetson compilation,question,2022-12-27 17:35:15 +0000 UTC
+5191,CLOSED,Question about basic sequence stream example,question,2023-01-04 09:47:53 +0000 UTC
+5189,CLOSED,Core dump when load model with config which containning repoagent in explicit mode,question,2023-01-30 18:57:28 +0000 UTC
+5188,CLOSED,Can't use tensorrt with stateful model,question,2023-01-25 22:01:34 +0000 UTC
+5187,CLOSED,Release the tar file contains the Triton server executable and shared libraries,,2023-01-28 00:00:35 +0000 UTC
+5185,CLOSED,HTTP response from server does not include "content-encoding" header even if the response body is encoded,bug,2023-04-19 20:38:57 +0000 UTC
+5183,OPEN,Can no longer use GCS for model store with latest release,enhancement, investigating,2022-12-16 21:04:55 +0000 UTC
+5182,CLOSED,how to load weights files for python execution environment in the cloud?,question,2023-01-09 19:52:42 +0000 UTC
+5180,CLOSED,Can I free the memory of model instance?,question,2023-01-09 19:52:16 +0000 UTC
+5175,CLOSED,Assigning GPU fraction to a model,,2022-12-15 22:51:46 +0000 UTC
+5174,CLOSED,what's the behavior about python_backend.InferenceRequest.exec()?,,2023-07-07 07:48:28 +0000 UTC
+5171,CLOSED,question: What is the easiest way to check my custom backend library for memory errors?,question,2023-01-09 19:54:07 +0000 UTC
+5170,CLOSED,Supports of group instance count,question,2023-07-08 00:35:17 +0000 UTC
+5168,CLOSED,Python client getting significantly less performance than perf_client tool,question,2023-01-09 16:15:37 +0000 UTC
+5167,CLOSED,Fatal error - No such file or directory when building custom PyTorch backend,,2023-01-11 13:29:22 +0000 UTC
+5165,CLOSED,[Question] Is it possible to add custom information in a model config ?,,2022-12-14 14:17:58 +0000 UTC
+5164,CLOSED,[Question] Is there a way to specify a different filename depending on the GPU ?,,2022-12-14 14:17:46 +0000 UTC
+5163,CLOSED,problem about daemonts svclb-tritoninferenceserve,,2023-02-21 21:57:06 +0000 UTC
+5162,CLOSED,How can I make this scenario with image_client?,,2022-12-16 09:25:56 +0000 UTC
+5160,OPEN,Question: Determine if all responses are processed for a given request.,enhancement,2022-12-14 21:14:04 +0000 UTC
+5159,CLOSED,Question: Order of responses for the same request for decoupled models.,question,2023-01-09 19:53:08 +0000 UTC
+5158,CLOSED,triton is slower than direct access the trt engine file,,2022-12-12 13:56:14 +0000 UTC
+5157,CLOSED,[Question]Why triton inference server uses different gpu devices when inference?,question,2022-12-13 07:27:14 +0000 UTC
+5156,CLOSED,archive_write_data_block() failed with error code = -20, error message is Write failed for model.py (ensemble model deployment on triton),,2022-12-25 16:25:25 +0000 UTC
+5155,CLOSED,tritonserver: unrecognized option '--cloud-credentials=path/to/creds/.json',,2023-02-16 09:32:10 +0000 UTC
+5154,CLOSED,Detecting abrupt stream closing in C++ client library,question,2022-12-14 04:24:55 +0000 UTC
+5152,CLOSED,Torchscript model inference time too slow, how to know if model is running on GPU,performance,2023-07-11 15:36:02 +0000 UTC
+5150,CLOSED,Change model namespacing to allow Triton to re-use model names across different model repos,bug,2023-03-29 19:10:42 +0000 UTC
+5147,CLOSED,[Question] about NVFUSER with pytorch backend,,2022-12-09 08:10:52 +0000 UTC
+5146,CLOSED,how to keep model output on gpu in python-backend?,,2022-12-12 08:59:40 +0000 UTC
+5145,CLOSED,how to add lib and header into different c++ project,,2022-12-07 19:13:08 +0000 UTC
+5144,OPEN,Stub process is unhealthy and it will be restarted.,bug,2023-07-11 02:23:56 +0000 UTC
+5134,CLOSED,[Question] Is it possible to disable model concurrency ?,,2022-12-06 18:51:29 +0000 UTC
+5133,CLOSED,Can not create Customized Python Backends with Python 3.10 - 3.11 due to a Conda Pack issue,,2022-12-13 16:22:08 +0000 UTC
+5132,CLOSED,Calling metrics endpoint stops the triton server,,2023-01-28 00:06:16 +0000 UTC
+5131,CLOSED,All models load successfully but unload themselves.,,2022-12-05 15:28:00 +0000 UTC
+5130,CLOSED,How to convert to dictionary in Python, Getting TypeError,,2022-12-04 18:32:55 +0000 UTC
+5129,CLOSED,model_warmup is ignored,,2022-12-29 13:28:28 +0000 UTC
+5128,CLOSED,NvFuser is disabled,,2022-12-07 15:44:00 +0000 UTC
+5127,CLOSED,Mutex lock in statistics report,,2022-12-29 13:28:40 +0000 UTC
+5124,CLOSED,Does triton support kv_cache?,question,2022-12-05 02:53:40 +0000 UTC
+5123,CLOSED,Torchscript model input requires tensor on gpu,,2022-12-05 10:40:34 +0000 UTC
+5121,CLOSED,CPU consumtion much higher when using Triton server + aws inferentia vs aws inferentia alone,question,2022-12-12 14:49:25 +0000 UTC
+5120,CLOSED,Parameters CPU_THREADS_NUM in openvino_backend and instance_group in python_backend don't work well,,2023-01-28 00:00:26 +0000 UTC
+5116,CLOSED,Error while nginx proxy,bug, investigating,2023-01-28 16:01:36 +0000 UTC
+5115,CLOSED,Torchscript backend **MUCH** slower only with FP16 on 1650,,2022-11-28 19:22:30 +0000 UTC
+5114,CLOSED,Bump up curl version for compatibility for Linux,enhancement, investigating,2022-12-05 06:08:38 +0000 UTC
+5113,CLOSED,Ensemble output waiting,question,2022-12-19 20:22:01 +0000 UTC
+5110,CLOSED,Can not create a customized Python backend when using Python 3.11 for the Conda environment,bug,2022-12-05 15:39:57 +0000 UTC
+5109,CLOSED,GRPC error when return the client results,question,2022-12-20 20:33:17 +0000 UTC
+5108,CLOSED,Multi instances's performance is slightly low. [pytorch_backend],question,2022-12-03 05:38:05 +0000 UTC
+5107,CLOSED,Can not use container with tensorflow in python backend,,2022-12-19 20:23:03 +0000 UTC
+5106,CLOSED,The end-to-end request duration (_nv_inference_request_duration_us) is 10x the actual inference duration of the model (_nv_inference_compute_infer_duration_us_),question,2022-12-19 20:23:32 +0000 UTC
+5105,CLOSED,How to determine which instance the request is on？,question,2022-12-19 20:19:53 +0000 UTC
+5103,CLOSED,Build Error Windows without docker,question,2022-12-19 20:21:03 +0000 UTC
+5102,CLOSED,Triton client failed to build on Mac - <rapidjson/document.h> file not found,,2023-01-28 00:04:23 +0000 UTC
+5100,CLOSED,Want to install specific version of TensorRT in Triton server,question,2022-12-08 17:48:57 +0000 UTC
+5098,OPEN,Question about 22.09,bug,2022-11-29 14:37:03 +0000 UTC
+5097,CLOSED,Is there any way to check if the model is now used by another client ? Then my client will not unload this model,question,2022-12-01 17:48:13 +0000 UTC
+5095,CLOSED,What are the things to do to make inference faster or more responsive?,,2023-01-28 00:03:49 +0000 UTC
+5094,CLOSED,Cannot server custom cuda kernel GPU pytorch models via the python backend with conda pack,,2022-11-29 01:19:28 +0000 UTC
+5092,CLOSED,Tensorrt output is wrong in GPU-A40 but GPU-P40 is right,,2022-11-21 18:57:35 +0000 UTC
+5091,CLOSED,Error case handling,question,2022-12-19 20:23:58 +0000 UTC
+5090,CLOSED,need help about the deployment of large model,,2022-12-19 20:24:49 +0000 UTC
+5089,CLOSED,How to setup a local server for realtime streaming inferences?,,2023-01-28 00:03:56 +0000 UTC
+5087,CLOSED,Tritonserver load recommended backend failed,,2022-11-18 03:54:23 +0000 UTC
+5084,CLOSED,onnx model causes core dump in 22.08+, works with 22.06,bug,2023-01-03 20:04:34 +0000 UTC
+5082,CLOSED,Preprocessing input on server causing increased latency,,2022-11-16 16:19:17 +0000 UTC
+5081,CLOSED,Question about the "python_backend",question,2022-12-06 19:02:42 +0000 UTC
+5080,CLOSED,Model Ensemble using only the latest configuration files,,2022-12-06 19:04:07 +0000 UTC
+5079,CLOSED,getting incorrect output shape,,2022-12-06 19:04:13 +0000 UTC
+5078,CLOSED,client fails to compile,,2022-12-06 19:03:58 +0000 UTC
+5077,CLOSED,triton server can not start if docker run with specified cpusets using docker run -cpuset-cpus,,2023-03-07 02:49:07 +0000 UTC
+5074,CLOSED,Provide an API to access configpb.txt programmatically to know which models are configured to run on Triton,enhancement,2022-12-01 00:36:59 +0000 UTC
+5073,CLOSED,docs/examples/stable_diffusion -> `scale_model_input` function should be called before `step`,bug,2022-11-28 22:54:33 +0000 UTC
+5072,CLOSED,class inference::RepositoryModelLoadRequest has no member named ‘mutable_parameters’,,2022-11-15 00:33:18 +0000 UTC
+5071,CLOSED,Failed to serve model coverted by torch2trt,,2022-11-17 00:52:46 +0000 UTC
+5069,CLOSED,mobilenet based model deploy success, but run fail,,2023-02-21 21:58:08 +0000 UTC
+5067,OPEN,Thread control options in PyTorch backend,enhancement,2022-11-14 04:26:46 +0000 UTC
+5065,CLOSED,Triton doesn't recognize any model,,2022-11-10 20:55:56 +0000 UTC
+5064,CLOSED,When i use 3060 gpu,can not load model,,2022-11-14 05:41:29 +0000 UTC
+5063,CLOSED,Problems with installation of python_backend from source,,2022-11-10 20:55:14 +0000 UTC
+5062,CLOSED,Support Load/ Unload Model Type Of PyTorch Geometric With TorchScript,,2022-11-29 16:57:16 +0000 UTC
+5055,CLOSED,Stable-diffusion Example Inference Error due to Triton Server side's triton version update,bug,2022-11-17 00:33:42 +0000 UTC
+5053,CLOSED,Infer error - unknown request input name input__0,,2022-11-13 17:06:50 +0000 UTC
+5052,CLOSED,Deploying multiple Triton containers on a single GPU and unable to servce by GRPC at the same time,,2023-01-27 16:44:55 +0000 UTC
+5050,OPEN,Ragged batching support for PyTorch backend,enhancement,2022-11-14 04:51:35 +0000 UTC
+5049,CLOSED,Our onnx models need onnxruntime version 1.10.0. I'm using triton server version 22.08 which has onnxruntime version 1.11.1 in onnx backend. How can i use the required version?,,2022-11-29 16:57:35 +0000 UTC
+5046,CLOSED,Can't make more than one request,,2023-07-06 18:30:25 +0000 UTC
+5045,CLOSED,Triton server container taking long time to launch. How to reduce the time when scaling to new instances,,2023-02-07 10:46:12 +0000 UTC
+5042,CLOSED,Triton-server TF backend, delayed response during explicit model update,,2022-11-29 16:58:07 +0000 UTC
+5041,CLOSED,Error on loading Pytorch Model using Triton-PytorchBackedn,,2022-11-29 16:58:15 +0000 UTC
+5040,CLOSED,How to build TensorFlow Backend With Custom TensorFlow?,,2022-11-25 12:38:46 +0000 UTC
+5039,CLOSED,Unable to access GCS bucket with workload identity mechanism in GKE,bug,2023-04-18 09:59:19 +0000 UTC
+5037,OPEN,allow constant input tensors in (ensemble) models,enhancement,2022-11-04 23:17:28 +0000 UTC
+5035,CLOSED,Inputs' shape with reshape for batch inference has wired behaviour,,2023-02-23 22:23:52 +0000 UTC
+5034,CLOSED,async_infer GRPC result mapping,question,2022-11-06 12:57:29 +0000 UTC
+5031,CLOSED,Question about deploying multiple docker models on a single GPU,question,2022-11-07 18:54:43 +0000 UTC
+5027,CLOSED,InferenceServerClient request time is large than model-analyzer,question,2022-11-09 17:33:50 +0000 UTC
+5026,CLOSED,AVX,,2023-02-06 10:42:03 +0000 UTC
+5025,CLOSED,22.09 Logger still unavailable in python_backend,question,2022-11-02 18:08:27 +0000 UTC
+5024,CLOSED,Inference Speed drops,question,2022-11-29 16:58:23 +0000 UTC
+5023,OPEN,Socket Closed when running on K8S,bug,2022-11-03 21:33:35 +0000 UTC
+5021,CLOSED,Pytorch Backend Compatibility on AGX,question,2022-11-29 17:02:35 +0000 UTC
+5018,CLOSED,USE GRPC ON OPENSHIFT,,2022-10-28 22:58:20 +0000 UTC
+5017,CLOSED,report error when model batchsize>1.,,2022-12-29 13:36:41 +0000 UTC
+5014,CLOSED,How to decrease GPU memory in onnxruntime?,,2022-11-22 02:26:06 +0000 UTC
+5013,CLOSED,Why does TensorRT engine only supports max-batch 1,,2022-11-19 14:21:09 +0000 UTC
+5009,CLOSED,Possibility to recreate new connection to the triton server,,2022-11-29 17:02:51 +0000 UTC
+5008,CLOSED,why jetpack5.0 not supported S3 storage?,,2023-02-10 14:36:24 +0000 UTC
+5007,CLOSED,Triton inference results lower than the corresponding .tlt model,,2023-01-28 00:04:52 +0000 UTC
+5001,CLOSED,Can I use Triton server for inference on GPU AWS graviton instances,question,2022-11-29 17:04:05 +0000 UTC
+5000,CLOSED,Windows triton build cannot make https call because of a reduced default libcurl,bug, investigating,2022-11-18 23:01:28 +0000 UTC
+4999,CLOSED,How do I know that Triton's return is over?,duplicate,2023-06-28 12:45:01 +0000 UTC
+4998,CLOSED,Triton Inference Server Image Python Libraries,,2022-10-20 19:16:10 +0000 UTC
+4997,CLOSED,Model config and data transmitted is type F32, response in BYTES,bug, investigating,2022-10-21 01:16:03 +0000 UTC
+4996,CLOSED,UnicodeEncodeError when using python backend to encode utf-8,question,2022-10-20 04:47:26 +0000 UTC
+4995,CLOSED,How to properly use dynamic input shape with Triton?,bug, investigating,2022-10-31 20:45:56 +0000 UTC
+4994,CLOSED,ModelInfer RPC doesn't support models with decoupled transaction policy,question,2022-10-25 23:22:51 +0000 UTC
+4992,CLOSED,NVIDIA A2 Volatile GPU-Util = 88% just max_batch_size=2,question,2022-12-29 13:36:59 +0000 UTC
+4990,CLOSED,Triton server throwing error on inference request after upgrading to latest version - INVALID_ARGUMENT: Fail to proof the equality of two dimensions at compile time,,2022-10-17 19:57:01 +0000 UTC
+4989,CLOSED,Logging question,question,2022-12-19 20:28:19 +0000 UTC
+4988,OPEN,Triton image_client.py fails to run,enhancement, investigating,2022-10-17 20:55:29 +0000 UTC
+4987,CLOSED,The Issue of Triton Server Load Same Model Many Times,,2022-11-22 03:12:45 +0000 UTC
+4986,CLOSED,Model profiling without model analyzer,,2023-02-17 19:55:21 +0000 UTC
+4985,CLOSED,tritonclient.utils.InferenceServerException: [StatusCode.UNAVAILABLE] Connection reset by peer,,2022-10-18 03:16:09 +0000 UTC
+4984,OPEN,gRPC client generated python files are really old,enhancement, investigating,2023-02-08 11:09:07 +0000 UTC
+4983,CLOSED,Ho to debug the model file while using Python backend,question,2022-12-19 20:32:18 +0000 UTC
+4981,CLOSED,Docker build fail on windows,bug,2022-12-05 19:58:34 +0000 UTC
+4980,CLOSED,Can not run the demo example : Request for unknown model: 'densenet_onnx' is not found,question,2022-10-14 01:14:01 +0000 UTC
+4979,CLOSED,Build error on windows,bug,2022-11-28 18:36:29 +0000 UTC
+4978,CLOSED,Question: Triton server scaling when too many models,question,2022-10-19 17:25:33 +0000 UTC
+4977,CLOSED,Unable to Launch container ImagePull Back Off Issue,question,2022-10-18 00:25:37 +0000 UTC
+4976,CLOSED,Load/Unload model,,2022-10-26 01:01:19 +0000 UTC
+4975,CLOSED,Python backend: enable sharing memory between multiple instances of a model,,2023-01-04 22:36:23 +0000 UTC
+4974,CLOSED,Loading parameters at model loading time,question,2022-12-19 20:28:56 +0000 UTC
+4973,CLOSED,How to discover for Tensorrt backend if batch size for the model is restricted to exactly max_batch_size?,question,2022-12-20 20:31:49 +0000 UTC
+4972,CLOSED,python backend {'error': 'GRPC Execute Failed, message: Received message larger than max (4915244 vs. 4194304)'},,2022-11-22 03:12:55 +0000 UTC
+4971,CLOSED,The results returned by the interface can be customized,,2022-10-13 00:37:23 +0000 UTC
+4970,CLOSED,Shared Memory Header and Libraries not included in the prebuilt libraries. Can they be included?,,2022-10-11 23:38:19 +0000 UTC
+4967,CLOSED,terminate called after throwing an instance of 'std::out_of_range',,2022-11-22 03:12:50 +0000 UTC
+4966,CLOSED,ONNX model unload does not free system memory,investigating,2023-01-30 18:57:56 +0000 UTC
+4965,CLOSED,Error while loading shared libraries: libdcgm.so.2,bug,2023-05-22 22:37:22 +0000 UTC
+4959,CLOSED,Triton server with python backend slow for YOLO inferencing,question,2022-10-18 11:48:52 +0000 UTC
+4958,CLOSED,8000 port is not responding,,2022-11-22 03:13:00 +0000 UTC
+4957,CLOSED,Unable to load custom YOLOX model with CPU only mode - failed to stat file,question,2022-10-11 05:55:59 +0000 UTC
+4956,CLOSED,Triton Server crash when upload a wrong serializationVersion model,bug, investigating,2022-10-10 21:50:13 +0000 UTC
+4955,CLOSED,Failed to create environment directory for '/tmp/python_env_QNQ3ug/5',investigating,2022-12-04 03:55:47 +0000 UTC
+4954,CLOSED,how to get triton_python.dll?,question,2022-10-14 23:26:14 +0000 UTC
+4952,CLOSED,libgomp-d22c30c5.so.1: cannot allocate memory in static TLS block,,2022-10-25 02:51:48 +0000 UTC
+4947,CLOSED,Load python model GPU failed,,2022-11-22 02:57:02 +0000 UTC
+4942,OPEN,Add example model for FIL backend,enhancement,2022-11-22 03:13:42 +0000 UTC
+4941,CLOSED,Get request_id_ from failed request,enhancement,2023-01-19 18:30:16 +0000 UTC
+4940,CLOSED,Simultaneous execution and dynamic batching,question,2022-10-06 17:36:21 +0000 UTC
+4938,CLOSED,[INFO] How does the internal queue/batching work?,question,2022-10-06 16:40:16 +0000 UTC
+4937,OPEN,Missing doc for `platform`,bug,2022-10-03 16:29:53 +0000 UTC
+4936,CLOSED,Deploy triton model on multiple nodes,question,2022-10-06 16:42:14 +0000 UTC
+4935,CLOSED,Load model to repository,question,2022-12-19 20:33:01 +0000 UTC
+4934,CLOSED,RuntimeError: Error in dlopen: libtorch_cuda_linalg.so: cannot open shared object file: No such file or directory,bug,2023-02-21 21:01:11 +0000 UTC
+4932,CLOSED,Python Backend Uses Lots of GPU Memory,question,2022-12-26 09:21:08 +0000 UTC
+4931,CLOSED,Failed to process the request(s) for model '***', message: error: unpack_from requires a buffer of at least 578635587 bytes for unpacking 578486289 bytes at offset 149298 (actual buffer size is 242891) At: /opt/tritonserver/backends/python/triton_python_backend_utils.py(116): deserialize_bytes_tensor,bug,2023-05-19 22:31:16 +0000 UTC
+4930,CLOSED,tritonclient with pycuda integration,enhancement,2023-06-21 01:33:39 +0000 UTC
+4929,CLOSED,Error Resuming when not pause,bug,2022-12-20 20:22:03 +0000 UTC
+4927,CLOSED,Seek help: Unrecognized data format,,2022-09-28 02:52:48 +0000 UTC
+4925,CLOSED,Occasional segfault in dynamic batch scheduler using ONNX runtime,bug, investigating,2022-10-07 15:51:53 +0000 UTC
+4924,CLOSED,[Tensorflow Backend] | [Segfault, intermittent] | Unloading model on GPU results in a segfault i.e. tritonserver crashes,bug,2022-10-28 23:52:04 +0000 UTC
+4919,OPEN,Add support to "parameters" in Python tritonclient package,enhancement,2022-09-26 21:17:58 +0000 UTC
+4918,CLOSED,Responses arrives in different order than has been sent,question,2023-02-23 23:12:41 +0000 UTC
+4917,CLOSED,Grpc_serve unable to handle Java client UINT32 requests,,2022-11-10 16:33:32 +0000 UTC
+4916,CLOSED,Bytes/String Datatype golang grpc client example,,2022-10-07 01:12:19 +0000 UTC
+4915,CLOSED,Triton failed to open the cudaIpcHandle upon prediction request when launched in container under WSL2,bug,2022-10-19 02:45:41 +0000 UTC
+4910,CLOSED,explicit mode load model failed,,2022-10-13 00:52:00 +0000 UTC
+4909,CLOSED,Multiple Classification,,2022-10-18 00:02:10 +0000 UTC
+4908,CLOSED,module 'triton_python_backend_utils' has no attribute 'Logger',,2022-09-25 17:57:47 +0000 UTC
+4907,CLOSED,Yolo in the Cloud,,2022-11-22 03:13:56 +0000 UTC
+4906,CLOSED,Server Hangs when loading python backend w/ pytorch (including the example),,2022-11-22 03:14:23 +0000 UTC
+4904,CLOSED,Broken metrics link in the backend example,,2022-10-03 18:57:24 +0000 UTC
+4903,CLOSED,Unexpected shape for input 'TEXT' for model 'ensemble_model'. Expected [-1,-1], got [2],question,2022-12-20 20:28:38 +0000 UTC
+4902,CLOSED,wrong format of returned results,,2022-09-22 07:04:12 +0000 UTC
+4899,CLOSED,Parameter CPU_THREADS_NUM setting bug in openvino_backend,,2022-09-22 13:57:36 +0000 UTC
+4898,CLOSED,Triton Server return wrong results,,2022-09-21 07:08:13 +0000 UTC
+4896,CLOSED,Supporting MIG with multi model instance groups,,2022-10-04 15:24:17 +0000 UTC
+4894,CLOSED,Fatal error: EfficientNMS_TRT is not a registered function/op,question,2022-09-30 22:45:32 +0000 UTC
+4893,CLOSED,Fatal error: EfficientNMS_TRT is not a registered function/op,duplicate,2022-09-19 18:50:43 +0000 UTC
+4892,CLOSED,[QUESTION] What are Benefits of Using Backends?,question,2022-09-21 05:06:34 +0000 UTC
+4890,CLOSED,How to invoke multiple models using a single BLS?,,2022-10-04 15:24:34 +0000 UTC
+4889,CLOSED,[QUESTION] About Concurrent Model Execution Feature,,2022-09-28 06:53:20 +0000 UTC
+4888,CLOSED,Splitting a batch of images into small batches for different instances.,,2022-10-04 15:25:40 +0000 UTC
+4887,CLOSED,UNAVAILABLE: Internal: Unable to set NUMA memory policy: Operation not permitted,,2022-10-10 23:59:26 +0000 UTC
+4886,CLOSED,Dynamic batching but require fixed input size,bug,2022-11-29 17:03:48 +0000 UTC
+4885,CLOSED,Get shm memory status,,2022-11-22 03:15:02 +0000 UTC
+4881,CLOSED,image_client.py does not get enough responses from server,question,2022-09-20 10:25:10 +0000 UTC
+4880,CLOSED,too many open files,bug, investigating,2022-09-27 14:31:28 +0000 UTC
+4875,CLOSED,Lower latency with high cpu usage, while higher latency with low cpu usage,question,2022-09-14 17:07:08 +0000 UTC
+4874,CLOSED,build file after edit image_client.py code,question,2022-09-16 01:54:53 +0000 UTC
+4872,CLOSED,Transfering results of Async Requests to Queue Best Practice,,2022-09-13 15:48:19 +0000 UTC
+4870,OPEN,Python backend cannot import Tensor,enhancement,2023-01-29 10:59:57 +0000 UTC
+4869,CLOSED,python backend: How to set the shm-size with triton helm chart?,,2022-09-12 11:37:55 +0000 UTC
+4868,CLOSED,Memory leek problem in 22.07 of tensorrt backend,,2022-09-11 09:02:50 +0000 UTC
+4867,CLOSED,Run constrained beam search T5 with TensorRT Triton,question,2022-09-30 22:45:29 +0000 UTC
+4866,CLOSED,Lost file when deploy Triton from S3,question,2022-09-30 22:42:52 +0000 UTC
+4865,CLOSED,Python backend 0-size GPU tensors causing "failed to get cuda pointer device attribute: invalid argument",bug, investigating,2022-09-29 00:42:27 +0000 UTC
+4858,CLOSED,Tritonserver: symbol lookup error: _sentencepiece_tokenizer.so,question,2022-09-22 21:39:51 +0000 UTC
+4857,CLOSED,python backend crash,,2022-09-30 22:42:45 +0000 UTC
+4856,CLOSED,question about the priority in ratelimiter,question,2022-11-08 23:02:47 +0000 UTC
+4855,CLOSED,strange coredump in libtritonserver.so,bug,2022-09-30 18:47:14 +0000 UTC
+4854,CLOSED,how could I know the default value of model_config.proto,question,2022-09-09 01:58:23 +0000 UTC
+4853,CLOSED,Impossible to load custom python backend,investigating,2023-03-15 10:06:26 +0000 UTC
+4851,CLOSED,How to use triton for Multi Object Tracking,question,2022-09-30 22:47:28 +0000 UTC
+4850,CLOSED,Question with client with cuda shared memory,question,2022-09-06 17:06:21 +0000 UTC
+4849,CLOSED,`tritonclient[grpc]==2.24.0` Produces OOMs When Async gRPC Calls Are Performed,,2022-11-22 03:14:18 +0000 UTC
+4848,CLOSED,use triton container 22.07 sdk load torchscript model failed,,2022-09-05 11:46:07 +0000 UTC
+4847,CLOSED,Can't build the Dockerfile.win10.min,,2022-11-22 18:19:50 +0000 UTC
+4846,CLOSED,data serializing error when the concurrency of requests is high,,2022-09-30 22:47:02 +0000 UTC
+4845,CLOSED,Ensemble scheduler parallel,question,2022-09-09 01:59:11 +0000 UTC
+4844,CLOSED,Errors when loading new models multiple times in quick succession,,2022-11-22 03:15:18 +0000 UTC
+4843,OPEN,How stub methods are using?,question,2022-09-09 23:53:29 +0000 UTC
+4842,CLOSED,BERT model is returning NaN logits values in output,,2022-09-30 22:44:01 +0000 UTC
+4841,CLOSED,Under high load, stateful batcher queues small batches,bug,2022-11-18 10:15:24 +0000 UTC
+4840,CLOSED,GRPC: unable to provide 'prob' in GPU, will use CPU,,2022-11-22 03:10:49 +0000 UTC
+4839,CLOSED,building without docker: How to specify backend's CUDA / CuDNN version?,,2022-09-02 09:01:19 +0000 UTC
+4838,CLOSED,Is there any sort of C++ equivalent of the Python BLS in Triton?,,2022-09-02 16:53:14 +0000 UTC
+4837,CLOSED,Unable to create cluster with single t4 gpu , 2 core 12 gb ram in GKE,question,2022-10-13 00:52:59 +0000 UTC
+4836,OPEN,Dose StreamInfer assure connecting with same model instance?,question,2022-09-02 17:01:24 +0000 UTC
+4834,CLOSED,Using shm in bls,,2022-11-22 02:07:43 +0000 UTC
+4833,CLOSED,when using model load api to load model in explicit mode by passing the model config, get attempt to access JSON non-string as string exception,,2022-08-31 11:58:10 +0000 UTC
+4832,CLOSED,How to make different model version use different config?,question,2022-10-11 00:00:30 +0000 UTC
+4831,CLOSED,example for Kubernetes configuration multi-node multi-gpu,,2023-02-13 19:19:42 +0000 UTC
+4829,CLOSED,README.md steps not working for me,question, investigating,2023-07-08 00:31:18 +0000 UTC
+4828,CLOSED,Wrong behaviour of ensemble unloading using HTTP API,,2022-08-31 11:27:13 +0000 UTC
+4827,CLOSED,How to use nsight system in triton inference server,,2022-08-30 08:23:43 +0000 UTC
+4826,CLOSED,max_batch_size configuration issue,question,2022-09-12 17:33:43 +0000 UTC
+4825,CLOSED,Buiding without docker failed,,2022-11-22 03:10:12 +0000 UTC
+4824,CLOSED,Load a new pytorch model but get an error: configuration expects 0 inputs, model provides 2,,2022-10-27 01:02:30 +0000 UTC
+4821,CLOSED,Cannot use pb_utils.Logger,,2022-08-29 18:40:33 +0000 UTC
+4820,CLOSED,Model config for empty dimensions,,2022-08-29 16:53:46 +0000 UTC
+4819,OPEN,Core rebuild is extremely long,enhancement,2022-08-30 20:08:42 +0000 UTC
+4818,CLOSED,Request Cancellation,question,2022-09-12 17:33:20 +0000 UTC
+4817,CLOSED,Tensorflow model does not work on inference,question,2022-09-12 17:35:11 +0000 UTC
+4815,CLOSED,Triton not loading custom ONNX model,question,2022-09-14 18:58:41 +0000 UTC
+4814,CLOSED,Error when doing inference using tf-trt converted frozen model,,2022-10-11 00:00:00 +0000 UTC
+4813,CLOSED,triton_python_backend_utils's api document?,,2022-09-09 07:17:57 +0000 UTC
+4812,CLOSED,Incomprehensible overhead in Tritonserver inference,question, performance,2023-02-23 23:10:42 +0000 UTC
+4811,CLOSED,onnx model dynamic axis not working,,2022-10-13 00:48:32 +0000 UTC
+4810,CLOSED,[12th Gen Intel(R)] container was built for CPUs supporting at least the AVX instruction set,,2022-11-22 03:11:33 +0000 UTC
+4809,CLOSED,No-copy Tensor transfer in python backend-based ensemble,question, performance,2022-11-29 17:03:24 +0000 UTC
+4808,CLOSED,If I open the log verbose in python http client, will time cost increase?,,2022-08-26 08:46:36 +0000 UTC
+4807,CLOSED,Support for Mac M1 chips?,,2022-08-24 07:02:28 +0000 UTC
+4806,CLOSED,Check torchlib torch version,,2022-08-23 22:51:37 +0000 UTC
+4804,CLOSED,Failed to set cuda graph shape when I set max_batch_size==0,bug,2022-09-23 18:27:21 +0000 UTC
+4800,CLOSED,Shared memory failing in gunicorn following example,,2022-11-22 03:15:29 +0000 UTC
+4799,CLOSED,Multiple instances on the same GPU,question,2022-08-23 00:24:44 +0000 UTC
+4798,CLOSED,Error details: Error setting the binding dimension,,2022-08-24 10:19:16 +0000 UTC
+4797,CLOSED,W0821 08:49:41.251296 104802 server.cc:208] failed to enable peer access for some device pairs,,2022-10-13 00:49:24 +0000 UTC
+4796,CLOSED,What should I do when backend_input_collector return need_cuda_input_sync of True value?,question,2022-09-13 20:54:51 +0000 UTC
+4795,CLOSED,Images Downloaded Location?,question,2022-08-24 18:27:45 +0000 UTC
+4790,CLOSED,Low gpu utilization,,2022-11-22 03:13:23 +0000 UTC
+4788,CLOSED,Question about inferring on multiple cards using cuda_shared_memory,question,2022-09-12 17:33:56 +0000 UTC
+4787,CLOSED,How can I get the model list when I set --model-control-mode=poll by using "curl -X POST http://<ip>: <port>/v2/reporitory/models/....",question,2022-08-31 20:37:56 +0000 UTC
+4786,OPEN,Show ensemble stage where error happens,enhancement,2022-08-19 15:57:22 +0000 UTC
+4784,CLOSED,Changing Maximum Batch Size after model deployment,question,2022-08-19 15:41:38 +0000 UTC
+4783,CLOSED,Python backend bug when using model load api to reload model in explicit mode,bug, investigating,2023-03-16 22:28:34 +0000 UTC
+4782,CLOSED,How long support of current backend API you are guarantee.,question,2022-08-18 21:54:47 +0000 UTC
+4781,OPEN,Container images for Jetson devices,enhancement,2022-08-17 23:20:00 +0000 UTC
+4780,CLOSED,[BUG] Multi-input model with varying sizes gives error with gRPC client,,2022-08-18 13:32:53 +0000 UTC
+4779,OPEN,Tests for backend examples,enhancement,2022-08-17 18:37:03 +0000 UTC
+4778,CLOSED,triton pytorch backend malloc coredump,bug,2022-09-05 09:25:38 +0000 UTC
+4776,CLOSED,Socket operation on non-socket when using multiprocessing,,2022-08-18 06:01:16 +0000 UTC
+4775,CLOSED,Issues with implicit state management,,2022-08-17 18:40:25 +0000 UTC
+4772,OPEN,Python Backend to support GPU instance,enhancement,2023-07-06 18:14:38 +0000 UTC
+4769,CLOSED,Core dump when dynamic batch Infer using tensorflow backend,bug, investigating,2023-04-10 16:22:00 +0000 UTC
+4767,CLOSED,Unexpected inference output 'model_output', allowed outputs are: logits,question,2022-09-21 12:49:20 +0000 UTC
+4766,OPEN,CUDNN_STATUS_EXECUTION_FAILED when Triton server is running,bug,2022-09-01 00:34:30 +0000 UTC
+4765,CLOSED,WebRTC Support ?,,2022-08-31 20:36:57 +0000 UTC
+4764,CLOSED,Does model should instantly free memory after unload?,bug,2022-08-31 20:36:43 +0000 UTC
+4763,CLOSED,Run server with public IP !,question,2022-08-15 16:49:01 +0000 UTC
+4761,CLOSED,Triton pod not scheduled in all GPUs in a physical server.,question,2022-08-13 01:56:48 +0000 UTC
+4756,CLOSED,TF_SIGNATURE_DEF is not used when selected on service startup,bug,2023-03-24 00:40:27 +0000 UTC
+4754,CLOSED,Real time inference using Triton C APIs for multiple models,,2022-11-10 16:35:14 +0000 UTC
+4753,CLOSED,Limit maximum nubmer of concurrent requests for triton server,question,2022-08-10 17:17:22 +0000 UTC
+4752,CLOSED,Build a custom python backend environment for old fashion model. How to use specific CUDA version in conda environment?,,2022-11-22 03:03:02 +0000 UTC
+4749,CLOSED,Have any roadmap to support http2?,,2022-09-06 23:43:31 +0000 UTC
+4748,CLOSED,[question] Development workflow for custom C++ backends,,2022-09-02 10:26:06 +0000 UTC
+4745,CLOSED,Is it possible to add logging to python backend?,,2022-08-05 19:29:11 +0000 UTC
+4744,CLOSED,Triton server always crash during stress test.,,2022-09-22 11:13:16 +0000 UTC
+4743,CLOSED,Python Backend complains "triton_python_backend_utils" has no attribute "InferenceRequest",bug,2022-09-30 22:43:27 +0000 UTC
+4742,CLOSED,onnxruntime: `--no-container-build` not honored,,2022-09-09 20:38:15 +0000 UTC
+4739,CLOSED,Couldn't get temp CUBIN file name - TensorFlow XLA,investigating,2022-08-17 22:05:44 +0000 UTC
+4737,CLOSED,How to run triton on windows10?,investigating,2023-05-29 08:22:16 +0000 UTC
+4734,CLOSED,Perf_analyzer json file error on unspecified optional tensors.,,2022-08-03 14:32:27 +0000 UTC
+4733,OPEN,Python backend dynamic loading model uses configuration parameters, and loading fails,bug,2022-08-05 21:05:57 +0000 UTC
+4731,OPEN,enh: Trace to capture the child models invoked from BLS,enhancement,2022-08-05 00:37:14 +0000 UTC
+4730,CLOSED,build error: /workspace/src/grpc_server.cc:822:35: error: 'google::protobuf::stringpiece_internal' has not been declared,,2022-09-07 21:47:14 +0000 UTC
+4728,CLOSED,Python backend shared codebase and code import,question,2022-08-03 05:56:51 +0000 UTC
+4726,CLOSED,Question complex output with different shape for every sample - Token classification,,2022-08-28 12:24:51 +0000 UTC
+4725,OPEN,Fail to fetch PR with `--repo-tag`,bug,2022-08-04 22:54:42 +0000 UTC
+4724,CLOSED,failed to load model,,2022-09-07 21:47:37 +0000 UTC
+4722,CLOSED,Customize response when using raw binary request.,enhancement,2022-08-12 16:17:41 +0000 UTC
+4720,CLOSED,Trace summary script doesn't correctly handle splitting string values,,2022-10-21 23:42:35 +0000 UTC
+4719,CLOSED,Large model output is copied when working from a BLS,,2023-02-01 20:46:26 +0000 UTC
+4718,CLOSED,Add request id to trace output,enhancement,2023-03-07 19:36:32 +0000 UTC
+4716,CLOSED,Parallel model inferencing flakey after upgrading triton,,2022-10-13 00:42:15 +0000 UTC
+4715,CLOSED,Triton server periodically stops responding to `ServerLive`, `ServerReady` and `RepositoryIndex` requests,,2022-11-22 03:31:59 +0000 UTC
+4714,CLOSED,oops,,2022-07-29 18:51:27 +0000 UTC
+4713,CLOSED,Error message printed when install python packages in `nvcr.io/nvidia/tritonserver:22.06-py3` docker,,2022-08-01 22:23:01 +0000 UTC
+4712,CLOSED,Tensorrt is very slow when batch size is small,,2022-09-07 21:48:51 +0000 UTC
+4711,CLOSED,[Question] About perf_analyzer request rate,bug,2023-02-10 03:18:17 +0000 UTC
+4708,CLOSED,can't print chinese in Python backend,,2022-09-09 20:40:55 +0000 UTC
+4703,CLOSED,Python backend initialize 'model_repository' arg breaking change,,2022-07-29 18:45:34 +0000 UTC
+4702,CLOSED,Error in accessing MINIO with self signed cert,,2022-09-07 21:49:19 +0000 UTC
+4700,CLOSED,How to load plugin dynamically,enhancement,2023-07-06 18:47:59 +0000 UTC
+4697,CLOSED,Failed to load tensorflow model: Not loaded: No model version was found,,2022-07-26 23:02:44 +0000 UTC
+4696,CLOSED,Two issues when trying to do the homework of Triton server lesson from the NVIDIA DLI,,2022-07-27 02:11:34 +0000 UTC
+4691,CLOSED,Compute output much larger than compute input,,2022-11-22 03:31:55 +0000 UTC
+4690,CLOSED,openvinobackend inference only support synchronous mode? why not asynchronous mode,,2022-11-22 03:03:09 +0000 UTC
+4689,CLOSED,Using S3 repository for Triton does not working properly / Cannot plug in to S3,,2022-07-26 17:35:12 +0000 UTC
+4688,CLOSED,NGC container 22.07,question,2022-07-25 17:49:58 +0000 UTC
+4687,CLOSED,Support TensorRT 8.4,,2022-07-23 13:16:32 +0000 UTC
+4685,CLOSED,tritonclient.utils.InferenceServerException: [StatusCode.INTERNAL] Unable to open shared memory region: '/tVHIApxKugk6TwsknLr1b',question,2023-02-23 23:05:02 +0000 UTC
+4683,CLOSED,TensorRT Optimization in ConfigMap,question,2022-07-31 12:39:49 +0000 UTC
+4679,CLOSED,Clients supporting sending multiple synchronous inferences at the same time,question,2022-10-11 00:01:56 +0000 UTC
+4678,CLOSED,How can I run an uninterrupted thread in python backend?,question,2022-07-25 01:58:39 +0000 UTC
+4675,CLOSED,Optional usage of shared memory for python backend,,2022-07-21 11:27:53 +0000 UTC
+4674,CLOSED,perf_analyzer,,2022-07-21 17:32:40 +0000 UTC
+4673,CLOSED,Ensemble model can't obtain data from pre-processor,question,2022-07-21 02:03:52 +0000 UTC
+4672,CLOSED,What is the difference between `Plattform` and `Backend`,question,2022-10-11 00:03:01 +0000 UTC
+4671,CLOSED,run testing triton fail,question,2022-09-07 21:50:27 +0000 UTC
+4669,CLOSED,Questions for asynchronous Triton deployment,question,2022-07-20 08:21:27 +0000 UTC
+4668,OPEN,One click deployment to GKE no longer works as Istio deprecated,enhancement, investigating,2023-02-16 18:11:46 +0000 UTC
+4667,CLOSED,The average latency of the fp16 bert demo trt.engine with dynamic batch size is up to 2s,question,2022-07-21 21:16:14 +0000 UTC
+4665,CLOSED,Server crashes on loading shared libraries,bug, investigating,2023-01-28 02:50:58 +0000 UTC
+4662,CLOSED,Server died forever when overloaded,question,2022-07-19 21:49:57 +0000 UTC
+4661,OPEN,Add source distribution to Python client package,enhancement,2022-07-19 21:41:10 +0000 UTC
+4660,CLOSED,I am new to triton inference. I am looking for documents related to A/B testing but couldn't find them so far. If someone here is already used this feature, please let me the details.,invalid,2022-07-18 20:55:21 +0000 UTC
+4659,CLOSED,exec /opt/nvidia/nvidia_entrypoint.sh: exec format error,,2022-07-18 20:49:17 +0000 UTC
+4658,CLOSED,How to profile applications running on the triton-server?,question,2022-08-08 20:12:17 +0000 UTC
+4657,CLOSED,The cc_model_filenames does not work,question,2022-08-10 01:31:55 +0000 UTC
+4655,CLOSED,TRTIS 19.10 does notfind GPUs | failed call to cuInit: UNKNOWN ERROR (303),,2022-08-10 15:14:36 +0000 UTC
+4651,CLOSED,Cannot load Custom Op file in the container LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.,,2022-07-19 16:54:55 +0000 UTC
+4647,OPEN,[python backend] Add class_count argument for inference requests with BLS scripting,enhancement,2022-07-15 22:00:40 +0000 UTC
+4646,CLOSED,How to get predict_proba as inference output of pytorch model?,question,2022-08-08 09:54:52 +0000 UTC
+4644,CLOSED,jetson AGX xavier: Shared memory in docker container.,question,2022-07-14 15:30:31 +0000 UTC
+4643,CLOSED,question about batch mechanism,question,2022-07-15 13:42:32 +0000 UTC
+4639,CLOSED,Failing to get output tensor on GPU device,,2022-07-28 17:07:22 +0000 UTC
+4638,CLOSED,enh: Extend model management to support load / unload at model version,duplicate,2022-07-13 15:42:30 +0000 UTC
+4637,CLOSED,onnxruntime tensorrt is faster then triton server tensorrt,,2022-10-31 14:36:35 +0000 UTC
+4636,CLOSED,bug: Trace output for `BYTES` has invalid JSON encoding,bug, investigating,2022-08-01 00:28:10 +0000 UTC
+4630,OPEN,Dynamically loaded models don't work with ensemble,bug, investigating,2022-07-20 18:58:08 +0000 UTC
+4629,CLOSED,[question] Adding timeout options for both client and server with a custom backend and stateful batching,question,2022-07-19 16:09:09 +0000 UTC
+4628,CLOSED,Client gets into deadlock when max_sequence_idle_microseconds timeout occurs on triton server,,2022-07-19 08:38:42 +0000 UTC
+4627,CLOSED,[question] Versioning for ensemble models,question,2022-07-13 08:36:50 +0000 UTC
+4620,CLOSED,Why is C api recommended on Jetson ?,question,2022-07-14 14:22:05 +0000 UTC
+4619,CLOSED,[question] performance comparison, ensemble vs. BLS,question,2022-07-26 19:22:08 +0000 UTC
+4618,CLOSED,Can I stop execution?,question,2022-07-13 14:28:05 +0000 UTC
+4617,CLOSED,Get unexpected deadlock with ensemble model,,2022-07-13 02:43:36 +0000 UTC
+4616,CLOSED,How ot imporve throughput on tritonserver,,2022-09-07 21:51:09 +0000 UTC
+4615,CLOSED,How to connect to remote server with GRPC?,,2022-07-09 15:03:25 +0000 UTC
+4610,CLOSED,Ensemble model scheduler,question,2022-07-13 20:09:26 +0000 UTC
+4609,CLOSED,Changing the gRPC protocol to implement standard gRPC Health Checking Protocol,enhancement,2023-01-23 21:03:08 +0000 UTC
+4608,CLOSED,Triton server how to schedule GPU resource?,question,2022-07-11 20:37:20 +0000 UTC
+4606,CLOSED,Automating Image and Payload Upload for ONNX Backend Inference - Image Shape and Data payload,,2022-09-06 23:46:43 +0000 UTC
+4605,CLOSED,Cannot start Triton inference server with Python backend stub and ONNX models,,2023-03-17 15:45:19 +0000 UTC
+4604,CLOSED,Fail loading TensorRT model: could not set binding dimension,,2022-07-07 13:29:49 +0000 UTC
+4603,CLOSED,Accumulate inference time with an ensemble model is way slower than the slowest individual,question,2022-09-07 21:46:30 +0000 UTC
+4600,CLOSED,MT-NLG - Are we ever getting access to the 530 B parameters trained model?,,2022-07-06 21:59:45 +0000 UTC
+4598,CLOSED,[confused] Does Triton-server support to handle data from multiple video streams , maybe with model like detection + tracking(deep sort) + classification(stateful model),,2022-09-07 21:46:09 +0000 UTC
+4597,CLOSED,Autoscale instances,,2022-07-08 17:45:39 +0000 UTC
+4594,CLOSED,Add support for loading onnx files with the tensorRT backend,,2023-01-28 00:03:20 +0000 UTC
+4593,CLOSED,Triton server doesn't detect GPUs,,2022-08-01 06:52:51 +0000 UTC
+4590,CLOSED,Build latest triton custom image for Ubuntu 18.04,,2022-12-29 13:24:33 +0000 UTC
+4587,OPEN,ONNXRuntime TensorRT cache gets regenerated every time a model is uploaded even with correct settings,investigating,2022-07-29 03:00:28 +0000 UTC
+4585,CLOSED,Metrics,question,2022-07-05 21:42:05 +0000 UTC
+4584,CLOSED,Will warmup been done when we start the server with --model-control-mode=explicit,,2022-07-05 21:25:12 +0000 UTC
+4583,CLOSED,Help! triton_client.load_model return timeout error!,,2022-11-16 08:02:53 +0000 UTC
+4582,CLOSED,How can I send variable-length tensors as a batch in one request using Python APIs?,,2022-07-05 21:33:01 +0000 UTC
+4581,CLOSED,python backend doesn't support run each instance on multiple gpu,,2022-07-04 14:39:15 +0000 UTC
+4580,CLOSED,failed to launch triton-server,,2022-07-04 14:41:53 +0000 UTC
+4572,CLOSED,Use-cases and benefits of "Streaming" inference,,2022-07-22 23:57:37 +0000 UTC
+4571,CLOSED,Auto-Generated Model Configuraton with label file,,2022-07-14 01:19:36 +0000 UTC
+4570,CLOSED,request should include at least one InferRequestedOutput object,,2022-11-22 03:30:09 +0000 UTC
+4566,CLOSED,Triton terminated with Signal (6),bug,2022-10-08 06:54:57 +0000 UTC
+4563,CLOSED,tritonclient expects a different shape as defined in config.pbtxt,,2022-06-30 01:04:28 +0000 UTC
+4562,CLOSED,UNAVAILABLE: Internal: archive_read_open_filename() failed.,,2022-11-23 02:18:34 +0000 UTC
+4561,CLOSED,Release tags for tritonclient,,2022-06-29 17:32:04 +0000 UTC
+4560,CLOSED,Poll failed for model directory 'full-pipeline': output 'OUT' for ensemble 'full-pipeline' is not written,,2022-06-29 14:00:25 +0000 UTC
+4559,CLOSED,Assertion `batchSize > 0' failed, when deploy the tf-trt int8 optimization model,,2022-06-29 16:48:47 +0000 UTC
+4558,CLOSED,failed to split the output tensor 'dets' in responses: expected batch size of atleast 2 in model output, got 1,,2022-07-15 18:28:02 +0000 UTC
+4557,CLOSED,UNAVAILABLE: Internal: output 'labels' does not follow naming convention i.e. <name>__<index>.,,2022-06-29 19:09:36 +0000 UTC
+4556,CLOSED,support Hyper-Q in triton-server,,2022-07-15 18:27:47 +0000 UTC
+4555,CLOSED,Triton Server Docker Image with ONNXRuntime support,question,2022-07-14 23:14:01 +0000 UTC
+4554,CLOSED,Failed when building for Windows 10,,2022-09-07 21:44:49 +0000 UTC
+4550,CLOSED,[question] simple.cc, why std::vector<char>,question,2022-07-15 18:52:11 +0000 UTC
+4549,CLOSED,Server not ready: Warmup using python BLS,,2022-11-22 03:31:02 +0000 UTC
+4548,CLOSED,Questions regarding the Rate Limiter,question,2022-06-30 09:04:20 +0000 UTC
+4547,OPEN,Splitting a batch to max_batch_size if the batch size is larger than max_batch_size,enhancement, investigating,2023-07-07 23:11:44 +0000 UTC
+4545,CLOSED,When using perf_analyzer, throughput decreases sharply as concurrency increases.,,2022-07-11 14:18:23 +0000 UTC
+4542,CLOSED,Request the feature to send metadata in BLS,enhancement,2022-10-06 18:21:17 +0000 UTC
+4541,OPEN,python tritonclient stream_infer should send end signal to callback,enhancement,2022-08-26 17:17:25 +0000 UTC
+4540,OPEN,[Question] Customize HTTP response status code for malformed GPU card,bug, investigating,2022-06-29 00:20:52 +0000 UTC
+4538,OPEN,Support for the new CUDA virtual memory management functions for shared memory.,enhancement,2023-05-19 03:50:33 +0000 UTC
+4537,CLOSED,Server start stuck when loading python model instantiating certain transformers model,bug, investigating,2023-02-01 12:15:31 +0000 UTC
+4533,CLOSED,what is raw_mug_data?,question,2022-07-08 18:57:45 +0000 UTC
+4531,CLOSED,Triton's resource consumption,,2022-06-27 08:51:33 +0000 UTC
+4530,OPEN,support decoupled mode in perf_analyzer,enhancement,2022-06-20 15:04:19 +0000 UTC
+4529,OPEN,Hardening guide for Triton Server,enhancement,2022-06-21 19:08:35 +0000 UTC
+4528,CLOSED,Encounter memory leak issue when using http /load api to load new version's model,,2022-08-10 03:14:19 +0000 UTC
+4527,CLOSED,Loosing a horrible amount of recall on triton server inference,,2022-07-14 05:19:31 +0000 UTC
+4526,CLOSED,Option for adding or overriding model config attributes at server startup,enhancement,2023-07-08 00:36:26 +0000 UTC
+4525,CLOSED,pytorch_backend[undefined symbol],question,2022-07-08 18:58:31 +0000 UTC
+4524,CLOSED,tritonserver sometimes turn slow when gpu error,bug,2023-07-10 23:33:54 +0000 UTC
+4523,CLOSED,GRPC Health check method,,2022-06-18 09:29:35 +0000 UTC
+4520,CLOSED,Token classification,,2022-07-08 18:59:22 +0000 UTC
+4519,CLOSED,Triton 2.10.0 Build without docker on Ubuntu 20.04 with ONNX Backend,,2022-11-22 03:23:19 +0000 UTC
+4517,CLOSED,Tritonserver hanging on startup,,2022-06-17 21:40:10 +0000 UTC
+4516,CLOSED,How to use java client api transfer images and texts to python backend?,question,2022-06-21 17:03:37 +0000 UTC
+4515,CLOSED,Relative path for s3 storage,bug,2022-12-16 18:12:11 +0000 UTC
+4513,CLOSED,add param for pytorch backend to specific get_method other than forward,,2022-06-17 06:04:48 +0000 UTC
+4512,CLOSED,accuracy difference in local inference and triton inference,question, investigating,2022-06-15 10:11:57 +0000 UTC
+4511,CLOSED,perf_analyzer and perf_client don't have 'x' permission after pip install tritonclient(2.22.3),bug,2022-06-20 16:03:20 +0000 UTC
+4509,CLOSED,triton image classification example client http memory leak,bug,2022-07-08 19:33:19 +0000 UTC
+4508,CLOSED,On custom config and accessing other models config on python backend,question,2022-07-13 16:19:43 +0000 UTC
+4507,CLOSED,first run ok,seconde run error:xxx_model(version:1)inference error1:PyTorch execute failure: UNSUPPORTED DTYPE: Device,,2022-06-27 14:53:32 +0000 UTC
+4506,CLOSED,AttributeError: 'python backend utils.Inference Request' object has no attribute 'as_numpy',,2022-06-13 15:03:58 +0000 UTC
+4505,CLOSED,Status Message: CUDNN error executing cudnnFindConvolutionForwardAlgorithmEx,bug,2023-02-03 21:03:29 +0000 UTC
+4504,CLOSED,Triton 22.03-py3 stuck at TRITONBACKEND_ModelInstanceInitialize on older Ubuntu 18.04,question,2023-02-23 17:12:58 +0000 UTC
+4503,CLOSED,Warm up by sending multiple requests,question,2022-07-08 18:59:01 +0000 UTC
+4502,CLOSED,Multiple models in Triton,,2022-06-15 18:19:35 +0000 UTC
+4492,CLOSED,Throughput and concurrency values,question,2022-06-08 15:19:55 +0000 UTC
+4491,CLOSED,Triton server stops while making Async request from multiple threads,bug,2022-06-16 08:54:25 +0000 UTC
+4490,CLOSED,How to delete a backend?,question,2022-06-08 15:23:00 +0000 UTC
+4489,CLOSED,[Solved] Bug: socket.timeout: timed out. Server failed to respond to requests,,2022-09-12 17:51:07 +0000 UTC
+4488,CLOSED,Tensorflow and NVIDIA Triton Setup Issue,,2022-06-08 22:03:52 +0000 UTC
+4487,CLOSED,Is there a good solution for video streaming? For example, RTSP and RTMP,,2022-06-09 02:14:36 +0000 UTC
+4486,CLOSED,openvino backend 2022,question,2022-07-27 01:48:18 +0000 UTC
+4485,OPEN,SHARK Backend integration,enhancement,2022-06-07 21:17:48 +0000 UTC
+4483,CLOSED,Compute infer time increases linearly with batch size even with batching,,2022-06-14 10:10:56 +0000 UTC
+4482,CLOSED,Python backend with additional OS libraries,,2022-06-27 14:54:35 +0000 UTC
+4481,CLOSED,how to debug the model file when use python backend,question,2022-06-08 06:55:20 +0000 UTC
+4480,CLOSED,run ci test fail,,2022-06-07 18:41:59 +0000 UTC
+4479,CLOSED,Failed to register CUDA shared memory region 'fc6_1',,2023-06-28 10:30:12 +0000 UTC
+4478,CLOSED,C-API onnx runtime error 2: not enough space: expected 3145728, got 786432,,2023-05-29 11:33:17 +0000 UTC
+4477,CLOSED,Server stuck for a while when declaring gpu tensor of torch or cupy in python backend in first time of inference,,2022-06-10 07:35:54 +0000 UTC
+4471,CLOSED,Run and query a finetuned T5 model in Triton Inference Server,,2022-06-06 20:07:32 +0000 UTC
+4470,CLOSED,Can ragged input used together with stateful model?,,2022-11-22 03:23:03 +0000 UTC
+4467,CLOSED,Build from source failed,,2022-07-08 19:01:40 +0000 UTC
+4460,CLOSED,InferRequestedOutput throws memory error during object destruction,,2022-06-04 08:24:51 +0000 UTC
+4457,CLOSED,Conversion of Pbutils Output Tensor to Numpy Array without Torch dl pack,,2022-06-21 19:17:42 +0000 UTC
+4456,CLOSED,Cannot load model from GCS when LD_PRELOAD env var is set,bug,2023-07-06 18:44:41 +0000 UTC
+4454,CLOSED,Unable to use S3 model storage,,2022-06-08 00:14:30 +0000 UTC
+4453,CLOSED,"POST v2/repository/models/${MODEL_NAME}/load" failed on 22.05 but works fine on 21.08,,2022-07-12 20:54:31 +0000 UTC
+4452,CLOSED,Question: support complex model out data struct (List[Dict[str, Tensor]])?,question,2022-05-31 19:00:45 +0000 UTC
+4451,OPEN,Torchscript backend **MUCH** slower only with FP16 on 1650,investigating,2023-02-24 21:48:34 +0000 UTC
+4450,CLOSED,Build simple.cc,,2022-05-31 15:43:11 +0000 UTC
+4449,CLOSED,Torchscript backend error in multi-gpu environment,,2022-06-08 13:00:16 +0000 UTC
+4448,CLOSED,Tritonclient cushm on multi-gpu server,,2022-06-05 12:41:57 +0000 UTC
+4447,CLOSED,Issue with running .trt model,,2022-06-14 15:00:48 +0000 UTC
+4445,CLOSED,model expected the shape of dimension 0 to be between 1 and 1 but received 32,,2022-06-16 00:01:48 +0000 UTC
+4444,CLOSED,Download nvcr.io/nvidia/tritonserver:22.04-py3 file,,2022-05-30 10:12:49 +0000 UTC
+4443,CLOSED,[Question] Reducing payload size,,2022-06-02 15:33:54 +0000 UTC
+4436,CLOSED,WSL2 CUDA SHM support,,2022-12-28 18:30:58 +0000 UTC
+4434,CLOSED,WSL2 CUDA SHM support,,2022-05-27 17:32:03 +0000 UTC
+4433,CLOSED,run ci test fail,,2022-06-11 05:12:34 +0000 UTC
+4432,OPEN,Internal: An input of type 'Tensor[]' was detected in the model. Only a single input of type Dict(str, Tensor) or input(s) of type Tensor are supported.,enhancement,2022-12-29 23:12:58 +0000 UTC
+4430,CLOSED,Triton inference is slow then normal pytorch model on GPU, http_server.cc:1226] HTTP: unable to provide 'OUTPUT__0' in GPU, will use CPU,,2022-05-26 05:30:56 +0000 UTC
+4429,CLOSED,Triton inference is slow then normal pytorch model on GPU http_server.cc:1226] HTTP: unable to provide 'OUTPUT__0' in GPU, will use CPU,,2022-05-25 15:33:13 +0000 UTC
+4425,CLOSED,Triton docker client image does not have cmake installed (/usr/bin/cmake),,2022-05-25 15:02:23 +0000 UTC
+4422,CLOSED,Reduce load/unload model time,,2022-11-22 03:21:56 +0000 UTC
+4421,CLOSED,config.pbtxt format in Python Protobuf text_format,,2022-05-25 00:36:31 +0000 UTC
+4416,CLOSED,Allow loading/unloading of specific version for a given model,enhancement,2023-01-17 11:47:28 +0000 UTC
+4415,CLOSED,how to deploy stateful model without implicit state management,,2022-06-01 20:32:35 +0000 UTC
+4414,CLOSED,[mlflow-plugin] create a separate repository for mlflow-triton-plugin,,2022-11-22 03:21:42 +0000 UTC
+4412,CLOSED,Cannot warmup with python backend with batch_size,,2022-07-08 19:02:04 +0000 UTC
+4411,CLOSED,Segmentation fault, Core dumped during inference tensorrt model,,2022-05-23 11:18:04 +0000 UTC
+4408,CLOSED,GPT NeoX 20B,question,2023-03-03 00:53:04 +0000 UTC
+4407,CLOSED,Cannot build triton docker container due GPG keys rotation,,2022-07-08 19:02:48 +0000 UTC
+4406,CLOSED,Parse error at offset 0: The document is empty.,,2022-07-08 19:00:24 +0000 UTC
+4405,CLOSED,Shape does not match with data while using kserve http protocol BYTES data type,bug,2022-07-14 20:59:59 +0000 UTC
+4404,CLOSED,Option to return non binary data from when sending raw binary request,,2022-06-08 21:45:23 +0000 UTC
+4402,CLOSED,L0_https Client Example Hangs,,2022-05-31 23:06:23 +0000 UTC
+4400,CLOSED,Dynamically passing model parameters in inference request,,2022-06-08 21:47:58 +0000 UTC
+4399,CLOSED,pytorch_backend build fail,,2022-06-07 08:43:48 +0000 UTC
+4398,CLOSED,Using custom Python backend stub causes no such file or directory error,,2022-05-19 12:47:46 +0000 UTC
+4397,CLOSED,[question] about the GPU metrics,question,2022-07-11 22:41:43 +0000 UTC
+4396,CLOSED,Run dynamic shapes engine in Triton Server,,2022-05-18 23:28:16 +0000 UTC
+4394,CLOSED,Triton Crashes with onnxruntime error,,2022-07-08 19:37:48 +0000 UTC
+4391,CLOSED,python backend example for BERT like architecture,,2022-07-08 19:38:52 +0000 UTC
+4390,CLOSED,Internal: An input of type 'str' was detected in the model. Only a single input of type Dict(str, Tensor) or input(s) of type Tensor are supported.,,2023-07-06 13:51:28 +0000 UTC
+4389,CLOSED,[Question] perf_analyzer, "Server Compute Input" meaning,question,2022-05-17 18:33:28 +0000 UTC
+4388,CLOSED,What was the reason for designing Implicit State Management,question,2022-05-20 07:24:09 +0000 UTC
+4387,CLOSED,Tritonbackend multiple instances of the same model run sequentially instead of in parallel on the same device with asyncrhonous requests.,,2023-01-03 01:44:37 +0000 UTC
+4386,CLOSED,Why is an extra 1 added to the first element of the result of model simple_squence?,question,2022-05-20 07:25:53 +0000 UTC
+4385,CLOSED,pytorch_backend build fail,question,2022-07-14 23:11:17 +0000 UTC
+4384,CLOSED,Request specifies invalid shape for input 'images' for yolov5x6_tensorrt. Error details: model expected the shape of dimension 0 to be between 1 and 1 but received 32,,2022-07-08 19:47:05 +0000 UTC
+4380,CLOSED,GPG Keys Expired For Apt Packages,,2022-05-19 12:52:09 +0000 UTC
+4379,CLOSED,Response cache is not working well,bug,2022-08-03 21:44:38 +0000 UTC
+4378,OPEN,perf_analyzer socket closed error,investigating,2022-11-01 00:18:09 +0000 UTC
+4376,CLOSED,docker run not working,,2022-07-08 19:52:59 +0000 UTC
+4375,CLOSED,[Question] Incorrect generation of model configuration for ONNX model,,2022-05-13 16:08:05 +0000 UTC
+4374,CLOSED,Using custom Python backend stub causes Operation not permitted error,,2022-11-22 03:21:25 +0000 UTC
+4373,CLOSED,[Question] Setting dynamic batching with warmup,,2022-06-02 15:33:14 +0000 UTC
+4372,CLOSED,Possible GPU memory leak in Triton. Not draining.,,2022-11-30 08:31:03 +0000 UTC
+4371,CLOSED,Triton Server build with Docker fails,,2022-05-17 08:48:47 +0000 UTC
+4367,CLOSED,Parallel model loading on multiple GPUs on startup,enhancement,2023-06-26 20:34:29 +0000 UTC
+4366,CLOSED,[Feature Request] Add load_model api to BLS triton_python_backend_utils,enhancement,2023-06-29 20:51:03 +0000 UTC
+4365,CLOSED,[Question] Plugins from MMDeploy,question,2023-01-30 18:58:37 +0000 UTC
+4363,OPEN,Triton Batch size feature requests,enhancement,2022-05-11 23:44:36 +0000 UTC
+4362,OPEN,"inference failed: response output count mismatch" from gRPC client after enabling response cache,bug,2022-09-11 01:54:10 +0000 UTC
+4361,CLOSED,Why triton serving shared memory failed with running multiple workers in uvicorn in order to send multiple request concurrently to the models?,,2022-09-30 22:45:17 +0000 UTC
+4353,CLOSED,A few questions on C++ API,,2022-05-10 09:49:20 +0000 UTC
+4352,CLOSED,Accept JSON files as configuration files,enhancement,2022-05-10 01:45:14 +0000 UTC
+4351,OPEN,Multiple configuration files for the same model,enhancement,2023-06-02 01:35:16 +0000 UTC
+4350,CLOSED,Poor Performance on Triton vs inferencing w/o triton,,2022-05-27 19:46:02 +0000 UTC
+4346,CLOSED,Loading ONNX model fails because of insufficient CUDA driver version,bug,2022-05-11 23:50:24 +0000 UTC
+4345,CLOSED,Loading ONNX model fails because of insufficient CUDA driver version,,2022-05-06 11:06:24 +0000 UTC
+4344,CLOSED,Is there any scheduling strategy that drops old requests?,question,2022-05-27 19:49:06 +0000 UTC
+4341,CLOSED,Can ensemble model support to handle multiple inference requests simultaneously?,,2022-10-10 16:20:15 +0000 UTC
+4340,CLOSED,Python backend stub compilation fails because of TRITONSERVER_TYPE_BF16,,2022-05-06 17:35:22 +0000 UTC
+4335,CLOSED,Docs link error,,2022-05-04 19:54:12 +0000 UTC
+4333,CLOSED,Build failure when building PyTorch CPU tritonserver container: /usr/bin/ld: cannot find -ltorch,,2022-05-12 20:36:47 +0000 UTC
+4332,CLOSED,Third party won't build with clang 13+,,2022-11-22 03:20:19 +0000 UTC
+4331,CLOSED,Python backend stuck at TRITONBACKEND_ModelInstanceInitialize,bug,2023-01-27 16:42:04 +0000 UTC
+4330,CLOSED,Model warmup fails, yet load is reported successful,bug,2022-05-19 18:19:21 +0000 UTC
+4329,CLOSED,fp16 onnx model does not load on triton server,,2022-05-23 21:42:08 +0000 UTC
+4328,CLOSED,Support for dynamic shapes with TFTRT model,,2022-11-22 03:21:08 +0000 UTC
+4321,CLOSED,[Question] Semantic Segmentation : Add an argmax layer,question,2022-05-12 08:37:12 +0000 UTC
+4319,CLOSED,how multiple instances on the same device can be concurrent,question,2023-06-21 21:38:02 +0000 UTC
+4310,CLOSED,image_client.py example is broken for multiple classification outputs,bug,2022-05-04 17:55:59 +0000 UTC
+4308,CLOSED,how can i use custom tensorrt backends,question,2022-05-27 19:46:30 +0000 UTC
+4306,CLOSED,Triton Server build failed because DCGM 2.2.9 public key is not available,,2022-05-27 19:47:37 +0000 UTC
+4303,OPEN,[Question] explicit model control mode - load models in parallel,enhancement, investigating,2023-07-08 00:28:43 +0000 UTC
+4302,CLOSED,"no module named tensorflow" when running Triton server w/ python backend,,2022-04-28 19:59:45 +0000 UTC
+4300,CLOSED,Torchscript model loading throws error UNAVAILABLE: INTERNAL: An Input of type 'Tensor?' was detected in the model,,2022-04-29 18:25:40 +0000 UTC
+4299,CLOSED,Run tritonserver on Windows10 but it exited without any error,,2022-06-23 02:16:01 +0000 UTC
+4298,CLOSED,Running inference with Pytorch backend on Jetson nano,,2022-04-28 10:45:11 +0000 UTC
+4297,CLOSED,Questions about performance test results,question,2022-06-08 21:53:08 +0000 UTC
+4293,CLOSED,ONNX with TensorRT Optimization (ORT-TRT) Warmup,,2022-09-06 23:45:03 +0000 UTC
+4288,CLOSED,Triton inference server container freezes on startup,,2022-05-27 19:49:46 +0000 UTC
+4287,CLOSED,Docker A2 GPU support,,2022-04-27 07:28:54 +0000 UTC
+4286,CLOSED,How to connect Triton client library to my C++ client project?,,2022-06-05 03:34:09 +0000 UTC
+4285,CLOSED,Use custom integer data for warmup,question,2022-06-08 21:54:09 +0000 UTC
+4284,CLOSED,Custom Metrics per server,question,2022-05-23 21:40:45 +0000 UTC
+4278,CLOSED,model inference of yolov5 torchscript format runs slow,,2022-05-12 00:57:16 +0000 UTC
+4277,CLOSED,I want to load from a serialized model_config message with the Triton Server API.,,2022-05-23 21:47:53 +0000 UTC
+4276,CLOSED,nvidia-triton flower demo performace,,2023-04-28 13:11:52 +0000 UTC
+4275,CLOSED,Abnormal gpu memory usage,,2022-05-23 21:40:06 +0000 UTC
+4274,CLOSED,Proper Densenet_onnx Classification Input File Format Error,,2022-05-24 14:15:57 +0000 UTC
+4273,CLOSED,【question】cross compile for riscv64,,2022-05-18 08:07:56 +0000 UTC
+4272,CLOSED,tritonserver: error while loading shared libraries: /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1: file too short,,2022-11-22 03:20:32 +0000 UTC
+4270,CLOSED,Optimal Jetpack 4.x build,,2022-11-22 03:19:03 +0000 UTC
+4269,CLOSED,PyTorch inference returns incorrect values,,2022-04-25 14:24:52 +0000 UTC
+4266,CLOSED,Example in Python to create a pipeline with Gst-nvinferserver,,2022-04-24 12:22:08 +0000 UTC
+4265,CLOSED,tritonclient.utils.InferenceServerException: [StatusCode.UNAVAILABLE] unavailable,,2022-11-22 03:18:44 +0000 UTC
+4264,CLOSED,Load model parallelly,enhancement,2023-07-11 21:21:41 +0000 UTC
+4263,CLOSED,compose.py failure: module 'build' has no attribute 'get_container_versions',bug,2022-04-22 21:28:04 +0000 UTC
+4262,CLOSED,openvino 2022.01 support,enhancement,2022-06-09 00:35:59 +0000 UTC
+4261,CLOSED,Cannot import tritonclient.grpc after using conda pack,,2022-05-13 23:01:12 +0000 UTC
+4260,CLOSED,inference failed: ensemble unexpected deadlock,,2022-11-22 03:15:43 +0000 UTC
+4255,CLOSED,How to run inference for T5 tensorrt model deployed on nvidia triton?,,2023-04-07 06:55:03 +0000 UTC
+4254,CLOSED,How to test triton-inference-server with jmeter,,2022-05-13 22:58:51 +0000 UTC
+4250,CLOSED,Can you tell me the meaning of overhead in perf_analyzer report,question,2022-04-26 01:01:38 +0000 UTC
+4249,CLOSED,Can Triton support TensorFlow1.10.0?,,2022-04-19 16:20:04 +0000 UTC
+4248,CLOSED,How to maintain confidentiality of models in local deployments,,2023-01-14 09:28:17 +0000 UTC
+4247,CLOSED,torchscript model don't use Dynamic Batching,,2022-04-22 16:07:44 +0000 UTC
+4245,CLOSED,Clean and Concise documentation,,2022-11-11 08:56:23 +0000 UTC
+4244,CLOSED,400 Error during Inference,,2022-11-22 03:18:50 +0000 UTC
+4243,CLOSED,Test L0_memory_growth failed to load all models for tritonserver image including only onnx and python backends,,2022-05-16 21:36:53 +0000 UTC
+4242,CLOSED,Test L0_custom_ops failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 15:42:20 +0000 UTC
+4241,CLOSED,Test L0_https failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 15:45:06 +0000 UTC
+4240,CLOSED,Test L0_parallel_copy failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 15:46:21 +0000 UTC
+4239,CLOSED,Test L0_large_payload failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 15:46:41 +0000 UTC
+4238,CLOSED,Test L0_model_config failed to load all models for tritonserver image including only onnx and python backends,,2022-07-12 21:37:49 +0000 UTC
+4237,CLOSED,Test L0_output_name failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 15:49:11 +0000 UTC
+4236,CLOSED,Test L0_nullchar_string failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 15:49:29 +0000 UTC
+4235,CLOSED,Test L0_multi_server failed for tritonserver image including only onnx and python backends,,2022-04-18 15:49:52 +0000 UTC
+4234,CLOSED,triton python backend load time of pytorch model is 4x slower than an ONNX model load time.,,2022-11-22 03:21:34 +0000 UTC
+4233,CLOSED,Test L0_grpc failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 16:06:04 +0000 UTC
+4232,CLOSED,Test L0_http failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 16:05:52 +0000 UTC
+4231,CLOSED,Test L0_backend_python is coupled with AWS S3,,2022-04-18 16:09:06 +0000 UTC
+4225,CLOSED,[Question]The performance of triton-server itself,question,2022-05-31 16:47:27 +0000 UTC
+4223,OPEN,Python InferenceServerClient (http) should not call close() from __del__,investigating,2023-03-29 13:54:47 +0000 UTC
+4221,CLOSED,Serverside Postprocessing,question,2022-04-15 00:19:25 +0000 UTC
+4220,CLOSED,[FEATURE REQUEST] Support types supported by TensorRT,,2022-04-15 12:51:16 +0000 UTC
+4219,CLOSED,Async requests not increasing throughput with high network latency,,2022-07-12 17:52:52 +0000 UTC
+4215,CLOSED,Python Model with BLS: failed to get cuda pointer device attribute,bug,2022-06-24 14:50:11 +0000 UTC
+4214,CLOSED,[mlflow-triton-plugin] wrong "backend" value when deploying in onnx flavor,,2022-04-13 11:49:47 +0000 UTC
+4213,CLOSED,Tritonserver Log: model-level grading,enhancement,2022-08-18 18:05:29 +0000 UTC
+4212,CLOSED,Custom ops with LD_PRELOAD trick failed,,2022-05-13 21:42:22 +0000 UTC
+4206,CLOSED,Can Triton improve QPS by increasing CPU utilization?,,2022-05-06 01:48:01 +0000 UTC
+4205,CLOSED,Launch the TIS encountering the issue of "Unable to get power limit for GPU 0: Not Supported",,2022-04-15 05:35:38 +0000 UTC
+4203,CLOSED,Batching by any axis,enhancement,2022-04-20 09:00:26 +0000 UTC
+4202,CLOSED,feat: multiple triton server can bind on the same http/grpc port,enhancement,2022-08-09 15:41:29 +0000 UTC
+4201,CLOSED,python backend: BLS triton_python_backend_utils support url arguments,question,2022-04-13 02:23:12 +0000 UTC
+4200,CLOSED,Failed to load tensorflow savedmodel,,2022-04-12 07:38:30 +0000 UTC
+4199,CLOSED,tritonserver ensemble SEGV in nvidia::inferenceserver::RateLimiter::EnqueuePayload,bug,2022-05-02 16:18:37 +0000 UTC
+4198,CLOSED,how to ensemble models with detection and classsification,question,2023-02-23 17:06:56 +0000 UTC
+4197,CLOSED,Triton dosen't start in kubernetes,,2022-04-10 17:11:02 +0000 UTC
+4196,CLOSED,Failed to update context stat: Timer not set correctly. Send time from 1649545273035059093 to 0.,,2022-05-13 21:40:52 +0000 UTC
+4194,CLOSED,UNAVAILABLE: Invalid argument: model 'keypoints_pose_0', tensor 'input.1': the model expects 4 dimensions (shape [1,3,224,224]) but the model configuration specifies 4 dimensions (shape [1,3,224,244]),,2022-04-09 02:36:41 +0000 UTC
+4193,CLOSED,[request] Able to dynamically load ensembles without polling for model dependencies,,2022-04-10 04:01:40 +0000 UTC
+4188,CLOSED,[request] automatic on-the-fly model load/unload based on init & process resources,,2022-05-23 22:12:42 +0000 UTC
+4187,CLOSED,model config is unavaible to client if model is not loaded by the server,,2022-04-08 18:06:36 +0000 UTC
+4184,CLOSED,Will you submit the min-image for history triton?,,2022-04-13 03:22:43 +0000 UTC
+4183,CLOSED,Need minimum CI test set to validate customized tritonsever gpu image built using optional features,,2022-05-13 20:55:01 +0000 UTC
+4179,CLOSED,Triton server container lockup on stop when pinned memory is too big,investigating,2022-06-06 12:50:07 +0000 UTC
+4178,CLOSED,[Question] The way of working of sequence_end control signal in sequece batcher,question,2022-04-14 09:09:06 +0000 UTC
+4172,CLOSED,Jetson PyTorch wheel broken link,bug,2022-04-07 17:13:14 +0000 UTC
+4170,CLOSED,sometimes slower response time with smaller batch size,question,2022-04-10 05:09:24 +0000 UTC
+4168,CLOSED,HTTP ERROR 400 when load or unload model,,2022-04-06 06:06:35 +0000 UTC
+4165,CLOSED,Not able to use S3 bucket as a model storage,question,2022-04-19 10:16:42 +0000 UTC
+4162,OPEN,Support for tfio in tensorflow backend,enhancement,2022-04-07 00:44:10 +0000 UTC
+4160,CLOSED,Specific item names cause Triton placeholders to not work in a GUI,,2022-04-05 14:46:34 +0000 UTC
+4154,OPEN,pinned_memory_manager Killed,bug,2022-04-15 17:43:26 +0000 UTC
+4153,CLOSED,Previous success build of full tritonserver failed for most recent release branches (r21.12...r22.03),bug, investigating,2022-05-02 22:25:33 +0000 UTC
+4152,CLOSED,ValueError: assignment destination is read-only,question,2022-04-18 16:13:38 +0000 UTC
+4151,CLOSED,Cannot start python-backend with TYPE_GPU instance type,question,2022-04-12 07:39:17 +0000 UTC
+4150,CLOSED,Optional input for python backend,bug,2022-09-29 22:47:55 +0000 UTC
+4149,CLOSED,Can't find tritonclient.utils.shared_memory on WIN10,,2022-04-04 18:40:50 +0000 UTC
+4145,CLOSED,python backend stuck in Starting Python backend stub when launching multiple servers simultaneously,,2022-04-04 23:08:52 +0000 UTC
+4144,CLOSED,How should I request a model whose input is a dictionary？,,2022-10-27 03:45:25 +0000 UTC
+4142,CLOSED,Triton inference time extremely slow at scale,,2022-06-27 14:54:22 +0000 UTC
+4139,CLOSED,Request custom model metadata,,2022-04-22 15:58:06 +0000 UTC
+4138,CLOSED,[Question] memory consumption of model loading for different instance_group count,,2022-03-31 18:00:51 +0000 UTC
+4137,CLOSED,Q. Is BLS and model excution run with pipeline parallelism,,2022-09-10 02:40:52 +0000 UTC
+4134,CLOSED,Triton hangs on tensorflow1 backend cpu-only build,,2022-03-31 17:53:04 +0000 UTC
+4133,CLOSED,Dynamic Batching not creating batches correctly and incorrect inference results,bug,2022-06-24 21:44:43 +0000 UTC
+4132,CLOSED,python backend always loading!,,2022-03-31 13:49:41 +0000 UTC
+4131,CLOSED,Grpc compression increases the latency quite a lot?,,2022-04-04 18:48:12 +0000 UTC
+4130,CLOSED,error creating a triton deployment mlflow plugin,investigating,2022-04-08 20:31:35 +0000 UTC
+4129,CLOSED,Dynamic Batching is not creating batches during inference,,2022-03-30 16:04:52 +0000 UTC
+4127,CLOSED,perf_analyzer failed with --shared-memory=cuda,,2022-03-30 00:58:11 +0000 UTC
+4126,CLOSED,pytorch backend：backend_memory.cc:177] failed to free CUDA memory: an illegal memory access was encountered,,2022-03-31 22:56:58 +0000 UTC
+4118,CLOSED,[Feature Request] Allow ensemble model's sub-models to be inside the model dir,,2022-05-13 19:22:43 +0000 UTC
+4113,CLOSED,Endless wait when loading Python backend model on Jetson - v2.19.0,bug,2022-04-06 15:17:25 +0000 UTC
+4112,CLOSED,inference failed: PyTorch execute failure: Global alloc not supported yet,,2022-04-04 13:50:21 +0000 UTC
+4111,CLOSED,how should i run the fastertransformer(FT) custom op with TIS?,,2022-03-28 04:42:56 +0000 UTC
+4105,CLOSED,ONNX TensorRT gives widely different result for fp16 quantized CLIP text embedding,question,2022-05-18 23:52:20 +0000 UTC
+4104,CLOSED,Post Processing with Triton Ensemble,,2022-03-25 18:14:33 +0000 UTC
+4103,CLOSED,No response after a long period,,2022-04-09 01:37:52 +0000 UTC
+4102,CLOSED,Deploy Triton server with MinIO as Model Store,,2022-03-25 05:42:30 +0000 UTC
+4101,CLOSED,Failed to load model - Unknown Builtin Op Torch Sparse,,2022-03-25 18:06:50 +0000 UTC
+4096,CLOSED,Client Hung,,2022-03-28 20:48:37 +0000 UTC
+4095,OPEN,Is it possible to make gRPC to use a unix socket instead of TCP in Triton Server?,enhancement,2022-09-01 17:05:35 +0000 UTC
+4094,CLOSED,Some confusions about MessageQueue in python backend.,,2022-03-24 01:47:29 +0000 UTC
+4089,CLOSED,Input to the script for publishing models to mlflow is overly particular with inputs,bug,2022-04-22 15:58:55 +0000 UTC
+4088,CLOSED,Request to cherry-pick fixes in tensorrt-backend for 22.03,,2022-03-22 21:09:04 +0000 UTC
+4087,CLOSED,Device memory is insufficient for Jetson example,,2022-03-23 09:17:40 +0000 UTC
+4085,CLOSED,YoloV4 Inference with Triton produces different output than with TensorRT,,2022-04-07 06:51:17 +0000 UTC
+4082,OPEN,feat: Add `TYPE_STRING` support to PyTorch backend,enhancement,2022-05-30 22:47:01 +0000 UTC
+4081,CLOSED,object class 'GstNvInferServer' has no property named 'input-tensor-meta',question,2022-03-21 19:10:54 +0000 UTC
+4079,CLOSED,Triton Inference Server taking adding 3 seconds to get YOLOv4 Inference,,2022-03-31 05:52:32 +0000 UTC
+4078,CLOSED,POSTing Base64 image to Triton running YOLOV4/TensorRT model,,2022-08-09 02:21:56 +0000 UTC
+4072,OPEN,Support connection strings for Azure-backed modelrepo,enhancement,2022-05-24 17:36:47 +0000 UTC
+4068,CLOSED,Perf_Analyzer always throwing 'std::length_error',bug,2022-03-31 17:18:26 +0000 UTC
+4067,CLOSED,build the server in a container,,2022-04-07 01:25:46 +0000 UTC
+4066,CLOSED,Perf_Analyzer requires libcudart,,2022-03-29 23:31:34 +0000 UTC
+4065,CLOSED,Model Configuration is wrongly restrictive,,2023-05-23 04:57:46 +0000 UTC
+4064,CLOSED,MOT model deploy error,,2022-08-09 07:15:41 +0000 UTC
+4062,CLOSED,Please add option to specify `grpc.default_authority` when creating python InferenceServerClient,enhancement,2022-05-10 04:01:55 +0000 UTC
+4061,CLOSED,Error and server shutdown on http request.,,2023-01-31 09:05:36 +0000 UTC
+4059,OPEN,Understanding Trace File,bug,2022-06-27 03:27:43 +0000 UTC
+4083,CLOSED,RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`,,2022-04-07 01:35:20 +0000 UTC
+4058,CLOSED,Server unloads models automatically after starting,,2022-03-15 05:08:13 +0000 UTC
+4057,CLOSED,Build openvino backend fail in container,question,2022-03-15 16:40:26 +0000 UTC
+4053,CLOSED,【Question】how to set the ensemble_scheduling when do inference of multiple models at the same time,,2022-03-29 23:50:49 +0000 UTC
+4052,CLOSED,TorchScript model inference grid sampler error,question,2022-04-18 16:18:15 +0000 UTC
+4051,CLOSED,[Question]Can TRITON POLL MODE hot-update the ensemble model?,bug, investigating,2022-05-31 17:54:09 +0000 UTC
+4050,CLOSED,Failed with Jetson NX using tensorrt model and docker from nvcr.io/nvidia/tritonserver:22.02-py3,,2022-03-17 07:50:26 +0000 UTC
+4049,OPEN,[feature request] classification by axis support in classification extension,enhancement,2022-10-02 07:17:23 +0000 UTC
+4048,CLOSED,Onnx batchsize greater than 1,question,2022-03-15 16:40:49 +0000 UTC
+4046,CLOSED,Triton server crashed unexpectedly during loading TensorRT models,,2022-03-15 08:33:41 +0000 UTC
+4045,CLOSED,Auto-Setting upstream container version in build.py for no container build,bug,2022-04-13 15:33:29 +0000 UTC
+4044,CLOSED,Question on Concurrent Execution on the same GPU device,question,2022-03-25 16:48:53 +0000 UTC
+4043,CLOSED,Python Backend can not be loaded!!,,2022-03-17 09:08:55 +0000 UTC
+4042,CLOSED,tensorflow_text support for Triton,,2022-08-20 00:05:28 +0000 UTC
+4038,CLOSED,How to set dynamic batching for tensorrt model?,question,2022-03-11 01:24:26 +0000 UTC
+4037,CLOSED,when i increase the number of instance_group,the latency is not decrease. i use tensorrt platform,,2022-05-17 18:06:11 +0000 UTC
+4036,CLOSED,Expose gRPC channel options,enhancement,2022-05-20 19:11:30 +0000 UTC
+4035,CLOSED,Possible to write dirty data in deprecated share memory in python backend?,,2022-03-10 15:12:43 +0000 UTC
+4030,CLOSED,Permission denied message,question,2022-03-10 22:55:10 +0000 UTC
+4028,CLOSED,Can we load TensorRT model within the self.initialize method of the TritonPythonModel?,question,2022-03-11 00:49:17 +0000 UTC
+4027,CLOSED,How to build server source code in a container?,question,2022-03-11 16:32:56 +0000 UTC
+4026,CLOSED,Ensemble model using BLS: stub unhealthy,bug,2023-03-17 15:40:30 +0000 UTC
+4023,CLOSED,Graceful handing of oom errors,enhancement,2022-03-09 18:47:15 +0000 UTC
+4022,CLOSED,Triton Python backend not able to use conda env built on different OS,wontfix,2022-03-09 02:33:49 +0000 UTC
+4021,CLOSED,ONNX CPU slower performance with a series of classification requests vs single one,question,2022-05-17 18:06:51 +0000 UTC
+4020,CLOSED,Dynamically load multiple instances for the same model,question,2022-03-10 08:05:53 +0000 UTC
+4019,CLOSED,Inference time with triton server is more than the inference time without triton,performance,2022-03-25 16:45:31 +0000 UTC
+4018,CLOSED,ONNX configuration example,,2022-03-08 09:11:48 +0000 UTC
+4017,CLOSED,tritonserver exited with coredump when using cuda graph optimization,bug,2022-04-09 07:02:20 +0000 UTC
+4016,CLOSED,Segmentation fault (core dumped),,2022-03-07 21:35:30 +0000 UTC
+4015,CLOSED,How to use Triton Inference Server with Docker Swarm,question,2022-03-10 17:02:05 +0000 UTC
+4014,CLOSED,[Question] - Is it possible to cache inferences for TTS,,2022-03-25 16:44:34 +0000 UTC
+4013,CLOSED,run c++ example code on win10,,2022-03-08 00:59:59 +0000 UTC
+4011,CLOSED,YOLOv5x ONNXRuntime with OpenVINO EP failed - need to upgrade OpenVINO EP,,2022-03-10 04:53:39 +0000 UTC
+4010,CLOSED,triton server failed exited with coredump,,2022-03-25 16:50:27 +0000 UTC
+4008,CLOSED,Signal 11 received and server down when inferring bert model,,2022-04-22 16:00:13 +0000 UTC
+4007,CLOSED,quantized model inference slow with Triton server than inference directly in python code,,2022-06-08 23:19:16 +0000 UTC
+4003,CLOSED,Revise build environment and release Debian package to improve integration with existing systems,,2022-04-11 16:41:45 +0000 UTC
+4002,CLOSED,(triton-third-party) Azure-storage-cpplite dependency will not build with tag 0.3.0,bug, investigating,2022-03-15 14:51:29 +0000 UTC
+4001,CLOSED,(Question) Is there existing a way to reschedule requests in concurrent model execution?,,2022-03-02 16:53:19 +0000 UTC
+3998,CLOSED,Question: Is it possible to access request body 'parameters' in the python backend?,enhancement, question,2023-03-15 22:47:23 +0000 UTC
+3997,CLOSED,Memory not released,bug, investigating,2023-02-09 04:03:56 +0000 UTC
+3996,CLOSED,Tensorflow2 backend UNAVAILABLE: Not found: unable to load shared library: libnccl.so.2,,2022-03-02 18:23:55 +0000 UTC
+3994,CLOSED,How to get the time each inference cost?,,2022-03-02 01:50:38 +0000 UTC
+3992,CLOSED,Python backend BLS unable to handle response from GPU model,,2022-04-11 19:00:36 +0000 UTC
+3990,CLOSED,Ask about the meaning of output.,,2022-03-01 01:37:22 +0000 UTC
+3989,CLOSED,Deploying my own crnn model using triton, the output has the wrong shape,,2022-03-07 02:30:23 +0000 UTC
+3987,CLOSED,Python client shm functions set_shared_memory_region and get_contents_as_numpy should support offset !=0,,2022-03-15 23:59:07 +0000 UTC
+3986,CLOSED,Set_shared_memory of the class InferInput in python client doesn't support offset !=0,,2022-03-15 23:58:39 +0000 UTC
+3985,CLOSED,How to download dependency in advance?,question,2022-03-02 06:40:39 +0000 UTC
+3984,OPEN,Batching support by stacking input arrays in python backend,enhancement,2023-04-14 07:32:27 +0000 UTC
+3980,CLOSED,CPU-only mode unable to load Models got CUDA error,,2022-11-16 02:41:02 +0000 UTC
+3979,CLOSED,Support for OpenPPL Backend,question,2022-05-17 18:32:40 +0000 UTC
+3978,CLOSED,Triton cannot inference `tf.math.l2_normalize` correctly from ngc 21.06 ~ ngc 22.03 ( triton 2.20.0),bug, investigating,2022-08-04 17:05:11 +0000 UTC
+3976,CLOSED,Edge compute using Transform4rec models with ONNX runtime,,2022-04-12 01:03:54 +0000 UTC
+3973,CLOSED,Update python preprocessor example to showcase batch processing,,2022-05-13 21:38:27 +0000 UTC
+3972,OPEN,Way to capture headers in model.py when using python backend,enhancement,2023-03-07 23:32:45 +0000 UTC
+3971,CLOSED,Auto generated model configuration for custom backend,question,2022-02-28 01:49:57 +0000 UTC
+3970,CLOSED,Cannot get CUDA device count, GPU metrics will not be available on multi-gpus,bug,2022-05-19 00:51:01 +0000 UTC
+3968,CLOSED,error when trying to allocate a region of cuda shared memory,,2022-02-22 19:11:35 +0000 UTC
+3967,CLOSED,Error about layer datatype when load onnx model,,2022-02-24 22:58:17 +0000 UTC
+3965,CLOSED,ONNX Backend Installation Error,,2023-04-27 05:23:18 +0000 UTC
+3962,CLOSED,Build trtion server image with container failed,bug,2022-02-23 19:19:56 +0000 UTC
+3961,OPEN,python backend: how does the conda environment support multiple versions,enhancement,2022-02-19 22:06:11 +0000 UTC
+3960,OPEN,Request for non-gpu version docker image (to decrease the image size),enhancement,2023-02-03 06:31:38 +0000 UTC
+3955,CLOSED,Triton server - required NVIDIA driver version vs CUDA minor version compatibility,question,2022-02-25 05:40:17 +0000 UTC
+3953,CLOSED,inference queue time high,,2022-09-09 23:42:35 +0000 UTC
+3952,CLOSED,Bug for building triton server with onnx backend with docker,,2022-03-10 04:57:02 +0000 UTC
+3951,CLOSED,Questions about ragged batching of higher dimensional tensors,,2022-02-28 06:19:17 +0000 UTC
+3948,CLOSED,[client c++] build script uses system libcurl instead of own third-party/curl,bug,2022-03-03 11:20:19 +0000 UTC
+3944,CLOSED,python backend error: c_python_backend_utils.TritonModelException: Tensor is stored in GPU and cannot be converted to NumPy,,2023-03-22 14:47:41 +0000 UTC
+3942,CLOSED,[Question] Details examples for how to use the rate limiter?,question,2022-03-01 13:54:58 +0000 UTC
+3941,CLOSED,[Question] About perf_analyzer request & execution count,,2022-06-07 02:02:13 +0000 UTC
+3940,CLOSED,Docs for building custom TF backend are obsolete,bug,2022-03-10 23:28:23 +0000 UTC
+3937,CLOSED,torchscripted model fails to load on triton server,,2022-02-27 14:46:55 +0000 UTC
+3936,CLOSED,triton for windows(r22.01) build error,,2022-05-13 21:38:59 +0000 UTC
+3935,CLOSED,Best practices for loading a new model version across Triton instances,question,2022-03-10 23:20:54 +0000 UTC
+3930,OPEN,VPU support for OpenVINO backend,enhancement,2022-02-14 21:22:15 +0000 UTC
+3929,CLOSED,Same output for every batches when using shared memory,bug, investigating,2022-03-04 10:14:59 +0000 UTC
+3928,CLOSED,The error means that the trt plan does not support the shape [16,128].,,2022-04-11 09:50:52 +0000 UTC
+3927,CLOSED,Test “L0_trt_dla” failed in branch r21.12 due to missing model “resnet50_plan”,,2022-02-18 21:48:08 +0000 UTC
+3926,OPEN,Does triton-inference-server run on Drive AGX?,enhancement,2022-02-18 21:52:36 +0000 UTC
+3924,CLOSED,Build image "tritonserver_qa" failed due to “Dockerfile.QA” references to a non-existing folder,,2022-02-22 18:08:54 +0000 UTC
+3923,CLOSED,Build tritonserver image failed due to hardcoded the”/tmp” folder in build files,enhancement,2022-03-14 20:11:06 +0000 UTC
+3922,CLOSED,[client c++] Could not find a package configuration file provided by "RapidJSON",,2022-11-22 03:31:12 +0000 UTC
+3920,CLOSED,NVIDIA Tesla T4 is not being used during inference,,2022-02-11 14:32:12 +0000 UTC
+3919,CLOSED,UNAVAILABLE: Internal: trt failed to set binding dimension to [8,10,128] for input 'input_id' for paraRecognition,,2022-02-11 17:13:18 +0000 UTC
+3918,CLOSED,Where to modify the apt sources in building such container?,,2022-02-17 19:47:53 +0000 UTC
+3917,CLOSED,grpc node.js client unable to send uint32 inputs,,2022-11-22 03:18:17 +0000 UTC
+3916,CLOSED,Triton Server Crash,,2022-03-01 19:13:14 +0000 UTC
+3915,CLOSED,Use ensemble to start Python Backend and PyTorch Backend, prompting not supported for Pytorch Backend.,,2022-02-11 01:18:01 +0000 UTC
+3914,CLOSED,Error: Failed to process the request(s). error: unpack_from requires a buffer of at least ...,,2022-02-13 13:23:35 +0000 UTC
+3913,CLOSED,CPU-Only Image: Dockerfile to build them, or release to nvcr,,2022-02-28 16:34:59 +0000 UTC
+3912,CLOSED,Model has kind KIND_GPU but no GPUs are available,question,2022-02-11 08:49:56 +0000 UTC
+3909,CLOSED,TorchScript GELU error,pytorch ngc,2022-02-11 19:05:29 +0000 UTC
+3908,CLOSED,Failed to build perf_analyzer on macOS,bug, enhancement, investigating,2022-04-05 19:44:29 +0000 UTC
+3907,CLOSED,Question: support for older CPUs (no AVX),question,2022-02-09 16:52:16 +0000 UTC
+3906,CLOSED,Triton tries to use the `tensorflow1` backend for `22.01-tf2-python-py3` image,question,2022-02-10 08:29:49 +0000 UTC
+3905,CLOSED,Experiencing Bottlenecking at Scale - is it related to having a single gRPC connection?,question,2022-02-09 20:29:48 +0000 UTC
+3904,CLOSED,Unclear TensorRT version match,question,2022-02-09 01:32:00 +0000 UTC
+3903,CLOSED,Directing requests to correct triton deployment on kubernetes,question,2022-02-10 07:58:01 +0000 UTC
+3902,CLOSED,Is there a way to load LD_PRELOAD plugins dynamically?,,2023-07-06 18:46:10 +0000 UTC
+3901,OPEN,How to add reshape[] to states with implicit state management,enhancement,2022-02-08 16:26:34 +0000 UTC
+3898,CLOSED,Client side ratio perf_ Analyzer requests are much slower,,2022-02-15 19:48:51 +0000 UTC
+3897,CLOSED,DCGM_FI_DEV_GPU_UTIL for HPA is showing error "no metrics returned from custom metrics API",,2022-03-10 23:23:59 +0000 UTC
+3896,CLOSED,Conditional model inference,,2022-02-07 14:20:06 +0000 UTC
+3895,CLOSED,unexpected input format FORMAT_NONE, expecting FORMAT_NCHW or FORMAT_NHWC,,2022-03-10 23:24:15 +0000 UTC
+3891,CLOSED,Pass outputs of one model to inputs of another in BLS,,2022-02-07 19:23:59 +0000 UTC
+3885,CLOSED,Triton server not combined requests to batch in python backend,,2022-02-03 20:29:39 +0000 UTC
+3884,CLOSED,triton server for jetpack not provided in release 2.18,,2022-02-28 16:37:09 +0000 UTC
+3883,OPEN,tensorrt slower than onnx,bug,2022-08-18 11:30:47 +0000 UTC
+3880,CLOSED,Batch Inference,,2022-02-02 23:19:43 +0000 UTC
+3877,CLOSED,UNAVAILABLE: Internal: unable to create stream: the provided PTX was compiled with an unsupported toolchain,,2023-01-20 13:01:12 +0000 UTC
+3869,CLOSED,Dynamic_batching not working correctly with tensorflow models,,2022-02-01 17:37:37 +0000 UTC
+3866,CLOSED,Why the third party of grpc-new not appear to contain CMakeLists.txt,bug,2022-03-25 18:19:00 +0000 UTC
+3865,CLOSED,[Question] how to update config.prototxt to support different model input shape,,2022-01-29 00:13:54 +0000 UTC
+3864,CLOSED,Windows Dockerfile reference new cuDNN folder structure,,2022-01-31 21:26:48 +0000 UTC
+3859,CLOSED,Incorrect order of outputs,,2022-01-27 18:55:07 +0000 UTC
+3857,CLOSED,BLS script + FORCE_CPU_ONLY_INPUT_TENSORS -> output tensor from ORT is NEVER on GPU memory,,2022-02-01 21:53:11 +0000 UTC
+3856,CLOSED,Single model scaling,,2022-07-06 16:10:14 +0000 UTC
+3855,CLOSED,tensorrt slower than others,,2022-02-15 15:58:30 +0000 UTC
+3854,OPEN,rate limiting based on number of requests,enhancement,2023-06-01 20:01:46 +0000 UTC
+3852,CLOSED,modification of the dimension check in EvaluateTensorRTContext,bug, investigating,2022-03-16 00:53:07 +0000 UTC
+3851,CLOSED,Why Triton's allow_ragged_batch feature doesn't works?,,2022-02-02 23:20:54 +0000 UTC
+3847,CLOSED,Understanding Backends,,2022-03-15 10:45:17 +0000 UTC
+3846,CLOSED,how to enable Dynamic batching for Ensembling models?,,2022-01-24 14:07:20 +0000 UTC
+3845,CLOSED,E0124 07:16:50.138736 59 logging.cc:43] 1: [stdArchiveReader.cpp::StdArchiveReader::34] Error Code 1: Serialization (Serialization assertion safeVersionRead == safeSerializationVersion failed.Version tag does not match. Note: Current Version: 43, Serialized Engine Version: 0),,2022-02-01 17:39:30 +0000 UTC
+3844,CLOSED,How do I create properly formatted input_data_file for warmup,investigating,2022-01-26 04:05:44 +0000 UTC
+3842,CLOSED,Prometheus always shows one GPU in Triton on Kubernetes,,2022-01-21 17:39:03 +0000 UTC
+3841,CLOSED,Pytorch backend forJetson Nano,,2022-01-21 18:16:07 +0000 UTC
+3840,CLOSED,python model get stuck on instance initialization step,,2022-03-30 14:00:10 +0000 UTC
+3839,CLOSED,allow_ragged_batch,,2022-01-28 18:05:48 +0000 UTC
+3838,CLOSED,The problem of low cpu usage,,2022-02-09 20:42:11 +0000 UTC
+3837,CLOSED,[Question] Is there data populating when request batch_size is less than tensorrt offline model batch_size,,2022-01-21 17:58:05 +0000 UTC
+3834,CLOSED,[Question] Error when loading models with python backend,,2022-01-25 01:42:41 +0000 UTC
+3833,CLOSED,Differing batch_size between input and output,,2022-01-21 15:56:24 +0000 UTC
+3832,CLOSED,[question] model loading,,2022-01-20 23:34:12 +0000 UTC
+3831,OPEN,Missing logs incase of incorrect model name,enhancement,2022-01-20 14:46:01 +0000 UTC
+3830,CLOSED,[Question]: Fairseq model to Triton server,,2022-01-20 23:30:35 +0000 UTC
+3825,CLOSED,Tensorflow Backend had an unexpected memory increase while updating models,question, investigating,2022-06-21 09:30:55 +0000 UTC
+3824,CLOSED,server's NGC container has no source code and build tools like cmake.,question,2022-01-20 23:41:42 +0000 UTC
+3822,CLOSED,[Question] Is ensemble model sequential in all cases?,,2022-01-20 23:31:11 +0000 UTC
+3820,CLOSED,its easy to compilation failure in China,,2022-01-20 23:49:38 +0000 UTC
+3818,CLOSED,How to understand queue_batch_size payload_batch_size pending_batch_size batch_size next_preferred_batch_size_ in dynamic_batch_scheduler.cc,,2022-01-21 00:07:25 +0000 UTC
+3817,CLOSED,Possible Network Performance Bottleneck,,2022-02-28 16:35:41 +0000 UTC
+3816,CLOSED,How to run tests after building tritonserver,question,2022-01-20 23:40:48 +0000 UTC
+3813,CLOSED,Driver incompatibility for triton image on sagemaker instance,,2022-01-21 17:08:56 +0000 UTC
+3808,CLOSED,Triton HTTP python client library call `get_result()` on `InferAsyncRequest` object in different thread results in `greenlet .error`,bug, question,2022-01-21 01:39:19 +0000 UTC
+3807,CLOSED,Memory efficient BLS with input-dependent number of inference requests,,2022-05-02 16:27:11 +0000 UTC
+3805,CLOSED,Failed to fetch anonymous token when trying to pull Triton Docker image in CI,bug,2023-05-02 19:32:22 +0000 UTC
+3804,CLOSED,Question: Return label map along with predictions,question, investigating,2022-05-24 17:43:35 +0000 UTC
+3802,CLOSED,Model load call is throwing error only on the first call `POST /v2/repository/models/{MODEL}/load`,bug, investigating,2022-03-10 18:33:47 +0000 UTC
+3801,CLOSED,struct.error: unpack_from requires a buffer of at least 1150092984 bytes for unpacking 1150092980 bytes at offset 4 (actual buffer size is 97920),,2022-01-25 06:10:43 +0000 UTC
+3800,CLOSED,`is_server_live()` python GRPC client got no response intermittently, while c++ is OK,bug, investigating,2023-03-13 22:39:17 +0000 UTC
+3796,CLOSED,Nonlinear increase of throughput as the number of CPU instances increases,,2022-01-31 14:09:59 +0000 UTC
+3795,CLOSED,Specifying model signatures in config.pbtxt serving TensorFlow Models,enhancement,2022-02-03 18:28:32 +0000 UTC
+3794,CLOSED,Python backend ensemble model extra input params,question,2022-02-03 18:27:12 +0000 UTC
+3787,CLOSED,Semantic segmentation model serving,bug, question, investigating,2022-01-21 18:07:23 +0000 UTC
+3786,CLOSED,Allow tritonserver to stay up on model load failure?,question,2023-05-11 04:31:18 +0000 UTC
+3784,CLOSED,Efficient Way to Send And Retrieve Image Inference Response,question,2022-10-08 07:52:35 +0000 UTC
+3781,OPEN,Server goes down trying to predict on certain BERT based, TensorRT optimized model in Tenosrflow Savedmodel format,bug, investigating,2022-03-22 21:18:29 +0000 UTC
+3779,CLOSED,python_backend consuming too much CPU without any incoming request,bug, investigating,2022-01-11 16:24:39 +0000 UTC
+3777,CLOSED,free() invalid pointer,bug, investigating,2022-02-23 04:14:15 +0000 UTC
+3774,CLOSED,why tensorrt is slow than onnx,,2022-01-21 23:36:21 +0000 UTC
+3773,CLOSED,No CUDA-capable device is detected (CUDA_ERROR_NO_DEVICE) cuInit()=100,,2022-01-10 05:50:00 +0000 UTC
+3770,CLOSED,Could not invoke stateful service in triton bls,bug,2022-01-24 03:51:54 +0000 UTC
+3769,CLOSED,How to protect python model in triton server?,,2022-01-21 18:28:21 +0000 UTC
+3765,OPEN,Standard Log format like JSON or XML,enhancement,2023-02-08 11:15:11 +0000 UTC
+3764,CLOSED,when we can use `compose.py` to build the CPU-only containers?,,2022-02-09 15:49:44 +0000 UTC
+3763,OPEN,21.12-py3 Server launching error when hosting a TRT model with custom plugin,enhancement,2022-01-31 21:22:50 +0000 UTC
+3762,CLOSED,The set_data_from_numpy function of class InferInput returns a list of **Nonetypes**,,2022-01-21 01:30:40 +0000 UTC
+3761,CLOSED,GPU memory never release.,,2022-12-06 17:38:26 +0000 UTC
+3759,CLOSED,custom tritonserver build, but the protobuf version is too low, causing an error,,2022-01-05 01:30:50 +0000 UTC
+3758,CLOSED,Memory not being released after triton inference - Python,bug, investigating,2022-01-11 00:03:28 +0000 UTC
+3757,CLOSED,[Question] - Artifacts with Python backend,,2022-01-04 15:03:23 +0000 UTC
+3756,CLOSED,Class labels are not returned when label_filename is provided,,2022-01-04 07:55:22 +0000 UTC
+3755,CLOSED,[question][performance] Triton ensemble scheduling in parallel,question,2022-01-05 10:36:42 +0000 UTC
+3754,CLOSED,/bin/bash: error while loading shared libraries: libnvinfer.so.8: cannot open shared object file: No such file or directory,,2022-01-03 16:44:11 +0000 UTC
+3753,CLOSED,use docker pull triton on jetson,,2022-01-05 19:23:05 +0000 UTC
+3752,CLOSED,[Question] How to serve sklearn preprocessing pipeline with triton,,2021-12-31 05:21:24 +0000 UTC
+3750,CLOSED,pinned buffer: failed to perform CUDA copy: invalid argument,,2022-01-27 18:25:51 +0000 UTC
+3749,CLOSED,Benchmark/Measure switching time between models,,2021-12-30 17:42:03 +0000 UTC
+3748,CLOSED,What is the generation method of the first four bytes of bytestring?,,2021-12-30 23:21:29 +0000 UTC
+3747,CLOSED,Sporadic streaming gRPC error "2 UNKNOWN: TRTIS response timeout",investigating,2022-01-05 13:32:14 +0000 UTC
+3746,CLOSED,ERROR: infer_trtis_server.cpp:261 Triton: TritonServer response error received., triton_err_str:Internal, err_msg:PyTorch execute failure: Expected Tensor but got GenericDict,question,2022-01-03 21:28:32 +0000 UTC
+3745,CLOSED,got error when output is zero rank in BLS,bug,2022-01-06 20:40:24 +0000 UTC
+3744,CLOSED,Read-only file system error,,2021-12-30 10:47:55 +0000 UTC
+3743,CLOSED,change the request input shape will make triton server hang up？,,2022-01-03 16:36:04 +0000 UTC
+3742,CLOSED,Onnx with TensorRT in Windows 10 platform,,2021-12-31 10:19:59 +0000 UTC
+3741,CLOSED,Question: ragged batch with ONNX backend,,2021-12-30 06:39:42 +0000 UTC
+3740,CLOSED,'c++: fatal error: Killed signal terminated program cc1plus' while building the container,,2021-12-28 06:20:56 +0000 UTC
+3739,CLOSED,tritonserver died after sometime,,2022-01-13 07:47:08 +0000 UTC
+3738,CLOSED,Messages of `BUG: soft lockup` and freezing when stopping tritonserver container,bug,2022-03-10 23:27:28 +0000 UTC
+3737,CLOSED,pytorch model error,,2021-12-30 06:50:47 +0000 UTC
+3736,CLOSED,Why does TRITONSERVER_Server used like this?,,2021-12-29 17:48:24 +0000 UTC
+3734,CLOSED,tensorrt plugin error？,question,2022-01-21 18:34:24 +0000 UTC
+3733,CLOSED,S3 sync unloading all the models in triton,,2021-12-26 14:48:05 +0000 UTC
+3732,CLOSED,how to send paramters between different models when use ensemble,,2021-12-23 07:42:07 +0000 UTC
+3731,CLOSED,CPU only mode cannot load models, got CUDA error,,2022-02-24 09:59:09 +0000 UTC
+3730,CLOSED,Failed to allocate memory for requested buffer of size 13565952,,2021-12-22 17:01:02 +0000 UTC
+3729,CLOSED,How to implement custom pre/post processing in Triton inference server?,,2021-12-22 14:53:25 +0000 UTC
+3728,CLOSED,[developing custom triton backend] Segmentation fault when return error in TRITONBACKEND_ModelInstanceInitialize,,2022-01-21 18:19:48 +0000 UTC
+3727,CLOSED,Make grpc_client thread-safe,,2022-01-10 19:38:59 +0000 UTC
+3725,CLOSED,Error configuring triton with s3 | Could not get MetaData for object at s3://,,2022-01-21 18:17:48 +0000 UTC
+3719,CLOSED,[question] What is the server actions when got new requests immediately after the exiting,,2021-12-20 23:08:05 +0000 UTC
+3718,CLOSED,Unable to load Openvino shufflenet model (Input 'axes' should be Constant.),,2021-12-23 07:51:20 +0000 UTC
+3717,CLOSED,Segmentation fault when delete server through C API,bug,2022-03-17 06:09:44 +0000 UTC
+3716,CLOSED,Starting a Triton Inference Server using Terraform on AWS throwing stub error,investigating,2021-12-23 07:07:45 +0000 UTC
+3715,CLOSED,Windows Tensorflow and Pytorch support,,2021-12-29 17:32:24 +0000 UTC
+3711,CLOSED,Server returns broken json requests when using TensorRT model config,bug,2022-02-01 19:38:00 +0000 UTC
+3710,CLOSED,My tensorrt model can not be loaded by triton server,,2022-04-21 17:54:35 +0000 UTC
+3705,CLOSED,How to understand the priority in rate_limiter?,,2021-12-16 03:46:38 +0000 UTC
+3701,CLOSED,timeouts when using gpu_execution_accelerator 'tensorrt',,2021-12-15 22:29:05 +0000 UTC
+3700,CLOSED,Segmentation fault,bug,2022-01-21 16:58:00 +0000 UTC
+3697,CLOSED,Triton Server python backend doesn't provide permission to create directories [Errno 13] Permission denied,,2021-12-13 22:56:09 +0000 UTC
+3696,CLOSED,question about warmup,question,2022-07-25 14:15:47 +0000 UTC
+3695,CLOSED,question about tensorflow backend,,2021-12-13 19:22:01 +0000 UTC
+3694,OPEN,[BUG] Triton Server with Kaldi Backend does not return final response to client.,bug,2021-12-20 21:49:32 +0000 UTC
+3693,CLOSED,question about batch inference,question,2021-12-21 01:23:54 +0000 UTC
+3688,CLOSED,Cannot start Triton servers following default instructions,,2021-12-10 19:13:12 +0000 UTC
+3687,CLOSED,Fails to load the models from model_repository,,2021-12-10 12:04:46 +0000 UTC
+3686,CLOSED,Onnx runtime error. grid_sampler is not a registered function/op,,2021-12-10 19:54:44 +0000 UTC
+3685,CLOSED,docker: Command not found in build server with backend,,2022-01-21 18:49:21 +0000 UTC
+3683,CLOSED,JetPack 4.6 do not support pytorch?,,2021-12-10 20:13:17 +0000 UTC
+3681,CLOSED,About the Model Warmup part of the document,question,2021-12-10 20:31:05 +0000 UTC
+3679,CLOSED,Triton hosted with kubernetes on Jetson Nano,question,2022-01-10 19:43:12 +0000 UTC
+3678,CLOSED,Stub process is unhealthy and it will be restarted,bug,2023-03-17 17:07:51 +0000 UTC
+3677,CLOSED,error: ‘TRITONSERVER_ResponseAllocatorQueryFn_t’ has not been declared,,2022-01-05 18:57:56 +0000 UTC
+3671,CLOSED,Asynchronous web client sending request to triton server,question,2022-12-26 16:37:56 +0000 UTC
+3669,CLOSED,yolov5 inference time increases in triton?how to get infer time details in triton?,question,2021-12-09 21:40:50 +0000 UTC
+3668,CLOSED,[Question] Client code halted at ModelRepositoryIndex via gRPC?,question,2022-01-21 18:51:44 +0000 UTC
+3664,OPEN,Triton's TF backend does not support ScaNN operations needed for tf recommenders models,enhancement, investigating,2023-03-21 22:53:39 +0000 UTC
+3658,CLOSED,How to get requests headers in the backend,question,2021-12-07 02:03:34 +0000 UTC
+3657,CLOSED,when use perf_analyzer, what's the compute input meaning?,question,2021-12-06 22:01:45 +0000 UTC
+3656,CLOSED,Can't define a constant value in input_map within config.pbtxt,,2021-12-06 22:08:57 +0000 UTC
+3655,CLOSED,Class InferResult not released causing memory leak,,2021-12-06 22:26:30 +0000 UTC
+3652,CLOSED,[QUESTION] grpc_server.cpp ModelInferHandler::Process the same Request instance for different correlation_id,question,2022-03-25 22:49:21 +0000 UTC
+3650,CLOSED,unexpected platform type python for <model_name>,,2021-12-03 17:20:57 +0000 UTC
+3649,CLOSED,pytorch models not being loaded,,2021-12-05 22:44:32 +0000 UTC
+3648,CLOSED,Pose estimation models,,2021-12-05 22:43:29 +0000 UTC
+3647,CLOSED,error: failed to register input shared memory region: failed to register CUDA shared memory region 'input_data1',bug,2021-12-22 00:26:42 +0000 UTC
+3646,CLOSED,system crash problem,,2022-01-19 09:30:53 +0000 UTC
+3645,CLOSED,start failed with s3 model repo,,2021-12-02 20:40:46 +0000 UTC
+3644,OPEN,[Feature Request] Support label look up for tensors of higher rank in classification protocol,enhancement, investigating,2021-12-06 23:44:51 +0000 UTC
+3643,CLOSED,can I specify the log file with auto rolling with file size option when run triton?,enhancement,2023-07-10 22:52:31 +0000 UTC
+3640,CLOSED,faster_rcnn_r50 pretrained converted to ONNX hosted in Triton model server,question,2023-01-30 07:45:59 +0000 UTC
+3637,CLOSED,[Feature Request] Proper documentation on usage of "label_filename" and code example for server side label look up,,2021-12-22 18:46:14 +0000 UTC
+3634,CLOSED,[QUESTION] TensorRT model with variable-sized input / output dimensions returns null,bug, investigating,2023-07-10 22:54:59 +0000 UTC
+3633,CLOSED,About PyTorch execute failure: forward() is missing value for argument 'input'. error,,2021-12-01 18:48:46 +0000 UTC
+3631,CLOSED,triton with onnx model error when load,,2021-12-13 22:21:13 +0000 UTC
+3630,CLOSED,When I use Onnx In Triton, The CPU Only 70% utilization can be measured, When I add the concurrent, The time delay increase, but CPU usages can't increase any more.,,2021-12-10 22:11:13 +0000 UTC
+3629,CLOSED,Will the triton server schedule requests in the queue to GPUs with low memory usage?,,2021-12-01 03:36:26 +0000 UTC
+3628,CLOSED,Update model,,2022-10-31 11:40:25 +0000 UTC
+3627,OPEN,health check should not say it's ready when cuda device-side assertion error is triggered,enhancement,2022-08-08 21:44:18 +0000 UTC
+3626,CLOSED,When I run tritonserver.exe on windows, I encountered the following problems,,2023-01-11 08:32:15 +0000 UTC
+3624,CLOSED,dynamic batching not working properly while requests waiting in queue,bug, investigating,2022-01-24 16:08:41 +0000 UTC
+3623,CLOSED,failed to load model :at least one version must be available under the version policy of model,,2021-11-29 19:51:58 +0000 UTC
+3622,CLOSED,backend development: cannot convert string to datatype,,2021-11-26 13:56:19 +0000 UTC
+3621,CLOSED,release memory,enhancement,2021-11-30 09:50:13 +0000 UTC
+3620,CLOSED,Failed to build triton server with docker at branch r20.12,,2021-12-01 08:36:34 +0000 UTC
+3619,CLOSED,How to Get Model's FLOPS In TIS?,enhancement,2023-07-10 22:55:31 +0000 UTC
+3617,CLOSED,Failed to build r21.05 in Docker container,,2021-11-29 10:04:36 +0000 UTC
+3616,CLOSED,Getting (unsorted) value of all classes as output,,2021-11-26 13:46:37 +0000 UTC
+3615,CLOSED,Error in using S3-Compatible Storage [Oracle Cloud Infrastructure (OCI) Object Storage],,2021-11-30 15:36:36 +0000 UTC
+3614,CLOSED,floating point exception using self built python backend and triton, centos,,2021-12-01 01:46:21 +0000 UTC
+3613,CLOSED,Segmentation fault in libtriton_pytorch.so with invalid inputs,,2022-11-16 09:05:43 +0000 UTC
+3612,CLOSED,Allow different input types for different inputs in perf_analyzer,,2021-12-22 18:30:07 +0000 UTC
+3610,OPEN,Triton Inference Server binary distribution for Ubuntu x64,enhancement,2022-02-01 03:44:49 +0000 UTC
+3609,CLOSED,The model deployed with TensorRT could not be load,,2023-05-27 16:35:12 +0000 UTC
+3608,CLOSED,python_backend always tries to chmod the triton_python_backend_stub,,2022-06-16 20:26:08 +0000 UTC
+3607,CLOSED,undefined symbol: TRITONBACKEND_StateBuffer,,2021-11-29 05:52:33 +0000 UTC
+3604,CLOSED,Op type not registered 'SentencepieceOp' for Universal Sentence Encoder,question,2021-12-02 04:24:28 +0000 UTC
+3603,CLOSED,Pip install tritonclient[all] not working in ubuntu 20.04,,2023-03-29 03:41:52 +0000 UTC
+3602,CLOSED,Ensemble models with multiple inputs or outputs,,2021-11-29 18:07:48 +0000 UTC
+3601,CLOSED,protobuf version issue,,2023-04-10 16:19:00 +0000 UTC
+3599,CLOSED,Considerably higher "compute infer" time when client geographically further from server,investigating,2022-01-21 18:54:28 +0000 UTC
+3597,OPEN,Allow triton to read from multiple cloud model repositories,enhancement,2021-11-23 03:27:30 +0000 UTC
+3596,CLOSED,Hi, I have some question about triton/onnxruntime/tensorrt,,2021-12-07 23:32:57 +0000 UTC
+3595,CLOSED,no model loaded by triton,,2021-12-07 23:34:13 +0000 UTC
+3594,OPEN,Support external shared memory stored in a Redis server,enhancement,2021-11-27 13:39:36 +0000 UTC
+3593,CLOSED,Error when converting the automatic config json to config.pbtxt,bug,2022-03-29 16:38:38 +0000 UTC
+3584,CLOSED,Generate config.pbtxt,,2021-11-24 10:18:04 +0000 UTC
+3583,CLOSED,automatic model load / unload or a lockable store extension,enhancement,2022-01-20 23:51:47 +0000 UTC
+3581,CLOSED,Is there any way to check if the client is disconnected? Or a way to force close the model?,question,2021-11-29 18:02:04 +0000 UTC
+3580,CLOSED,Stateful model example for the python backend,,2021-11-21 09:53:54 +0000 UTC
+3579,CLOSED,triton server supports multi-version TensorRT backend,,2021-11-24 02:55:48 +0000 UTC
+3578,CLOSED,Unknown type name 'NoneType',question,2021-12-10 22:18:52 +0000 UTC
+3577,CLOSED,Triton Server return sequence flags errors in each 6 batch processing,,2021-12-07 00:12:21 +0000 UTC
+3576,CLOSED,Unable to find the implementation of the pure virtual function Run() function of the BackendContext structure,,2021-11-17 18:55:57 +0000 UTC
+3575,OPEN,[Question] Poll mode with --load-model,enhancement,2022-10-31 14:49:59 +0000 UTC
+3574,CLOSED,Slower ONNX inference on tritonserver than on jetson-voice,,2021-11-30 22:39:12 +0000 UTC
+3573,CLOSED,Unexpected inference output 'detections' for model 'yolov4',,2021-11-17 06:58:24 +0000 UTC
+3572,CLOSED,Triton Inference Server is 10X Slower than TensorFlow Serving!!!?,performance, investigating,2022-01-31 21:55:42 +0000 UTC
+3571,CLOSED,How does the client call HTTP request to send an image file (such as a.jpg) to the server?,,2021-11-15 23:00:29 +0000 UTC
+3570,CLOSED,CMake Error In Centos,,2022-08-23 06:23:02 +0000 UTC
+3566,CLOSED,Can Give a Centos version Model_analyzer build.py. Now this build script just for window or ubantu, I have to make my own Dockerfile, But there many bug report.,,2021-11-13 00:14:30 +0000 UTC
+3565,CLOSED,Is it possible to release the additional gpu memory occupied for inference on a batch?,,2021-11-17 05:18:35 +0000 UTC
+3564,CLOSED,How to send batch with different sizes in inference client request,,2021-11-17 00:21:50 +0000 UTC
+3563,CLOSED,triton server is down after inferencing with one request,bug,2022-03-25 16:47:22 +0000 UTC
+3561,CLOSED,Failed to build triton server on an aarch64 device with error "error: ‘size_t’ does not name a type",,2021-11-11 14:11:48 +0000 UTC
+3560,CLOSED,python backend model instances as threads instead of separate processes?,,2021-11-30 22:37:11 +0000 UTC
+3559,CLOSED,Signal 11 received and server down when inferring an Onnx model,,2021-11-11 18:25:58 +0000 UTC
+3554,CLOSED,RPC Client report Unimplemented desc,,2021-11-10 16:38:21 +0000 UTC
+3553,CLOSED,How to measure performance (throughput, latency) when running two different models run concurrently?,,2021-11-10 17:01:50 +0000 UTC
+3552,CLOSED,Unable to compile identity_backend,,2021-11-11 14:45:54 +0000 UTC
+3551,CLOSED,Documentation for jetson should be updated for client installation,,2021-11-10 16:33:44 +0000 UTC
+3548,CLOSED,Not able to load the BertForSequenceClassification model from huggingface,pytorch ngc,2022-07-27 13:08:26 +0000 UTC
+3547,CLOSED,[Python Backend] Send PbTensor to cpu for calling as_numpy() or add a function as_cupy(),enhancement, investigating,2023-01-24 22:42:06 +0000 UTC
+3546,CLOSED,CondaPackError: Cannot pack an environment with editable packages,,2021-11-15 10:49:19 +0000 UTC
+3544,CLOSED,triton-client no matches found from pip install ?,,2022-07-13 10:52:39 +0000 UTC
+3543,CLOSED,How to transfer GPU memory data to CPU memory in C++ custom backend,,2021-11-08 11:09:56 +0000 UTC
+3542,CLOSED,how to set CMAKE_EXPORT_COMPILE_COMMANDS,,2021-11-09 01:00:44 +0000 UTC
+3541,CLOSED,Need help authoring Model configuration for Pytorch MNIST,,2021-12-06 13:51:42 +0000 UTC
+3537,CLOSED,output tensor of [1, 1000, 1, 1] 'fc6_1' output layer in densenet onnx,,2021-11-05 08:19:14 +0000 UTC
+3534,CLOSED,String Outputs- Setting Shape and Output Config,,2021-11-30 22:33:52 +0000 UTC
+3532,CLOSED,How can I check nvidia driver version built in images?,,2021-11-04 04:11:30 +0000 UTC
+3529,CLOSED,TensorRT Backend Installation Error,question,2021-11-30 22:33:28 +0000 UTC
+3528,CLOSED,Tensorflow Object Detection Prediction,bug, question,2021-11-30 22:39:20 +0000 UTC
+3527,CLOSED,max_batch_size in config.pbtxt refer to model batch size or request batch size ?,question,2023-02-16 18:14:02 +0000 UTC
+3525,CLOSED,Prometheus not working with Triton,bug,2022-05-23 16:38:43 +0000 UTC
+3524,CLOSED,struct.error: unpack_from requires a buffer of at least 274435 bytes,bug,2021-11-29 19:53:44 +0000 UTC
+3523,CLOSED,Custom backend (C++ or python) Whats the difference ?,question,2021-11-03 03:44:46 +0000 UTC
+3522,CLOSED,How to debug a custom backend,question,2021-11-05 01:59:35 +0000 UTC
+3519,CLOSED,Load buit-in OpenVINO failed,bug,2021-11-22 23:38:31 +0000 UTC
+3518,CLOSED,`cudaCheck` fails when provided `_CUDA_COMPAT_REALLIB`,question,2021-11-02 09:01:50 +0000 UTC
+3515,CLOSED,Standard Output in Python Backend,,2021-10-29 20:21:03 +0000 UTC
+3512,CLOSED,perf_analyzer report indices element out of data bounds, idx=1936311911 must be within the inclusive range [-100000,99999],investigating,2022-03-01 01:52:59 +0000 UTC
+3511,CLOSED,"Failed to allocate CUDA memory with byte size" WARNNING,question,2022-07-19 11:35:48 +0000 UTC
+3509,CLOSED,unable to create shared memory region,,2021-11-30 22:29:18 +0000 UTC
+3507,CLOSED,How to request batch inputs to server, I want recall 200 creative ideas, but the input in config.pbtxt only for one,,2021-11-10 07:25:08 +0000 UTC
+3506,CLOSED,MIG deployment of triton gives error on GKE,,2021-10-31 09:06:21 +0000 UTC
+3505,CLOSED,unexpected inference output 'OUTPUT__0' for model 'fil',,2021-10-27 22:11:44 +0000 UTC
+3504,CLOSED,Error while deploying nvidia triton inference on AWS EKS,,2021-11-12 22:44:20 +0000 UTC
+3503,CLOSED,к21.04 broken link in release notes,question,2021-10-28 19:35:03 +0000 UTC
+3502,CLOSED,Does triton support TTS audio streaming synthesis？,,2021-11-12 22:40:38 +0000 UTC
+3499,CLOSED,Save files in server,,2021-11-09 18:02:16 +0000 UTC
+3498,CLOSED,Unable to build CPU only image,bug,2021-11-30 22:28:58 +0000 UTC
+3496,CLOSED,Dynamic Batching in client script,,2021-11-12 22:41:15 +0000 UTC
+3495,CLOSED,Creating custom python backend environment,bug,2021-12-17 14:42:31 +0000 UTC
+3494,CLOSED,Unable to use shm using simple_http_shm_client,,2021-10-25 18:25:45 +0000 UTC
+3493,CLOSED,pytorch backend has lowergpu utilization,question,2021-11-12 22:42:02 +0000 UTC
+3492,CLOSED,custom backend handling of unexpectly closed sequence under sequence batcher oldest Strategy,enhancement, question,2023-07-06 18:15:30 +0000 UTC
+3491,CLOSED,Witch backend should I use for gettting best performance,,2021-10-23 18:19:39 +0000 UTC
+3490,CLOSED,Is it possible to redirect a gRPC request from the Triton client?,question,2021-11-30 22:27:33 +0000 UTC
+3489,CLOSED,Clear documentation about all parameters in config.pbtxt meaning?,,2021-12-15 01:23:48 +0000 UTC
+3488,CLOSED,core dumps error,investigating,2021-11-30 22:27:18 +0000 UTC
+3485,CLOSED,Version policy `specific` doesn’t work as expected. It doesn’t respect the model_version parameter during inference.,bug,2022-01-21 18:13:54 +0000 UTC
+3484,OPEN,Label_filename content request,enhancement,2023-02-07 23:12:42 +0000 UTC
+3483,CLOSED,Can the ensembled model implement branching logic?,,2021-12-01 16:28:25 +0000 UTC
+3482,OPEN,Is python backend going to support asyncio?,enhancement,2023-02-03 01:27:42 +0000 UTC
+3480,OPEN,unable to load custom python environment with python backend,bug,2021-12-07 16:01:37 +0000 UTC
+3479,CLOSED,Deploy Triton using Kubernetes,,2021-11-12 22:43:16 +0000 UTC
+3478,CLOSED,looser throw specifier for ‘virtual const char* google::protobuf::FatalException::what() const’,,2021-11-17 00:43:01 +0000 UTC
+3475,CLOSED,Intermittent issue with loading a python backend models (21.08),bug,2021-11-30 15:27:40 +0000 UTC
+3474,CLOSED,Device Auto Reallocaton not working as expected,bug, investigating,2021-11-03 07:11:23 +0000 UTC
+3473,CLOSED,ONNX with TensorRT Optimization Model Warmup not work,,2022-03-25 04:27:34 +0000 UTC
+3471,CLOSED,Triton Explicit Model control mode cloud serving advice,enhancement,2022-09-22 20:52:26 +0000 UTC
+3470,CLOSED,Using String outputs for Custom C++ Backend Models,,2021-10-27 17:48:16 +0000 UTC
+3469,CLOSED,jimeter test, tps is not stable,,2021-11-01 16:20:23 +0000 UTC
+3468,CLOSED,Unable to deploy Resnet.,,2021-11-12 22:54:35 +0000 UTC
+3467,CLOSED,label_filename usage,,2021-10-22 22:05:25 +0000 UTC
+3465,CLOSED,Unable to grpcurl to inference.GRPCInferenceService/ServerLive,,2021-11-12 22:55:06 +0000 UTC
+3464,CLOSED,I saved a tensorflow model as the saved_model format of the serialized input of tf.Example, how to deploy the triton service and infer,bug,2023-05-19 22:37:38 +0000 UTC
+3460,CLOSED,GPU metrics are not matching with nvidia-smi output,investigating,2021-11-30 22:52:33 +0000 UTC
+3459,CLOSED,Condition checking in ensemble,,2021-12-22 17:49:52 +0000 UTC
+3456,CLOSED,Not able to provide <perf-analyzer-flags> 'shape' for perf_analyzer in config.yaml, results in termination,,2021-10-12 17:34:15 +0000 UTC
+3455,CLOSED,Can we provide pure cmake builds without mixing python scripts and cmake files and download steps?,,2021-11-01 16:19:18 +0000 UTC
+3452,OPEN,Same model but different results between triton and native tensorrt engine,bug,2023-06-23 19:36:33 +0000 UTC
+3451,CLOSED,Core dump caused by overwhelming requests,,2021-11-01 16:18:50 +0000 UTC
+3450,CLOSED,Triton backend API version does not support this backend,,2021-10-12 18:08:51 +0000 UTC
+3448,CLOSED,Python backend model not loading in recent release,,2021-12-22 17:30:21 +0000 UTC
+3446,CLOSED,No throughput gain on increasing number of GPUs,,2021-10-20 12:29:44 +0000 UTC
+3445,CLOSED,Unable to find lib triton_python.so,,2021-10-12 03:23:54 +0000 UTC
+3444,CLOSED,Possibilities for shared memory and CUDA shared memory,,2021-10-14 21:48:18 +0000 UTC
+3443,CLOSED,Change Documentation of Instance Groups to prevent Misunderstandings,,2021-11-30 22:42:50 +0000 UTC
+3442,CLOSED,Add installation instructions for rhel,,2021-10-18 22:02:47 +0000 UTC
+3441,CLOSED,will ensemble model pass the GPU pointer to next model?,question,2022-07-27 00:46:40 +0000 UTC
+3429,CLOSED,Docker fails to register cuda shared memory,bug,2023-05-16 18:27:04 +0000 UTC
+3428,CLOSED,Triton Server Shared Memory - Free Unused Memory from Multiple Processes in C++,,2021-10-06 03:21:31 +0000 UTC
+3426,CLOSED,How to use a YOLO detection + DeepSORT tracking models in Triton?,,2023-03-08 11:21:01 +0000 UTC
+3425,CLOSED,unable to build custom image: unexpected keyword argument,,2021-10-01 18:45:33 +0000 UTC
+3420,CLOSED,how to profile GPU input/output tensors on python backend,,2021-09-29 19:21:59 +0000 UTC
+3419,CLOSED,Ability to have optional inputs,enhancement,2021-12-14 00:49:59 +0000 UTC
+3418,CLOSED,Onnxruntime execute failure,bug,2022-06-07 16:59:38 +0000 UTC
+3417,CLOSED,TensorRT Backend Repo build failed.,,2021-09-30 05:29:21 +0000 UTC
+3416,CLOSED,[Questions] Why is the queue time high?,,2021-09-29 22:22:51 +0000 UTC
+3414,CLOSED,python-backend r21.09-py3 manifest unknown,,2021-09-28 19:21:18 +0000 UTC
+3413,CLOSED,Improve ensemble concurrent performance,,2021-10-18 22:27:41 +0000 UTC
+3410,CLOSED,Clear documentation on GRPC health endpoint,,2021-11-30 22:48:33 +0000 UTC
+3409,OPEN,Packages for triton server and its components.,enhancement,2021-09-27 21:43:37 +0000 UTC
+3407,CLOSED,Problem with sending float tensors to the server,,2022-12-07 08:56:55 +0000 UTC
+3406,CLOSED,Out of order on processing sequence,,2021-10-07 01:19:07 +0000 UTC
+3405,CLOSED,how to change a torchscript model to stateful model to use the sequence_batching config,,2021-10-26 19:04:13 +0000 UTC
+3401,CLOSED,error: unable to run model: failed to parse the request JSON buffer: The document is empty. at 0,investigating,2021-12-22 00:43:52 +0000 UTC
+3400,CLOSED,Why client expriment v2 api in r20.03 can not request 20.03 server,investigating,2021-09-27 03:50:31 +0000 UTC
+3399,CLOSED,Misleading gRPC status codes returned for some error cases,enhancement, investigating,2022-07-13 21:42:23 +0000 UTC
+3395,CLOSED,Dynamic Batching for Variable Shaped Inputs,,2021-10-07 19:15:08 +0000 UTC
+3394,CLOSED,Asynchronous Operation for Request/Response for Python Backed,enhancement,2022-07-18 23:23:54 +0000 UTC
+3393,CLOSED,TFLite Backend,,2022-11-07 22:01:28 +0000 UTC
+3391,CLOSED,`perf_analyzer` error while loading shared libraries: `libcudart.so.11.0`,,2021-10-01 18:29:17 +0000 UTC
+3390,CLOSED,multithread failed,,2021-09-24 19:43:26 +0000 UTC
+3389,CLOSED,multithread failed,,2021-09-24 19:43:35 +0000 UTC
+3386,CLOSED,Python backend on CPU is slower when serving a pytorch model,bug, investigating,2022-01-27 01:45:38 +0000 UTC
+3385,CLOSED,TorchScript gelu signature difference,,2021-09-23 07:58:09 +0000 UTC
+3384,CLOSED,Can't load model by API,,2021-11-30 22:50:03 +0000 UTC
+3383,CLOSED,AWS S3 bucket model repository failure,,2021-11-01 12:51:16 +0000 UTC
+3382,CLOSED,Inconsistency in CUDA compatibility check,,2022-01-21 23:16:20 +0000 UTC
+3380,CLOSED,Release triton server image 21.09+,,2021-09-21 06:29:10 +0000 UTC
+3379,CLOSED,GPU memory consumption increases after model is loaded,,2021-11-30 22:55:10 +0000 UTC
+3378,CLOSED,Concurrent Model Execution In Same Client Script,,2021-10-26 13:22:37 +0000 UTC
+3377,CLOSED,Multiple models inference in just 1 client script,,2021-10-07 18:56:27 +0000 UTC
+3376,CLOSED,why a model might take longer per inference request after a different model is run?,investigating,2023-07-10 22:44:23 +0000 UTC
+3374,CLOSED,Why expriment v2 api in r20.03 can't request 20.03 server,,2021-09-24 02:58:42 +0000 UTC
+3373,OPEN,Ensemble of models are executed over different devices,enhancement,2021-09-17 15:54:37 +0000 UTC
+3372,CLOSED,tritonclient.utils.shared_memory not available on Jetson tritonclient[all]=2.8.0,enhancement,2021-09-19 15:17:38 +0000 UTC
+3371,CLOSED,Cannot load model using S3 / MinIO,,2021-09-15 10:44:12 +0000 UTC
+3370,CLOSED,Does tensorflow-backend support multi-stream on a single GPU?,,2021-09-16 22:20:59 +0000 UTC
+3369,CLOSED,2 or more models running in parallel,,2021-09-17 05:11:14 +0000 UTC
+3364,CLOSED,Enable querying for desired output memory type,enhancement,2021-12-02 01:11:28 +0000 UTC
+3362,CLOSED,Crashing when sending 1000 requests on 10 concurrent connections,,2021-09-15 13:11:53 +0000 UTC
+3361,CLOSED,Loading CPU/GPU dyncamically,,2021-09-14 17:13:18 +0000 UTC
+3360,CLOSED,Allow http server to bind to specific address/interface,enhancement, good first issue,2022-04-04 17:48:23 +0000 UTC
+3358,CLOSED,Triton on AWS ec2 instance: cuda_memory_manager.cc:115] CUDA memory pool disabled,,2021-09-11 07:24:04 +0000 UTC
+3353,CLOSED,Connection reset by peer,,2021-10-28 00:05:11 +0000 UTC
+3350,CLOSED,about config.pbtxt,,2021-09-10 16:54:40 +0000 UTC
+3349,CLOSED,Model Load Lifecycle,,2021-09-30 18:56:55 +0000 UTC
+3348,CLOSED,PyTorch execute failure: isTensor(); Expected Tensor but got GenericList,,2021-10-28 01:44:20 +0000 UTC
+3347,CLOSED,Is it possible to use tensorflow with GPU on custom python backend?,,2021-09-14 17:07:37 +0000 UTC
+3344,CLOSED,An option to automatically re-assign device for CPU only ops in tensorflow SavedModel,,2021-09-17 09:13:48 +0000 UTC
+3343,CLOSED,Pass different input size in a batch,,2021-10-18 22:45:08 +0000 UTC
+3340,CLOSED,Tensorflow backend support for parallelism,,2021-09-09 19:18:46 +0000 UTC
+3337,CLOSED,inference failed: in ensemble unexpected deadlock,,2021-09-09 08:40:50 +0000 UTC
+3336,CLOSED,How to provide dynamic input size in ensemble model,,2021-09-08 18:58:39 +0000 UTC
+3335,CLOSED,How can I improve the inference speed?,,2021-09-09 04:15:28 +0000 UTC
+3334,CLOSED,Triton server is slower than pytorch model,,2021-11-15 23:01:55 +0000 UTC
+3333,CLOSED,Defining multiple key-value pairs in ensemble model config.pbtxt,,2021-09-23 17:18:04 +0000 UTC
+3332,CLOSED,AttributeError: module 'triton_python_backend_utils' has no attribute 'InferenceRequest',,2021-09-08 18:37:04 +0000 UTC
+3328,CLOSED,How to pass different input sizes to different models in an ensemble?,,2021-09-08 07:21:09 +0000 UTC
+3327,CLOSED,is the dynamic batcher setting sucess?,,2021-09-08 17:03:50 +0000 UTC
+3326,CLOSED,Scaling multiple GPUs,,2021-11-30 22:55:20 +0000 UTC
+3325,CLOSED,TRITON server returning wrong inferences,,2021-09-16 22:21:18 +0000 UTC
+3322,CLOSED,make: *** No rule to make target 'triton-python-backend-stub'.,,2021-09-09 18:15:05 +0000 UTC
+3321,CLOSED,triton_server slower than tensorflow serving and python API,question,2021-12-22 00:33:08 +0000 UTC
+3320,CLOSED,Memory leak issue on using load/unload API to dynamic loading TensorRT model,bug,2021-12-16 06:25:25 +0000 UTC
+3318,CLOSED,Shape problem when using SavedModel,,2021-09-07 16:44:16 +0000 UTC
+3316,CLOSED,Allow selected models in the model repository not to be loaded,,2021-09-03 15:58:07 +0000 UTC
+3312,CLOSED,Why the Yolov5s speed is very slow neither using dynamic batch size or multiple instances?,,2021-11-17 07:58:58 +0000 UTC
+3308,CLOSED,Facing issue while using Yolov5s with Trition inference Server,,2022-11-22 03:18:08 +0000 UTC
+3307,CLOSED,How to rename input name of torchscript model?,,2021-09-09 18:57:42 +0000 UTC
+3306,CLOSED,Dynamically switch floating point precision for TensorRT engine,enhancement, question,2021-11-23 00:01:56 +0000 UTC
+3303,CLOSED,http client for a model with multiple outputs,question,2023-07-05 12:47:44 +0000 UTC
+3302,CLOSED,The current model becomes slow after running another model,,2021-09-18 06:10:18 +0000 UTC
+3301,CLOSED,max_batch_size not in op error reported for tensorflow tensorrt model,investigating,2021-09-27 22:45:15 +0000 UTC
+3300,CLOSED,Can NMT models can be deployed with triton inference server ?,question,2021-09-01 18:10:02 +0000 UTC
+3299,CLOSED,Periodic dead of server while using python backend,bug,2023-02-21 17:51:26 +0000 UTC
+3292,CLOSED,Using Triton as a custom service in GitLab CI,question,2021-09-01 22:35:28 +0000 UTC
+3290,CLOSED,Half Precision TF inference error (DT_HALF and Einsum),investigating,2021-10-08 18:19:21 +0000 UTC
+3288,CLOSED,Build error withou docker,,2021-08-30 16:51:45 +0000 UTC
+3287,CLOSED,Triton21.05 has problem when load model from Minio,investigating,2022-02-24 16:38:43 +0000 UTC
+3286,OPEN,Batched Prediction for Python backend,enhancement,2022-02-25 14:21:04 +0000 UTC
+3284,CLOSED,Question regarding string request/response with HTTP frontend and python backend,question,2021-09-03 22:56:10 +0000 UTC
+3283,CLOSED,Build ONNX Runtime Backend image failed,,2021-09-17 20:37:05 +0000 UTC
+3282,CLOSED,Triton repository agent checksum - readme broken link,,2021-08-31 17:18:21 +0000 UTC
+3281,CLOSED,【20210826】 multi model-repository is invalid,,2022-11-22 03:17:54 +0000 UTC
+3275,CLOSED,Running nvidia inference server without an Nvidia GPU,,2021-08-25 13:37:53 +0000 UTC
+3274,CLOSED,Performance on triton with python backend,performance,2021-08-30 14:38:15 +0000 UTC
+3271,CLOSED,Does TIS supports other devices such as google's TPU?,,2021-09-03 21:50:59 +0000 UTC
+3265,CLOSED,[PyTorch Backend] Triton server consumes too much GPU memory,,2021-09-03 22:57:05 +0000 UTC
+3264,CLOSED,Sagemaker and KFServing port clash,,2021-08-23 13:57:39 +0000 UTC
+3263,CLOSED,How to Add Custom Models with different framework in Triton inference server and have to serve the model and utilize the api,,2021-09-06 12:18:33 +0000 UTC
+3262,CLOSED,Build modified triton from source in Jetson,,2021-09-28 12:15:32 +0000 UTC
+3261,CLOSED,Image Segmentation Output,,2021-09-03 21:51:20 +0000 UTC
+3260,CLOSED,Client Script Error,,2021-09-20 14:47:08 +0000 UTC
+3259,CLOSED,python backend pb_utils.InferenceResponse can adding message argument for the responses output ?,,2021-08-23 14:17:09 +0000 UTC
+3258,CLOSED,the result of ensemble model is wrong and the result of single model is right.,,2021-08-22 01:20:14 +0000 UTC
+3256,CLOSED,Missing package configuration: prometheus-cpp,question,2021-08-23 15:26:43 +0000 UTC
+3255,CLOSED,Kubernetes unable to kill container,,2021-08-26 08:03:36 +0000 UTC
+3254,CLOSED,Tensorflow Segmentation Model Deployment On Triton Inference Server,,2021-09-03 21:52:23 +0000 UTC
+3253,CLOSED,Build python backend error using docker,,2021-08-24 03:19:54 +0000 UTC
+3252,CLOSED,I use my yolov5l tensorrt model to deploy on triton, it will instantly fill up the GPU memory,,2021-08-20 08:47:28 +0000 UTC
+3251,CLOSED,Avg request latency to Avg HTTP time up 5000 usec,,2021-09-30 02:41:03 +0000 UTC
+3248,CLOSED,Inconsistent implementation results,question,2021-09-03 21:51:49 +0000 UTC
+3246,CLOSED,Triton Ensemble model: Unable to get multiple output,,2021-08-19 16:43:56 +0000 UTC
+3245,CLOSED,Accumulate inference time with an ensemble model is way slower than the slowest individual,,2021-09-03 22:58:11 +0000 UTC
+3244,CLOSED,Dockerfile in building triton,,2021-11-18 18:40:11 +0000 UTC
+3239,CLOSED,Unable to set input with 0 dimension shape for non-batching model,,2021-08-17 17:04:38 +0000 UTC
+3237,CLOSED,FileNotFoundError: [Errno 2] No such file or directory: '/tmp/folder31uWBi/1/model.py',bug,2021-08-18 23:04:45 +0000 UTC
+3234,CLOSED,tritonserver:<version>py3-min Dockerfile,,2021-08-18 20:37:50 +0000 UTC
+3228,CLOSED,Release Python client on PyPI,enhancement,2021-12-02 00:58:48 +0000 UTC
+3227,CLOSED,Load triton_python_backend_stub from S3 model repository,bug,2021-09-07 02:09:52 +0000 UTC
+3225,CLOSED,Can we fine tune a model with triton server?,,2021-08-12 15:42:43 +0000 UTC
+3224,CLOSED,Install Triton Server From Source Code On Jetson Nano,,2021-08-11 17:27:00 +0000 UTC
+3223,CLOSED,CPU triton server,,2021-08-30 21:02:00 +0000 UTC
+3222,CLOSED,Triton with python backend: not Using Python execution env *.tar.gz file,,2021-08-13 02:44:23 +0000 UTC
+3219,CLOSED,sending an output with wrong datatype (e.g. integers > 256 as datatype uint8) should fail,bug,2021-09-08 19:33:37 +0000 UTC
+3218,CLOSED,Is dynamic batch trt model file required when using dynamic_batching,,2021-08-30 21:00:12 +0000 UTC
+3217,CLOSED,[StatusCode.INVALID_ARGUMENT] unable to find data for input tensor 'input_tensor',,2021-08-10 07:23:01 +0000 UTC
+3215,CLOSED,Triton Ensemble Model - seems to be executing each model multiple times.,question,2021-08-23 14:53:40 +0000 UTC
+3213,CLOSED,Error in Build of Triton server using --no-container-build,,2021-08-30 20:59:57 +0000 UTC
+3212,OPEN,Can I load costum layer or plugin from Minio or S3?,enhancement,2022-02-08 09:39:12 +0000 UTC
+3211,CLOSED,Concurrent requests for the same model lead to inconsistent output results,,2022-06-30 03:59:47 +0000 UTC
+3210,CLOSED,error: creating server: INTERNAL - Could not get MetaData for bucket with name 192.168.x.xxx:9000,,2021-08-09 05:24:48 +0000 UTC
+3207,CLOSED,Unregistering system shared memory region before async requests complete causes server crash,question,2021-08-07 09:20:03 +0000 UTC
+3206,CLOSED,How to send binary data in `perf_analyzer`?,question,2021-08-30 21:00:37 +0000 UTC
+3204,CLOSED,Unable to run TensortRT execution accelerator with an ONNX model.,,2021-08-05 20:03:40 +0000 UTC
+3203,CLOSED,How to build the PyTorch Backend on jetson?,question,2021-11-06 12:12:36 +0000 UTC
+3198,CLOSED,Clarification of error message: Poll failed for model directory 'x': output 'output.0' for ensemble 'x' is not written,question,2021-08-04 21:23:27 +0000 UTC
+3197,CLOSED,Include triton backend license files in tritonserver container image.,enhancement, investigating,2021-08-05 21:40:40 +0000 UTC
+3196,CLOSED,segment fault when running tritonserver which is built from source,investigating,2021-08-05 03:30:13 +0000 UTC
+3195,CLOSED,how can I control the cuda memory for models?,question,2021-08-11 21:25:17 +0000 UTC
+3194,CLOSED,openvino_backend has much lower performance than tensorFlow_backend,performance, investigating,2022-01-07 18:13:56 +0000 UTC
+3193,CLOSED,how can I checkout to r20.12?,,2021-08-04 06:16:35 +0000 UTC
+3191,CLOSED,Certain FP16 traced pytorch models with batchnorm no longer work with r21.07,bug, investigating,2021-08-24 06:30:50 +0000 UTC
+3189,CLOSED,python backend complains about modulenotfound,,2021-08-12 03:49:28 +0000 UTC
+3188,CLOSED,Triton on KFServing errors on every request - "Infer failed: input 'features' already exists in request",,2021-09-03 22:58:32 +0000 UTC
+3187,CLOSED,Triton on KFServing errors on every request - " Infer failed: input 'features' already exists in request",,2021-08-03 14:31:23 +0000 UTC
+3183,CLOSED,Python backend server stuck after initializing,,2021-08-10 16:40:45 +0000 UTC
+3179,CLOSED,No performance gain in models from Tensorflow model zoo,,2021-09-17 20:35:39 +0000 UTC
+3178,CLOSED,SageMaker HTTPService automatically starts in r21.07/r21.06,,2021-08-04 18:42:19 +0000 UTC
+3177,CLOSED,[INTERNAL] Attempting to access response which is not yet allocated,,2021-08-09 22:18:54 +0000 UTC
+3176,CLOSED,[Docs] Clarify when backends need to release requests,,2021-08-03 00:19:34 +0000 UTC
+3175,CLOSED,CUDA illegal memory error when calling bool() in torch script,,2021-08-02 15:46:14 +0000 UTC
+3174,CLOSED,Trition Inference server docker container get exited during 2nd time inference on CPU,investigating,2021-12-22 00:22:39 +0000 UTC
+3173,OPEN,`Magic tag does not match` even when `model.plan` is generated inside the same container of Triton,bug, investigating,2021-08-04 08:19:44 +0000 UTC
+3172,OPEN,Parallel model initialization for python backend,enhancement,2021-08-02 17:23:02 +0000 UTC
+3171,OPEN,Model Interpretability,enhancement,2021-08-02 17:23:19 +0000 UTC
+3170,CLOSED,Python custom backend failed to start with error "no version information available (required by /bin/bash)",,2021-08-03 13:30:35 +0000 UTC
+3169,CLOSED,Build and run Triton direclty in my machine instead of using docker,question,2021-08-30 21:02:58 +0000 UTC
+3163,CLOSED,Missing tritonserver2.12.0-jetpack4.6.tgz,,2021-07-28 12:20:25 +0000 UTC
+3162,CLOSED,/opt/tensorrtserver/nvidia_entrypoint.sh: line 92: exec: tritonserver: not found,,2021-07-28 02:41:17 +0000 UTC
+3161,CLOSED,/opt/tensorrtserver/nvidia_entrypoint.sh: line 92: exec: tritonserver: not found,,2021-07-28 18:30:46 +0000 UTC
+3160,CLOSED,Streaming connections interrupted for extremely long time series,bug,2021-10-23 19:20:19 +0000 UTC
+3157,CLOSED,Framework support matrix has wrong version of onnxruntime for 21.06,,2021-07-28 18:31:58 +0000 UTC
+3156,CLOSED,Triton python backend build failed (main branch),bug,2021-08-12 20:24:05 +0000 UTC
+3155,CLOSED,Trition local build (branch r21.06) with docker failed,,2021-07-27 17:45:57 +0000 UTC
+3154,CLOSED,py3-min Dockerfile,,2021-08-02 17:48:32 +0000 UTC
+3152,CLOSED,got stucked when import torch,,2021-07-27 14:35:42 +0000 UTC
+3151,CLOSED,Can Triton support MIG?,,2021-08-03 00:57:41 +0000 UTC
+3150,CLOSED,Doesn't co-work with MPS?,bug,2022-04-12 17:05:48 +0000 UTC
+3148,CLOSED,Can we build with Caffe2 without MKL?,,2021-07-23 18:36:18 +0000 UTC
+3147,CLOSED,"failed to connect to all addresses" occurs by chance,,2021-09-01 09:28:58 +0000 UTC
+3146,CLOSED,cmake bug in subproject core,,2021-07-26 18:50:43 +0000 UTC
+3142,CLOSED,openvino error in loading network (with custom op),bug, duplicate,2021-08-14 02:42:37 +0000 UTC
+3141,CLOSED,CMake problems for client library in v2.11.0,enhancement,2022-03-25 16:46:51 +0000 UTC
+3138,CLOSED,Secure https endpoint,enhancement,2022-01-31 21:44:43 +0000 UTC
+3135,CLOSED,Triton requiring config.pbtxt when loading models from s3 (MinIO)?,bug, investigating,2023-01-31 19:29:55 +0000 UTC
+3134,CLOSED,dlSym cannot locate method 'CreateExtension',bug,2021-08-04 22:15:40 +0000 UTC
+3131,OPEN,Support for Kafka endpoints,enhancement,2021-07-19 19:00:06 +0000 UTC
+3130,CLOSED,Support for HTTP endpoint on windows,enhancement,2021-09-20 22:54:06 +0000 UTC
+3129,CLOSED,openvino custom op,question,2021-07-21 02:09:35 +0000 UTC
+3126,CLOSED,Triton spins on startup running in EGX,bug,2021-08-04 18:55:14 +0000 UTC
+3122,CLOSED,[PyTorch Backend] Missing libtorch_python.so and libshm.so for custom ops,,2021-07-15 17:58:44 +0000 UTC
+3121,CLOSED,GRPC: unable to provide 'output_3' in GPU, will use CPU,,2021-07-15 18:41:10 +0000 UTC
+3120,CLOSED,Can Triton server be installed on Drive PX2?,,2021-07-15 18:46:44 +0000 UTC
+3119,CLOSED,Triton server crashes silently with invalid input data,bug,2021-07-29 18:10:58 +0000 UTC
+3117,CLOSED,failed to load torch script model, unexpected no host policy,,2021-07-15 04:06:15 +0000 UTC
+3116,CLOSED,Wrong preferred_batch_size selected,,2021-07-21 08:26:01 +0000 UTC
+3115,CLOSED,How to read image file from HTTP requests for python backend?,question,2021-07-14 15:35:14 +0000 UTC
+3114,CLOSED,rapidjson.JSONDecodeError: Parse error at offset 0: Invalid value.,question,2021-07-16 07:42:51 +0000 UTC
+3113,OPEN,perf_analyzer latency vs throughput mismatch: reduce the overhead of perf_analyzer when using synchronous infer API,bug,2021-07-29 19:18:33 +0000 UTC
+3112,CLOSED,ONNX runtime failed to inferenced and shows onnx runtime error 2?,,2021-07-14 02:50:46 +0000 UTC
+3111,CLOSED,Using Python backend for jetson platform,,2021-07-14 03:39:28 +0000 UTC
+3110,CLOSED,Not able to connect to Dell EMC ECS,,2021-07-21 09:18:45 +0000 UTC
+3107,CLOSED,Question about max_batch_size, dynamic_batching in python_backend,,2021-07-13 03:44:08 +0000 UTC
+3106,CLOSED,How to set the cpu affinity of the model instance?,question,2021-08-02 22:02:58 +0000 UTC
+3103,CLOSED,Multiple copies of perf_analyzer cannot be run in parallel with sequence batching due to sequence id collisions,bug, enhancement,2021-11-11 19:51:03 +0000 UTC
+3100,CLOSED,No ModelWarmup examples,investigating,2021-07-13 17:02:59 +0000 UTC
+3099,CLOSED,Support for Sequence IDs as strings,enhancement,2021-12-22 00:08:16 +0000 UTC
+3098,CLOSED,Failed to initialize server,,2021-08-27 03:03:10 +0000 UTC
+3097,CLOSED,Usage of packaged conda environments stored in s3 with python backend,enhancement,2021-09-07 02:08:55 +0000 UTC
+3096,CLOSED,Multiple instances of same model-additional memory for weights?,,2021-07-13 05:54:53 +0000 UTC
+3094,OPEN,python backend with custom packages reports error "Internal: Failed to initialize stub, stub process exited unexpectedly",bug,2021-07-13 14:12:43 +0000 UTC
+3093,CLOSED,Failed to load tensorflow model: Op type not registered NormalizeUTF8,,2021-07-08 05:11:33 +0000 UTC
+3090,CLOSED,When sending multiple pictures at the same time, the prediction result of the same picture will be different every time,,2021-08-03 18:47:27 +0000 UTC
+3086,CLOSED,is_server_live and is_server_ready results in connection reset when tritonserver crash,,2021-08-02 22:29:41 +0000 UTC
+3085,CLOSED,custom build tensorflow backend,,2021-07-06 21:14:23 +0000 UTC
+3083,CLOSED,Versioning of other repositories,enhancement,2021-07-07 21:30:03 +0000 UTC
+3081,CLOSED,2.11.0 tensorflow backend support TF_ENABLE_ONEDNN_OPTS=1,question, investigating,2022-02-16 00:47:37 +0000 UTC
+3080,CLOSED,Question about tensorflow backend (savedmodel),question,2021-08-03 18:49:20 +0000 UTC
+3076,CLOSED,Triton into Gitlab CI, Need exposed port 9000,,2021-07-07 21:50:06 +0000 UTC
+3074,CLOSED,Python backend segfault with detectron2,bug,2021-09-22 14:05:31 +0000 UTC
+3073,CLOSED,segfault at ... error 6 in libc-2.27.so,,2021-08-03 18:23:59 +0000 UTC
+3070,CLOSED,backend build fails,,2021-07-16 16:53:35 +0000 UTC
+3069,CLOSED,How to add resnet50 model for qa tests,question,2021-07-07 21:55:17 +0000 UTC
+3066,CLOSED,Increasing number of instances of the model does not increase performance,question,2021-09-01 16:38:58 +0000 UTC
+3061,CLOSED,Client library / triton server / config.pbtxt data_type naming convention missmatch,,2021-07-07 19:56:03 +0000 UTC
+3060,CLOSED,Change some reported metrics from Prometheus' `Counter` to `Histogram`,enhancement,2023-03-07 00:25:01 +0000 UTC
+3058,CLOSED,pytorch_backend build fails with `cannot find -ltorch`,,2021-07-13 17:04:06 +0000 UTC
+3056,CLOSED,file not found: archive/constants.pkl error while loading model from TorchScript,,2021-06-29 01:19:11 +0000 UTC
+3045,CLOSED,Concurrent requests to multiple models cause NaN values in output,bug, investigating,2021-10-07 23:18:20 +0000 UTC
+3044,CLOSED,use var_length bert plan, tritonserver core dumped,,2021-07-13 17:04:47 +0000 UTC
+3043,CLOSED,.proto files deleted in src/core/ directory,,2021-06-24 17:07:37 +0000 UTC
+3040,CLOSED,Ensemble within ensemble,,2021-07-07 22:55:55 +0000 UTC
+3038,OPEN,How to find out the number of unprocessed requests for an inference?,enhancement,2023-01-27 22:29:15 +0000 UTC
+3037,CLOSED,No performance improvement when optimizing models,,2021-07-13 17:05:41 +0000 UTC
+3036,CLOSED,Python Backend does not load due to shared memory issue,,2021-06-29 15:24:21 +0000 UTC
+3034,CLOSED,CMake Error: The source directory "/tmp/citritonbuild/tritonserver/build/build" does not exist.,,2021-06-23 07:36:52 +0000 UTC
+3033,CLOSED,cpu memory increase constantly when serving model with triton-inference-server,bug, investigating,2022-04-01 00:44:51 +0000 UTC
+3032,CLOSED,Error details: model expected the shape of dimension 0 to be between 1 and 1 but received 5,,2021-08-02 22:53:33 +0000 UTC
+3031,CLOSED,Constant URL for RESTAPI,,2021-06-25 01:19:56 +0000 UTC
+3030,CLOSED,Error message while installing triton for jetpack,,2021-06-28 22:13:35 +0000 UTC
+3029,CLOSED,Huge inference speed difference when loading a model from S3,bug, investigating,2022-02-24 16:37:01 +0000 UTC
+3027,CLOSED,docker load invalid diffid,investigating,2021-07-13 17:07:00 +0000 UTC
+3026,CLOSED,unexpected platform type tensorflow_savedmodel for rul,bug,2021-07-15 18:14:03 +0000 UTC
+3025,CLOSED,from tritonclient.grpc import model_config_pb2,,2021-06-25 02:35:18 +0000 UTC
+3023,CLOSED,Example to Test TensorFlow2 Backend in TritonServer Built Without Docker,,2021-06-24 18:20:58 +0000 UTC
+3022,CLOSED,need a doc for the optimization of cuda graph,,2021-07-13 17:09:05 +0000 UTC
+3021,CLOSED,When 21.06 Will be available to launch triton in unprivileged (non-root user) ?,,2021-06-30 08:03:57 +0000 UTC
+3018,CLOSED,How to debug an ensemble model?,,2021-06-16 09:43:24 +0000 UTC
+3014,CLOSED,tritonclient.utils.InferenceServerException: [StatusCode.INTERNAL] in ensemble \'ensemble_dali_face_detect\', request specifies invalid shape for input \'input\' for facedetect_trt_0_gpu0. Error details: model expected the shape of dimension 0 to be between 1 and 1 but received 2,,2021-07-13 17:09:40 +0000 UTC
+3013,CLOSED,faild build requires deletion of `/tmp/citritonbuild/<backend>` git repo folders,enhancement, good first issue,2023-07-10 22:43:10 +0000 UTC
+3012,CLOSED,warmup does not seem to work in libtorch backend,,2022-09-02 16:17:13 +0000 UTC
+3011,CLOSED,Trouble understanding REST outputs from Python TritonClient,,2021-06-16 20:59:35 +0000 UTC
+3010,CLOSED,Error cannot find -ltorch and ltorchvision in building PyTorch Backend along with Triton Server in Oracle Linux 7.9,enhancement,2021-07-13 17:11:17 +0000 UTC
+3009,CLOSED,Error In Loading Zipped Model in Python Backend,,2021-06-21 21:53:13 +0000 UTC
+3008,CLOSED,Deployment Strategies,enhancement,2021-07-13 17:13:18 +0000 UTC
+3007,CLOSED,A suggestion: allowing the different batch size of inputs for "ensemble mode".,,2021-06-14 02:53:15 +0000 UTC
+3004,CLOSED,Validation tool for concurrent model execution and dynamic batching,,2021-06-21 21:54:56 +0000 UTC
+3001,CLOSED,Model Load request to a kubernetes cluster reaches only one pod (when replicationCount > 1),,2021-06-11 16:52:32 +0000 UTC
+2999,CLOSED,How to minimize the docker image for deployment?,duplicate, question,2021-07-19 02:16:13 +0000 UTC
+2998,CLOSED,Auto-Generated Model Configuration for tensorflow_savedmodel issue,enhancement, investigating,2021-07-13 17:13:56 +0000 UTC
+2994,CLOSED,grpc::UNAVAILABLE, error_message_ = "Connect Failed" when max_batch_size is greater than 0,,2021-07-13 17:16:25 +0000 UTC
+2993,CLOSED,Do we need to explicitly load/unload sub-models in an ensemble model.,question,2021-06-10 14:25:27 +0000 UTC
+2989,CLOSED,How to load python backend model without restart server,question,2021-06-09 13:32:55 +0000 UTC
+2988,CLOSED,Is there a way that the server do not exit when some model load failed,,2021-06-09 08:34:06 +0000 UTC
+2987,CLOSED,how to set cuda version when build?,question,2021-06-09 10:42:08 +0000 UTC
+2985,CLOSED,Triton (+ model.pt) silently exits,bug, investigating,2021-06-29 16:55:24 +0000 UTC
+2979,CLOSED,Triton Server Onnx - CUDA failure 700,,2021-06-09 18:40:55 +0000 UTC
+2978,CLOSED,Use real image data with perf_analyzer - Triton Inference Server,,2021-06-08 15:23:00 +0000 UTC
+2977,CLOSED,yolov5 to onnx model image client throws 'expecting model output to be a vector' error,question,2021-06-08 17:39:57 +0000 UTC
+2976,CLOSED,Failed to,invalid,2021-06-08 14:51:55 +0000 UTC
+2975,CLOSED,Can't set ‘backend’ to run my tensorRT model,,2021-06-10 11:27:29 +0000 UTC
+2974,CLOSED,QA：XGBoost/RAPIDS support?,question,2021-07-13 17:16:44 +0000 UTC
+2971,CLOSED,Python backend returning wrong results converting from NHWC to NCHW,bug, investigating,2021-06-16 17:41:42 +0000 UTC
+2970,CLOSED,input segment_ids[0] expected type int32 != int64,,2021-06-08 17:55:25 +0000 UTC
+2969,CLOSED,tritonserver running in pytorch backend cannot do batch but change it to python backend it is ok,,2021-06-08 10:48:08 +0000 UTC
+2968,CLOSED,rapidjson.JSONDecodeError Error in Running Python Model,bug, investigating,2021-06-10 14:08:39 +0000 UTC
+2967,CLOSED,build with --no-container-build flag fails while building onnxruntime inside a container,duplicate,2021-06-08 12:42:39 +0000 UTC
+2966,CLOSED,Failed to determine modification time of model on Azure Storage when starting up triton server after update to release 21.04,bug, investigating,2021-09-01 01:23:50 +0000 UTC
+2960,CLOSED,Failed to load python model inside Self-built Triton Server in Centos 7,,2021-06-07 05:10:06 +0000 UTC
+2959,CLOSED,when a large amount of data needs to be transferred between ensemble models, it is very time consuming,,2021-06-05 07:20:27 +0000 UTC
+2958,CLOSED,python backend is much slower than the same code run in python environment,performance, investigating,2021-11-25 00:18:51 +0000 UTC
+2953,CLOSED,Service Shutdown while multiple python backend to be loaded.,,2021-06-03 14:46:19 +0000 UTC
+2952,CLOSED,The question of interrupting streams,question,2021-07-13 17:19:10 +0000 UTC
+2951,CLOSED,Cannot find -ltensorflow_triton when building tensorflow_backend,,2021-06-04 16:00:01 +0000 UTC
+2949,CLOSED,AWS Helm chart error,,2021-06-03 17:51:32 +0000 UTC
+2948,CLOSED,triton client memory leak in python sdk,,2021-06-08 03:51:24 +0000 UTC
+2944,CLOSED,triton C API How to build example,,2022-05-31 13:48:40 +0000 UTC
+2943,CLOSED,Triton server hangs while trying to deploy models,,2021-06-11 22:11:09 +0000 UTC
+2941,OPEN,HTTP/REST client supporting for Java/Scala,enhancement,2021-06-01 18:47:23 +0000 UTC
+2940,CLOSED,tritonserver2.10.0-jetpack4.5.tgz - deepstream 5.1- jetson xavier NX,,2021-06-02 17:24:30 +0000 UTC
+2939,CLOSED,Model loading is slow， and when it comes to loading multiple models, it gets stuck!,,2021-07-13 17:29:51 +0000 UTC
+2938,CLOSED,Client build on Windows fails: cl : command line error D8021: invalid numeric argument '/Wno-implicit-fallthrough',,2021-06-06 13:25:07 +0000 UTC
+2937,CLOSED,GRPC server always return `RawOutputContents` even if the `InferInputTensor::contents` is specified.,question,2021-06-07 11:43:38 +0000 UTC
+2932,CLOSED,Triton (docker 21.05-py3) fails to load a model repository with two python backends,,2021-05-28 17:44:02 +0000 UTC
+2931,CLOSED,server can not load mybackend shared library,,2021-05-28 07:17:41 +0000 UTC
+2929,CLOSED,how can get tensorflow version in triton tensorflow backend,,2021-05-27 16:34:10 +0000 UTC
+2928,CLOSED,How to serve a PyTorch model in Triton python_backend on multiple GPUs?,,2021-05-31 13:59:26 +0000 UTC
+2927,CLOSED,Extending Triton to support more open source libraries,question,2021-06-02 16:29:37 +0000 UTC
+2926,CLOSED,Get label with REST http client,,2021-06-08 15:58:30 +0000 UTC
+2925,CLOSED,Cuda version for Jetpack 2.10.0,,2021-05-31 10:50:25 +0000 UTC
+2922,CLOSED,ensemble_image_client client example giving error,,2021-05-26 16:58:38 +0000 UTC
+2921,CLOSED,Onnx runtime on Jetson Xavier,,2021-05-26 16:07:58 +0000 UTC
+2920,CLOSED,Triton server docker images for jetson xavier,,2021-05-26 16:08:13 +0000 UTC
+2919,CLOSED,Use share memory sometime have a error in defferent process,,2021-05-27 11:42:20 +0000 UTC
+2918,CLOSED,How to test performance benchmark with input type raw_image ?,,2021-05-27 16:22:49 +0000 UTC
+2917,CLOSED,ensemble model load failed,,2021-05-27 07:20:28 +0000 UTC
+2911,CLOSED,error: failed to get model metadata: Request for unknown model: 'inception_graphdef' is not found,,2021-06-01 19:18:44 +0000 UTC
+2910,CLOSED,Docker minimum Triton example seems incomplete,investigating,2021-08-03 01:32:29 +0000 UTC
+2907,CLOSED,My model is working fine when I use gpu:0 but it is giving error when I use gpu:1.,,2022-02-23 12:04:47 +0000 UTC
+2906,CLOSED,unable to create TensorRT context,,2022-07-18 08:13:08 +0000 UTC
+2902,CLOSED,GRPC Execute Failed, message: failed to connect to all addresses,,2021-05-21 20:42:13 +0000 UTC
+2901,CLOSED,unexpected platform type caffe2_netdef for Triton 2.7.0 ?,,2021-05-21 20:47:47 +0000 UTC
+2900,CLOSED,Can't generating linux_stamp.whl successfully on Intel i5,question,2021-05-24 03:46:06 +0000 UTC
+2899,CLOSED,Running image_client.py causes core dump on Jetson NX,,2021-05-25 07:20:12 +0000 UTC
+2895,CLOSED,Unable to compile Triton/client on CentOS 7.9,,2021-11-11 07:20:09 +0000 UTC
+2893,CLOSED,Triton json cause assert Abort or Segmentfault,bug,2021-07-13 17:20:13 +0000 UTC
+2892,CLOSED,Can not load densenet model,,2021-05-21 01:41:33 +0000 UTC
+2888,CLOSED,What onnx version support this Add_1003 operator?,,2021-05-19 20:27:20 +0000 UTC
+2887,CLOSED,[Question] [Model storage] Is there a way to download a model from cloud repo and store it to disk?,question,2021-06-08 17:39:03 +0000 UTC
+2886,CLOSED,Backend version mismatch when building from sources,,2021-05-19 11:41:21 +0000 UTC
+2885,CLOSED,ensemble model, inference input data-type is 'BYTES', model expects 'FP32',,2021-05-21 03:12:38 +0000 UTC
+2883,CLOSED,Waiting for in-flight requests to complete,,2021-05-19 07:34:16 +0000 UTC
+2882,CLOSED,tritonserver: error while loading shared libraries: libnvidia-ml.so.1 on Jetson NX,,2021-05-21 19:54:51 +0000 UTC
+2877,CLOSED,Triton does not run without gpu (cpu-only),bug, question,2023-04-03 20:50:43 +0000 UTC
+2876,CLOSED,CUDA copy error in python backend on Jetson,,2021-05-22 11:17:10 +0000 UTC
+2875,CLOSED,Cmake error,,2021-05-18 23:47:49 +0000 UTC
+2874,CLOSED,Python backend is not using python3 available in PATH,,2021-05-18 23:48:25 +0000 UTC
+2873,CLOSED,triton load yensorrt error,,2021-06-08 15:40:28 +0000 UTC
+2872,CLOSED,Questions related to shared memory,question,2021-06-08 17:21:40 +0000 UTC
+2871,CLOSED,Triton does not refresh aws credentials when using IAM roles,enhancement, investigating,2022-05-25 17:37:22 +0000 UTC
+2870,CLOSED,Reshape did not work for python backend,question, investigating,2021-05-21 15:36:13 +0000 UTC
+2868,CLOSED,Yolov5 - preprocess in infer config file,,2021-05-18 00:17:15 +0000 UTC
+2865,CLOSED,How to stop ensemble pipeline,,2021-05-17 23:57:42 +0000 UTC
+2860,CLOSED,Troubleshooting execute function of Python Backend,,2021-06-02 16:25:00 +0000 UTC
+2858,CLOSED,failed to load all models,,2021-06-08 15:27:05 +0000 UTC
+2857,CLOSED,how to make dynamic_axes espcn(super resolution model) onnx config.pbtxt file?,,2021-05-14 14:05:20 +0000 UTC
+2856,CLOSED,smaller size docker image with specific functions,,2021-05-14 15:21:27 +0000 UTC
+2855,CLOSED,python backend streaming pybind11::error_already_set for perf_analyzer with concurrency > 1,bug, investigating,2021-08-30 14:56:44 +0000 UTC
+2853,CLOSED,Triton does not load the latest available model in explicit mode.,bug, investigating,2021-06-08 10:49:15 +0000 UTC
+2852,CLOSED,How to accelerate the Triton when loading TensorRT plan.,question,2021-07-13 17:30:54 +0000 UTC
+2851,CLOSED,How to reduce the time consumption of tensor transfer among models in a ensemble model?,,2021-05-12 19:27:27 +0000 UTC
+2850,CLOSED,TRITONBACKEND_InputProperties,,2021-05-14 16:04:08 +0000 UTC
+2849,CLOSED,Python client support for variable size dimensions,,2021-05-14 21:41:27 +0000 UTC
+2848,CLOSED,the preprocess VGG mean-substraction in image_client.cc,bug,2021-08-11 19:41:45 +0000 UTC
+2846,CLOSED,InferenceServerException: PyTorch execute failure: Expected Tensor but got Tuple [ BART Summarization ],bug, investigating,2022-10-20 07:23:25 +0000 UTC
+2845,CLOSED,identity_backend of input_buffer_count,,2021-05-11 13:13:35 +0000 UTC
+2844,CLOSED,Update Release notes for clients - libopencv_imgcodecs.so.4.2: cannot open shared object file: No such file or directory,,2021-05-11 13:36:05 +0000 UTC
+2843,CLOSED,so many server's handler caused server not work,bug, investigating,2021-06-08 00:01:55 +0000 UTC
+2842,OPEN,Azure Kubernetes Service Deployment Sample,enhancement,2021-05-13 14:02:37 +0000 UTC
+2838,CLOSED,Running Triton Without GPU,,2021-05-11 23:49:39 +0000 UTC
+2836,CLOSED,infrence of torch script model much slower with triton than python environment,bug, performance, pytorch ngc,2022-04-11 13:44:14 +0000 UTC
+2835,CLOSED,Unable to connect to S3 protocol of Dell EMC ECS EX300,bug,2021-05-19 14:44:20 +0000 UTC
+2834,CLOSED,Windows build errors with grpc: missing pthread dependency, protobuf version inconsistency,,2021-05-12 03:36:29 +0000 UTC
+2833,CLOSED,Windows docker build error, Could not create SSL/TLS secure channel.,,2021-06-07 14:11:49 +0000 UTC
+2831,CLOSED,How to inference with tuple input in pytorch_backend?,,2021-05-17 02:47:54 +0000 UTC
+2830,CLOSED,Triton Model Repository Format,enhancement, investigating,2021-05-10 23:17:26 +0000 UTC
+2828,CLOSED,how to change the package name in java client,question, investigating,2021-05-10 23:08:35 +0000 UTC
+2827,CLOSED,Question: How to call another model in a model.(like emsemble, speech recognition application),question,2021-05-17 22:07:46 +0000 UTC
+2821,CLOSED,Triton Client build for windows,,2021-05-07 08:18:54 +0000 UTC
+2818,CLOSED,Triton crashes while running TensorFlow model with reshape commands for outputs in config.pbtxt,,2021-05-19 21:47:07 +0000 UTC
+2817,CLOSED,[BUG] cuDF context is initialized to gpu0 multiple times with Tritons python backend on a multi-gpu machine,,2021-05-05 23:05:56 +0000 UTC
+2815,CLOSED,Dynamic Batching does not seem to work.,,2021-07-13 17:32:29 +0000 UTC
+2811,CLOSED,Verifying dynamic batching is working,,2021-05-06 18:35:34 +0000 UTC
+2810,CLOSED,Out of memory error on second inference,,2021-05-06 03:03:13 +0000 UTC
+2807,CLOSED,Question: Performance differences between v1 and v2,,2021-05-05 13:54:49 +0000 UTC
+2803,OPEN,Windows support for C API sample,enhancement,2021-05-06 16:27:35 +0000 UTC
+2802,CLOSED,List dependent packages for Windows build in build.py,,2021-05-07 21:02:52 +0000 UTC
+2801,CLOSED,Exclude non-needed GRPC/HTTP dependencies,,2021-05-07 21:02:52 +0000 UTC
+2800,CLOSED,Exclude disabled cloud dependencies from CMAKE,,2021-05-11 02:35:04 +0000 UTC
+2798,CLOSED,Incorrect NVTX annotations emitted by the inference server,,2021-05-04 00:52:08 +0000 UTC
+2794,CLOSED,memory leak in s3 filesystem,,2021-05-03 16:08:32 +0000 UTC
+2793,CLOSED,can not load onnx model,,2021-05-19 00:01:59 +0000 UTC
+2792,CLOSED,How to deploy yolov5 model,,2021-07-30 04:14:30 +0000 UTC
+2791,OPEN,Kubernetes Operator for install and lifecycle management,enhancement,2021-10-19 10:51:38 +0000 UTC
+2790,CLOSED,When use command line to launch triton, it raise: attempt to access JSON non-number as double,,2021-05-06 06:24:57 +0000 UTC
+2786,CLOSED,How to add ConfigProto when running a model,,2021-04-29 12:12:02 +0000 UTC
+2785,CLOSED,Building triton server from min with Pytorch backend,,2021-05-05 09:12:32 +0000 UTC
+2784,CLOSED,No models being mounted when running container,,2021-04-29 17:33:54 +0000 UTC
+2783,CLOSED,Unusual HTTP send/recv latency with perf_analyzer,,2021-06-07 14:04:00 +0000 UTC
+2781,CLOSED,Cannot load ensemble model,,2022-11-01 17:29:13 +0000 UTC
+2780,CLOSED,Questions about Model Loading on GPU memory,,2021-05-18 00:39:22 +0000 UTC
+2779,CLOSED,[Jetson] triton client health calls via grpc on jetson very slow/hangs,,2021-06-07 15:57:55 +0000 UTC
+2777,CLOSED,How to use Triton server “ensemble model” with 1:N input/output to create patches from large image?,,2022-02-08 09:38:23 +0000 UTC
+2773,CLOSED,Custom Operations for TensorFlow 2 error,,2021-12-21 23:51:59 +0000 UTC
+2771,CLOSED,hello, everyone , here is a my demo of using ensemble model.,,2021-04-27 08:40:29 +0000 UTC
+2770,CLOSED,tensorflow saved_model performance,,2021-05-05 18:10:04 +0000 UTC
+2769,CLOSED,big inference latency difference between k8s (aws) and local laptop,,2021-05-04 23:53:03 +0000 UTC
+2768,CLOSED,tritonclient.utils.InferenceServerException: request specifies invalid shape for input 'input' for face_det_tensorrt_0_3_gpu0. Error details: model expected the shape of dimension 0 to be between 1 and 1 but received 2,,2021-06-15 08:13:45 +0000 UTC
+2767,CLOSED,Documentation for Windows alpha release,,2021-06-11 13:34:25 +0000 UTC
+2766,CLOSED,503 Service Unavailable,,2021-07-13 17:33:13 +0000 UTC
+2765,CLOSED,When I user triton server, How should I use it，batch concatenate and more GPU,,2021-07-13 17:33:27 +0000 UTC
+2763,CLOSED,Unable to build r21.04 version,,2021-04-23 21:09:43 +0000 UTC
+2760,CLOSED,Ensemble model throughput lower than member models,,2022-07-21 01:54:04 +0000 UTC
+2759,CLOSED,Common access to DataType-manipulating functions,enhancement,2022-05-20 18:57:02 +0000 UTC
+2758,CLOSED,containerd: OCI runtime create failed "stat /run/containerd/io.containerd.runtime.v1.linux/k8s.io/../nvidia: no such file or directory",,2021-04-23 19:09:31 +0000 UTC
+2756,CLOSED,how to get open-source packages manually?,,2021-04-23 21:14:13 +0000 UTC
+2755,CLOSED,metrics about Count and Latency,,2021-12-15 09:38:44 +0000 UTC
+2752,CLOSED,Launching Triton worked first time now it doesn't (nothing changed),,2021-05-19 21:35:21 +0000 UTC
+2750,CLOSED,Cpu memory keeps increasing while inferencing,,2021-06-21 11:49:38 +0000 UTC
+2748,CLOSED,No error log for failed request.,,2021-04-21 14:07:20 +0000 UTC
+2747,CLOSED,use python backend to add Huggingface Transformers example?,enhancement,2021-05-15 13:12:44 +0000 UTC
+2744,CLOSED,[Question] Flask integration with Python HTTP Client,,2021-04-26 15:13:21 +0000 UTC
+2743,CLOSED,error: creating server: Internal - failed to load all models,,2021-04-21 01:04:35 +0000 UTC
+2740,CLOSED,Is there a way to shut down GPU/CPU memory fall back policy?,,2021-04-23 21:40:35 +0000 UTC
+2739,CLOSED,grpc java client,,2021-05-06 02:11:45 +0000 UTC
+2738,CLOSED,In the latest project of triton service ,How to create the imagepreprocess.so ? Thanks,,2021-04-19 16:49:43 +0000 UTC
+2737,CLOSED,Questions about gPRC threads in Triton Inference Server,,2021-04-23 21:52:04 +0000 UTC
+2735,CLOSED,Add trtorch backend,enhancement,2021-11-30 23:01:47 +0000 UTC
+2732,CLOSED,Starting triton got stucked,,2021-04-19 07:49:01 +0000 UTC
+2731,CLOSED,Triton did not update the model after users added a new model into model_repository | There is nothing on localhost:8000/api/status,,2021-04-20 00:36:38 +0000 UTC
+2727,CLOSED,Question about the relationship between multiple instances and GPU usage,,2021-04-19 06:11:29 +0000 UTC
+2726,CLOSED,nvbufsurftransform:cuInit failed : 3,,2021-04-14 11:44:34 +0000 UTC
+2725,CLOSED,is_server_live results in connection reset,,2022-10-21 23:24:47 +0000 UTC
+2720,CLOSED,Question about repeat backend,,2021-04-12 16:03:39 +0000 UTC
+2719,CLOSED,new C++ client file,,2021-04-12 12:17:43 +0000 UTC
+2716,CLOSED,c++ client compile fail to #include "triton/common/triton_json.h",,2021-08-04 11:01:28 +0000 UTC
+2714,CLOSED,Latest nvidia-pyindex release breaks pip install of client libraries,,2021-04-09 18:37:31 +0000 UTC
+2711,CLOSED,Question about custom backends under the 21.x API,,2021-04-23 22:07:44 +0000 UTC
+2706,CLOSED,Golang GRPC Request caused Triton server with a fatal error,,2021-09-30 19:03:17 +0000 UTC
+2702,CLOSED,set_data_from_numpy hangs with image data,,2021-04-14 03:24:00 +0000 UTC
+2701,CLOSED,Getting error in a multi gpu machine,,2021-05-13 02:28:30 +0000 UTC
+2699,CLOSED,c# .net5,,2021-04-23 22:10:32 +0000 UTC
+2698,CLOSED,[Question] Infer from multiple models simultaneously on a single GPU without lower fps,,2022-01-19 13:20:10 +0000 UTC
+2691,CLOSED,Rebuilding c++ example image_client within docker sdk image fails,,2021-04-02 17:24:12 +0000 UTC
+2690,CLOSED,Can't find shared_memory module in tritonclient library,,2021-04-08 16:40:56 +0000 UTC
+2689,CLOSED,Call other backends within python backend,,2021-04-07 03:56:19 +0000 UTC
+2688,CLOSED,Trinton Errror during the initialization of the modell,,2021-04-01 16:27:58 +0000 UTC
+2687,CLOSED,About Ensemble Models,,2021-04-08 17:28:45 +0000 UTC
+2686,OPEN,perf_analyzer --async miscommunicates with server and runs out of memory,bug,2021-09-09 18:29:04 +0000 UTC
+2685,CLOSED,How can I get percentile latency metrics (say p99) for online inferencing?,,2021-04-23 22:20:43 +0000 UTC
+2681,CLOSED,How can I run identity_backend on trition,,2021-05-18 19:48:49 +0000 UTC
+2678,CLOSED,Tesla T4 cards unusually hot while Triton is idling,,2022-06-28 09:34:38 +0000 UTC
+2675,OPEN,Programmatic interface for Nvidia Framework Containers Support matrix,enhancement,2021-08-11 19:42:48 +0000 UTC
+2674,CLOSED,QuickStart example without the --gpus flag fails for tritonserver:v20.12-py3 and later.,,2021-03-29 20:45:33 +0000 UTC
+2673,CLOSED,using cuda shared memory with libtorch,,2021-03-31 17:19:29 +0000 UTC
+2672,CLOSED,fail to load pytorch model,,2021-04-20 13:11:39 +0000 UTC
+2671,CLOSED,The examples of the customised python backend are not working with the inference server. Cannot run interpreter host. Errno = 2,,2021-03-28 14:30:42 +0000 UTC
+2668,CLOSED,Python gRPC and http clients throw warning when using Python 3.8,,2021-04-02 15:44:19 +0000 UTC
+2667,CLOSED,Sometimes receive Socket Closed UNAVAILABLE from triton grpc server.,,2021-03-29 20:32:51 +0000 UTC
+2661,CLOSED,Option (flag) to disable `optimized_execution` in pytorch backend,,2021-06-09 07:59:17 +0000 UTC
+2659,CLOSED,How triton determine when to remove an idle instance?,,2021-03-31 16:22:52 +0000 UTC
+2658,CLOSED,What do you support for Multi Tenancy?,,2021-03-31 16:27:47 +0000 UTC
+2657,OPEN,Add support for STSAssumeRoleWebIdentityCredentialsProvider for S3 repositories,enhancement,2021-05-19 21:45:36 +0000 UTC
+2653,CLOSED,config.pbtxt for openvino model,,2021-03-23 04:22:43 +0000 UTC
+2652,CLOSED,Unable to execute model simultaneously on a multi gpu instance.,,2021-03-23 22:32:24 +0000 UTC
+2650,CLOSED,Simplify example python client code.,,2021-04-06 16:41:32 +0000 UTC
+2649,CLOSED,Error on loading onnx model,,2021-03-31 16:30:06 +0000 UTC
+2644,CLOSED,Wrong error for loading ONNX models,,2021-03-31 16:30:31 +0000 UTC
+2643,CLOSED,Tritonserver crashes with segmentation fault,bug,2021-04-08 16:34:12 +0000 UTC
+2641,CLOSED,Unable to run pytorch model, CUDA copy error,,2022-05-03 10:25:35 +0000 UTC
+2640,CLOSED,Failed to get device count,,2021-03-18 17:40:27 +0000 UTC
+2637,CLOSED,Not able to run custom PyTorch model using Triton Inference Server & Seldon Core,,2021-03-31 16:35:57 +0000 UTC
+2636,CLOSED,Label mismatch: value and index in reverse order,,2021-03-31 16:36:28 +0000 UTC
+2635,CLOSED,Can't set max batch size when using strict-model-config = false #1466,,2021-03-25 15:57:49 +0000 UTC
+2634,CLOSED,Question of starting serving inside docker,,2021-03-17 15:16:04 +0000 UTC
+2633,CLOSED,Error when load model with http api,,2021-08-16 02:59:35 +0000 UTC
+2629,CLOSED,DGX A100,,2021-03-18 10:50:38 +0000 UTC
+2627,OPEN,TF-TRT Model can't be loaded,bug,2021-12-28 14:45:38 +0000 UTC
+2626,CLOSED,PyTorch backend sometimes allocates input tensors on wrong GPU on multi-GPU systems,,2021-03-31 16:37:39 +0000 UTC
+2625,OPEN,Triton server crashes with a model converted using tf-trt,bug,2023-07-09 11:01:39 +0000 UTC
+2624,CLOSED,service not responding: Too many open files,,2023-06-03 05:39:26 +0000 UTC
+2618,CLOSED,./rtSafe/safeContext.cpp (133) - Cudnn Error in configure: 7 (CUDNN_STATUS_MAPPING_ERROR),,2021-04-23 22:27:49 +0000 UTC
+2617,CLOSED,[Ask for help & discussion] Understanding shared memory.,,2021-03-22 20:31:48 +0000 UTC
+2616,CLOSED,Thread safety question about python grpcclient and server,,2021-03-31 16:38:14 +0000 UTC
+2613,CLOSED,ONNX TensorRT optimization parameters,,2021-03-11 16:39:11 +0000 UTC
+2612,CLOSED,Shared memory allocation,,2021-03-14 05:21:51 +0000 UTC
+2607,CLOSED,reload ensemble model may cause server crash,bug,2022-05-31 17:54:18 +0000 UTC
+2606,CLOSED,Unable to autofill for 'yolov4_nvidia', either all model tensor configuration should specify their dims or none,,2022-06-08 04:45:25 +0000 UTC
+2604,CLOSED,Building Pytorch backend,,2021-03-10 17:11:12 +0000 UTC
+2603,CLOSED,triton client C++,,2022-10-24 08:54:08 +0000 UTC
+2600,CLOSED,Docker image does not run under arbitrary non-root user,,2022-01-10 03:16:13 +0000 UTC
+2599,CLOSED,[Question] how to improve gpu-utilization with multi models,,2021-04-23 22:35:14 +0000 UTC
+2598,CLOSED,AttributeError: 'NoneType' object has no attribute 'cancelled',,2021-03-10 03:41:01 +0000 UTC
+2596,CLOSED,Error at Triton init on GKE while loading Pytorch custom ops,,2021-03-11 08:40:43 +0000 UTC
+2594,CLOSED,Running torchscript exported model in Triton throws InferenceServerException,,2021-03-05 02:02:02 +0000 UTC
+2593,CLOSED,In model configuration support defining list of tensors as input,,2021-10-25 04:09:45 +0000 UTC
+2592,CLOSED,Error response from daemon: received unexpected HTTP status: 502 Bad Gateway,,2021-04-23 22:35:57 +0000 UTC
+2591,CLOSED,TensorRT model uses Perf_Client to test the performance and finds that compute input takes too long and infer takes too short.,,2021-03-16 02:09:50 +0000 UTC
+2584,CLOSED,What is the possible reason of "instance group 124M2_1 of model 124M2 specifies invalid or unsupported gpu id 1",,2021-03-11 09:19:18 +0000 UTC
+2583,CLOSED,How to fork a request to multiple model instances?,,2021-03-05 02:02:17 +0000 UTC
+2582,CLOSED,How to get outputs.content instead of raw_output_contents?,,2021-04-23 22:36:07 +0000 UTC
+2578,CLOSED,model reload should be performed in backgroud but it does not,,2021-03-26 15:31:52 +0000 UTC
+2576,CLOSED,No rule to make target "client",,2021-03-04 10:28:34 +0000 UTC
+2575,CLOSED,CMake error in building from source (no container),,2021-03-02 19:03:18 +0000 UTC
+2572,CLOSED,Deploy model trained with TLT,,2021-08-03 19:15:11 +0000 UTC
+2571,CLOSED,InferenceServerException: input 'max_seqlen' batch size does not match other inputs for 'varsbert,,2021-06-25 05:47:14 +0000 UTC
+2570,CLOSED,Model saved with tensorflow 2.4's tf.compat.v1.saved_model.simple_save inference ran errors.,,2021-12-21 23:37:01 +0000 UTC
+2568,CLOSED,Can Tensorflow Backend be compiled a no-GPU version ?,,2021-04-23 22:38:23 +0000 UTC
+2564,CLOSED,Triton not starting HTTP server on GKE,,2021-04-23 22:38:40 +0000 UTC
+2563,CLOSED,How to find correct rest api params for POST /v2/models/:model/infer inference request?,,2021-03-01 17:40:21 +0000 UTC
+2562,CLOSED,ERROR: No supported GPU(s) detected to run this container,,2021-02-26 06:30:42 +0000 UTC
+2559,CLOSED,Error - "Internal - failed to load all models",,2022-10-10 05:43:25 +0000 UTC
+2558,CLOSED,Build the TensorFlow Backend With Custom TensorFlow Failed.,,2021-02-26 18:44:33 +0000 UTC
+2555,CLOSED,Start tensorrtserver on boot,,2021-02-24 17:14:33 +0000 UTC
+2554,CLOSED,yolov3.onnx problem,,2021-02-25 02:45:47 +0000 UTC
+2553,CLOSED,Problem with commiting changes to the prebuilt docker container,,2021-03-07 12:15:51 +0000 UTC
+2552,CLOSED,QA：Framework installation is needed or not after the construction of the backend?,,2021-03-04 10:43:31 +0000 UTC
+2549,CLOSED,Configuration documentation typo,,2021-02-24 20:08:35 +0000 UTC
+2545,CLOSED,Trailing zeros in bytes truncated in binary data using python http client,,2021-03-16 15:28:00 +0000 UTC
+2543,CLOSED,client latency is too high when model returns a large tensor,,2021-04-23 22:38:50 +0000 UTC
+2542,CLOSED,build docker with build.py bug,,2021-05-19 21:36:58 +0000 UTC
+2540,CLOSED,MyelinGraphError,,2021-03-04 16:10:18 +0000 UTC
+2539,CLOSED,Decoding binary data,,2021-02-22 17:11:55 +0000 UTC
+2537,CLOSED,a mistake in the description of custom docker build,,2021-02-22 18:41:08 +0000 UTC
+2536,CLOSED,Is there any difference between tensorflow model and TensorRT engine?,,2023-02-27 06:10:35 +0000 UTC
+2535,CLOSED,Memory allocation when using multiple platforms/backends,,2021-02-24 18:58:46 +0000 UTC
+2532,CLOSED,Cannot import torch when using Python Backend,,2021-07-26 12:14:14 +0000 UTC
+2531,CLOSED,Tensorflow 1/2 backend source code,,2021-02-22 18:26:19 +0000 UTC
+2530,CLOSED,how to show model repository from the client?,,2021-02-18 13:40:28 +0000 UTC
+2526,CLOSED,[LibTorch] Expected Tensor but got None with inception v3,,2022-04-28 10:44:33 +0000 UTC
+2525,CLOSED,Pip install nvidia-pyindex not working,,2021-04-11 03:35:28 +0000 UTC
+2521,CLOSED,Support List of Tensors in LibTorch backend,,2021-02-16 17:09:18 +0000 UTC
+2520,CLOSED,Call HTTP/RestAPI with byte data in payload,,2021-02-18 17:39:24 +0000 UTC
+2519,CLOSED,backend directory in jetson nano deploy,,2021-02-27 03:05:05 +0000 UTC
+2518,CLOSED,Backend cmake build error,,2021-02-19 08:06:09 +0000 UTC
+2517,CLOSED,Triton GRPC client creation has memory leak,,2021-02-23 02:28:45 +0000 UTC
+2514,CLOSED,Decoding grpc output,,2021-08-11 19:43:24 +0000 UTC
+2513,CLOSED,ArmNN Backend,,2021-11-12 22:58:46 +0000 UTC
+2512,CLOSED,Reading raw content produces wrong values,,2021-02-12 17:56:19 +0000 UTC
+2502,CLOSED,Wrong formatted config file from Auto-Generated Model Configuration,,2021-02-16 18:27:50 +0000 UTC
+2501,CLOSED,Is it possible to compile a version for Centos7?,,2021-02-25 06:55:37 +0000 UTC
+2500,CLOSED,Python backend doesn't support boolean outputs,,2021-02-11 16:54:45 +0000 UTC
+2496,CLOSED,Unable to set dims for output with only batch dimension,,2021-09-30 19:02:17 +0000 UTC
+2495,CLOSED,Failed to parse error (Quick Start),,2021-02-09 01:05:54 +0000 UTC
+2493,CLOSED,Error in using S3-Compatible Storage [Oracle Cloud Infrastructure (OCI) Object Storage],,2021-07-13 09:13:42 +0000 UTC
+2491,CLOSED,Is the pytorch backend correctly disabling gradient calculation?,,2021-02-08 19:08:29 +0000 UTC
+2490,CLOSED,How to specify the memory type of an output and collect the output to a response?,,2021-02-10 19:05:18 +0000 UTC
+2488,CLOSED,Using TensorRT acceleration with model running on both CPU and GPU,,2021-02-12 20:02:08 +0000 UTC
+2486,OPEN,get query_params with python backend,enhancement,2022-11-22 19:37:18 +0000 UTC
+2482,CLOSED,Simplify the source building process (i.e. doc and dependencies),,2021-02-09 08:14:59 +0000 UTC
+2478,CLOSED,Trouble sending image data to Triton Server,,2021-11-08 08:41:04 +0000 UTC
+2477,CLOSED,Nvidia T4 - perf_client low performance,,2021-02-08 22:03:14 +0000 UTC
+2476,CLOSED,UnicodeEncodeError when deploying Vietnamese n-gram language model on the Triton Inference Server.,,2021-02-03 15:24:08 +0000 UTC
+2475,CLOSED,Segmentation fault (core dumped) on ensemble model from Triton (GPU) to Python Backend (CPU),,2021-03-16 15:28:40 +0000 UTC
+2472,CLOSED,tritonclient.utils.cuda_shared_memory.CudaSharedMemoryException: unable to set device successfully,,2021-02-01 05:57:05 +0000 UTC
+2467,CLOSED,Getting the server inference time using the client library,,2021-02-02 18:17:28 +0000 UTC
+2466,CLOSED,wrong word in README.md document,,2021-01-28 22:36:37 +0000 UTC
+2462,CLOSED,perf client using json file to load real image data for ensemble DALI+model,,2022-12-14 21:52:32 +0000 UTC
+2461,CLOSED,Triton server failed to load Tensorflow SavedModel,bug,2022-03-14 20:31:20 +0000 UTC
+2460,CLOSED,20.11-py3 vs 20.11-py3-min?,,2021-01-26 17:54:31 +0000 UTC
+2458,CLOSED,Build failing in python build script,,2021-01-26 16:38:03 +0000 UTC
+2457,CLOSED,python_backend should not set name_ property,,2021-01-27 23:29:11 +0000 UTC
+2456,CLOSED,memory leak before doing inference (windows http client),,2021-01-26 03:28:10 +0000 UTC
+2453,CLOSED,Spend long time in python backend module Tensor function,,2021-02-07 03:48:02 +0000 UTC
+2446,CLOSED,CPU memory usage constantly increases while doing inference (windows http client),,2021-01-26 02:47:45 +0000 UTC
+2443,CLOSED,Build TF-Text by default with TensorFlow2 backend,,2021-10-18 22:00:17 +0000 UTC
+2442,CLOSED,Streaming inference for heavily overlapping data,,2021-07-21 19:02:05 +0000 UTC
+2439,CLOSED,Python backend fails with PyTorch > 1.6.0,,2022-06-17 08:27:43 +0000 UTC
+2435,CLOSED,python_backend cmake error,,2021-02-03 01:02:33 +0000 UTC
+2434,CLOSED,segment fault within http_server.cc about rapidjson,,2021-01-26 00:11:28 +0000 UTC
+2433,CLOSED,Streaming generation,,2022-05-30 03:35:56 +0000 UTC
+2432,CLOSED,python_backend cmake error,,2021-01-28 23:54:18 +0000 UTC
+2431,CLOSED,Performance Analyzer sends input data with wrong order for Hugectr Model,,2021-01-22 08:07:38 +0000 UTC
+2430,CLOSED,Which model should be fed to `simple` server?,,2021-01-19 02:23:30 +0000 UTC
+2429,CLOSED,[question] Total avg queue time = 0 usec for ensemble model,,2021-01-26 17:33:48 +0000 UTC
+2428,CLOSED,Unable to get "compute start" and "compute end" with ensemble model for trace command,,2021-03-31 16:38:09 +0000 UTC
+2427,CLOSED,python_backend cmake,,2021-01-17 20:20:24 +0000 UTC
+2421,CLOSED,Declaring Triton plugins, with kfserving,,2021-01-19 17:59:49 +0000 UTC
+2419,CLOSED,tritonclient.utils.InferenceServerException / StatusCode.UNAVAILABLE] / Request for unknown model / model is not found,,2021-01-15 11:14:35 +0000 UTC
+2418,CLOSED,How to use custom backend e.g. identity_backend,,2021-01-15 17:47:09 +0000 UTC
+2417,CLOSED,Internal: failed to connect to all addresses,,2021-01-20 15:13:53 +0000 UTC
+2416,CLOSED,build with docker failed,,2021-01-20 01:42:19 +0000 UTC
+2415,CLOSED,Encounter error when running official python backend,,2021-01-14 15:29:30 +0000 UTC
+2414,CLOSED,Linker error when torch installed in Python backend,,2021-01-14 17:54:59 +0000 UTC
+2411,CLOSED,"unexpected shape for input" Error for a Model with Dynamic Input,,2021-01-13 17:16:56 +0000 UTC
+2410,CLOSED,Loading Custom TRT Plugins while Serving the TRT Models on Jetson Nano,,2021-01-13 17:18:59 +0000 UTC
+2409,CLOSED,Question: Changing GRPC compression options,,2021-01-13 17:22:17 +0000 UTC
+2408,CLOSED,A make error here that complains that 'not finding grpc_service_pb2.py',,2021-01-13 07:16:45 +0000 UTC
+2407,CLOSED,A make error here that complains that 'not finding grpc_service_pb2.py',,2021-01-13 03:23:46 +0000 UTC
+2404,CLOSED,Missing typeinfo in shared libraries,,2021-01-12 21:38:35 +0000 UTC
+2403,CLOSED,Can't Load Models Using Distributed MinIO,enhancement,2021-05-18 20:27:42 +0000 UTC
+2402,CLOSED,Startup error while loading shared libraries,,2021-01-26 16:45:52 +0000 UTC
+2401,CLOSED,ONNX Backend Support for Jetpack,,2021-01-26 00:22:53 +0000 UTC
+2400,CLOSED,Triton Server Support for Jetson Nano,,2021-01-12 17:27:12 +0000 UTC
+2399,CLOSED,How to optimize config: instance_group vs. dynamic_batching,,2021-01-12 02:45:56 +0000 UTC
+2398,CLOSED,The maximum batch size of pytorch model hosted by triton, is much smaller than torch jit model.,,2021-01-26 03:19:20 +0000 UTC
+2397,CLOSED,Remove unnecessary null pointer checks,,2021-01-26 00:26:22 +0000 UTC
+2395,CLOSED,triton server core dumped when client infer (model: bert-tensorrt engine),,2021-01-26 00:28:01 +0000 UTC
+2393,CLOSED,randomly same output with last batch,,2021-01-14 01:49:17 +0000 UTC
+2392,CLOSED,PIL module not found,,2021-01-08 17:26:21 +0000 UTC
+2389,CLOSED,Optimal/Suggested way of handling image requests for optimizing throughput,,2021-01-08 07:50:01 +0000 UTC
+2388,CLOSED,Cannot run Pytorch and Tensorflow models consecutively on a single GPU,,2021-01-11 17:02:37 +0000 UTC
+2387,CLOSED,Onnx batchsize greater than 1,,2021-08-30 16:53:23 +0000 UTC
+2386,CLOSED,Not able to use GKE default driver with Triton Inference Server,,2021-01-07 00:22:27 +0000 UTC
+2384,CLOSED,Performance improvement for numpy decoding in Python client,performance,2021-05-19 21:39:19 +0000 UTC
+2381,CLOSED,Support for SwiftStack S3 API,enhancement,2021-05-18 20:27:53 +0000 UTC
+2379,CLOSED,Intel MKL FATAL ERROR in 20.11 and 20.12,,2021-01-26 00:36:48 +0000 UTC
+2377,CLOSED,Getting issue while load FP16 retinaface model,,2022-05-28 18:09:55 +0000 UTC
+2376,CLOSED,Question: Kubernetes Deployment with stateful models,,2021-01-26 00:40:26 +0000 UTC
+2375,CLOSED,NVTX error occurs when TRITON_ENABLE_CAFFE2 and TRITON_ENABLE_PYTORCH is OFF,,2021-01-26 00:41:20 +0000 UTC
+2374,CLOSED,Triton Inference server 20.12 start error in Tesla P4.,,2021-02-03 01:03:35 +0000 UTC
+2373,CLOSED,Yolov5s torchscript model shows pytorch backend bugs?,,2021-10-28 01:38:46 +0000 UTC
+2372,CLOSED,TritonClient install in windows 10?,,2021-01-04 18:28:33 +0000 UTC
+2371,CLOSED,bert optimized onnx model infer error when batch_size > 1,,2020-12-31 05:47:13 +0000 UTC
+2370,CLOSED,When I set the concurrency increase, the service returns a data exception,,2021-01-04 17:25:08 +0000 UTC
+2369,CLOSED,python backend not support TRITONSERVER_MEMORY_GPU,enhancement,2021-09-01 15:15:59 +0000 UTC
+2368,CLOSED,Check failed: size >= 0 (-1655719932 vs. 0),,2021-01-05 16:55:19 +0000 UTC
+2367,CLOSED,Triton Inference Server does not use GPU for Jetson Nano.,,2021-01-04 18:43:50 +0000 UTC
+2366,CLOSED,pip install tritonclinet on win10,,2021-01-05 02:19:02 +0000 UTC
+2365,CLOSED,fail to use perf_client,,2021-01-04 17:48:09 +0000 UTC
+2364,CLOSED,HTTP client failed: Send failed since rewinding of the data stream failed,,2021-01-26 00:46:41 +0000 UTC
+2363,CLOSED,FP32 inference - ctypes?,,2021-01-02 08:57:21 +0000 UTC
+2362,CLOSED,how to build the C++ client libraries on windows,,2021-01-04 17:54:36 +0000 UTC
+2361,CLOSED,L4T images for Triton,,2021-01-26 00:47:01 +0000 UTC
+2360,CLOSED,Not able to compile client libraries using docker,,2021-01-06 18:54:04 +0000 UTC
+2359,CLOSED,Call to GRPC Inference API fails using generated java code,,2021-01-26 00:49:31 +0000 UTC
+2357,CLOSED,TensorRT on Jetpack Triton Build,,2020-12-17 17:18:03 +0000 UTC
+2356,CLOSED,Tensorflow Saved Model Format tensorflow-gpu==2.0.0 vs. tensorflow==2.3.0,,2021-01-26 00:50:49 +0000 UTC
+2355,CLOSED,Questions on building triton server,,2020-12-17 17:14:27 +0000 UTC
+2354,CLOSED,docker pull failed from NGC,,2021-01-26 00:51:07 +0000 UTC
+2353,CLOSED,CPU memory usage details in metrics,,2021-07-07 21:59:39 +0000 UTC
+2347,CLOSED,grpc_simple_client.go "undefined: inference.GRPCInferenceServiceClient",,2020-12-16 16:47:30 +0000 UTC
+2345,CLOSED,Is it possible to run Triton without starting a server?,,2023-05-24 11:30:32 +0000 UTC
+2342,CLOSED,perf_analyzer failed,,2020-12-16 03:46:00 +0000 UTC
+2339,CLOSED,[Libtorch] Triton server produces inconsistent results when hosting multiple models in one GPU,,2022-01-11 01:42:00 +0000 UTC
+2337,CLOSED,[Kaldi] 8Khz Model " unexpected size for input tensor.",,2021-01-27 00:02:23 +0000 UTC
+2334,OPEN,Option to not provide all inputs specified in config.pbtxt,enhancement,2021-05-10 22:33:45 +0000 UTC
+2333,CLOSED,Support scalar input to Triton,,2023-07-02 22:15:22 +0000 UTC
+2332,CLOSED,Add ability to choose which graph and signature_def to load on model load,,2021-04-21 17:56:57 +0000 UTC
+2331,CLOSED,Supporting multiple signature_def's at runtime,,2021-05-10 22:36:54 +0000 UTC
+2330,CLOSED,Pull 20.11-py3-sdk, manifest unknown,,2020-12-10 16:54:28 +0000 UTC
+2329,CLOSED,[Typo] in the developer website of trition introduction,,2021-02-03 18:08:49 +0000 UTC
+2324,CLOSED,Triton for JetPack does NOT support ONNX backend,enhancement,2021-05-10 22:38:24 +0000 UTC
+2323,CLOSED,Unable to start server on CPU-only device,,2022-05-13 17:25:32 +0000 UTC
+2322,CLOSED,SegmentationFault error related PinnedMemoryManager,,2021-02-05 21:17:24 +0000 UTC
+2321,CLOSED,Implement a custom client using curl and jsoncpp,,2020-12-11 15:44:49 +0000 UTC
+2317,CLOSED,Loading TorchScript model fails for Triton in DeepStream,,2021-01-26 00:53:18 +0000 UTC
+2315,CLOSED,How to protect trtis (20.02-py3) python code,,2022-11-16 02:08:58 +0000 UTC
+2312,CLOSED,HTTPService is starting instead of started,,2020-12-04 16:51:08 +0000 UTC
+2311,CLOSED,Failed to finalize CUDA memory manager: CNMEM_STATUS_INVALID_ARGUMENT,,2020-12-03 06:42:22 +0000 UTC
+2308,CLOSED,'torch.dtype' object has no attribute 'type',,2020-12-08 18:00:49 +0000 UTC
+2307,CLOSED,Serialization Error in readExternam:0 (Type mismatch),,2020-12-17 17:17:44 +0000 UTC
+2306,CLOSED,Why is there such a big performance difference between using http and grpc?,,2021-01-26 00:54:51 +0000 UTC
+2303,CLOSED,multiple model instances running simultaneously maybe cause gpu memory exhaust. How to avoid it?,,2020-12-10 18:47:37 +0000 UTC
+2301,CLOSED,How to load more than one custom plugin, LD_PRELOAD,,2021-04-06 02:32:42 +0000 UTC
+2300,CLOSED,Memory leak,,2021-01-05 08:06:06 +0000 UTC
+2299,CLOSED,RTX 3000 series?,,2021-01-26 00:55:09 +0000 UTC
+2297,CLOSED,Build from sources r2.5 does not work correctly,,2020-12-05 15:48:57 +0000 UTC
+2296,CLOSED,error with shared memory with client in triton client sdk: failed to register input shared memory region,,2022-07-06 11:18:31 +0000 UTC
+2294,CLOSED,TorchScript concurrent inference net execution time is substantially higher than single inference execution time,,2021-01-26 00:56:06 +0000 UTC
+2293,CLOSED,TensorRT version mismatch for NGC containers 20.11 and 20.10,,2021-05-19 09:07:24 +0000 UTC
+2291,CLOSED,Fix S3 authentication for IAM roles by including Session token in env vars,,2022-12-16 19:09:53 +0000 UTC
+2289,CLOSED,Error Handling in ` model.py `,,2020-12-03 21:44:54 +0000 UTC
+2281,CLOSED,Does onnx backend utilize TensorRT while interfering?,,2020-11-22 16:37:54 +0000 UTC
+2278,CLOSED,E1120 12:54:31.030207 49 model_repository_manager.cc:1007] failed to load 'yolov3-spp' version 1: Invalid argument: model 'yolov3-spp_0_gpu0', tensor '000_net': the model expects 4 dimensions (shape [1,3,608,608]) but the model configuration specifies 3 dimensions (shape [3,608,608]),,2021-01-26 00:56:54 +0000 UTC
+2277,CLOSED,anyone build triton by cmake or buil.py success in ubuntu?,,2021-01-26 00:58:54 +0000 UTC
+2276,CLOSED,TFLite support with Google Edge TPU acceleration,enhancement,2022-11-09 15:51:31 +0000 UTC
+2275,CLOSED,Different result: ensemble_model vs. ensemble_client,,2020-12-07 17:43:16 +0000 UTC
+2274,CLOSED,Client side memory leak in python sdk using shared memory,,2020-11-23 18:19:05 +0000 UTC
+2269,CLOSED,[Crash] Triton Server 20.10 crashing with 'double free or corruption (out)' with TensorFlow XLA-GPU enabled,,2021-01-26 01:00:10 +0000 UTC
+2268,CLOSED,Why can't the renamed "triton_python_backend_utils.py" and other modules be imported in models directory?,,2021-02-19 14:43:18 +0000 UTC
+2267,CLOSED,Adding Custom layer to Triton,,2020-11-20 05:20:56 +0000 UTC
+2265,CLOSED,onnx model inference cpu usage,,2021-01-26 01:00:57 +0000 UTC
+2260,CLOSED,Issue while Serving the model using Triton Server,,2020-11-18 19:05:10 +0000 UTC
+2258,CLOSED,Is there any guide to deploy QuartzNet through triton inference server?,,2022-12-22 13:58:12 +0000 UTC
+2254,CLOSED,Request for unknown model: 'resnet50_netdef' is not found,,2020-11-17 05:59:04 +0000 UTC
+2253,CLOSED,GCS permissions to load models from bucket?,,2020-11-17 05:46:35 +0000 UTC
+2250,CLOSED,How to build libimagepreprocess.so?,,2020-11-16 17:16:33 +0000 UTC
+2245,CLOSED,How to downgrade the cuda version in the "nvcr.io/nvidia/tritonserver:20.10-py3",,2020-11-11 23:14:37 +0000 UTC
+2243,CLOSED,Protobuf version conflicts r19.10,,2020-11-12 13:36:05 +0000 UTC
+2240,CLOSED,Jetson support of pytorch and PTH-TRT,enhancement,2023-07-10 22:58:28 +0000 UTC
+2239,CLOSED,cmake build server error r20.10 or master,,2020-11-20 05:57:59 +0000 UTC
+2238,CLOSED,cmake build server error r20.10,,2022-04-13 03:33:16 +0000 UTC
+2237,CLOSED,Memory leaking with many TensorFlow models and warmup,,2020-11-11 23:31:25 +0000 UTC
+2236,CLOSED,Can we put image download to server side?,,2021-02-04 17:57:09 +0000 UTC
+2234,CLOSED,Use TRTIS optimized model instead of running optimization again on load,,2020-11-09 17:18:29 +0000 UTC
+2231,CLOSED,Model with dynamic shapes and TensorRT optimization outputs nonsense,,2021-12-17 10:05:00 +0000 UTC
+2229,CLOSED,sequence_batch_scheduler.cc:399 The previous sequence did not end before this sequence start,bug,2020-11-11 23:42:19 +0000 UTC
+2228,CLOSED,Config.pbtx for Efficientdet-D0,,2020-11-09 20:37:44 +0000 UTC
+2227,CLOSED,Segmentation Fault when launching the server with custom built TensorRT plugins,,2021-01-04 02:30:45 +0000 UTC
+2226,CLOSED,Mismatch between config.pbtxt and reported model config json,,2020-11-12 21:25:51 +0000 UTC
+2225,CLOSED,Error occurs when i run the triton server with docker(Quick start),,2020-11-05 17:22:25 +0000 UTC
+2224,CLOSED,The README file in this repo has a bad link - [404:NotFound],,2020-11-04 19:01:57 +0000 UTC
+2223,CLOSED,Failed to load Tensorflow models,,2020-11-05 12:54:57 +0000 UTC
+2222,CLOSED,Support for AWS Inferentia?,,2022-05-11 09:11:08 +0000 UTC
+2221,CLOSED,CUDA Shared Memory for pytorch cuda tensor?,,2021-04-08 01:02:55 +0000 UTC
+2210,CLOSED,Array object is not Json Serializable on triton_client.infer,,2020-11-11 23:49:39 +0000 UTC
+2209,CLOSED,Debian package, when?,,2020-11-11 23:50:16 +0000 UTC
+2205,CLOSED,Triton Inference Server on ppc64le,,2021-07-07 22:01:12 +0000 UTC
+2195,CLOSED,C++ Clients V2 Api - Inference results output and batch Id,,2020-10-30 19:50:17 +0000 UTC
+2194,CLOSED,Python API references disappeared from documentation !,,2020-10-29 15:52:45 +0000 UTC
+2193,CLOSED,provided PTX was compiled with an unsupported toolchain?,,2020-11-03 22:15:38 +0000 UTC
+2192,CLOSED,Can't get max-batch-size to work,,2020-11-11 23:51:36 +0000 UTC
+2187,CLOSED,Could you provide a tritonclient in Java?,,2020-11-11 23:51:49 +0000 UTC
+2186,CLOSED,Error when build TRTIS Docker Image,,2020-11-11 23:52:19 +0000 UTC
+2179,CLOSED,failed to load 'densenet_onnx' version 1: Invalid argument: unknown platform 'onnxruntime_onnx',,2023-01-20 09:17:47 +0000 UTC
+2169,CLOSED,Fail to build identity_backend,,2020-10-26 12:38:28 +0000 UTC
+2168,CLOSED,HTTP end point doesn't support models with decoupled transaction policy,,2020-10-26 17:29:39 +0000 UTC
+2167,CLOSED,Bypass arguments in Ensemble Models' input and output,,2020-10-27 16:57:16 +0000 UTC
+2166,CLOSED,IndexError: list index out of range,,2020-10-26 19:56:38 +0000 UTC
+2165,CLOSED,"Invalid argument - repository path is not a valid directory" when running local S3 on port 80,,2021-01-26 01:05:44 +0000 UTC
+2163,OPEN,Pull repository from different s3 accounts (multiple credentials),enhancement,2022-08-30 22:25:50 +0000 UTC
+2160,CLOSED,CMake problems for client library in v2.3.0,,2021-07-07 22:21:57 +0000 UTC
+2159,CLOSED,Support different network protocols,,2021-09-10 19:18:04 +0000 UTC
+2158,CLOSED,Ragged batching support for ML backends,,2021-07-07 22:24:35 +0000 UTC
+2157,CLOSED,Python backend cannot support KIND_GPU in model config,,2023-05-18 01:54:44 +0000 UTC
+2156,CLOSED,Can multiple instances can use the same GPU shared memory?,,2022-05-30 02:50:27 +0000 UTC
+2155,CLOSED,Do multiple instances of the same model share parameters?,,2020-10-22 15:50:26 +0000 UTC
+2154,CLOSED,What is the difference between triton-inference-server/server/src/backends/backend and triton-inference-server/backend?,,2020-10-22 17:05:14 +0000 UTC
+2153,CLOSED,Where is the definition of TRITONBACKEND_ModelSetState,,2020-10-22 16:01:35 +0000 UTC
+2148,CLOSED,Wrong value of byte_size returned by TRITONBACKEND_InputProperties,,2020-10-27 23:27:21 +0000 UTC
+2147,CLOSED,S3 Storage with POLL mode reloads models constantly,,2021-01-28 22:39:32 +0000 UTC
+2144,CLOSED,Non-numeric subdirectories are not ignored for version convertion,,2020-10-27 02:08:34 +0000 UTC
+2143,CLOSED,Windows support,,2020-10-20 15:47:48 +0000 UTC
+2140,CLOSED,Failed to connect all addresses,,2020-10-20 23:58:03 +0000 UTC
+2138,CLOSED,Failing to invoke triton methods via GRPC,,2020-10-20 07:13:06 +0000 UTC
+2137,CLOSED,The overhead cost so much time,,2020-10-29 02:32:30 +0000 UTC
+2136,CLOSED,Different batch-size requests causes error, even the loading model supports batching,,2021-08-30 02:53:14 +0000 UTC
+2135,CLOSED,Wrong results when using an onnx model with tensorrt gpu_execution_accelerator and dynamic axes,,2021-05-19 21:44:16 +0000 UTC
+2133,CLOSED,Deploying Bert TensorRT model with Triton,,2020-10-27 19:30:22 +0000 UTC
+2130,CLOSED,perf_client fails with "Received message larger than max",,2022-10-14 20:36:38 +0000 UTC
+2127,CLOSED,Triton server multiple initialization errors, under kubernetes,,2020-10-21 22:18:59 +0000 UTC
+2126,CLOSED,How to run Triton Inference Server docker container on a Jetson Nano?,,2020-10-15 15:22:21 +0000 UTC
+2123,CLOSED,Multi-inputs with dynamic axes in ONNX Graph not corrected reported on loading,,2020-10-14 21:03:55 +0000 UTC
+2122,CLOSED,Prometheus output differs from nvidia-smi,,2020-11-13 22:31:41 +0000 UTC
+2121,CLOSED,TRITON INFERENCE WITH ENSEMBLE MODEL,,2020-10-20 17:00:36 +0000 UTC
+2120,CLOSED,Indiscriminate use of ExternalProject_Add,,2020-10-20 17:00:52 +0000 UTC
+2118,CLOSED,Can request the Triton Inference Server by using the 'request package' in python rather than 'client library'?,,2021-05-12 09:09:33 +0000 UTC
+2112,CLOSED,Missing methods in Kotlin/Java code generated using gRPC,,2021-01-26 01:07:21 +0000 UTC
+2111,CLOSED,When batch_size is 1, return Stream removed.,,2020-11-11 23:55:02 +0000 UTC
+2110,CLOSED,20.10 build error ( linking error ),,2021-05-19 17:23:00 +0000 UTC
+2109,CLOSED,Memory leak in 20.09?,,2020-10-20 22:14:57 +0000 UTC
+2107,CLOSED,Cannot deploy on AWS EKS gpu nodes (p3, g4, etc.),,2020-10-12 19:12:10 +0000 UTC
+2104,CLOSED,Triton 20.08 hangs on inference when Custom Backend (Legacy, either v1 or v2) has no output tensors specified,,2020-10-20 16:06:39 +0000 UTC
+2103,CLOSED,How to debug failed inference request,,2023-02-22 14:01:49 +0000 UTC
+2101,CLOSED,Release 2.3.0- Missing custom backend sdk,,2020-10-08 21:34:55 +0000 UTC
+2100,CLOSED,fail to reload a model with the same name and different configurations,,2020-10-16 17:48:50 +0000 UTC
+2098,CLOSED,Jetson build for 20.09,,2020-10-08 07:40:20 +0000 UTC
+2097,CLOSED,Tensorflow models don't seem to batch properly,,2021-07-30 18:55:18 +0000 UTC
+2095,CLOSED,Triton unable to access GPU on Jetson Nano,,2020-10-07 18:55:42 +0000 UTC
+2092,CLOSED,V2 API Migration,,2021-04-12 17:10:18 +0000 UTC
+2091,CLOSED,Failing to comsume Triton's Prometheus in Grafana,,2021-11-03 17:04:14 +0000 UTC
+2090,CLOSED,Unclear torch model failure message,,2022-05-25 10:02:34 +0000 UTC
+2085,OPEN,MXNet support,enhancement,2020-10-09 18:21:27 +0000 UTC
+2082,CLOSED,Issues with CentOS client build,,2021-10-26 13:09:07 +0000 UTC
+2076,CLOSED,Model auto loading unuseful,,2022-02-25 19:39:42 +0000 UTC
+2068,CLOSED,Failed to build identity_backend, square_backend and repeat_backend,,2020-10-26 16:57:39 +0000 UTC
+2067,CLOSED,the results are in descending order, is it right?,,2020-10-09 01:23:01 +0000 UTC
+2060,CLOSED,how to create .so file for a model,,2020-09-28 16:03:34 +0000 UTC
+2059,CLOSED,Send a raw image file from a client to a Triton server,,2020-10-20 17:07:08 +0000 UTC
+2053,CLOSED,Different GPUs cause huge memory consumption differences,,2020-09-22 15:20:21 +0000 UTC
+2052,CLOSED,the model expects 0 dimensions (shape []) but 0 dimensionsid not allowed,,2020-09-22 15:21:59 +0000 UTC
+2049,CLOSED,How to use custom-backend with Tritonbackend.h and run it on GPU,,2020-10-06 13:34:44 +0000 UTC
+2048,CLOSED,Expected Tuple but got GenericDict,,2021-06-29 14:32:36 +0000 UTC
+2046,CLOSED,Same code working for one postprocess backend but not for another.,,2020-09-23 17:59:28 +0000 UTC
+2040,CLOSED,How to install tritonclient?,,2020-09-22 15:24:32 +0000 UTC
+2036,CLOSED,Separate weights from plan file for TensorRT backend,,2020-09-25 05:06:57 +0000 UTC
+2028,CLOSED,Use-after-free in ensemble scheduler when using legacy custom model,,2020-10-02 22:00:21 +0000 UTC
+2027,CLOSED,Error when using Ceph S3 storage as a model repository,enhancement,2021-05-18 20:28:10 +0000 UTC
+2025,CLOSED,Deploy Detectron2 Mask R-CNN inside Triton,,2022-12-16 11:41:44 +0000 UTC
+2024,CLOSED,How to deploy Detectron2 model using pytorch?,,2022-03-01 02:30:31 +0000 UTC
+2023,CLOSED,unload model not release mem,,2020-09-22 15:38:27 +0000 UTC
+2021,CLOSED,Support input/output compression,,2021-04-30 17:22:14 +0000 UTC
+2020,OPEN,Ability to disable or redirect cout/cerr,enhancement,2021-09-14 17:28:41 +0000 UTC
+2019,CLOSED,Stricter model versioning,,2021-11-12 22:57:54 +0000 UTC
+2018,CLOSED,Control number of threads used by CPU server,,2022-11-08 19:55:51 +0000 UTC
+2014,CLOSED,Ensemble model stuck perf test,,2020-09-17 07:35:41 +0000 UTC
+2008,CLOSED,can't parse path with --model-repository=s3://host:port/demobucket/path,,2020-09-16 02:34:16 +0000 UTC
+2004,CLOSED,Triton Inference Server on Azure,,2020-10-05 18:35:58 +0000 UTC
+2003,CLOSED,oom happens in GTX1080Ti(11GB) but not in RTX2080Ti(11GB),,2020-09-10 18:34:13 +0000 UTC
+2002,CLOSED,TF-TRT model's TRTEngineOP loaded on the first GPU only,,2021-11-12 22:57:31 +0000 UTC
+1996,CLOSED,Datatype difference in model config and HTTP request body,,2021-01-26 01:08:09 +0000 UTC
+1995,CLOSED,Can't get the shape right,,2020-09-11 06:14:37 +0000 UTC
+1994,CLOSED,Loading libimagepreprocess.so get undefined symbol: _ZNK6google8protobuf7Message25InitializationErrorStringEv,,2020-09-10 08:24:43 +0000 UTC
+1993,CLOSED,[enforce fail at operator.cc:76] blob != nullptr. op Cast: Encountered a non-existing input blob: data,,2020-11-16 18:27:16 +0000 UTC
+1990,CLOSED,Whats the equivalent of /api/status?format=json in v2?,,2020-09-09 09:21:59 +0000 UTC
+1989,CLOSED,Unreasonable handler overhead using python client 20.08,,2020-09-09 16:33:59 +0000 UTC
+1988,CLOSED,Invalid argument: unsupported datatype 'TYPE_BYTES' on 20.08,,2020-09-14 16:01:59 +0000 UTC
+1982,CLOSED,the metric with ensemble_scheduling.,,2020-09-04 16:13:30 +0000 UTC
+1972,CLOSED,is there pre-build docker images for jetson?,,2020-09-08 12:45:06 +0000 UTC
+1971,CLOSED,GRPC python client get_model_config,,2020-09-09 18:16:08 +0000 UTC
+1968,CLOSED,trion on kfserving performance,,2020-09-02 20:53:12 +0000 UTC
+1966,CLOSED,How to use dynamic_batching in ensemble for a pipeline,,2020-09-02 15:45:09 +0000 UTC
+1965,CLOSED,How do I use AsyncRun?,,2020-09-02 22:05:59 +0000 UTC
+1964,CLOSED,simplest way to use tensorflow 1.12.0 in triton server 20.07,,2020-09-04 17:18:19 +0000 UTC
+1963,CLOSED,REST API is too slow when using python request API,,2020-09-05 11:23:39 +0000 UTC
+1962,CLOSED,REST API is too slow when using python request API,,2020-09-02 03:00:47 +0000 UTC
+1961,CLOSED,docker install error,,2020-09-01 15:48:12 +0000 UTC
+1960,CLOSED,Missing models to run client examples,,2020-09-10 15:53:52 +0000 UTC
+1959,CLOSED,Custom Backend load custom model files,,2020-09-01 15:56:40 +0000 UTC
+1958,CLOSED,Cannot set Tensorflow backend option "allow-soft-placement" with docker,,2020-09-02 21:41:56 +0000 UTC
+1950,CLOSED,Please add max_queue_delay_microseconds and possibly preferred_batch_size to StrategyDirect,,2020-10-22 21:46:38 +0000 UTC
+1948,CLOSED,:8000/v2/health/live and :8000/v2/health/ready returns 400,,2021-04-10 11:32:00 +0000 UTC
+1947,CLOSED,Tritonbackend.h or Custom.h, unclear ensemble model,,2020-09-01 16:01:44 +0000 UTC
+1940,CLOSED,OOM error API,,2020-10-09 18:25:30 +0000 UTC
+1939,CLOSED,client request error,,2020-08-31 15:51:38 +0000 UTC
+1935,CLOSED,Failed to load 'resnet50_netdef',,2020-09-01 06:05:14 +0000 UTC
+1931,CLOSED,k8s triton cluster error: creating server: Internal - failed to stat file,,2020-08-31 17:47:10 +0000 UTC
+1927,CLOSED,How to safely restart/pullout a triton server in prod env?,,2020-08-25 16:43:27 +0000 UTC
+1925,CLOSED,jetson can not use gpu now?,,2020-08-28 05:39:33 +0000 UTC
+1919,CLOSED,Perf_Client TF Warm-Up Period,,2020-08-24 15:45:52 +0000 UTC
+1915,CLOSED,cudashm.get_contents_as_numpy always using gpu 0,,2020-09-11 20:12:56 +0000 UTC
+1908,CLOSED,can we use inference server for the codes which doesnt have any model ?,,2020-08-31 17:54:44 +0000 UTC
+1907,CLOSED,document empty error while running image client example,,2020-08-31 21:25:25 +0000 UTC
+1906,CLOSED,PyTorch 1.6 support?,,2021-04-19 08:49:26 +0000 UTC
+1905,CLOSED,[libprotobuf FATAL /workspace/build/grpc-repo/src/grpc/third_party/protobuf/src/google/protobuf/repeated_field.h:1193] CHECK failed: (index) < (current_size_): terminate called after throwing an instance of 'google::protobuf::FatalException' what(): CHECK failed: (index) < (current_size_): Aborted (core dumped),,2020-08-21 10:54:29 +0000 UTC
+1904,CLOSED,CPU memory usage constantly increases while doing inference,,2021-04-20 11:39:38 +0000 UTC
+1902,CLOSED,How is the concurrent model execution support feature performs? Is there any benchmark data?,,2020-08-13 18:06:42 +0000 UTC
+1900,CLOSED,No error or warnings when underlying models of ensemble model request doesn't succeed,,2020-08-17 17:35:01 +0000 UTC
+1899,CLOSED,Perf Client Failed while inferencing request on the loaded bert model !!,,2020-08-31 19:01:54 +0000 UTC
+1894,CLOSED,tensorrtserver.api.InferenceServerException: [ 0] expecting 1 invocations of SetRaw for input 'INPUT__0', one per batch entry,,2020-08-13 16:39:46 +0000 UTC
+1893,CLOSED,Example how to work with shared memory in multi-threaded application?,,2020-12-06 13:58:30 +0000 UTC
+1889,CLOSED,Config.pbtxt Issue,,2020-08-11 00:36:15 +0000 UTC
+1888,CLOSED,Docker build fails with ln: target 'libonnxruntime.so' is not a directory,,2020-08-11 01:51:00 +0000 UTC
+1878,CLOSED,Question on perf_client (both client and server version at 20.06),,2020-08-07 02:32:41 +0000 UTC
+1876,CLOSED,Question on dynamic batching and preferred batch size,,2020-08-06 03:18:01 +0000 UTC
+1872,CLOSED,Typo in the v2.0.0 release notes - Jetson Jetpack Support,,2020-08-05 18:44:35 +0000 UTC
+1871,CLOSED,String field 'nvidia.inferenceserver.ModelInferResponse.InferOutputTensor.ParametersEntry.key' contains invalid UTF-8 data when parsing a protocol buffer. Use the 'bytes' type if you intend to send raw bytes.,,2020-08-12 14:30:03 +0000 UTC
+1870,CLOSED,Torchvision ops not compiled with GPU support,,2021-08-04 02:08:38 +0000 UTC
+1869,CLOSED,Trition V2 incorrectly computing number of elements in the batch,,2020-08-07 02:27:55 +0000 UTC
+1861,CLOSED,20.07 pre-built release not working?,,2020-08-03 16:08:03 +0000 UTC
+1856,CLOSED,Are the Triton clients thread safe?,,2020-08-02 16:27:01 +0000 UTC
+1854,CLOSED,TLT Examples,,2020-09-12 15:25:30 +0000 UTC
+1853,CLOSED,Slow ONNX inference,,2020-08-05 16:46:33 +0000 UTC
+1845,CLOSED,Inference with 20.06 using curl,,2020-07-29 15:56:11 +0000 UTC
+1844,CLOSED,Understanding more about triton,,2020-08-04 15:48:18 +0000 UTC
+1839,CLOSED,The useage of GPU memory in TRT environment is pretty more than Python environment for a torch scripte model.,,2020-07-30 10:15:49 +0000 UTC
+1838,CLOSED,GPU support for triton container?,,2020-07-28 15:42:56 +0000 UTC
+1835,CLOSED,Installation of Triton Server with helm chart,,2021-02-08 16:35:25 +0000 UTC
+1834,CLOSED,Perf client output,,2020-07-31 19:19:16 +0000 UTC
+1832,CLOSED,Error in request.raw_input_contents.extend([input_bytes]),,2020-07-27 17:44:55 +0000 UTC
+1829,CLOSED,Docker container won't start due to entrypoint.sh error,,2020-07-24 22:36:39 +0000 UTC
+1827,CLOSED,new v2 api embeds nvidia.inferenceserver as package name in .proto,,2020-07-30 16:01:57 +0000 UTC
+1822,CLOSED,How to send binary tensor data to ensemble model by HTTP request?,,2020-07-24 15:37:31 +0000 UTC
+1821,CLOSED,gRPC communication extremely slow,,2020-08-03 19:01:40 +0000 UTC
+1820,CLOSED,Suggestion: Use distributed flags for commandline passing,,2021-05-10 22:29:57 +0000 UTC
+1814,CLOSED,Jetson Triton v1 to v2 Tensorflow,,2020-07-23 18:11:55 +0000 UTC
+1811,CLOSED,[question] Does it support distributed serving?,,2021-04-22 07:48:37 +0000 UTC
+1810,CLOSED,warm up issue,,2020-07-20 23:17:54 +0000 UTC
+1809,CLOSED,OpenVINO unsupported operation,,2020-07-20 23:21:58 +0000 UTC
+1808,CLOSED,E0720 03:43:07.419723 1 main.cc:1099] error: creating server: INTERNAL - failed to load all models,,2020-08-05 09:37:21 +0000 UTC
+1807,CLOSED,cmake build error,,2020-07-18 17:07:57 +0000 UTC
+1806,CLOSED,model_version not set in inference response unless it is set in request,,2020-07-20 18:50:30 +0000 UTC
+1804,CLOSED,TensorFlow 2 support,,2020-09-01 23:52:10 +0000 UTC
+1803,CLOSED,Python API async_infer http callback,,2021-12-06 17:30:34 +0000 UTC
+1800,CLOSED,Custom backend and generic computer vision algorithms,,2020-07-15 17:22:27 +0000 UTC
+1798,CLOSED,how to stream multisource camera in real-time with trtis,,2020-07-22 03:01:54 +0000 UTC
+1796,CLOSED,Invalid argument: warmup setting expects n bytes,,2022-04-26 08:33:20 +0000 UTC
+1795,CLOSED,Unable to load model config with zero/random data warm up,,2020-07-15 18:26:54 +0000 UTC
+1794,CLOSED,fp16 issue in 20.03,,2020-08-15 07:19:32 +0000 UTC
+1793,CLOSED,Auto switching between GPU and CPU for production environment?,,2020-07-14 02:46:06 +0000 UTC
+1790,CLOSED,Clip to predictions in ensembles,,2020-07-16 16:56:08 +0000 UTC
+1787,CLOSED,CUDA out of memory during inference, not during model loading,,2020-07-14 02:41:11 +0000 UTC
+1786,CLOSED,Choose TensorRT version to use for Triton,,2020-07-14 02:36:33 +0000 UTC
+1778,CLOSED,Mask RCNN TensorRT in Triton,,2020-07-09 17:10:46 +0000 UTC
+1777,OPEN,About Model Encrypted,enhancement,2022-03-15 00:52:19 +0000 UTC
+1776,CLOSED,triton_client.infer(...) error "Received message larger than max (33816626 vs. 4194304)",,2020-07-16 22:42:24 +0000 UTC
+1766,CLOSED,libevhtp patch step fails on sed: extra characters at the end of g command,,2020-07-11 06:18:03 +0000 UTC
+1765,CLOSED,Failed to allocate CUDA memory with byte size 78643200 on GPU 0: CNMEM_STATUS_OUT_OF_MEMORY, falling back to pinned system memory,,2020-07-08 21:26:43 +0000 UTC
+1764,CLOSED,Wrong output for my triton segmentation model,,2020-07-09 04:03:48 +0000 UTC
+1762,CLOSED,Load onnx model error in trtis 19.09 container?,,2020-07-08 15:49:22 +0000 UTC
+1752,CLOSED,TRT run the torchscripte model failed!,,2020-08-04 15:49:06 +0000 UTC
+1749,CLOSED,tensorrtserver.api.InferenceServerException: [ 0] status request did not return status,,2020-08-04 15:49:43 +0000 UTC
+1746,CLOSED,CPU memory slowly increases when reusing an InferContext object for many times,,2021-04-20 11:39:11 +0000 UTC
+1745,CLOSED,tritonclientutils.utils.InferenceServerException: [StatusCode.UNIMPLEMENTED],,2020-11-28 00:33:47 +0000 UTC
+1741,CLOSED,Changing batch sizes when using cuda shared memory,,2020-07-08 07:38:04 +0000 UTC
+1733,CLOSED,Loading of models from GCS is prohibitively slow,,2020-08-04 15:51:06 +0000 UTC
+1724,CLOSED,Triton container not loading the latest version of model when S3 location is updated,,2020-07-08 19:30:09 +0000 UTC
+1723,CLOSED,Why does the client scale the image nparray with particular models?,,2020-07-02 20:47:06 +0000 UTC
+1722,CLOSED,Can't download image 20.03.1-py3 on Kind k8s 1.15.11,,2020-08-04 15:51:55 +0000 UTC
+1718,CLOSED,S3 custom model load is not working,,2020-07-24 16:12:07 +0000 UTC
+1714,CLOSED,Docker build error,,2020-06-25 23:48:14 +0000 UTC
+1708,CLOSED,Memory leak in get_async_run_results (when async call and error),,2020-07-01 23:46:37 +0000 UTC
+1707,CLOSED,ONNX->TensorRT model fails supporting multiple batch sizes,,2020-07-21 07:55:14 +0000 UTC
+1706,CLOSED,Broader input formatting supported,,2020-08-04 15:55:39 +0000 UTC
+1705,CLOSED,WSL2 + CUDA,,2021-12-29 18:36:40 +0000 UTC
+1693,CLOSED,async is too slow on http,,2020-08-04 15:57:23 +0000 UTC
+1688,CLOSED,difference inferencing result between http v1 and grpc v2,,2020-06-22 04:01:53 +0000 UTC
+1683,CLOSED,Client build fails when excluding GPU support,,2020-06-25 16:00:27 +0000 UTC
+1682,CLOSED,A shape problem,,2020-06-22 17:19:32 +0000 UTC
+1665,CLOSED,GRPC v2 low bandwidth,,2020-06-29 16:00:43 +0000 UTC
+1663,CLOSED,only one host thread launch CUDA kernel in 20.03,,2021-01-26 01:09:03 +0000 UTC
+1662,CLOSED,Fine grained Division,,2020-06-15 15:15:51 +0000 UTC
+1661,CLOSED,instance_group and --concurrency-range,,2020-06-22 17:19:55 +0000 UTC
+1660,CLOSED,Variable-length, row-based, TF Example and TF ExampleListWithContext support,,2021-09-08 20:23:34 +0000 UTC
+1653,CLOSED,Cuda shared memory support for Custom Backends,,2020-06-11 18:03:30 +0000 UTC
+1649,CLOSED,REST API V2 call is shutting down server,,2023-01-11 08:31:23 +0000 UTC
+1648,CLOSED,Error when updating context stat,,2020-06-22 17:20:15 +0000 UTC
+1647,CLOSED,using LOG_VERBOSE(L) in custom backend not work,,2020-06-10 15:58:56 +0000 UTC
+1644,CLOSED,Distribute model across multiple GPUs,,2020-08-14 15:45:38 +0000 UTC
+1637,CLOSED,Preformance seems to be poor in benchmark.,,2020-06-19 15:49:45 +0000 UTC
+1636,CLOSED,Memory leak when running simple_cuda_shm_client?,,2020-06-29 18:05:34 +0000 UTC
+1633,CLOSED,How to solve "no next action, trigger OnComplete()" plz?,,2020-06-09 00:39:42 +0000 UTC
+1632,CLOSED,/opt/tritonserver/nvidia_entrypoint.sh: line 93: exec: --: invalid option,,2020-06-08 15:50:48 +0000 UTC
+1625,CLOSED,Transfer Learning Toolkit,,2021-09-08 20:20:36 +0000 UTC
+1624,CLOSED,Tritonhttpclient should support https,enhancement,2020-06-19 03:33:46 +0000 UTC
+1621,CLOSED,Is triton inference server same with TRTIS?,,2020-06-05 16:23:56 +0000 UTC
+1609,CLOSED,which gpu will instance model exist if not set gpus:[0],,2020-06-05 16:29:58 +0000 UTC
+1598,CLOSED,Release dates 20.05 and 20.06,,2020-06-09 22:08:06 +0000 UTC
+1593,CLOSED,Problems using Pyrthon API to access the triton server,,2020-06-02 15:56:53 +0000 UTC
+1589,CLOSED,Troubleshooting "CUDA Driver unavailable",,2020-06-02 16:37:24 +0000 UTC
+1588,CLOSED,docker run triton-inference-server error when I replace the models with my own.,,2020-06-02 16:38:13 +0000 UTC
+1578,CLOSED,Differences between the result of Triton Inference Server and mere GPU,,2021-03-01 05:49:21 +0000 UTC
+1577,CLOSED,Unable to build tritonserver_client through Dockerfile.client,,2020-06-25 17:33:45 +0000 UTC
+1567,CLOSED,instance_group issue,,2020-05-28 20:14:01 +0000 UTC
+1566,CLOSED,Compatibility of Instance Groups Setting,,2020-05-28 20:46:14 +0000 UTC
+1557,CLOSED,Strange GPU usage on Jetson Nano,,2020-05-29 10:13:39 +0000 UTC
+1556,CLOSED,tensorflow 1.13.1 model and triton server 20.03,,2020-06-02 17:37:03 +0000 UTC
+1555,CLOSED,max_workspace_size_bytes can't set in config.pbtxt,,2020-05-28 20:07:54 +0000 UTC
+1554,CLOSED,Docker Image Not Found,,2020-05-27 16:15:55 +0000 UTC
+1546,CLOSED,Successfully loaded torchscript model failed with "CUDA error: CUBLAS_STATUS_NOT_INITIALIZED" when called for inference,,2020-06-22 19:25:53 +0000 UTC
+1545,CLOSED,tensorrtserver.api.InferenceServerException: [inference:0 49] unexpected shape for output 'features', model configuration shape is [1,128], inference shape is [1,128],,2020-05-27 07:35:09 +0000 UTC
+1544,CLOSED,perfclient shows weirdly low throughput compared to client application,,2020-06-12 03:22:08 +0000 UTC
+1543,CLOSED,GRPC client failed / HTTP client failed,,2020-06-11 19:00:36 +0000 UTC
+1534,CLOSED,regex incorrect defined for s3 path.,,2020-05-28 18:28:10 +0000 UTC
+1529,CLOSED,TritonRT server on EKS not being able to read from AWS S3,,2020-05-22 20:44:04 +0000 UTC
+1528,CLOSED,How to support models with both CPU and GPU?,,2020-06-25 17:32:52 +0000 UTC
+1524,OPEN,SSD_MobileNetv1_COCO label_filename incorrect classification,enhancement,2022-08-10 06:40:43 +0000 UTC
+1522,CLOSED,XLA Warmup for Multiple Instances and Batch Sizes,,2021-09-08 20:16:14 +0000 UTC
+1521,CLOSED,GRPC Configuration Documentation, Defaults, and Examples,,2020-05-26 21:08:43 +0000 UTC
+1517,CLOSED,Run on Jetson - continuation,,2020-05-21 08:33:27 +0000 UTC
+1516,CLOSED,parameters in output tensor,,2020-05-20 19:50:48 +0000 UTC
+1515,CLOSED,Provide steps for compiling PyTorch custom-ops library against the nightlies,,2020-05-26 21:56:40 +0000 UTC
+1514,CLOSED,Document steps for how to compile Pytorch custom-ops library for running with Triton,,2020-05-21 23:13:33 +0000 UTC
+1510,CLOSED,Docker Error for nvcr.io/nvidia/deepstream:5.0-dp-20.04-triton,,2020-05-19 21:50:34 +0000 UTC
+1507,CLOSED,General observation of Triton Server OOMs.,,2021-09-30 18:58:31 +0000 UTC
+1505,CLOSED,Compilation error using C++ client library on Win10 when doing inference with batch_size > 1,,2020-05-19 18:53:14 +0000 UTC
+1502,CLOSED,Multiple Inputs Model Failing: Failed to update context stat: [ 0] INTERNAL - Timer not set correctly.,,2020-05-22 23:29:50 +0000 UTC
+1501,CLOSED,How to use label_filename,,2021-06-27 13:23:00 +0000 UTC
+1500,CLOSED,Running inference with Trits is much slower than running model Pytorch directly,,2020-06-02 18:15:20 +0000 UTC
+1499,CLOSED,Facing OOM while running TF models,,2020-05-21 18:58:59 +0000 UTC
+1495,CLOSED,HTTP Response Occasionally Cut Off,,2020-06-25 17:23:21 +0000 UTC
+1485,CLOSED,Allow Triton to report all health metrics at startup,,2020-05-14 18:09:47 +0000 UTC
+1483,CLOSED,where is the TRTSERVER_Server definition?,,2020-05-14 07:48:57 +0000 UTC
+1477,CLOSED,Onnx GPU inference on GKE not possible,,2020-05-15 23:51:44 +0000 UTC
+1476,CLOSED,[Question]support ONNX opset 11,,2020-05-13 14:06:55 +0000 UTC
+1472,CLOSED,Enable AMP without any TensorRT optimization for Tensorflow in the inference server.,,2020-07-22 15:54:02 +0000 UTC
+1471,CLOSED,predict failed with tensorrtserver-1.12.0,,2020-05-14 01:45:51 +0000 UTC
+1468,CLOSED,Run on Jetson,,2022-09-26 23:09:14 +0000 UTC
+1458,CLOSED,AttributeError: function 'SharedMemoryControlContextCudaRegister' not found (Windows 10),,2020-05-14 15:09:36 +0000 UTC
+1454,CLOSED,saved model: Expected image, got empty file,,2020-05-12 10:01:24 +0000 UTC
+1453,CLOSED,failed to build triton-client,,2020-05-15 06:34:45 +0000 UTC
+1441,CLOSED,libpytorch backend occupies more gpu memory than pytorch,,2020-05-08 18:07:50 +0000 UTC
+1440,CLOSED,How to size how many models can be served simultaneously.,,2020-05-13 16:06:24 +0000 UTC
+1439,CLOSED,Sending some metrics from triton container,,2020-05-08 15:48:25 +0000 UTC
+1429,CLOSED,Improve RapidJSON support in build,enhancement,2021-11-29 21:02:36 +0000 UTC
+1428,CLOSED,Don't require curl CMake config,,2020-08-11 16:13:51 +0000 UTC
+1427,CLOSED,Thread safety in channel map,,2020-07-11 00:17:43 +0000 UTC
+1426,CLOSED,Support https connections,,2020-07-16 15:11:18 +0000 UTC
+1425,CLOSED,Support client-side deadlines/timeouts for AsyncRun callback,,2020-07-11 00:17:43 +0000 UTC
+1423,CLOSED,Windows 10 Triton Client Build Does Not Generate a .whl file,,2020-05-11 18:54:22 +0000 UTC
+1415,CLOSED,fasterrcnn_resnet50_fpn TorchScript model cannot be loaded,,2021-06-29 16:55:47 +0000 UTC
+1391,CLOSED,Reduce Docker Image Size,,2020-06-15 15:25:03 +0000 UTC
+1390,CLOSED,how to send input request to the loaded bert_pt model,,2020-05-18 20:34:22 +0000 UTC
+1385,CLOSED,Allow customization of cuda sync behavior,enhancement,2020-06-17 18:18:55 +0000 UTC
+1378,CLOSED,how to make TensorRT Optimization?,,2020-05-08 16:34:11 +0000 UTC
+1377,CLOSED,Server fails to load tensorflow_savedmodel when using s3 model repository,,2020-05-08 15:36:21 +0000 UTC
+1372,CLOSED,perf_client batchsize -b issue,,2020-05-08 16:35:15 +0000 UTC
+1371,OPEN,Is there any plan to support new pyTorch(==1.5) model archive format?,enhancement,2020-11-30 19:33:23 +0000 UTC
+1370,CLOSED,How to free GPU memory in perf_client,,2020-05-08 16:36:09 +0000 UTC
+1369,CLOSED,Error serving PyTorch image crowd counting model,,2020-05-02 17:12:14 +0000 UTC
+1368,CLOSED,How to use dynamic variable size on multiple dimension?,,2020-05-07 07:26:41 +0000 UTC
+1367,CLOSED,config.pbtxt should not contain any directory name,enhancement,2020-08-26 22:03:04 +0000 UTC
+1366,CLOSED,Version number 01 fails to load.,,2020-04-29 21:10:27 +0000 UTC
+1361,CLOSED,fp16 c++ support example,,2020-04-28 03:26:15 +0000 UTC
+1360,CLOSED,How to pass scalar value via input - MTCNN,,2020-04-30 17:20:24 +0000 UTC
+1358,OPEN,Is there way to log IP addresses of http or https requests ?,enhancement,2020-10-08 06:03:33 +0000 UTC
+1352,CLOSED,Allow the use of different SavedModel signature_def,,2021-05-19 21:43:12 +0000 UTC
+1351,CLOSED,OOMs on enabling multiple TF MaskRCNN models with FP16 optimization,,2020-05-19 16:33:58 +0000 UTC
+1350,CLOSED,is it possible to downgrade to cuda-10.1?,,2020-04-23 00:34:48 +0000 UTC
+1349,CLOSED,batch size >1 cann't speed in inference,,2020-04-30 17:18:32 +0000 UTC
+1340,CLOSED,Wrong CUDA buffer size used in addsub.cu AllocateCudaBuffers() when reading payloads,,2020-04-27 15:51:27 +0000 UTC
+1338,CLOSED,unable to load model 'bert', tensor 'input_ids': the model expects 1 dimensions but the model configuration specified 2 dimensions,,2020-04-28 11:36:37 +0000 UTC
+1337,CLOSED,Raw binary data order of multiple inputs batch request,,2020-05-08 16:36:42 +0000 UTC
+1331,CLOSED,Passing S3 Credentials as Env Vars,enhancement,2020-05-01 22:30:06 +0000 UTC
+1329,CLOSED,Is there a Rest API version of Triton Client ?,,2020-04-19 17:58:06 +0000 UTC
+1328,CLOSED,Got a problem in autofilling model config,,2020-04-23 17:38:06 +0000 UTC
+1327,CLOSED,Question: Metrics - inferences per minute,,2020-04-17 10:15:08 +0000 UTC
+1326,CLOSED,the return value of the function named 'parse_model_http' in v2_image_client.py,,2020-04-21 19:06:09 +0000 UTC
+1321,CLOSED,Core dump when all CUDA-capable devices are busy or unavailable,,2020-05-15 23:46:09 +0000 UTC
+1313,CLOSED,fasterrcnn_resnet50_fpn TorchScript model cannot be loaded,,2020-04-21 19:06:23 +0000 UTC
+1311,CLOSED,error: creating server: INTERNAL - failed to load all models,,2021-12-10 23:55:48 +0000 UTC
+1305,CLOSED,How to send HTTP request to resnet50_netdef model using curl?,,2020-04-15 12:08:39 +0000 UTC
+1301,CLOSED,Unable to use TensorRT Execution Accelerator for ONNX Model,,2020-04-13 15:38:05 +0000 UTC
+1299,CLOSED,Trtsever crashes !!,,2020-04-16 16:44:55 +0000 UTC
+1297,CLOSED,unable to load model 'face_graphdef', configuration expects 1 inputs, model provides at most 0,,2020-04-15 06:01:02 +0000 UTC
+1295,CLOSED,TRITON cannot access MINIO,,2020-04-10 18:21:26 +0000 UTC
+1291,CLOSED,Spammy dynamic batching logs on 20.03,,2020-04-15 00:58:38 +0000 UTC
+1285,CLOSED,Error when input define as identity,,2020-04-09 01:30:47 +0000 UTC
+1284,CLOSED,unexpected shape for output when output is 4 dims,bug,2020-04-13 07:42:13 +0000 UTC
+1276,CLOSED,Model loaded but no input,,2020-04-15 01:02:02 +0000 UTC
+1274,CLOSED,Question: Will Stream endpoint increase GPU utilization?,,2020-04-06 09:57:03 +0000 UTC
+1273,CLOSED,GRPC version update,,2020-04-06 16:21:36 +0000 UTC
+1272,CLOSED,C# .NET,,2021-09-08 20:12:49 +0000 UTC
+1271,CLOSED,Encountered a non-existing input blob,,2020-04-10 09:33:16 +0000 UTC
+1270,CLOSED,Question: Performance difference with Tensorflow Serving,,2020-04-15 19:46:14 +0000 UTC
+1269,CLOSED,Incorrect configuration on DockerFile,,2020-04-07 07:35:17 +0000 UTC
+1264,CLOSED,How to to use model converted by torch2trt,,2020-04-07 23:37:47 +0000 UTC
+1254,CLOSED,provide a way to get trtis's version info,,2020-05-28 20:36:21 +0000 UTC
+1247,CLOSED,simple_perf_client inference size usage,,2020-03-31 15:44:37 +0000 UTC
+1246,CLOSED,Does NOT support M40(maxwell) GPU any more?,,2020-04-01 01:09:13 +0000 UTC
+1241,CLOSED,ONNX-exported torchvision FasterRCNN fails on inference request,,2020-04-16 18:28:30 +0000 UTC
+1240,CLOSED,TF 2.1 SavedModel Format : unexpected input format FORMAT_NONE, expecting FORMAT_NHWC or FORMAT_NCHW,,2022-08-29 23:15:53 +0000 UTC
+1239,CLOSED,How to transform caffe2 .pb model files to caffe2 netdef model files ?,,2020-04-09 01:58:35 +0000 UTC
+1236,CLOSED,TorchScript model -Failed to update context stat: [ 0] INVALID_ARG - Timer not set correctly.,,2020-03-27 20:31:47 +0000 UTC
+1231,CLOSED,Batched model works only with single instance, when sending parallel inference requests using non-cuda shared memory (works on 19.12),,2020-03-29 12:11:30 +0000 UTC
+1222,CLOSED,Incorrect last_inference_timestamp_milliseconds values in api/status output in 1.10.0,,2020-03-31 21:33:15 +0000 UTC
+1214,CLOSED,docker build error,,2020-03-23 15:28:52 +0000 UTC
+1213,CLOSED,unexpected size for output,,2020-04-15 01:00:21 +0000 UTC
+1209,CLOSED,perf_client does not support binary (image) data as TYPE_STRING?,,2020-04-03 17:32:24 +0000 UTC
+1204,CLOSED,C++ Image client for object detection,,2020-03-20 22:54:40 +0000 UTC
+1203,CLOSED,TRT inference server does not start in non-GPU mode,,2020-03-23 07:52:15 +0000 UTC
+1201,OPEN,Pruning of requested outputs in ensemble models,enhancement,2020-04-16 16:01:29 +0000 UTC
+1194,CLOSED,ensemble model question and model priority,,2020-12-10 18:48:43 +0000 UTC
+1188,CLOSED,Access to shared memory & CUDA shared memory through HTTP REST API,,2020-03-16 16:31:05 +0000 UTC
+1184,CLOSED,Accuracy difference between implementations of example image client,,2020-03-24 16:14:33 +0000 UTC
+1178,CLOSED,Cannot build TRTIS clients from on Ubuntu 18.04 docker container.,,2020-05-05 14:30:55 +0000 UTC
+1177,CLOSED,Can't build with glibc 2.30,,2020-03-12 15:12:11 +0000 UTC
+1174,CLOSED,How to make dynamic requests?,,2020-03-09 16:28:02 +0000 UTC
+1173,CLOSED,S3 file system should support remote minio server as well,,2020-03-12 04:56:02 +0000 UTC
+1172,CLOSED,Any examples on stateful models?,,2020-12-29 11:19:12 +0000 UTC
+1171,CLOSED,Error "NotImplementedError: memoryview: unsupported format <b",,2021-09-22 19:37:23 +0000 UTC
+1158,CLOSED,not getting same performance as in perf_client,,2020-03-24 20:21:24 +0000 UTC
+1157,CLOSED,20.02 model load err,,2020-03-08 18:01:58 +0000 UTC
+1154,CLOSED,Error when running TRTIS-20.01 built from source code--"Intel MKL FATAL ERROR: Cannot load libmkl_intel_thread.so",,2020-03-04 21:14:20 +0000 UTC
+1153,CLOSED,Multiple instance of the same model will load multiple copies of the model data in memory,,2020-03-06 15:23:31 +0000 UTC
+1151,CLOSED,How to use perf_client on ensemble models and sequence models,enhancement,2020-04-03 17:38:05 +0000 UTC
+1149,CLOSED,Is there any way to access the build script except pulling the image?,,2020-03-08 18:08:18 +0000 UTC
+1148,CLOSED,branch r20.02 ubuntu 1604 build error,,2020-03-06 18:35:07 +0000 UTC
+1147,CLOSED,HTTP API: Access-Control-Allow-Origin etc.,,2022-08-19 20:59:22 +0000 UTC
+1144,CLOSED,How to send multiple input tensor by rest api,,2020-04-16 12:37:59 +0000 UTC
+1140,CLOSED,Safeguard one model from failures in others,,2020-03-08 18:08:48 +0000 UTC
+1138,CLOSED,perf_client unable to allocate memory on gpu,bug,2020-03-02 17:16:57 +0000 UTC
+1129,CLOSED,Can V2 APIs client satisfied these requirement ?,,2020-05-09 03:20:36 +0000 UTC
+1126,CLOSED,Ensemble model only delivers the first of its configured outputs,,2020-02-24 17:52:50 +0000 UTC
+1125,CLOSED,"Error details: OK" error message from autofill.cc,,2020-02-21 13:11:48 +0000 UTC
+1123,CLOSED,Problem when running sequence models,,2020-09-25 12:57:29 +0000 UTC
+1119,CLOSED,Shared memory pre-allocation,,2020-02-21 22:23:30 +0000 UTC
+1115,CLOSED,what is the motivation for v2 refactory?,,2020-02-19 08:43:32 +0000 UTC
+1114,CLOSED,How to get the versioned model path when implementing a custom backend (Context),,2020-02-24 18:08:57 +0000 UTC
+1113,CLOSED,clientsdk: gnutls "non-properly terminated" error in git clone interrupts build (v1.10.0),,2020-02-18 21:59:22 +0000 UTC
+1112,CLOSED,Error while loading shared libraries: libtensorflow_framework.so.1,,2020-02-27 11:11:59 +0000 UTC
+1106,CLOSED,Concurrency problem due to allocation of output buffers on device,bug,2020-03-18 18:05:38 +0000 UTC
+1105,CLOSED,TRTIS failed to load 'trt model: unexpected configuration maximum batch size 64 for 'resnet50_trt_0_gpu0', model maximum is 1 as model does not contain an implicit batch dimension nor the explicit batch-dimension of 'gpu_0/data_0' is a wildcard,,2020-03-24 20:21:45 +0000 UTC
+1102,CLOSED,Memory Leak on 20.02 ver,,2020-03-02 21:57:54 +0000 UTC
+1100,CLOSED,golang client input of TYPE_STRING,,2020-02-18 03:18:06 +0000 UTC
+1099,CLOSED,torch/script.h: No such file or directory,,2020-02-11 04:05:46 +0000 UTC
+1098,CLOSED,problem about using pytorch_libtorch platform,,2020-02-11 08:35:14 +0000 UTC
+1097,CLOSED,http,,2020-02-10 16:54:25 +0000 UTC
+1096,CLOSED,custom backend with TYPE_STRING output?,,2020-02-10 17:07:29 +0000 UTC
+1082,CLOSED,nv_gpu_memory_used_bytes metric does not decrease on model unload,,2020-02-04 16:24:21 +0000 UTC
+1081,CLOSED,Build TRTIS by enabling trace, the inference time descreased 20%,,2020-02-03 02:54:32 +0000 UTC
+1080,CLOSED,trtserver uses more than 20 CPUs,,2020-03-08 02:53:27 +0000 UTC
+1063,CLOSED,Add support for unidirectional streaming inference,,2020-02-03 18:17:15 +0000 UTC
+1062,CLOSED,Thread-safety of CustomGetNextInputFn and CustomGetOutputFn,,2020-02-03 18:18:11 +0000 UTC
+1061,CLOSED,Multi-Process Server (MPS),,2022-08-22 22:22:27 +0000 UTC
+1055,CLOSED,ensemble scheduler reshape not working as expected,,2020-01-24 18:53:34 +0000 UTC
+1054,CLOSED,No postprocessing in case of aync call in image_client example,,2020-01-24 18:15:10 +0000 UTC
+1053,CLOSED,What does userp mean?,,2020-01-24 03:51:26 +0000 UTC
+1044,CLOSED,exceeds maximum batch size,,2020-01-22 22:39:05 +0000 UTC
+1043,CLOSED,Where to find the model def for image_preprocess_nchw_3x224x224_inception?,,2020-11-12 02:14:22 +0000 UTC
+1042,CLOSED,Is it possible to compile a version for MACOS?,,2021-02-16 18:31:35 +0000 UTC
+1041,CLOSED,Reshape 's requested shape is incorrect,,2020-02-03 09:03:59 +0000 UTC
+1040,CLOSED,ONNX multi dynamic_axes cause error,,2020-03-31 16:49:46 +0000 UTC
+1038,CLOSED,What format is the status in?,,2020-01-19 22:55:29 +0000 UTC
+1032,CLOSED,How to transfer request to sequence model on trtis by http?,,2020-01-19 01:51:58 +0000 UTC
+1029,CLOSED,Tensorflow 2.0 models with TRTIS,,2020-03-20 19:46:40 +0000 UTC
+1026,CLOSED,How can we share variables between distinct custom backends in the ensemble-model ?,,2020-01-19 13:54:06 +0000 UTC
+1025,CLOSED,No error when asking for invalid TensorRT output tensor name,,2020-01-24 18:49:42 +0000 UTC
+1024,CLOSED,Error in dlopen or dlsym: libthnvrtc.so,,2020-01-17 02:45:12 +0000 UTC
+1023,CLOSED,local s3 storage,,2020-02-03 21:33:28 +0000 UTC
+1021,CLOSED,How to optimize config file for MaskRCNN trt model on V100?,,2020-02-03 18:25:33 +0000 UTC
+1020,CLOSED,Failed to deserialize trt model.,,2020-01-16 12:32:42 +0000 UTC
+1019,CLOSED,deploying model using trtis is much slower than using frozen model directly,,2020-02-03 18:24:52 +0000 UTC
+1018,CLOSED,same code. run ok on version 19.04 but cannot run in version 19.11 or 19.12,,2020-03-16 16:29:19 +0000 UTC
+1015,CLOSED,maskrcnn-benchmark pytorch model error,,2020-01-16 21:18:55 +0000 UTC
+1010,CLOSED,How to parse NV-InferResponse header fastly in HTTP response?,,2020-01-08 23:57:55 +0000 UTC
+1009,CLOSED,TRTIS support TensorRT7,,2020-01-09 16:18:56 +0000 UTC
+1001,CLOSED,CPU memory grows up while using CUDA shared memory,,2020-01-13 04:24:34 +0000 UTC
+999,CLOSED,How to send integer string to inference server by http?,,2021-12-30 04:11:16 +0000 UTC
+998,CLOSED,SavedModel load sees different input tensor shape than exists in the model,,2020-01-08 20:09:32 +0000 UTC
+992,CLOSED,GPU memory didn't clean up as expected,,2020-01-15 16:47:58 +0000 UTC
+991,CLOSED,Is tensorrt-inference-server support savedmodel model that include sparse tensor input?,,2020-01-03 22:25:11 +0000 UTC
+990,CLOSED,Is InferContext in python/c++ api related to cuda context?,,2019-12-31 01:50:50 +0000 UTC
+989,CLOSED,How to use predict interface for savedmodel that inputs have 'coo_sparse' what is SparseTensor?,,2019-12-31 01:42:21 +0000 UTC
+988,CLOSED,Ensembling : Python custom operation,,2019-12-30 18:10:13 +0000 UTC
+987,CLOSED,What is the reason not to use the tensorflow serving?,,2020-02-27 17:48:56 +0000 UTC
+982,CLOSED,Questions Regarding Failures,,2020-01-03 15:07:09 +0000 UTC
+981,CLOSED,The size of the input dimensions that correspand for each batch must be equal?,,2019-12-27 01:38:08 +0000 UTC
+980,CLOSED,Requests support priority and timeout settings,enhancement,2020-03-03 19:00:55 +0000 UTC
+979,CLOSED,/api/status call before initialization causes segfault?,,2020-02-10 17:17:20 +0000 UTC
+976,CLOSED,Does the model support two variable size input dimensions(exclude batch dimension)?,,2019-12-20 02:06:28 +0000 UTC
+975,CLOSED,Curl request error in tensorrt-inference-server (Infer failed: unexpected size for input 'im_info', expecting 12 bytes for model 'ssd_model2),,2020-01-02 07:31:43 +0000 UTC
+971,CLOSED,I got a problem when I use trtis client,,2020-02-03 18:26:15 +0000 UTC
+968,CLOSED,Allow batch size 0 in ensembles,,2022-06-01 23:24:18 +0000 UTC
+967,CLOSED,Unexpected size for input,,2019-12-19 10:33:14 +0000 UTC
+964,CLOSED,libopencv_imgcodecs.so.3.2 was not found when I tried Example Custom Backend,,2019-12-19 17:54:02 +0000 UTC
+963,CLOSED,No Active Model,,2019-12-30 19:56:28 +0000 UTC
+958,CLOSED,Error from tensorflow_savedmodel,,2019-12-17 12:50:15 +0000 UTC
+944,CLOSED,What are the hardware requirements for trtis19_09，and tritis19_05?,,2019-12-10 11:08:58 +0000 UTC
+939,CLOSED,tensorrtserver:19.09-py3+tensorflow1.14.0 encounter error,,2019-12-10 17:50:43 +0000 UTC
+924,CLOSED,How to set batch size in onnx model,,2019-12-02 08:44:24 +0000 UTC
+923,CLOSED,TensorFlow 2 support,,2020-03-08 18:05:56 +0000 UTC
+922,CLOSED,Low GPU util without docker,,2020-01-06 16:33:46 +0000 UTC
+915,CLOSED,Error with Sequence Batch,,2019-12-30 19:56:47 +0000 UTC
+914,CLOSED,Server Queue,enhancement,2023-06-22 06:42:40 +0000 UTC
+911,CLOSED,Cannot load onnx model,,2019-12-02 21:26:59 +0000 UTC
+906,CLOSED,Dynamic batch scheduling in ensemble models,,2022-05-23 13:10:49 +0000 UTC
+903,CLOSED,Request batch size greater than max_batch_size,,2019-11-21 20:35:01 +0000 UTC
+900,CLOSED,pytorch bert model error,,2020-02-20 00:08:42 +0000 UTC
+899,CLOSED,Docs provided to create local cmake build trt inference server with Tensorflow not working properly or don't have enough information to build the required builds,,2019-12-16 23:58:03 +0000 UTC
+892,CLOSED,Allocations for TensorRT models with dynamic batch size are much too large,,2019-11-20 21:11:16 +0000 UTC
+891,OPEN,Separate static / shared client lib dependencies,enhancement,2020-03-08 18:15:37 +0000 UTC
+889,CLOSED,aws s3 model_repository error,,2020-11-24 19:53:06 +0000 UTC
+880,CLOSED,dynamic batch size parameter not recognized by python api,,2019-12-30 19:58:08 +0000 UTC
+877,CLOSED,network with output dims [ 1 ] , how to define config.pbtxt output field,,2019-11-18 23:58:14 +0000 UTC
+874,CLOSED,Can't get hidden layer as output?,,2019-12-09 12:34:00 +0000 UTC
+870,CLOSED,Parameters support for ensemble model during execution time,,2021-04-19 02:51:06 +0000 UTC
+868,CLOSED,Do you have a web_UI to manage models ?,,2019-11-13 16:28:41 +0000 UTC
+867,CLOSED,any plan for client sdk in other languages?,,2020-01-11 13:01:43 +0000 UTC
+866,CLOSED,[Question] Best way to upscale video using TRTIS?,,2019-11-18 23:52:31 +0000 UTC
+863,CLOSED,How to select the OpenVino execution provider for ONNX runtime backend in configuration file? Could you please provide an example?,,2019-11-13 03:25:23 +0000 UTC
+862,CLOSED,How to select the OpenVino execution provider for ONNX runtime backend in configuration file? Could you please provide a example?,,2019-11-13 03:10:33 +0000 UTC
+859,CLOSED,Warning: Explicit batch network detected and batch size specified, use enqueue without batch size instead.,,2021-07-19 06:35:19 +0000 UTC
+857,CLOSED,python not found,,2019-11-12 16:15:34 +0000 UTC
+855,CLOSED,Op type not registered 'BatchMatMulV2' in binary running,,2019-11-18 23:51:48 +0000 UTC
+848,CLOSED,There are some bugs in ICaffeParser,,2019-11-11 21:39:53 +0000 UTC
+843,CLOSED,HTTP allocation failed,,2019-11-10 15:50:42 +0000 UTC
+842,CLOSED,Confusing error message on loading and checking SequenceControl information.,,2019-12-18 17:58:40 +0000 UTC
+838,CLOSED,Tensorflow backend for Jetson,,2019-11-07 21:37:06 +0000 UTC
+837,CLOSED,Improve error messages: backend not installed -> Segfault,bug,2019-11-15 20:24:42 +0000 UTC
+819,CLOSED,Unable to run optimized bert from TensorRT python,,2021-03-21 23:32:49 +0000 UTC
+808,CLOSED,RNN in ONNX model does not give correct output with batch_size > 1,,2019-11-05 14:56:49 +0000 UTC
+796,CLOSED,unknown output name 'OUTPUT__0' for pytorch model and ctx.get_server_status() get protobuf error,,2019-11-07 01:20:51 +0000 UTC
+794,CLOSED,Build Error with CMAKE,,2019-10-28 21:44:57 +0000 UTC
+786,CLOSED,Unable to run optimized BERT: model shape expected by framework [-1,-1] doesn't match model configuration shape [-1,-1],,2019-10-28 23:56:20 +0000 UTC
+776,CLOSED,Could I get request access log instead of starting server with --verbose-log=1?,enhancement,2023-07-11 21:14:55 +0000 UTC
+775,CLOSED,How to config input has shape: <unknown> ?,,2019-10-24 16:28:29 +0000 UTC
+772,CLOSED,Provide test harness for custom backends,,2019-10-25 15:22:38 +0000 UTC
+768,CLOSED,Multiple instance limited with 50% utilization,,2019-10-31 18:48:44 +0000 UTC
+767,CLOSED,About Custom Plugin,,2020-03-30 16:00:57 +0000 UTC
+760,CLOSED,build error with cmake on version r19.09,,2019-10-25 23:12:33 +0000 UTC
+754,CLOSED,Dynamic batching - don't discard built models,,2019-10-14 18:21:28 +0000 UTC
+750,CLOSED,Can trt-server support TVM exported optimized libcode to run on datacenter?,,2019-10-11 16:52:38 +0000 UTC
+749,CLOSED,savemodel error: creating server: INTERNAL - failed to open text file for read /models/pb_version18_batch_FP16_TRT_docker_32/config.pbtxt: No such file or directory,,2019-10-14 16:51:01 +0000 UTC
+748,CLOSED,savemodel error: creating server: INTERNAL - failed to open text file for read /models/pb_version18_batch_FP16_TRT_docker_32/config.pbtxt: No such file or directory,,2019-10-10 15:38:13 +0000 UTC
+746,CLOSED,[Documentation] Build tensorflow library from source lacks needed scripts,,2019-10-14 17:20:02 +0000 UTC
+745,CLOSED,CUDNN_STATUS_MAPPING_ERROR,,2019-10-30 23:06:42 +0000 UTC
+744,CLOSED,librequest.so undefined reference to `nvidia::inferenceserver::InferResponse::~InferResponse()' in C++ implementing,,2019-10-14 17:23:26 +0000 UTC
+736,CLOSED,Integration with DALI-based custom backend for preprocessing?,,2020-09-30 17:51:50 +0000 UTC
+735,CLOSED,trtis loads model but does not detect input/output nodes,,2019-10-14 16:46:36 +0000 UTC
+727,CLOSED,failed to load Tensor shape expected by framework [] doesn't match model configuration shape [-1],,2020-04-28 16:28:00 +0000 UTC
+714,CLOSED,Metrics on Jetson TX2 do not contain nv_gpu information,,2019-10-14 17:24:09 +0000 UTC
+708,CLOSED,Feature request: model warmup,enhancement,2021-09-22 16:14:27 +0000 UTC
+706,CLOSED,Loading tftrt optimized models,,2019-10-04 21:07:11 +0000 UTC
+705,CLOSED,Update Protobuf dependencies,,2019-10-02 13:45:08 +0000 UTC
+704,CLOSED,Can you help me how to freeze_graph right pb,,2019-09-30 15:59:09 +0000 UTC
+703,CLOSED,Building clients fails on Raspberry Pi,,2019-10-14 17:25:55 +0000 UTC
+698,CLOSED,Documentation missing for the C++ API with latest release,,2019-09-27 21:13:54 +0000 UTC
+684,CLOSED,How to retrieve output tensor values (C++ Api),,2019-09-26 10:35:42 +0000 UTC
+679,CLOSED,GDB cannot access memory,,2019-09-20 23:28:07 +0000 UTC
+678,CLOSED,Docker Build Failing,,2019-09-24 23:01:42 +0000 UTC
+675,CLOSED,Doesn't ModelControlAPI manage unloaded model's GPU memory?,,2020-04-29 21:18:05 +0000 UTC
+669,CLOSED,HTTP keep-alive,,2019-09-23 07:27:27 +0000 UTC
+666,CLOSED,GetRawAtCursor returns unexpected results for C++ gRPC client,,2019-10-05 00:29:16 +0000 UTC
+663,CLOSED,The result is inf, inf, nan, nan,,2019-09-18 02:18:58 +0000 UTC
+651,CLOSED,Basic examples with the gRPC and/or REST API Plzzzzzzzzzzzzzzzzz!,,2019-09-15 21:09:34 +0000 UTC
+644,CLOSED,Skip "/api/status/model"?,,2019-09-11 22:47:04 +0000 UTC
+643,CLOSED,Support specifying host header in example clients grpc?,,2022-06-16 00:10:52 +0000 UTC
+641,CLOSED,How to preserve request / response order with the callback version of AsyncRun?,,2019-09-12 00:29:46 +0000 UTC
+640,CLOSED,batch_size has no efffect on the infence time,,2019-09-26 21:40:05 +0000 UTC
+635,CLOSED,cudaStreamCaptureModeGlobal not declared,,2021-11-16 02:30:43 +0000 UTC
+634,CLOSED,[Pytorch] Multiple instance of the model on same GPU provide no speedup,,2020-03-08 18:17:34 +0000 UTC
+633,CLOSED,Output data got from response body is in binary format when doing inference through http post.,,2019-09-26 21:41:12 +0000 UTC
+617,CLOSED,incremental build error,,2019-09-09 19:36:30 +0000 UTC
+612,CLOSED,taking to too much time for "HTTP client failed: Couldn't connect to server",,2021-01-15 22:01:58 +0000 UTC
+610,CLOSED,Excessive memory usage for Tensorflow SavedModel,,2019-09-04 20:13:13 +0000 UTC
+609,CLOSED,Publish client docker image in NGC,,2019-09-09 19:37:41 +0000 UTC
+608,CLOSED,NGC Inference Server Container Website Broken,,2019-08-31 23:19:20 +0000 UTC
+603,CLOSED,Changing batch size between requests with shared memory fails,,2019-09-12 22:19:30 +0000 UTC
+597,CLOSED,Does TF-TRT used in TRTIS tensorflow runtime engine?,,2021-05-12 18:58:24 +0000 UTC
+590,CLOSED,Ensemble configuration setup,,2019-09-09 19:39:09 +0000 UTC
+584,CLOSED,error input size deply ssd on trtis,,2019-08-25 09:13:08 +0000 UTC
+572,CLOSED,python3 in client docker complains about missing numpy,,2019-08-27 17:55:06 +0000 UTC
+565,CLOSED,CustomGetNextInput_fn,,2019-08-20 19:55:55 +0000 UTC
+564,CLOSED,model repository API?,,2019-08-27 00:53:35 +0000 UTC
+563,CLOSED,TRTIS gRPU DRAM Out-Of-Memory,,2019-08-20 19:35:14 +0000 UTC
+562,CLOSED,Feature request: add ByteSize() to Output class,enhancement,2019-09-19 00:33:32 +0000 UTC
+561,CLOSED,facenet input shape problem,,2019-09-26 21:41:53 +0000 UTC
+557,CLOSED,trtis cannot serve mask rcnn onnx model,,2019-08-27 18:33:27 +0000 UTC
+556,CLOSED,Header path issues in librequest.so,bug,2019-08-29 15:43:47 +0000 UTC
+550,CLOSED,trtserver: regionFormat.cpp:65: size_t nvinfer1::RegionFormatB::memorySize(int, const nvinfer1::Dims&) const: Assertion `batchSize > 0' failed.,,2022-06-30 16:22:28 +0000 UTC
+547,CLOSED,upsample,,2019-08-27 00:57:36 +0000 UTC
+546,CLOSED,image_cilent,,2019-08-27 00:57:17 +0000 UTC
+545,CLOSED,Custom Operations with Docker,,2021-07-26 18:29:26 +0000 UTC
+544,CLOSED,Shared Memory client fails for batch size != 1,,2019-08-16 00:20:18 +0000 UTC
+543,CLOSED,Output shape with Pytorch model,,2019-09-12 03:41:59 +0000 UTC
+539,CLOSED,UNIMPLEMENTED endpoints using java grpc,,2020-01-13 16:55:54 +0000 UTC
+531,CLOSED,Cannot compile trtis-clients,,2019-08-12 21:07:12 +0000 UTC
+521,CLOSED,TRT support for MaskRCNN,,2019-09-09 21:54:14 +0000 UTC
+519,CLOSED,Java client from .proto files,,2020-01-11 13:04:09 +0000 UTC
+517,CLOSED,Cannot run tensorrtserver with python3,,2019-08-12 21:12:22 +0000 UTC
+514,CLOSED,Label map query,,2019-08-05 17:13:42 +0000 UTC
+513,CLOSED,perf_client will silently ignore -f flag and not output a CSV in static concurrency mode,bug,2019-08-09 23:52:15 +0000 UTC
+509,CLOSED,Allow manual batching for TensorRT plans with max_batch_size=0,enhancement,2019-08-07 22:27:11 +0000 UTC
+501,CLOSED,docker installation in c++,,2019-07-29 22:07:31 +0000 UTC
+498,CLOSED,client installation in c++,,2019-08-02 21:12:45 +0000 UTC
+497,CLOSED,ensemble model MODEL_UNAVAILABLE,,2019-07-26 02:34:37 +0000 UTC
+490,CLOSED,CMake build error r19.07,,2019-07-23 23:15:08 +0000 UTC
+487,CLOSED,Computation performance difference between TRTIS and TRT with Yolov3 ONNX model,,2019-07-23 11:28:44 +0000 UTC
+483,CLOSED,Specify which port to expose services at,,2019-07-19 21:31:00 +0000 UTC
+481,CLOSED,Missing layout specification of serialized tensors in the inference API,,2019-07-23 00:52:39 +0000 UTC
+477,CLOSED,TRT+CELERY unexpected: InferenceServerException('c_void_p(122485 792)',),,2021-06-08 16:40:44 +0000 UTC
+471,CLOSED,How to migrate the preprocess and postprocess to the trt server?,,2019-07-24 16:18:12 +0000 UTC
+470,CLOSED,how to accelerate docker pull?,,2019-07-22 07:05:01 +0000 UTC
+469,CLOSED,Is there a matrix of backend version ?,,2019-07-17 16:34:09 +0000 UTC
+459,CLOSED,Can't use GCS bucket name as model repository root,bug,2019-07-17 22:39:22 +0000 UTC
+458,CLOSED,Explicitly configured max_batch_size is ignored by Autofill (ONNX, TensorFlow),,2019-07-16 20:56:44 +0000 UTC
+457,CLOSED,Deploying TF-serving SavedModel format to TRT-5.1.5 without conversion,,2019-07-15 16:04:54 +0000 UTC
+456,CLOSED,TensorRT server stucked when I run perf_client command,,2019-07-31 00:30:38 +0000 UTC
+454,CLOSED,ModelRepositoryManager doesn't fail on zero-prefixed version directories,,2019-07-17 18:25:15 +0000 UTC
+453,CLOSED,1.3.0 documentation doesn't mention removal of GCS support,,2019-07-16 19:23:03 +0000 UTC
+452,CLOSED,run tensorrtserver_client on centos,,2019-11-11 15:27:21 +0000 UTC
+448,CLOSED,Unable to load model repository on Google Cloud Storage with 1.3.0,,2019-07-13 17:58:44 +0000 UTC
+447,CLOSED,Dynamic input and output shape definition for object detection,,2019-07-17 07:16:55 +0000 UTC
+446,CLOSED,must map ensemble input INPUT for ensemble ensemble_model,,2019-07-11 16:27:05 +0000 UTC
+444,CLOSED,perf_client with user-defined data and variable size,,2019-07-10 21:17:16 +0000 UTC
+443,CLOSED,Ensembling and varying batch sizes,,2019-07-11 17:57:10 +0000 UTC
+439,CLOSED,Arbitrary metadata with models,,2023-03-01 21:52:51 +0000 UTC
+438,CLOSED,InferenceServerException: [ 0] invalid size 22110 bytes for input 'gpu_0/data', expects 4 bytes,,2019-07-09 17:00:59 +0000 UTC
+435,CLOSED,Caffe2 model crashes on loading,bug,2019-07-15 16:08:55 +0000 UTC
+432,CLOSED,Tensorflow shape detection fails with multiple versions.,bug,2019-07-09 23:07:32 +0000 UTC
+418,CLOSED,How to deploy maskRCNN pytorch model?,,2021-05-05 17:00:13 +0000 UTC
+409,CLOSED,Config proto formatting,,2019-06-27 17:35:26 +0000 UTC
+386,CLOSED,Support specifying host header in example clients,enhancement,2019-06-27 19:04:40 +0000 UTC
+385,CLOSED,curl example for simple-server,,2019-06-18 17:19:45 +0000 UTC
+384,CLOSED,[feature request] Publish Python Client Library to PyPi,,2019-06-19 01:25:45 +0000 UTC
+381,CLOSED,NMT output different from normal inference,,2019-07-27 23:05:50 +0000 UTC
+378,CLOSED,Unable to run TensorRTIS on AKS,,2019-06-16 03:59:01 +0000 UTC
+376,CLOSED,Support for SSD, NMT is missing from TensorRT Inference Server,,2021-08-30 08:57:05 +0000 UTC
+373,CLOSED,What is the meaning of metrics: nv_inference_load_ratio?,,2019-06-13 22:09:08 +0000 UTC
+366,CLOSED,Impact of "instance-group" option,,2019-06-14 22:58:37 +0000 UTC
+363,CLOSED,TRTIS stops serving all models when uploading a new model,bug,2019-08-08 15:55:43 +0000 UTC
+358,CLOSED,Support for XGBoost,enhancement,2021-04-30 02:59:24 +0000 UTC
+356,CLOSED,On demand Polling,,2019-08-10 00:58:56 +0000 UTC
+352,CLOSED,how do i calculate the overlap of two class?,,2019-06-11 17:39:12 +0000 UTC
+351,CLOSED,tensorrtserver.api.InferenceServerException: [ 0] invalid size 4153344 bytes for input 'input/input_data', expects 2076672 bytes,,2019-06-17 23:43:51 +0000 UTC
+343,CLOSED,Ensemble communication improvements: avoid tensor copies,enhancement,2019-12-30 18:20:16 +0000 UTC
+331,CLOSED,ppc64le, TensorRT Inference Server Release 19.05,,2019-06-04 17:16:53 +0000 UTC
+326,CLOSED,is_training/keep_prob tensor with 19.05,,2019-06-25 05:35:32 +0000 UTC
+325,CLOSED,dims [-1] don't match configuration dims [-1],,2019-06-06 16:27:26 +0000 UTC
+324,CLOSED,Support model repositories on s3 or azure blob,enhancement,2019-08-12 21:11:30 +0000 UTC
+323,CLOSED,can the server docker built from source on jetson baord with aarch64 cpu ?,,2019-06-06 16:37:03 +0000 UTC
+317,CLOSED,tensorrt version mismatch while serving plan files.,,2019-05-31 18:39:42 +0000 UTC
+312,CLOSED,How to pass placeholders other than input - MTCNN,,2020-04-22 07:03:35 +0000 UTC
+311,CLOSED,No PYVER in Dockerfile.client,,2019-06-19 17:48:17 +0000 UTC
+310,CLOSED,Looser coupling against TensorRT versions,,2019-05-28 20:55:25 +0000 UTC
+309,CLOSED,Directly crash after log "Adding visible gpu devices: 0",,2019-05-27 11:00:54 +0000 UTC
+308,CLOSED,why the uff parser is so slow ?,,2019-05-28 16:00:27 +0000 UTC
+307,CLOSED,memlock can not be setted in k8s,,2019-05-30 16:31:52 +0000 UTC
+302,CLOSED,Time to get_async_run_results is very slow,,2019-05-30 16:44:33 +0000 UTC
+300,CLOSED,About save the serialized model of tensor rt model...,,2019-05-22 16:06:04 +0000 UTC
+298,CLOSED,caffe2_netdef InferenceServerException,bug,2019-06-04 19:31:56 +0000 UTC
+297,CLOSED,grpc client for a model with multiple outputs and input dims is -1,,2019-06-02 11:05:15 +0000 UTC
+296,CLOSED,grpc_image_client.py example with raw output reading?,,2019-05-17 17:04:32 +0000 UTC
+295,CLOSED,nv_inference_load_ratio_bucket,,2019-09-09 21:48:43 +0000 UTC
+294,CLOSED,Invalid argument: model input must specify 'dims',,2019-06-03 09:04:34 +0000 UTC
+293,CLOSED,ONNX support,,2019-06-03 16:48:38 +0000 UTC
+289,CLOSED,Execution of models in ensemble does not depend on requested outputs,enhancement,2019-09-09 18:18:27 +0000 UTC
+287,CLOSED,AttributeError: module 'common' has no attribute 'allocate_buffers',,2019-05-11 11:49:08 +0000 UTC
+285,CLOSED,Custom backends with errors are flagged as READY,bug,2019-06-27 17:32:42 +0000 UTC
+282,CLOSED,YOLOv3 model configuration issue,,2020-02-01 15:37:57 +0000 UTC
+281,CLOSED,Transform ONNX to TensorRT Fail,,2019-05-09 20:04:32 +0000 UTC
+280,CLOSED,inference by CURL ?,,2019-05-10 02:40:18 +0000 UTC
+279,CLOSED,InferContext.ResultFormat.CLASS for RNN,,2019-05-10 04:01:24 +0000 UTC
+278,CLOSED,Windows Python Client Possibility,,2020-04-21 16:37:23 +0000 UTC
+276,CLOSED,TRTIS in pod segfaulting,,2019-09-09 21:49:26 +0000 UTC
+274,CLOSED,Wrong output order retrieval using ensemble model,,2019-05-14 16:27:55 +0000 UTC
+273,CLOSED,Handling input with shape [seq_len, batch_size],,2019-05-08 06:28:13 +0000 UTC
+272,CLOSED,TRTIS crashes in GCP after the first request without any error messages when more than 10 models are loaded.,,2019-05-07 17:05:08 +0000 UTC
+270,CLOSED,Dynamic batch size for input with shape -1,,2021-11-03 22:49:23 +0000 UTC
+269,CLOSED,Inference performance hit a limit with one trtserver instance via grpc,bug,2019-07-03 19:29:47 +0000 UTC
+264,CLOSED,Config.pbtxt and max_batch_size setting for SavedModel,,2019-10-11 08:08:22 +0000 UTC
+261,CLOSED,Model status API should report per-model-instance memory usage,enhancement,2023-07-08 00:16:15 +0000 UTC
+260,CLOSED,Custom backend and label file,,2019-05-09 19:43:28 +0000 UTC
+258,CLOSED,Jmeter failed to get response from trtis,,2019-05-07 07:01:37 +0000 UTC
+257,CLOSED,Error ensemble_scheduling,,2019-04-29 16:13:12 +0000 UTC
+252,CLOSED,about the ensemble,,2019-04-25 07:40:27 +0000 UTC
+251,CLOSED,Error perf_client with dynamic batch Caffe2 model,,2019-04-29 15:47:11 +0000 UTC
+246,CLOSED,TRTIS1.0.0: Timer not set correctly / No valid requests recorded within time interval / Must specify at least one target,,2019-04-24 16:34:02 +0000 UTC
+239,CLOSED,Unexpected TensorRT5.1.2 Results vs TRTIS1.0.0 Results,,2019-06-06 16:33:28 +0000 UTC
+238,CLOSED,Tool to convert an uff file to a plan file,,2019-04-22 15:46:22 +0000 UTC
+237,CLOSED,Support dynamic Model Outputs,,2019-04-22 16:32:04 +0000 UTC
+235,CLOSED,Incremental Builds failed,,2019-06-06 16:31:42 +0000 UTC
+228,CLOSED,Does TRTIS support MXNet and Pytorch models?,,2019-04-15 21:10:41 +0000 UTC
+223,CLOSED,ready_state: MODEL_UNAVAILABLE: TensorFlow SavedModel configuration,,2019-04-16 18:00:05 +0000 UTC
+220,CLOSED,Client Examples for Several Applications,,2019-05-09 19:42:45 +0000 UTC
+217,CLOSED,Object Detection Tensorflow example request,,2020-06-02 11:15:10 +0000 UTC
+214,CLOSED,Specifying Optimization Policy while using TensorRT Inference Server,,2019-04-09 16:50:53 +0000 UTC
+213,CLOSED,difference results between infer using .pb and infer using trtis,,2019-05-28 13:31:14 +0000 UTC
+207,CLOSED,trtis uses too much ram,,2019-08-18 18:00:06 +0000 UTC
+202,CLOSED,Run tensorrtserver failed!,,2019-04-04 17:07:34 +0000 UTC
+200,CLOSED,i update TRT 5.0 to TRT 5.1 and it show a warning,,2019-04-09 16:57:14 +0000 UTC
+194,CLOSED,Why TRTIS Accelerated limited?,,2019-04-09 16:56:02 +0000 UTC
+192,CLOSED,TYPE_STRING: failed to set result for entire batch,,2019-04-04 15:30:51 +0000 UTC
+187,CLOSED,Image compression,,2020-02-03 17:26:33 +0000 UTC
+185,CLOSED,Can tensorrtservere-19.03-py3 support cuda-driver-396.26 and cuda9.0?,,2019-04-02 23:45:34 +0000 UTC
+184,CLOSED,Caffe2 Backend: Support models with flexible input/output shapes in batch mode,,2019-04-16 17:51:17 +0000 UTC
+175,CLOSED,Feature Request: Add Go Client,enhancement,2019-09-14 21:57:38 +0000 UTC
+173,CLOSED,Account for xxx.yy.zz driver version numbers,,2019-03-22 23:52:46 +0000 UTC
+171,CLOSED,CUDAStreams only supported for TensorRT backend?,,2019-03-21 16:08:02 +0000 UTC
+170,CLOSED,Relationship between src/custom/ and src/servables/custom?,,2019-03-20 23:36:25 +0000 UTC
+169,CLOSED,Building the Server Failed,,2020-11-23 13:31:16 +0000 UTC
+168,CLOSED,How to use different max_batch_size configurations for version?,,2019-06-12 07:31:23 +0000 UTC
+162,CLOSED,image_client demos don't support dynamic input/output shapes,,2019-03-20 15:42:38 +0000 UTC
+156,CLOSED,perf_client not included in docker image,,2019-03-14 15:17:30 +0000 UTC
+155,CLOSED,how can i get a TRTIS which support tensorrt 4.0?,,2019-03-14 11:22:49 +0000 UTC
+147,CLOSED,Question: TRTIS Caffe2 does not support models with both CPU and GPU ops?,bug, enhancement,2020-06-25 17:24:57 +0000 UTC
+145,CLOSED,TensorRT Plan blocking GPU Memory,,2020-04-29 21:14:17 +0000 UTC
+142,CLOSED,Visualization tool that shows the TensorRT Inference Server Metrics,,2019-04-02 23:49:45 +0000 UTC
+141,CLOSED,TRTIS and Kubeflow docker image,,2019-03-08 20:23:24 +0000 UTC
+140,CLOSED,HTTP 200 response on invalid output names,bug,2019-04-26 17:20:31 +0000 UTC
+139,CLOSED,can not run tensorrt-inference-servere-19.02 with cuda-driver-396.26,,2019-03-08 18:47:16 +0000 UTC
+136,CLOSED,Performance Example Application: [ 0] INTERNAL - No valid requests recorded within time interval. Please use a larger time window.,,2019-03-08 15:47:16 +0000 UTC
+135,CLOSED,tensorrtserver_clients docker image: ERROR: No supported GPU(s) detected to run this container,,2019-03-07 20:11:10 +0000 UTC
+134,CLOSED,Unable to collect inference metrics for nullptr servable,bug,2019-03-21 15:48:06 +0000 UTC
+133,CLOSED,How to deploy serialized models?,,2019-03-15 19:35:27 +0000 UTC
+131,CLOSED,Does TRTIS support model parallelism?,,2023-01-30 02:19:49 +0000 UTC
+122,CLOSED,Potential GPU memory leak for TensorFlow models?,bug,2022-02-17 10:01:14 +0000 UTC
+118,CLOSED,image client build error,,2019-03-01 16:53:15 +0000 UTC
+110,CLOSED,Linking TRTIS as a library,enhancement,2019-10-21 16:37:12 +0000 UTC
+104,CLOSED,unexpected shape for input 'input' for model,,2019-02-21 17:05:00 +0000 UTC
+103,CLOSED,Error With Running the samples,,2019-02-20 22:43:03 +0000 UTC
+101,CLOSED,Spammy log: failed to get energy consumption,,2019-02-20 19:04:44 +0000 UTC
+99,CLOSED,kInvalidBinNum error,,2019-02-19 14:46:49 +0000 UTC
+95,CLOSED,Warning about Dynamic Batching and thread count,enhancement,2019-02-20 00:00:39 +0000 UTC
+94,CLOSED,load error when signature changed,,2019-02-20 00:17:31 +0000 UTC
+91,CLOSED,image_client.py due to message type error,,2019-02-15 16:27:52 +0000 UTC
+76,CLOSED,TRTIS does not load model,,2019-02-11 15:58:31 +0000 UTC
+72,CLOSED,image_client libopencv_highgui.so error,,2019-02-06 16:46:58 +0000 UTC
+71,CLOSED,Issue with simple_string model in the examples,,2019-02-07 05:45:52 +0000 UTC
+70,CLOSED,"authentication required" trying to build the server,,2019-02-03 01:25:10 +0000 UTC
+65,CLOSED,Best practice for custom backend with additional resources,enhancement,2019-02-25 16:42:54 +0000 UTC
+55,CLOSED,Multiple GPU scheduling,bug,2019-01-29 19:55:31 +0000 UTC
+53,CLOSED,resnet50_netdef does not run on CPU,,2019-02-02 18:39:53 +0000 UTC
+50,CLOSED,simple_client.py: unexpected additional input data for model 'simple',,2019-01-23 11:07:20 +0000 UTC
+49,CLOSED,NVIDIA Quadro M1000M GPU not supported by tensorrtserver:18.12-py3,,2019-01-23 16:26:39 +0000 UTC
+48,CLOSED,the TRTIS could not load tf-trt frozen model,,2019-01-31 03:11:21 +0000 UTC
+47,CLOSED,Encountered an error while loading tensorflow savedmodel,,2019-01-30 16:44:13 +0000 UTC
+44,CLOSED,the TRTIS can not load the trt model,,2019-01-23 16:20:51 +0000 UTC
+40,CLOSED,Cannot deserialize plugin RPROI_TRT,,2019-01-14 17:04:33 +0000 UTC
+37,CLOSED,How the tensorflow savedmodel worked in TRTIS?,,2019-06-26 12:22:23 +0000 UTC
+36,CLOSED,New labels file is not detected,bug,2019-01-24 00:50:02 +0000 UTC
+33,CLOSED,How the Input tensor definition for tensorflow GraphDef model in TRTIS graph.pbtxt file?,,2019-01-17 17:54:28 +0000 UTC
+32,CLOSED,Why TRTIS not support dims: [ -1 ] for input and output tensor?,duplicate,2019-01-04 22:37:12 +0000 UTC
+29,CLOSED,INTERNAL - unable to enqueue for inference,,2019-01-03 16:31:32 +0000 UTC
+28,CLOSED,Clients build failed when set --build-arg "PYVER=3.6",help wanted,2019-04-22 22:23:43 +0000 UTC
+27,CLOSED,cp: cannot stat 'bazel-bin/src/custom/addsub/libaddsub.so': No such file or directory,bug,2018-12-26 06:18:48 +0000 UTC
+25,CLOSED,Implement Pre-process "add-on" to reduce TRTIS communication bottleneck,,2019-05-09 19:54:17 +0000 UTC
+19,CLOSED,New model deployment is not detected,,2018-12-20 06:25:21 +0000 UTC
+18,CLOSED,Does TRTIS support large models that place on multiple devices？,,2019-01-17 17:54:55 +0000 UTC
+16,CLOSED,TRTIS should support TensorRT models that require custom plugins,enhancement,2020-01-30 13:03:26 +0000 UTC
+14,CLOSED,perf_client has limits on concurrency,bug,2019-01-29 19:55:30 +0000 UTC
+13,CLOSED,Encountered out of memoryError when using perf_client test with various batch sizes,bug,2020-03-08 18:20:21 +0000 UTC
+12,CLOSED,Problem with ssd,,2018-12-17 01:05:13 +0000 UTC
+8,CLOSED,TRTIS should support variable-sized input and output tensor dimensions,enhancement,2019-04-22 15:45:18 +0000 UTC
+7,CLOSED,got problem while serving with TensorRT plan,,2018-12-03 02:05:00 +0000 UTC
+5,CLOSED,How to deploy models where the shape of output tensor is not known,,2018-12-25 09:30:09 +0000 UTC
+4,CLOSED,how to infer by http restful API,,2018-11-29 08:31:32 +0000 UTC
+3,CLOSED,image_client error,,2020-07-24 15:43:54 +0000 UTC
+1,CLOSED,Submitting raw data via IPC,enhancement,2019-10-21 16:39:10 +0000 UTC
diff --git a/qa/L0_backend_python/common.sh b/qa/L0_backend_python/common.sh
index 6030849fc9..b0d110cbc4 100755
--- a/qa/L0_backend_python/common.sh
+++ b/qa/L0_backend_python/common.sh
@@ -67,7 +67,7 @@ create_conda_env() {
 create_conda_env_with_specified_path() {
   local python_version=$1
   local env_path=$2
-  conda create -p $env_path python=$python_version -y 
+  conda create -p $env_path python=$python_version -y
   conda activate $env_path
   conda install -c conda-forge conda-pack -y
 }
diff --git a/qa/L0_backend_python/env/test.sh b/qa/L0_backend_python/env/test.sh
index 4161be5b49..657a1a57f8 100755
--- a/qa/L0_backend_python/env/test.sh
+++ b/qa/L0_backend_python/env/test.sh
@@ -64,7 +64,7 @@ cp python_backend/builddir/triton_python_backend_stub ./models/python_3_7
 conda deactivate
 
 # Use python-3-7 without conda pack
-# Create a model with python 3.7 version and numpy 1.20.3 to distinguish from 
+# Create a model with python 3.7 version and numpy 1.20.3 to distinguish from
 # previous test.
 # Tensorflow 2.1.0 only works with Python 3.4 - 3.7. Successful execution of
 # the Python model indicates that the environment has been setup correctly.
diff --git a/qa/L0_sequence_batcher/sequence_batcher_test.py b/qa/L0_sequence_batcher/sequence_batcher_test.py
index 11b659b05a..1c238d151d 100755
--- a/qa/L0_sequence_batcher/sequence_batcher_test.py
+++ b/qa/L0_sequence_batcher/sequence_batcher_test.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -28,24 +30,23 @@
 
 sys.path.append("../common")
 
-from builtins import str
 import os
-import time
 import threading
+import time
 import unittest
+from builtins import str
+
 import numpy as np
-import test_util as tu
 import sequence_util as su
+import test_util as tu
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import InferenceServerException
 
-TEST_SYSTEM_SHARED_MEMORY = bool(
-    int(os.environ.get('TEST_SYSTEM_SHARED_MEMORY', 0)))
-TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get('TEST_CUDA_SHARED_MEMORY',
-                                                  0)))
+TEST_SYSTEM_SHARED_MEMORY = bool(int(os.environ.get("TEST_SYSTEM_SHARED_MEMORY", 0)))
+TEST_CUDA_SHARED_MEMORY = bool(int(os.environ.get("TEST_CUDA_SHARED_MEMORY", 0)))
 
-USE_GRPC = (os.environ.get('USE_GRPC', 1) != "0")
-USE_HTTP = (os.environ.get('USE_HTTP', 1) != "0")
+USE_GRPC = os.environ.get("USE_GRPC", 1) != "0"
+USE_HTTP = os.environ.get("USE_HTTP", 1) != "0"
 assert USE_GRPC or USE_HTTP, "USE_GRPC or USE_HTTP must be non-zero"
 if USE_GRPC and USE_HTTP:
     _protocols = ("http", "grpc")
@@ -54,28 +55,27 @@
 else:
     _protocols = ("http",)
 
-BACKENDS = os.environ.get('BACKENDS',
-                          "graphdef savedmodel onnx plan custom python")
-ENSEMBLES = bool(int(os.environ.get('ENSEMBLES', 1)))
+BACKENDS = os.environ.get("BACKENDS", "graphdef savedmodel onnx plan custom python")
+ENSEMBLES = bool(int(os.environ.get("ENSEMBLES", 1)))
 
-NO_BATCHING = (int(os.environ['NO_BATCHING']) == 1)
-MODEL_INSTANCES = int(os.environ['MODEL_INSTANCES'])
-IMPLICIT_STATE = (int(os.environ['IMPLICIT_STATE']) == 1)
+NO_BATCHING = int(os.environ["NO_BATCHING"]) == 1
+MODEL_INSTANCES = int(os.environ["MODEL_INSTANCES"])
+IMPLICIT_STATE = int(os.environ["IMPLICIT_STATE"]) == 1
 
 # Use initial state for implicit state
-INITIAL_STATE_FILE = (int(os.environ['INITIAL_STATE_FILE']) == 1)
+INITIAL_STATE_FILE = int(os.environ["INITIAL_STATE_FILE"]) == 1
 
 _trials = ()
 if NO_BATCHING:
-    for backend in BACKENDS.split(' '):
-        if (backend != 'custom'):
+    for backend in BACKENDS.split(" "):
+        if backend != "custom":
             _trials += (backend + "_nobatch",)
-elif os.environ['BATCHER_TYPE'] == "VARIABLE":
-    for backend in BACKENDS.split(' '):
-        if (backend != "libtorch") and (backend != 'custom'):
+elif os.environ["BATCHER_TYPE"] == "VARIABLE":
+    for backend in BACKENDS.split(" "):
+        if (backend != "libtorch") and (backend != "custom"):
             _trials += (backend,)
 else:
-    _trials = BACKENDS.split(' ')
+    _trials = BACKENDS.split(" ")
 
 # Add ensemble to the _trials
 ENSEMBLE_PREFIXES = ["simple_", "sequence_", "fan_"]
@@ -97,7 +97,7 @@
 # Not all models can be tested for ragged handling because the models
 # don't deal well with non-size-1 shapes
 _ragged_batch_not_supported_trials = list()
-if os.environ['BATCHER_TYPE'] == "VARIABLE":
+if os.environ["BATCHER_TYPE"] == "VARIABLE":
     if "custom" in _trials:
         _ragged_batch_not_supported_trials.append("custom")
     if "plan" in _trials:
@@ -118,48 +118,47 @@ def is_ensemble(model_name):
 
 
 class SequenceBatcherTest(su.SequenceBatcherTestUtil):
-
     def get_datatype(self, trial):
         # Get the datatype to use based on what models are available (see test.sh)
-        if ("plan" in trial):
+        if "plan" in trial:
             return (np.float32,)
-        if ("custom" in trial):
+        if "custom" in trial:
             return (np.int32,)
-        if ("savedmodel" in trial):
+        if "savedmodel" in trial:
             return (np.float32, np.bool_)
-        if ("graphdef" in trial):
+        if "graphdef" in trial:
             return (np.dtype(object), np.bool_)
 
         # Only test the string data type for ONNX and libtorch models in implicit state
         if IMPLICIT_STATE:
-            if ("onnx" in trial):
+            if "onnx" in trial:
                 return (np.dtype(object), np.int32, np.bool_)
             if NO_BATCHING:
-                if ("libtorch" in trial):
+                if "libtorch" in trial:
                     return (np.dtype(object), np.int32, np.bool_)
 
         return (np.int32, np.bool_)
 
     def get_expected_result(self, expected_result, value, trial, flag_str=None):
         # Adjust the expected_result for models that
-        # couldn't implement the full accumulator. See
+        # could not implement the full accumulator. See
         # qa/common/gen_qa_sequence_models.py for more
         # information.
-        if ((not NO_BATCHING and
-             ("custom" not in trial)) or ("graphdef" in trial) or
-            ("plan" in trial) or ("onnx" in trial)) or ("libtorch" in trial):
+        if (
+            (not NO_BATCHING and ("custom" not in trial))
+            or ("graphdef" in trial)
+            or ("plan" in trial)
+            or ("onnx" in trial)
+        ) or ("libtorch" in trial):
             expected_result = value
             if (flag_str is not None) and ("start" in flag_str):
                 expected_result += 1
         return expected_result
 
-    def get_expected_result_implicit(self,
-                                     expected_result,
-                                     value,
-                                     trial,
-                                     flag_str=None,
-                                     dtype=None):
-        if dtype == np.dtype(object) and trial.startswith('onnx'):
+    def get_expected_result_implicit(
+        self, expected_result, value, trial, flag_str=None, dtype=None
+    ):
+        if dtype == np.dtype(object) and trial.startswith("onnx"):
             return value
 
         if INITIAL_STATE_FILE:
@@ -182,7 +181,8 @@ def test_simple_sequence(self):
                     model_name = tu.get_sequence_model_name(trial, dtype)
                     # Skip bool type ensemble models
                     if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
-                            dtype == np.bool_):
+                        dtype == np.bool_
+                    ):
                         continue
                     # For bool type control models, use int32 as I/O types
                     if dtype == np.bool_:
@@ -191,14 +191,17 @@ def test_simple_sequence(self):
                     self.clear_deferred_exceptions()
                     try:
                         self.check_setup(model_name)
-                        self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER",
-                                         os.environ)
-                        self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                         os.environ)
-                        expected_result = self.get_expected_result(
-                            45, 9, trial, "end"
-                        ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                            45, 9, trial, "end", dtype)
+                        self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
+                        self.assertNotIn(
+                            "TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ
+                        )
+                        expected_result = (
+                            self.get_expected_result(45, 9, trial, "end")
+                            if not IMPLICIT_STATE
+                            else self.get_expected_result_implicit(
+                                45, 9, trial, "end", dtype
+                            )
+                        )
 
                         self.check_sequence(
                             trial,
@@ -207,19 +210,28 @@ def test_simple_sequence(self):
                             5,
                             (4000, None),
                             # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay))
-                            (("start", 1, None, None), (None, 2, None, None),
-                             (None, 3, None, None), (None, 4, None, None),
-                             (None, 5, None, None), (None, 6, None, None),
-                             (None, 7, None, None), (None, 8, None, None),
-                             ("end", 9, None, None)),
+                            (
+                                ("start", 1, None, None),
+                                (None, 2, None, None),
+                                (None, 3, None, None),
+                                (None, 4, None, None),
+                                (None, 5, None, None),
+                                (None, 6, None, None),
+                                (None, 7, None, None),
+                                (None, 8, None, None),
+                                ("end", 9, None, None),
+                            ),
                             expected_result,
                             protocol,
                             sequence_name="{}_{}".format(
-                                self._testMethodName, protocol))
+                                self._testMethodName, protocol
+                            ),
+                        )
 
                         self.check_deferred_exception()
-                        self.check_status(model_name, {1: 9 * (idx + 1)},
-                                          9 * (idx + 1), 9 * (idx + 1))
+                        self.check_status(
+                            model_name, {1: 9 * (idx + 1)}, 9 * (idx + 1), 9 * (idx + 1)
+                        )
                     except Exception as ex:
                         self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -235,7 +247,8 @@ def test_length1_sequence(self):
                     model_name = tu.get_sequence_model_name(trial, dtype)
                     # Skip bool type ensemble models
                     if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
-                            dtype == np.bool_):
+                        dtype == np.bool_
+                    ):
                         continue
                     # For bool type control models, use int32 as I/O types
                     if dtype == np.bool_:
@@ -244,14 +257,17 @@ def test_length1_sequence(self):
                     self.clear_deferred_exceptions()
                     try:
                         self.check_setup(model_name)
-                        self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER",
-                                         os.environ)
-                        self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                         os.environ)
-                        expected_result = self.get_expected_result(
-                            42, 42, trial, "start,end"
-                        ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                            42, 42, trial, "start,end", dtype)
+                        self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
+                        self.assertNotIn(
+                            "TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ
+                        )
+                        expected_result = (
+                            self.get_expected_result(42, 42, trial, "start,end")
+                            if not IMPLICIT_STATE
+                            else self.get_expected_result_implicit(
+                                42, 42, trial, "start,end", dtype
+                            )
+                        )
 
                         self.check_sequence(
                             trial,
@@ -260,16 +276,18 @@ def test_length1_sequence(self):
                             99,
                             (4000, None),
                             # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay))
-                            (
-                                ("start,end", 42, None, None),),
+                            (("start,end", 42, None, None),),
                             expected_result,
                             protocol,
                             sequence_name="{}_{}".format(
-                                self._testMethodName, protocol))
+                                self._testMethodName, protocol
+                            ),
+                        )
 
                         self.check_deferred_exception()
-                        self.check_status(model_name, {1: idx + 1}, (idx + 1),
-                                          (idx + 1))
+                        self.check_status(
+                            model_name, {1: idx + 1}, (idx + 1), (idx + 1)
+                        )
                     except Exception as ex:
                         self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -291,7 +309,8 @@ def test_batch_size(self):
                     model_name = tu.get_sequence_model_name(trial, dtype)
                     # Skip bool type ensemble models
                     if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
-                            dtype == np.bool_):
+                        dtype == np.bool_
+                    ):
                         continue
                     # For bool type control models, use int32 as I/O types
                     if dtype == np.bool_:
@@ -300,14 +319,17 @@ def test_batch_size(self):
                     self.clear_deferred_exceptions()
                     try:
                         self.check_setup(model_name)
-                        self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER",
-                                         os.environ)
-                        self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                         os.environ)
-                        expected_result = self.get_expected_result(
-                            10, 9, trial, "end"
-                        ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                            10, 9, trial, "end", dtype)
+                        self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
+                        self.assertNotIn(
+                            "TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ
+                        )
+                        expected_result = (
+                            self.get_expected_result(10, 9, trial, "end")
+                            if not IMPLICIT_STATE
+                            else self.get_expected_result_implicit(
+                                10, 9, trial, "end", dtype
+                            )
+                        )
 
                         self.check_sequence(
                             trial,
@@ -321,27 +343,36 @@ def test_batch_size(self):
                             protocol,
                             batch_size=2,
                             sequence_name="{}_{}".format(
-                                self._testMethodName, protocol))
+                                self._testMethodName, protocol
+                            ),
+                        )
 
                         self.check_deferred_exception()
                         self.assertTrue(False, "expected error")
                     except Exception as ex:
                         for prefix in ENSEMBLE_PREFIXES:
                             if model_name.startswith(prefix):
-                                base_model_name = model_name[(len(prefix)):]
-                                self.assertTrue(ex.message().startswith(
-                                    str("in ensemble '{}', " +
-                                        "inference request to model '{}' must specify "
-                                        +
-                                        "batch-size 1 due to requirements of sequence "
-                                        + "batcher").format(
-                                            model_name, base_model_name)))
+                                base_model_name = model_name[(len(prefix)) :]
+                                self.assertTrue(
+                                    ex.message().startswith(
+                                        str(
+                                            "in ensemble '{}', "
+                                            + "inference request to model '{}' must specify "
+                                            + "batch-size 1 due to requirements of sequence "
+                                            + "batcher"
+                                        ).format(model_name, base_model_name)
+                                    )
+                                )
                                 return
-                        self.assertTrue(ex.message().startswith(
-                            str("inference request to model '{}' must specify "
-                                +
-                                "batch-size 1 due to requirements of sequence "
-                                + "batcher").format(model_name)))
+                        self.assertTrue(
+                            ex.message().startswith(
+                                str(
+                                    "inference request to model '{}' must specify "
+                                    + "batch-size 1 due to requirements of sequence "
+                                    + "batcher"
+                                ).format(model_name)
+                            )
+                        )
 
     def test_no_correlation_id(self):
         # Send sequence without correlation ID and check for error.
@@ -353,7 +384,8 @@ def test_no_correlation_id(self):
                     model_name = tu.get_sequence_model_name(trial, dtype)
                     # Skip bool type ensemble models
                     if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
-                            dtype == np.bool_):
+                        dtype == np.bool_
+                    ):
                         continue
                     # For bool type control models, use int32 as I/O types
                     if dtype == np.bool_:
@@ -362,14 +394,17 @@ def test_no_correlation_id(self):
                     self.clear_deferred_exceptions()
                     try:
                         self.check_setup(model_name)
-                        self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER",
-                                         os.environ)
-                        self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                         os.environ)
-                        expected_result = self.get_expected_result(
-                            10, 9, trial, "end"
-                        ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                            10, 9, trial, "end", dtype)
+                        self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
+                        self.assertNotIn(
+                            "TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ
+                        )
+                        expected_result = (
+                            self.get_expected_result(10, 9, trial, "end")
+                            if not IMPLICIT_STATE
+                            else self.get_expected_result_implicit(
+                                10, 9, trial, "end", dtype
+                            )
+                        )
 
                         self.check_sequence(
                             trial,
@@ -382,25 +417,34 @@ def test_no_correlation_id(self):
                             expected_result,
                             protocol,
                             sequence_name="{}_{}".format(
-                                self._testMethodName, protocol))
+                                self._testMethodName, protocol
+                            ),
+                        )
 
                         self.check_deferred_exception()
                         self.assertTrue(False, "expected error")
                     except Exception as ex:
                         for prefix in ENSEMBLE_PREFIXES:
                             if model_name.startswith(prefix):
-                                base_model_name = model_name[(len(prefix)):]
-                                self.assertTrue(ex.message().startswith(
-                                    str("in ensemble '{}', " +
-                                        "inference request to model '{}' must specify a "
-                                        + "non-zero or non-empty correlation ID"
-                                       ).format(model_name, base_model_name)))
+                                base_model_name = model_name[(len(prefix)) :]
+                                self.assertTrue(
+                                    ex.message().startswith(
+                                        str(
+                                            "in ensemble '{}', "
+                                            + "inference request to model '{}' must specify a "
+                                            + "non-zero or non-empty correlation ID"
+                                        ).format(model_name, base_model_name)
+                                    )
+                                )
                                 return
-                        self.assertTrue(ex.message().startswith(
-                            str("inference request to model '{}' must specify a "
-                                +
-                                "non-zero or non-empty correlation ID").format(
-                                    model_name)))
+                        self.assertTrue(
+                            ex.message().startswith(
+                                str(
+                                    "inference request to model '{}' must specify a "
+                                    + "non-zero or non-empty correlation ID"
+                                ).format(model_name)
+                            )
+                        )
 
     def test_no_sequence_start(self):
         # Send sequence without start flag for never before seen
@@ -413,7 +457,8 @@ def test_no_sequence_start(self):
                     model_name = tu.get_sequence_model_name(trial, dtype)
                     # Skip bool type ensemble models
                     if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
-                            dtype == np.bool_):
+                        dtype == np.bool_
+                    ):
                         continue
                     # For bool type control models, use int32 as I/O types
                     if dtype == np.bool_:
@@ -422,15 +467,18 @@ def test_no_sequence_start(self):
                     self.clear_deferred_exceptions()
                     try:
                         self.check_setup(model_name)
-                        self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER",
-                                         os.environ)
-                        self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                         os.environ)
-
-                        expected_result = self.get_expected_result(
-                            6, 3, trial, "end"
-                        ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                            6, 3, trial, "end", dtype)
+                        self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
+                        self.assertNotIn(
+                            "TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ
+                        )
+
+                        expected_result = (
+                            self.get_expected_result(6, 3, trial, "end")
+                            if not IMPLICIT_STATE
+                            else self.get_expected_result_implicit(
+                                6, 3, trial, "end", dtype
+                            )
+                        )
                         self.check_sequence(
                             trial,
                             model_name,
@@ -438,12 +486,17 @@ def test_no_sequence_start(self):
                             37469245,
                             (4000, None),
                             # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay))
-                            ((None, 1, None, None), (None, 2, None, None),
-                             ("end", 3, None, None)),
+                            (
+                                (None, 1, None, None),
+                                (None, 2, None, None),
+                                ("end", 3, None, None),
+                            ),
                             expected_result,
                             protocol,
                             sequence_name="{}_{}".format(
-                                self._testMethodName, protocol))
+                                self._testMethodName, protocol
+                            ),
+                        )
 
                         self.check_deferred_exception()
                         self.assertTrue(False, "expected error")
@@ -451,20 +504,27 @@ def test_no_sequence_start(self):
                         print(model_name + "-> " + ex.message())
                         for prefix in ENSEMBLE_PREFIXES:
                             if model_name.startswith(prefix):
-                                base_model_name = model_name[(len(prefix)):]
-                                self.assertTrue(ex.message().startswith(
-                                    str("in ensemble '{}', " +
-                                        "inference request for sequence 37469245 to "
-                                        +
-                                        "model '{}' must specify the START flag on the first "
-                                        + "request of the sequence").format(
-                                            model_name, base_model_name)))
+                                base_model_name = model_name[(len(prefix)) :]
+                                self.assertTrue(
+                                    ex.message().startswith(
+                                        str(
+                                            "in ensemble '{}', "
+                                            + "inference request for sequence 37469245 to "
+                                            + "model '{}' must specify the START flag on the first "
+                                            + "request of the sequence"
+                                        ).format(model_name, base_model_name)
+                                    )
+                                )
                                 return
-                        self.assertTrue(ex.message().startswith(
-                            str("inference request for sequence 37469245 to " +
-                                "model '{}' must specify the START flag on the first "
-                                +
-                                "request of the sequence").format(model_name)))
+                        self.assertTrue(
+                            ex.message().startswith(
+                                str(
+                                    "inference request for sequence 37469245 to "
+                                    + "model '{}' must specify the START flag on the first "
+                                    + "request of the sequence"
+                                ).format(model_name)
+                            )
+                        )
 
     def test_no_sequence_start2(self):
         # Send sequence without start flag after sending a valid
@@ -478,7 +538,8 @@ def test_no_sequence_start2(self):
                     model_name = tu.get_sequence_model_name(trial, dtype)
                     # Skip bool type ensemble models
                     if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
-                            dtype == np.bool_):
+                        dtype == np.bool_
+                    ):
                         continue
                     # For bool type control models, use int32 as I/O types
                     if dtype == np.bool_:
@@ -487,14 +548,17 @@ def test_no_sequence_start2(self):
                     self.clear_deferred_exceptions()
                     try:
                         self.check_setup(model_name)
-                        self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER",
-                                         os.environ)
-                        self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                         os.environ)
-                        expected_result = self.get_expected_result(
-                            6, 3, trial, None
-                        ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                            6, 3, trial, None, dtype)
+                        self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
+                        self.assertNotIn(
+                            "TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ
+                        )
+                        expected_result = (
+                            self.get_expected_result(6, 3, trial, None)
+                            if not IMPLICIT_STATE
+                            else self.get_expected_result_implicit(
+                                6, 3, trial, None, dtype
+                            )
+                        )
 
                         self.check_sequence(
                             trial,
@@ -503,34 +567,48 @@ def test_no_sequence_start2(self):
                             3,
                             (4000, None),
                             # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay))
-                            (("start", 1, None, None), (None, 2, None, None),
-                             ("end", 3, None, None), (None, 55, None, None)),
+                            (
+                                ("start", 1, None, None),
+                                (None, 2, None, None),
+                                ("end", 3, None, None),
+                                (None, 55, None, None),
+                            ),
                             expected_result,
                             protocol,
                             sequence_name="{}_{}".format(
-                                self._testMethodName, protocol))
+                                self._testMethodName, protocol
+                            ),
+                        )
 
-                        self.check_status(model_name, {1: 3 * (idx + 1)},
-                                          3 * (idx + 1), 3 * (idx + 1))
+                        self.check_status(
+                            model_name, {1: 3 * (idx + 1)}, 3 * (idx + 1), 3 * (idx + 1)
+                        )
                         self.check_deferred_exception()
                         self.assertTrue(False, "expected error")
                     except Exception as ex:
                         for prefix in ENSEMBLE_PREFIXES:
                             if model_name.startswith(prefix):
-                                base_model_name = model_name[(len(prefix)):]
-                                self.assertTrue(ex.message().startswith(
-                                    str("in ensemble '{}', " +
-                                        "inference request for sequence 3 to model '{}' must "
-                                        +
-                                        "specify the START flag on the first request of "
-                                        + "the sequence").format(
-                                            model_name, base_model_name)))
+                                base_model_name = model_name[(len(prefix)) :]
+                                self.assertTrue(
+                                    ex.message().startswith(
+                                        str(
+                                            "in ensemble '{}', "
+                                            + "inference request for sequence 3 to model '{}' must "
+                                            + "specify the START flag on the first request of "
+                                            + "the sequence"
+                                        ).format(model_name, base_model_name)
+                                    )
+                                )
                                 return
-                        self.assertTrue(ex.message().startswith(
-                            str("inference request for sequence 3 to model '{}' must "
-                                +
-                                "specify the START flag on the first request of "
-                                + "the sequence").format(model_name)))
+                        self.assertTrue(
+                            ex.message().startswith(
+                                str(
+                                    "inference request for sequence 3 to model '{}' must "
+                                    + "specify the START flag on the first request of "
+                                    + "the sequence"
+                                ).format(model_name)
+                            )
+                        )
 
     def test_no_sequence_end(self):
         # Send sequence without end flag. Use same correlation ID to
@@ -544,7 +622,8 @@ def test_no_sequence_end(self):
                     model_name = tu.get_sequence_model_name(trial, dtype)
                     # Skip bool type ensemble models
                     if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
-                            dtype == np.bool_):
+                        dtype == np.bool_
+                    ):
                         continue
                     # For bool type control models, use int32 as I/O types
                     if dtype == np.bool_:
@@ -553,14 +632,17 @@ def test_no_sequence_end(self):
                     self.clear_deferred_exceptions()
                     try:
                         self.check_setup(model_name)
-                        self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER",
-                                         os.environ)
-                        self.assertNotIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                         os.environ)
-                        expected_result = self.get_expected_result(
-                            51, 9, trial, "end"
-                        ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                            51, 9, trial, "end", dtype)
+                        self.assertNotIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
+                        self.assertNotIn(
+                            "TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ
+                        )
+                        expected_result = (
+                            self.get_expected_result(51, 9, trial, "end")
+                            if not IMPLICIT_STATE
+                            else self.get_expected_result_implicit(
+                                51, 9, trial, "end", dtype
+                            )
+                        )
 
                         self.check_sequence(
                             trial,
@@ -569,16 +651,23 @@ def test_no_sequence_end(self):
                             4566,
                             (4000, None),
                             # (flag_str, value, (ls_ms, gt_ms), (pre_delay, post_delay))
-                            (("start", 1, None, None), (None, 2, None, None),
-                             ("start", 42, None, None), ("end", 9, None, None)),
+                            (
+                                ("start", 1, None, None),
+                                (None, 2, None, None),
+                                ("start", 42, None, None),
+                                ("end", 9, None, None),
+                            ),
                             expected_result,
                             protocol,
                             sequence_name="{}_{}".format(
-                                self._testMethodName, protocol))
+                                self._testMethodName, protocol
+                            ),
+                        )
 
                         self.check_deferred_exception()
-                        self.check_status(model_name, {1: 4 * (idx + 1)},
-                                          4 * (idx + 1), 4 * (idx + 1))
+                        self.check_status(
+                            model_name, {1: 4 * (idx + 1)}, 4 * (idx + 1), 4 * (idx + 1)
+                        )
                     except Exception as ex:
                         self.assertTrue(False, "unexpected error {}".format(ex))
 
@@ -592,8 +681,9 @@ def test_half_batch(self):
             for dtype in dtypes:
                 model_name = tu.get_sequence_model_name(trial, dtype)
                 # Skip bool type ensemble models
-                if (any(word in trial
-                        for word in ENSEMBLE_PREFIXES)) and (dtype == np.bool_):
+                if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
+                    dtype == np.bool_
+                ):
                     continue
                 # For bool type control models, use int32 as I/O types
                 if dtype == np.bool_:
@@ -602,9 +692,11 @@ def test_half_batch(self):
                 self.clear_deferred_exceptions()
 
                 precreated_shm0_handles = self.precreate_register_regions(
-                    (1, 2, 3, 4), dtype, 0)
+                    (1, 2, 3, 4), dtype, 0
+                )
                 precreated_shm1_handles = self.precreate_register_regions(
-                    (0, 9, 5, 13), dtype, 1)
+                    (0, 9, 5, 13), dtype, 1
+                )
 
                 try:
                     self.check_setup(model_name)
@@ -612,18 +704,19 @@ def test_half_batch(self):
                     # Need scheduler to wait for queue to contain all
                     # inferences for both sequences.
                     self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
+                    self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 8)
+                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 8)
-                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                  os.environ)
-                    self.assertEqual(
-                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]),
-                        0)
+                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0
+                    )
 
-                    expected_result = self.get_expected_result(
-                        10, 4, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        10, 4, trial, "end", dtype)
+                    expected_result = (
+                        self.get_expected_result(10, 4, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            10, 4, trial, "end", dtype
+                        )
+                    )
 
                     threads = []
                     threads.append(
@@ -636,18 +729,25 @@ def test_half_batch(self):
                                 987,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1, None), (None, 2, None),
-                                 (None, 3, None), ("end", 4, None)),
+                                (
+                                    ("start", 1, None),
+                                    (None, 2, None),
+                                    (None, 3, None),
+                                    ("end", 4, None),
+                                ),
                                 expected_result,
-                                precreated_shm0_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        27, 13, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        27, 13, trial, "end", dtype)
+                                precreated_shm0_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(27, 13, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            27, 13, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -658,14 +758,18 @@ def test_half_batch(self):
                                 988,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 0, None), (None, 9, None),
-                                 (None, 5, None), ("end", 13, None)),
+                                (
+                                    ("start", 0, None),
+                                    (None, 9, None),
+                                    (None, 5, None),
+                                    ("end", 13, None),
+                                ),
                                 expected_result,
-                                precreated_shm1_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
+                                precreated_shm1_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
 
                     for t in threads:
                         t.start()
@@ -681,7 +785,9 @@ def test_half_batch(self):
                         self.check_status(
                             model_name,
                             {stats_batch_size: 4 * min(2, MODEL_INSTANCES)},
-                            exec_cnt, 8)
+                            exec_cnt,
+                            8,
+                        )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
                 finally:
@@ -699,8 +805,9 @@ def test_skip_batch(self):
             for dtype in dtypes:
                 model_name = tu.get_sequence_model_name(trial, dtype)
                 # Skip bool type ensemble models
-                if (any(word in trial
-                        for word in ENSEMBLE_PREFIXES)) and (dtype == np.bool_):
+                if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
+                    dtype == np.bool_
+                ):
                     continue
                 # For bool type control models, use int32 as I/O types
                 if dtype == np.bool_:
@@ -709,13 +816,17 @@ def test_skip_batch(self):
                 self.clear_deferred_exceptions()
 
                 precreated_shm0_handles = self.precreate_register_regions(
-                    (1, 3), dtype, 0)
+                    (1, 3), dtype, 0
+                )
                 precreated_shm1_handles = self.precreate_register_regions(
-                    (11, 12, 13, 14), dtype, 1)
+                    (11, 12, 13, 14), dtype, 1
+                )
                 precreated_shm2_handles = self.precreate_register_regions(
-                    (111, 113), dtype, 2)
+                    (111, 113), dtype, 2
+                )
                 precreated_shm3_handles = self.precreate_register_regions(
-                    (1111, 1112, 1113, 1114), dtype, 3)
+                    (1111, 1112, 1113, 1114), dtype, 3
+                )
 
                 try:
                     self.check_setup(model_name)
@@ -724,18 +835,21 @@ def test_skip_batch(self):
                     # inferences for both sequences.
                     self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12)
-                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                  os.environ)
+                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12
+                    )
+                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]),
-                        0)
+                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0
+                    )
 
                     threads = []
-                    expected_result = self.get_expected_result(
-                        4, 3, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        4, 3, trial, "end", dtype)
+                    expected_result = (
+                        self.get_expected_result(4, 3, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            4, 3, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -748,15 +862,18 @@ def test_skip_batch(self):
                                 # (flag_str, value, pre_delay_ms)
                                 (("start", 1, None), ("end", 3, None)),
                                 expected_result,
-                                precreated_shm0_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        50, 14, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        50, 14, trial, "end", dtype)
+                                precreated_shm0_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(50, 14, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            50, 14, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -767,18 +884,25 @@ def test_skip_batch(self):
                                 1002,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 11, None), (None, 12, None),
-                                 (None, 13, None), ("end", 14, None)),
+                                (
+                                    ("start", 11, None),
+                                    (None, 12, None),
+                                    (None, 13, None),
+                                    ("end", 14, None),
+                                ),
                                 expected_result,
-                                precreated_shm1_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        224, 113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        224, 113, trial, "end", dtype)
+                                precreated_shm1_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(224, 113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            224, 113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -791,15 +915,18 @@ def test_skip_batch(self):
                                 # (flag_str, value, pre_delay_ms)
                                 (("start", 111, None), ("end", 113, None)),
                                 expected_result,
-                                precreated_shm2_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        4450, 1114, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        4450, 1114, trial, "end", dtype)
+                                precreated_shm2_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(4450, 1114, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            4450, 1114, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -810,14 +937,18 @@ def test_skip_batch(self):
                                 1004,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1111, None), (None, 1112, None),
-                                 (None, 1113, None), ("end", 1114, None)),
+                                (
+                                    ("start", 1111, None),
+                                    (None, 1112, None),
+                                    (None, 1113, None),
+                                    ("end", 1114, None),
+                                ),
                                 expected_result,
-                                precreated_shm3_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
+                                precreated_shm3_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
 
                     threads[1].start()
                     threads[3].start()
@@ -862,8 +993,9 @@ def test_full_batch(self):
             for dtype in dtypes:
                 model_name = tu.get_sequence_model_name(trial, dtype)
                 # Skip bool type ensemble models
-                if (any(word in trial
-                        for word in ENSEMBLE_PREFIXES)) and (dtype == np.bool_):
+                if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
+                    dtype == np.bool_
+                ):
                     continue
                 # For bool type control models, use int32 as I/O types
                 if dtype == np.bool_:
@@ -872,13 +1004,17 @@ def test_full_batch(self):
                 self.clear_deferred_exceptions()
 
                 precreated_shm0_handles = self.precreate_register_regions(
-                    (1, 2, 3), dtype, 0)
+                    (1, 2, 3), dtype, 0
+                )
                 precreated_shm1_handles = self.precreate_register_regions(
-                    (11, 12, 13), dtype, 1)
+                    (11, 12, 13), dtype, 1
+                )
                 precreated_shm2_handles = self.precreate_register_regions(
-                    (111, 112, 113), dtype, 2)
+                    (111, 112, 113), dtype, 2
+                )
                 precreated_shm3_handles = self.precreate_register_regions(
-                    (1111, 1112, 1113), dtype, 3)
+                    (1111, 1112, 1113), dtype, 3
+                )
 
                 try:
                     self.check_setup(model_name)
@@ -887,17 +1023,20 @@ def test_full_batch(self):
                     # inferences for both sequences.
                     self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12)
-                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                  os.environ)
+                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12
+                    )
+                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]),
-                        0)
-
-                    expected_result = self.get_expected_result(
-                        6, 3, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        6, 3, trial, "end", dtype)
+                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0
+                    )
+
+                    expected_result = (
+                        self.get_expected_result(6, 3, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            6, 3, trial, "end", dtype
+                        )
+                    )
                     threads = []
                     threads.append(
                         threading.Thread(
@@ -909,19 +1048,21 @@ def test_full_batch(self):
                                 1001,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1, None), (None, 2, None), ("end", 3,
-                                                                       None)),
+                                (("start", 1, None), (None, 2, None), ("end", 3, None)),
                                 expected_result,
-                                precreated_shm0_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-
-                    expected_result = self.get_expected_result(
-                        36, 13, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        36, 13, trial, "end", dtype)
+                                precreated_shm0_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+
+                    expected_result = (
+                        self.get_expected_result(36, 13, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            36, 13, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -932,19 +1073,25 @@ def test_full_batch(self):
                                 1002,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 11, None), (None, 12, None),
-                                 ("end", 13, None)),
+                                (
+                                    ("start", 11, None),
+                                    (None, 12, None),
+                                    ("end", 13, None),
+                                ),
                                 expected_result,
-                                precreated_shm1_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-
-                    expected_result = self.get_expected_result(
-                        336, 113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        336, 113, trial, "end", dtype)
+                                precreated_shm1_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+
+                    expected_result = (
+                        self.get_expected_result(336, 113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            336, 113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -955,18 +1102,24 @@ def test_full_batch(self):
                                 1003,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 111, None), (None, 112, None),
-                                 ("end", 113, None)),
+                                (
+                                    ("start", 111, None),
+                                    (None, 112, None),
+                                    ("end", 113, None),
+                                ),
                                 expected_result,
-                                precreated_shm2_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        3336, 1113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        3336, 1113, trial, "end", dtype)
+                                precreated_shm2_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(3336, 1113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            3336, 1113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -977,14 +1130,17 @@ def test_full_batch(self):
                                 1004,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1111, None), (None, 1112, None),
-                                 ("end", 1113, None)),
+                                (
+                                    ("start", 1111, None),
+                                    (None, 1112, None),
+                                    ("end", 1113, None),
+                                ),
                                 expected_result,
-                                precreated_shm3_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
+                                precreated_shm3_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
 
                     for t in threads:
                         t.start()
@@ -995,9 +1151,12 @@ def test_full_batch(self):
                         # Requests do not get batched for the ensemble model
                         self.check_status(model_name, {1: 12}, 12, 12)
                     else:
-                        self.check_status(model_name, {
-                            (4 / MODEL_INSTANCES): (3 * MODEL_INSTANCES)
-                        }, 3 * MODEL_INSTANCES, 12)
+                        self.check_status(
+                            model_name,
+                            {(4 / MODEL_INSTANCES): (3 * MODEL_INSTANCES)},
+                            3 * MODEL_INSTANCES,
+                            12,
+                        )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
                 finally:
@@ -1024,8 +1183,9 @@ def test_ragged_batch(self):
             for dtype in dtypes:
                 model_name = tu.get_sequence_model_name(trial, dtype)
                 # Skip bool type ensemble models
-                if (any(word in trial
-                        for word in ENSEMBLE_PREFIXES)) and (dtype == np.bool_):
+                if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
+                    dtype == np.bool_
+                ):
                     continue
                 # For bool type control models, use int32 as I/O types
                 if dtype == np.bool_:
@@ -1034,13 +1194,17 @@ def test_ragged_batch(self):
                 self.clear_deferred_exceptions()
 
                 precreated_shm0_handles = self.precreate_register_regions(
-                    (1, 2, 3), dtype, 0, tensor_shape=(2,))
+                    (1, 2, 3), dtype, 0, tensor_shape=(2,)
+                )
                 precreated_shm1_handles = self.precreate_register_regions(
-                    (11, 12, 13), dtype, 1, tensor_shape=(2,))
+                    (11, 12, 13), dtype, 1, tensor_shape=(2,)
+                )
                 precreated_shm2_handles = self.precreate_register_regions(
-                    (111, 112, 113), dtype, 2, tensor_shape=(1,))
+                    (111, 112, 113), dtype, 2, tensor_shape=(1,)
+                )
                 precreated_shm3_handles = self.precreate_register_regions(
-                    (1111, 1112, 1113), dtype, 3, tensor_shape=(3,))
+                    (1111, 1112, 1113), dtype, 3, tensor_shape=(3,)
+                )
 
                 try:
                     self.check_setup(model_name)
@@ -1049,18 +1213,21 @@ def test_ragged_batch(self):
                     # inferences for both sequences.
                     self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12)
-                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                  os.environ)
+                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12
+                    )
+                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]),
-                        0)
+                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0
+                    )
 
                     threads = []
-                    expected_result = self.get_expected_result(
-                        6 * 2, 3, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        6, 3, trial, "end", dtype)
+                    expected_result = (
+                        self.get_expected_result(6 * 2, 3, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            6, 3, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1071,20 +1238,24 @@ def test_ragged_batch(self):
                                 1001,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1, None), (None, 2, None), ("end", 3,
-                                                                       None)),
+                                (("start", 1, None), (None, 2, None), ("end", 3, None)),
                                 expected_result,
-                                precreated_shm0_handles),
+                                precreated_shm0_handles,
+                            ),
                             kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName),
-                                'tensor_shape': (2,)
-                            }))
-
-                    expected_result = self.get_expected_result(
-                        36 * 2, 13, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        36, 13, trial, "end", dtype)
+                                "sequence_name": "{}".format(self._testMethodName),
+                                "tensor_shape": (2,),
+                            },
+                        )
+                    )
+
+                    expected_result = (
+                        self.get_expected_result(36 * 2, 13, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            36, 13, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1095,19 +1266,27 @@ def test_ragged_batch(self):
                                 1002,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 11, None), (None, 12, None),
-                                 ("end", 13, None)),
+                                (
+                                    ("start", 11, None),
+                                    (None, 12, None),
+                                    ("end", 13, None),
+                                ),
                                 expected_result,
-                                precreated_shm1_handles),
+                                precreated_shm1_handles,
+                            ),
                             kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName),
-                                'tensor_shape': (2,)
-                            }))
-                    expected_result = self.get_expected_result(
-                        336, 113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        336, 113, trial, "end", dtype)
+                                "sequence_name": "{}".format(self._testMethodName),
+                                "tensor_shape": (2,),
+                            },
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(336, 113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            336, 113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1118,19 +1297,27 @@ def test_ragged_batch(self):
                                 1003,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 111, None), (None, 112, None),
-                                 ("end", 113, None)),
+                                (
+                                    ("start", 111, None),
+                                    (None, 112, None),
+                                    ("end", 113, None),
+                                ),
                                 expected_result,
-                                precreated_shm2_handles),
+                                precreated_shm2_handles,
+                            ),
                             kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName),
-                                'tensor_shape': (1,)
-                            }))
-                    expected_result = self.get_expected_result(
-                        3336 * 3, 1113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        3336, 1113, trial, "end", dtype)
+                                "sequence_name": "{}".format(self._testMethodName),
+                                "tensor_shape": (1,),
+                            },
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(3336 * 3, 1113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            3336, 1113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1141,15 +1328,20 @@ def test_ragged_batch(self):
                                 1004,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1111, None), (None, 1112, None),
-                                 ("end", 1113, None)),
+                                (
+                                    ("start", 1111, None),
+                                    (None, 1112, None),
+                                    ("end", 1113, None),
+                                ),
                                 expected_result,
-                                precreated_shm3_handles),
+                                precreated_shm3_handles,
+                            ),
                             kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName),
-                                'tensor_shape': (3,)
-                            }))
+                                "sequence_name": "{}".format(self._testMethodName),
+                                "tensor_shape": (3,),
+                            },
+                        )
+                    )
 
                     threads[0].start()
                     threads[1].start()
@@ -1190,8 +1382,9 @@ def test_ragged_batch_allowed(self):
             for dtype in dtypes:
                 model_name = tu.get_sequence_model_name(trial, dtype)
                 # Skip bool type ensemble models
-                if (any(word in trial
-                        for word in ENSEMBLE_PREFIXES)) and (dtype == np.bool_):
+                if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
+                    dtype == np.bool_
+                ):
                     continue
                 # For bool type control models, use int32 as I/O types
                 if dtype == np.bool_:
@@ -1200,13 +1393,17 @@ def test_ragged_batch_allowed(self):
                 self.clear_deferred_exceptions()
 
                 precreated_shm0_handles = self.precreate_register_regions(
-                    (1, 2, 3), dtype, 0, tensor_shape=(2,))
+                    (1, 2, 3), dtype, 0, tensor_shape=(2,)
+                )
                 precreated_shm1_handles = self.precreate_register_regions(
-                    (11, 12, 13), dtype, 1, tensor_shape=(2,))
+                    (11, 12, 13), dtype, 1, tensor_shape=(2,)
+                )
                 precreated_shm2_handles = self.precreate_register_regions(
-                    (111, 112, 113), dtype, 2, tensor_shape=(1,))
+                    (111, 112, 113), dtype, 2, tensor_shape=(1,)
+                )
                 precreated_shm3_handles = self.precreate_register_regions(
-                    (1111, 1112, 1113), dtype, 3, tensor_shape=(3,))
+                    (1111, 1112, 1113), dtype, 3, tensor_shape=(3,)
+                )
                 try:
                     self.check_setup(model_name)
 
@@ -1214,19 +1411,22 @@ def test_ragged_batch_allowed(self):
                     # inferences for both sequences.
                     self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12)
-                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                  os.environ)
+                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12
+                    )
+                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]),
-                        0)
+                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0
+                    )
 
                     threads = []
 
-                    expected_result = self.get_expected_result(
-                        6 * 2, 3, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        6 * 2, 3, trial, "end", dtype)
+                    expected_result = (
+                        self.get_expected_result(6 * 2, 3, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            6 * 2, 3, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1237,20 +1437,24 @@ def test_ragged_batch_allowed(self):
                                 1001,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1, None), (None, 2, None), ("end", 3,
-                                                                       None)),
+                                (("start", 1, None), (None, 2, None), ("end", 3, None)),
                                 expected_result,
-                                precreated_shm0_handles),
+                                precreated_shm0_handles,
+                            ),
                             kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName),
-                                'tensor_shape': (2,)
-                            }))
-
-                    expected_result = self.get_expected_result(
-                        36 * 2, 13, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        36 * 2, 13, trial, "end", dtype)
+                                "sequence_name": "{}".format(self._testMethodName),
+                                "tensor_shape": (2,),
+                            },
+                        )
+                    )
+
+                    expected_result = (
+                        self.get_expected_result(36 * 2, 13, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            36 * 2, 13, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1261,19 +1465,27 @@ def test_ragged_batch_allowed(self):
                                 1002,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 11, None), (None, 12, None),
-                                 ("end", 13, None)),
+                                (
+                                    ("start", 11, None),
+                                    (None, 12, None),
+                                    ("end", 13, None),
+                                ),
                                 expected_result,
-                                precreated_shm1_handles),
+                                precreated_shm1_handles,
+                            ),
                             kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName),
-                                'tensor_shape': (2,)
-                            }))
-                    expected_result = self.get_expected_result(
-                        336, 113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        336, 113, trial, "end", dtype)
+                                "sequence_name": "{}".format(self._testMethodName),
+                                "tensor_shape": (2,),
+                            },
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(336, 113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            336, 113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1284,19 +1496,27 @@ def test_ragged_batch_allowed(self):
                                 1003,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 111, None), (None, 112, None),
-                                 ("end", 113, None)),
+                                (
+                                    ("start", 111, None),
+                                    (None, 112, None),
+                                    ("end", 113, None),
+                                ),
                                 expected_result,
-                                precreated_shm2_handles),
+                                precreated_shm2_handles,
+                            ),
                             kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName),
-                                'tensor_shape': (1,)
-                            }))
-                    expected_result = self.get_expected_result(
-                        3336 * 3, 1113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        3336 * 3, 1113, trial, "end", dtype)
+                                "sequence_name": "{}".format(self._testMethodName),
+                                "tensor_shape": (1,),
+                            },
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(3336 * 3, 1113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            3336 * 3, 1113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1307,15 +1527,20 @@ def test_ragged_batch_allowed(self):
                                 1004,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1111, None), (None, 1112, None),
-                                 ("end", 1113, None)),
+                                (
+                                    ("start", 1111, None),
+                                    (None, 1112, None),
+                                    ("end", 1113, None),
+                                ),
                                 expected_result,
-                                precreated_shm3_handles),
+                                precreated_shm3_handles,
+                            ),
                             kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName),
-                                'tensor_shape': (3,)
-                            }))
+                                "sequence_name": "{}".format(self._testMethodName),
+                                "tensor_shape": (3,),
+                            },
+                        )
+                    )
 
                     for t in threads:
                         t.start()
@@ -1347,8 +1572,9 @@ def test_backlog(self):
             for dtype in dtypes:
                 model_name = tu.get_sequence_model_name(trial, dtype)
                 # Skip bool type ensemble models
-                if (any(word in trial
-                        for word in ENSEMBLE_PREFIXES)) and (dtype == np.bool_):
+                if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
+                    dtype == np.bool_
+                ):
                     continue
                 # For bool type control models, use int32 as I/O types
                 if dtype == np.bool_:
@@ -1357,15 +1583,20 @@ def test_backlog(self):
                 self.clear_deferred_exceptions()
 
                 precreated_shm0_handles = self.precreate_register_regions(
-                    (1, 2, 3), dtype, 0)
+                    (1, 2, 3), dtype, 0
+                )
                 precreated_shm1_handles = self.precreate_register_regions(
-                    (11, 12, 13), dtype, 1)
+                    (11, 12, 13), dtype, 1
+                )
                 precreated_shm2_handles = self.precreate_register_regions(
-                    (111, 112, 113), dtype, 2)
+                    (111, 112, 113), dtype, 2
+                )
                 precreated_shm3_handles = self.precreate_register_regions(
-                    (1111, 1112, 1113), dtype, 3)
+                    (1111, 1112, 1113), dtype, 3
+                )
                 precreated_shm4_handles = self.precreate_register_regions(
-                    (11111, 11112, 11113), dtype, 4)
+                    (11111, 11112, 11113), dtype, 4
+                )
 
                 try:
                     self.check_setup(model_name)
@@ -1374,18 +1605,21 @@ def test_backlog(self):
                     # inferences for both sequences.
                     self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12)
-                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                  os.environ)
+                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12
+                    )
+                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]),
-                        0)
+                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0
+                    )
 
                     threads = []
-                    expected_result = self.get_expected_result(
-                        6, 3, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        6, 3, trial, "end", dtype)
+                    expected_result = (
+                        self.get_expected_result(6, 3, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            6, 3, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1396,18 +1630,20 @@ def test_backlog(self):
                                 1001,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1, None), (None, 2, None), ("end", 3,
-                                                                       None)),
+                                (("start", 1, None), (None, 2, None), ("end", 3, None)),
                                 expected_result,
-                                precreated_shm0_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        36, 13, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        36, 13, trial, "end", dtype)
+                                precreated_shm0_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(36, 13, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            36, 13, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1418,18 +1654,24 @@ def test_backlog(self):
                                 1002,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 11, None), (None, 12, None),
-                                 ("end", 13, None)),
+                                (
+                                    ("start", 11, None),
+                                    (None, 12, None),
+                                    ("end", 13, None),
+                                ),
                                 expected_result,
-                                precreated_shm1_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        336, 113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        336, 113, trial, "end", dtype)
+                                precreated_shm1_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(336, 113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            336, 113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1440,18 +1682,24 @@ def test_backlog(self):
                                 1003,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 111, None), (None, 112, None),
-                                 ("end", 113, None)),
+                                (
+                                    ("start", 111, None),
+                                    (None, 112, None),
+                                    ("end", 113, None),
+                                ),
                                 expected_result,
-                                precreated_shm2_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        3336, 1113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        3336, 1113, trial, "end", dtype)
+                                precreated_shm2_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(3336, 1113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            3336, 1113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1462,19 +1710,25 @@ def test_backlog(self):
                                 1004,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1111, None), (None, 1112, None),
-                                 ("end", 1113, None)),
+                                (
+                                    ("start", 1111, None),
+                                    (None, 1112, None),
+                                    ("end", 1113, None),
+                                ),
                                 expected_result,
-                                precreated_shm3_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-
-                    expected_result = self.get_expected_result(
-                        33336, 11113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        33336, 11113, trial, "end", dtype)
+                                precreated_shm3_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+
+                    expected_result = (
+                        self.get_expected_result(33336, 11113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            33336, 11113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1485,14 +1739,17 @@ def test_backlog(self):
                                 1005,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 11111, None), (None, 11112, None),
-                                 ("end", 11113, None)),
+                                (
+                                    ("start", 11111, None),
+                                    (None, 11112, None),
+                                    ("end", 11113, None),
+                                ),
                                 expected_result,
-                                precreated_shm4_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
+                                precreated_shm4_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
 
                     for t in threads:
                         t.start()
@@ -1537,8 +1794,9 @@ def test_backlog_fill(self):
             for dtype in dtypes:
                 model_name = tu.get_sequence_model_name(trial, dtype)
                 # Skip bool type ensemble models
-                if (any(word in trial
-                        for word in ENSEMBLE_PREFIXES)) and (dtype == np.bool_):
+                if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
+                    dtype == np.bool_
+                ):
                     continue
                 # For bool type control models, use int32 as I/O types
                 if dtype == np.bool_:
@@ -1547,17 +1805,23 @@ def test_backlog_fill(self):
                 self.clear_deferred_exceptions()
 
                 precreated_shm0_handles = self.precreate_register_regions(
-                    (1, 2, 3), dtype, 0)
+                    (1, 2, 3), dtype, 0
+                )
                 precreated_shm1_handles = self.precreate_register_regions(
-                    (11, 13), dtype, 1)
+                    (11, 13), dtype, 1
+                )
                 precreated_shm2_handles = self.precreate_register_regions(
-                    (111, 113), dtype, 2)
+                    (111, 113), dtype, 2
+                )
                 precreated_shm3_handles = self.precreate_register_regions(
-                    (1111, 1112, 1113), dtype, 3)
+                    (1111, 1112, 1113), dtype, 3
+                )
                 precreated_shm4_handles = self.precreate_register_regions(
-                    (11111,), dtype, 4)
+                    (11111,), dtype, 4
+                )
                 precreated_shm5_handles = self.precreate_register_regions(
-                    (22222,), dtype, 5)
+                    (22222,), dtype, 5
+                )
 
                 try:
                     self.check_setup(model_name)
@@ -1566,18 +1830,21 @@ def test_backlog_fill(self):
                     # inferences for both sequences.
                     self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 10)
-                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                  os.environ)
+                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 10
+                    )
+                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]),
-                        2)
+                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 2
+                    )
 
                     threads = []
-                    expected_result = self.get_expected_result(
-                        6, 3, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        6, 3, trial, "end", dtype)
+                    expected_result = (
+                        self.get_expected_result(6, 3, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            6, 3, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1588,18 +1855,20 @@ def test_backlog_fill(self):
                                 1001,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1, None), (None, 2, None), ("end", 3,
-                                                                       None)),
+                                (("start", 1, None), (None, 2, None), ("end", 3, None)),
                                 expected_result,
-                                precreated_shm0_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        24, 13, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        24, 13, trial, "end", dtype)
+                                precreated_shm0_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(24, 13, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            24, 13, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1612,15 +1881,18 @@ def test_backlog_fill(self):
                                 # (flag_str, value, pre_delay_ms)
                                 (("start", 11, None), ("end", 13, None)),
                                 expected_result,
-                                precreated_shm1_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        224, 113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        224, 113, trial, "end", dtype)
+                                precreated_shm1_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(224, 113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            224, 113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1633,15 +1905,18 @@ def test_backlog_fill(self):
                                 # (flag_str, value, pre_delay_ms)
                                 (("start", 111, None), ("end", 113, None)),
                                 expected_result,
-                                precreated_shm2_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        3336, 1113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        3336, 1113, trial, "end", dtype)
+                                precreated_shm2_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(3336, 1113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            3336, 1113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1652,18 +1927,24 @@ def test_backlog_fill(self):
                                 1004,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1111, None), (None, 1112, None),
-                                 ("end", 1113, None)),
+                                (
+                                    ("start", 1111, None),
+                                    (None, 1112, None),
+                                    ("end", 1113, None),
+                                ),
                                 expected_result,
-                                precreated_shm3_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        11111, 11111, trial, "start,end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        11111, 11111, trial, "start,end", dtype)
+                                precreated_shm3_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(11111, 11111, trial, "start,end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            11111, 11111, trial, "start,end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1674,18 +1955,20 @@ def test_backlog_fill(self):
                                 1005,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (
-                                    ("start,end", 11111, None),),
+                                (("start,end", 11111, None),),
                                 expected_result,
-                                precreated_shm4_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        22222, 22222, trial, "start,end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        22222, 22222, trial, "start,end", dtype)
+                                precreated_shm4_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(22222, 22222, trial, "start,end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            22222, 22222, trial, "start,end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1696,14 +1979,13 @@ def test_backlog_fill(self):
                                 1006,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (
-                                    ("start,end", 22222, None),),
+                                (("start,end", 22222, None),),
                                 expected_result,
-                                precreated_shm5_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
+                                precreated_shm5_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
 
                     threads[0].start()
                     threads[1].start()
@@ -1750,8 +2032,9 @@ def test_backlog_fill_no_end(self):
             for dtype in dtypes:
                 model_name = tu.get_sequence_model_name(trial, dtype)
                 # Skip bool type ensemble models
-                if (any(word in trial
-                        for word in ENSEMBLE_PREFIXES)) and (dtype == np.bool_):
+                if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
+                    dtype == np.bool_
+                ):
                     continue
                 # For bool type control models, use int32 as I/O types
                 if dtype == np.bool_:
@@ -1760,17 +2043,23 @@ def test_backlog_fill_no_end(self):
                 self.clear_deferred_exceptions()
 
                 precreated_shm0_handles = self.precreate_register_regions(
-                    (1, 2, 3), dtype, 0)
+                    (1, 2, 3), dtype, 0
+                )
                 precreated_shm1_handles = self.precreate_register_regions(
-                    (11, 13), dtype, 1)
+                    (11, 13), dtype, 1
+                )
                 precreated_shm2_handles = self.precreate_register_regions(
-                    (111, 113), dtype, 2)
+                    (111, 113), dtype, 2
+                )
                 precreated_shm3_handles = self.precreate_register_regions(
-                    (1111, 1112, 1113), dtype, 3)
+                    (1111, 1112, 1113), dtype, 3
+                )
                 precreated_shm4_handles = self.precreate_register_regions(
-                    (11111,), dtype, 4)
+                    (11111,), dtype, 4
+                )
                 precreated_shm5_handles = self.precreate_register_regions(
-                    (22222, 22223, 22224), dtype, 5)
+                    (22222, 22223, 22224), dtype, 5
+                )
 
                 try:
                     self.check_setup(model_name)
@@ -1779,18 +2068,21 @@ def test_backlog_fill_no_end(self):
                     # inferences for both sequences.
                     self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 10)
-                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                  os.environ)
+                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 10
+                    )
+                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]),
-                        3)
+                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 3
+                    )
 
                     threads = []
-                    expected_result = self.get_expected_result(
-                        6, 3, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        6, 3, trial, "end", dtype)
+                    expected_result = (
+                        self.get_expected_result(6, 3, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            6, 3, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1801,18 +2093,20 @@ def test_backlog_fill_no_end(self):
                                 1001,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1, None), (None, 2, None), ("end", 3,
-                                                                       None)),
+                                (("start", 1, None), (None, 2, None), ("end", 3, None)),
                                 expected_result,
-                                precreated_shm0_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        24, 13, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        24, 13, trial, "end", dtype)
+                                precreated_shm0_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(24, 13, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            24, 13, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1825,15 +2119,18 @@ def test_backlog_fill_no_end(self):
                                 # (flag_str, value, pre_delay_ms)
                                 (("start", 11, None), ("end", 13, None)),
                                 expected_result,
-                                precreated_shm1_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        224, 113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        224, 113, trial, "end", dtype)
+                                precreated_shm1_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(224, 113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            224, 113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1846,15 +2143,18 @@ def test_backlog_fill_no_end(self):
                                 # (flag_str, value, pre_delay_ms)
                                 (("start", 111, None), ("end", 113, None)),
                                 expected_result,
-                                precreated_shm2_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        3336, 1113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        3336, 1113, trial, "end", dtype)
+                                precreated_shm2_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(3336, 1113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            3336, 1113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1865,18 +2165,24 @@ def test_backlog_fill_no_end(self):
                                 1004,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1111, None), (None, 1112, None),
-                                 ("end", 1113, None)),
+                                (
+                                    ("start", 1111, None),
+                                    (None, 1112, None),
+                                    ("end", 1113, None),
+                                ),
                                 expected_result,
-                                precreated_shm3_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        11111, 11111, trial, "start,end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        11111, 11111, trial, "end", dtype)
+                                precreated_shm3_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(11111, 11111, trial, "start,end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            11111, 11111, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1887,18 +2193,20 @@ def test_backlog_fill_no_end(self):
                                 1005,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (
-                                    ("start,end", 11111, None),),
+                                (("start,end", 11111, None),),
                                 expected_result,
-                                precreated_shm4_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        66669, 22224, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        66669, 22224, trial, "end", dtype)
+                                precreated_shm4_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(66669, 22224, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            66669, 22224, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -1915,11 +2223,11 @@ def test_backlog_fill_no_end(self):
                                     ("end", 22224, 2000),
                                 ),
                                 expected_result,
-                                precreated_shm5_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
+                                precreated_shm5_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
 
                     threads[0].start()
                     time.sleep(2)
@@ -1965,8 +2273,9 @@ def test_backlog_same_correlation_id(self):
             for dtype in dtypes:
                 model_name = tu.get_sequence_model_name(trial, dtype)
                 # Skip bool type ensemble models
-                if (any(word in trial
-                        for word in ENSEMBLE_PREFIXES)) and (dtype == np.bool_):
+                if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
+                    dtype == np.bool_
+                ):
                     continue
                 # For bool type control models, use int32 as I/O types
                 if dtype == np.bool_:
@@ -1975,15 +2284,20 @@ def test_backlog_same_correlation_id(self):
                 self.clear_deferred_exceptions()
 
                 precreated_shm0_handles = self.precreate_register_regions(
-                    (1, 2, 3), dtype, 0)
+                    (1, 2, 3), dtype, 0
+                )
                 precreated_shm1_handles = self.precreate_register_regions(
-                    (11, 12, 13), dtype, 1)
+                    (11, 12, 13), dtype, 1
+                )
                 precreated_shm2_handles = self.precreate_register_regions(
-                    (111, 112, 113), dtype, 2)
+                    (111, 112, 113), dtype, 2
+                )
                 precreated_shm3_handles = self.precreate_register_regions(
-                    (1111, 1112, 1113), dtype, 3)
+                    (1111, 1112, 1113), dtype, 3
+                )
                 precreated_shm4_handles = self.precreate_register_regions(
-                    (11111, 11113), dtype, 4)
+                    (11111, 11113), dtype, 4
+                )
 
                 try:
                     self.check_setup(model_name)
@@ -1992,18 +2306,21 @@ def test_backlog_same_correlation_id(self):
                     # inferences for both sequences.
                     self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12)
-                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                  os.environ)
+                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 12
+                    )
+                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]),
-                        2)
+                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 2
+                    )
 
                     threads = []
-                    expected_result = self.get_expected_result(
-                        6, 3, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        6, 3, trial, "end", dtype)
+                    expected_result = (
+                        self.get_expected_result(6, 3, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            6, 3, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2014,18 +2331,20 @@ def test_backlog_same_correlation_id(self):
                                 1001,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1, None), (None, 2, None), ("end", 3,
-                                                                       None)),
+                                (("start", 1, None), (None, 2, None), ("end", 3, None)),
                                 expected_result,
-                                precreated_shm0_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        36, 13, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        36, 13, trial, "end", dtype)
+                                precreated_shm0_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(36, 13, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            36, 13, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2036,18 +2355,24 @@ def test_backlog_same_correlation_id(self):
                                 1002,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 11, None), (None, 12, None),
-                                 ("end", 13, None)),
+                                (
+                                    ("start", 11, None),
+                                    (None, 12, None),
+                                    ("end", 13, None),
+                                ),
                                 expected_result,
-                                precreated_shm1_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        336, 113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        336, 113, trial, "end", dtype)
+                                precreated_shm1_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(336, 113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            336, 113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2058,18 +2383,24 @@ def test_backlog_same_correlation_id(self):
                                 1003,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 111, None), (None, 112, None),
-                                 ("end", 113, None)),
+                                (
+                                    ("start", 111, None),
+                                    (None, 112, None),
+                                    ("end", 113, None),
+                                ),
                                 expected_result,
-                                precreated_shm2_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        3336, 1113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        3336, 1113, trial, "end", dtype)
+                                precreated_shm2_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(3336, 1113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            3336, 1113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2080,18 +2411,24 @@ def test_backlog_same_correlation_id(self):
                                 1004,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1111, None), (None, 1112, None),
-                                 ("end", 1113, None)),
+                                (
+                                    ("start", 1111, None),
+                                    (None, 1112, None),
+                                    ("end", 1113, None),
+                                ),
                                 expected_result,
-                                precreated_shm3_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        22224, 11113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        22224, 11113, trial, "end", dtype)
+                                precreated_shm3_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(22224, 11113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            22224, 11113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2104,11 +2441,11 @@ def test_backlog_same_correlation_id(self):
                                 # (flag_str, value, pre_delay_ms)
                                 (("start", 11111, None), ("end", 11113, None)),
                                 expected_result,
-                                precreated_shm4_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
+                                precreated_shm4_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
 
                     threads[0].start()
                     threads[1].start()
@@ -2126,12 +2463,13 @@ def test_backlog_same_correlation_id(self):
                         if MODEL_INSTANCES != 4:
                             batch_exec = {
                                 (4 / MODEL_INSTANCES): (3 * MODEL_INSTANCES),
-                                1: 2
+                                1: 2,
                             }
                         else:
                             batch_exec = {1: (3 * MODEL_INSTANCES) + 2}
-                        self.check_status(model_name, batch_exec,
-                                          (3 * MODEL_INSTANCES) + 2, 14)
+                        self.check_status(
+                            model_name, batch_exec, (3 * MODEL_INSTANCES) + 2, 14
+                        )
                 except Exception as ex:
                     self.assertTrue(False, "unexpected error {}".format(ex))
                 finally:
@@ -2163,8 +2501,9 @@ def test_backlog_same_correlation_id_no_end(self):
             for dtype in dtypes:
                 model_name = tu.get_sequence_model_name(trial, dtype)
                 # Skip bool type ensemble models
-                if (any(word in trial
-                        for word in ENSEMBLE_PREFIXES)) and (dtype == np.bool_):
+                if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
+                    dtype == np.bool_
+                ):
                     continue
                 # For bool type control models, use int32 as I/O types
                 if dtype == np.bool_:
@@ -2173,15 +2512,20 @@ def test_backlog_same_correlation_id_no_end(self):
                 self.clear_deferred_exceptions()
 
                 precreated_shm0_handles = self.precreate_register_regions(
-                    (1, 3), dtype, 0)
+                    (1, 3), dtype, 0
+                )
                 precreated_shm1_handles = self.precreate_register_regions(
-                    (11, 12, 12, 13), dtype, 1)
+                    (11, 12, 12, 13), dtype, 1
+                )
                 precreated_shm2_handles = self.precreate_register_regions(
-                    (111, 112, 112, 113), dtype, 2)
+                    (111, 112, 112, 113), dtype, 2
+                )
                 precreated_shm3_handles = self.precreate_register_regions(
-                    (1111, 1112, 1112, 1113), dtype, 3)
+                    (1111, 1112, 1112, 1113), dtype, 3
+                )
                 precreated_shm4_handles = self.precreate_register_regions(
-                    (11111, 11113), dtype, 4)
+                    (11111, 11113), dtype, 4
+                )
                 try:
                     self.check_setup(model_name)
 
@@ -2189,18 +2533,19 @@ def test_backlog_same_correlation_id_no_end(self):
                     # inferences for both sequences.
                     self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 16)
-                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                  os.environ)
+                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 16
+                    )
+                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]),
-                        0)
+                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0
+                    )
 
                     threads = []
-                    expected_result = self.get_expected_result(
-                        4, 3, trial, None
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        4, 3, trial, None, dtype)
+                    expected_result = (
+                        self.get_expected_result(4, 3, trial, None)
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(4, 3, trial, None, dtype)
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2213,15 +2558,18 @@ def test_backlog_same_correlation_id_no_end(self):
                                 # (flag_str, value, pre_delay_ms)
                                 (("start", 1, None), (None, 3, None)),
                                 expected_result,
-                                precreated_shm0_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        48, 13, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        48, 13, trial, "end", dtype)
+                                precreated_shm0_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(48, 13, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            48, 13, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2232,18 +2580,25 @@ def test_backlog_same_correlation_id_no_end(self):
                                 1002,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 11, None), (None, 12, None),
-                                 (None, 12, None), ("end", 13, None)),
+                                (
+                                    ("start", 11, None),
+                                    (None, 12, None),
+                                    (None, 12, None),
+                                    ("end", 13, None),
+                                ),
                                 expected_result,
-                                precreated_shm1_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        448, 113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        448, 113, trial, "end", dtype)
+                                precreated_shm1_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(448, 113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            448, 113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2254,18 +2609,25 @@ def test_backlog_same_correlation_id_no_end(self):
                                 1003,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 111, None), (None, 112, None),
-                                 (None, 112, None), ("end", 113, None)),
+                                (
+                                    ("start", 111, None),
+                                    (None, 112, None),
+                                    (None, 112, None),
+                                    ("end", 113, None),
+                                ),
                                 expected_result,
-                                precreated_shm2_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        4448, 1113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        4448, 1113, trial, "end", dtype)
+                                precreated_shm2_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(4448, 1113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            4448, 1113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2276,18 +2638,25 @@ def test_backlog_same_correlation_id_no_end(self):
                                 1004,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1111, None), (None, 1112, None),
-                                 (None, 1112, None), ("end", 1113, None)),
+                                (
+                                    ("start", 1111, None),
+                                    (None, 1112, None),
+                                    (None, 1112, None),
+                                    ("end", 1113, None),
+                                ),
                                 expected_result,
-                                precreated_shm3_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        22224, 11113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        22224, 11113, trial, "end", dtype)
+                                precreated_shm3_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(22224, 11113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            22224, 11113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2300,11 +2669,11 @@ def test_backlog_same_correlation_id_no_end(self):
                                 # (flag_str, value, pre_delay_ms)
                                 (("start", 11111, None), ("end", 11113, None)),
                                 expected_result,
-                                precreated_shm4_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
+                                precreated_shm4_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
 
                     threads[0].start()
                     threads[1].start()
@@ -2351,8 +2720,9 @@ def test_backlog_sequence_timeout(self):
             for dtype in dtypes:
                 model_name = tu.get_sequence_model_name(trial, dtype)
                 # Skip bool type ensemble models
-                if (any(word in trial
-                        for word in ENSEMBLE_PREFIXES)) and (dtype == np.bool_):
+                if (any(word in trial for word in ENSEMBLE_PREFIXES)) and (
+                    dtype == np.bool_
+                ):
                     continue
                 # For bool type control models, use int32 as I/O types
                 if dtype == np.bool_:
@@ -2361,34 +2731,38 @@ def test_backlog_sequence_timeout(self):
                 self.clear_deferred_exceptions()
 
                 precreated_shm0_handles = self.precreate_register_regions(
-                    (1, 3), dtype, 0)
+                    (1, 3), dtype, 0
+                )
                 precreated_shm1_handles = self.precreate_register_regions(
-                    (11, 12, 12, 13), dtype, 1)
+                    (11, 12, 12, 13), dtype, 1
+                )
                 precreated_shm2_handles = self.precreate_register_regions(
-                    (111, 112, 112, 113), dtype, 2)
+                    (111, 112, 112, 113), dtype, 2
+                )
                 precreated_shm3_handles = self.precreate_register_regions(
-                    (1111, 1112, 1112, 1113), dtype, 3)
+                    (1111, 1112, 1112, 1113), dtype, 3
+                )
                 precreated_shm4_handles = self.precreate_register_regions(
-                    (11111, 11113), dtype, 4)
+                    (11111, 11113), dtype, 4
+                )
                 try:
                     self.check_setup(model_name)
 
                     # Need scheduler to wait for queue to contain all
                     # inferences for all sequences.
                     self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
+                    self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 4)
+                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 4)
-                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                  os.environ)
-                    self.assertEqual(
-                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]),
-                        0)
+                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0
+                    )
 
                     threads = []
-                    expected_result = self.get_expected_result(
-                        4, 3, trial, None
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        4, 3, trial, None, dtype)
+                    expected_result = (
+                        self.get_expected_result(4, 3, trial, None)
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(4, 3, trial, None, dtype)
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2399,18 +2773,23 @@ def test_backlog_sequence_timeout(self):
                                 1001,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1, None),
-                                 (None, 3, _max_sequence_idle_ms + 1000)),
+                                (
+                                    ("start", 1, None),
+                                    (None, 3, _max_sequence_idle_ms + 1000),
+                                ),
                                 expected_result,
-                                precreated_shm0_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        48, 13, trial, None
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        48, 13, trial, None, dtype)
+                                precreated_shm0_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(48, 13, trial, None)
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            48, 13, trial, None, dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2421,20 +2800,25 @@ def test_backlog_sequence_timeout(self):
                                 1002,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 11,
-                                  None), (None, 12, _max_sequence_idle_ms / 2),
-                                 (None, 12, _max_sequence_idle_ms / 2),
-                                 ("end", 13, _max_sequence_idle_ms / 2)),
+                                (
+                                    ("start", 11, None),
+                                    (None, 12, _max_sequence_idle_ms / 2),
+                                    (None, 12, _max_sequence_idle_ms / 2),
+                                    ("end", 13, _max_sequence_idle_ms / 2),
+                                ),
                                 expected_result,
-                                precreated_shm1_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        448, 113, trial, None
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        448, 113, trial, None, dtype)
+                                precreated_shm1_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(448, 113, trial, None)
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            448, 113, trial, None, dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2445,20 +2829,25 @@ def test_backlog_sequence_timeout(self):
                                 1003,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 111,
-                                  None), (None, 112, _max_sequence_idle_ms / 2),
-                                 (None, 112, _max_sequence_idle_ms / 2),
-                                 ("end", 113, _max_sequence_idle_ms / 2)),
+                                (
+                                    ("start", 111, None),
+                                    (None, 112, _max_sequence_idle_ms / 2),
+                                    (None, 112, _max_sequence_idle_ms / 2),
+                                    ("end", 113, _max_sequence_idle_ms / 2),
+                                ),
                                 expected_result,
-                                precreated_shm2_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        4448, 1113, trial, None
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        4448, 1113, trial, None, dtype)
+                                precreated_shm2_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(4448, 1113, trial, None)
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            4448, 1113, trial, None, dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2469,20 +2858,25 @@ def test_backlog_sequence_timeout(self):
                                 1004,
                                 (None, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (("start", 1111, None),
-                                 (None, 1112, _max_sequence_idle_ms / 2),
-                                 (None, 1112, _max_sequence_idle_ms / 2),
-                                 ("end", 1113, _max_sequence_idle_ms / 2)),
+                                (
+                                    ("start", 1111, None),
+                                    (None, 1112, _max_sequence_idle_ms / 2),
+                                    (None, 1112, _max_sequence_idle_ms / 2),
+                                    ("end", 1113, _max_sequence_idle_ms / 2),
+                                ),
                                 expected_result,
-                                precreated_shm3_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        22224, 11113, trial, "end"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        22224, 11113, trial, "end", dtype)
+                                precreated_shm3_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(22224, 11113, trial, "end")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            22224, 11113, trial, "end", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2495,11 +2889,11 @@ def test_backlog_sequence_timeout(self):
                                 # (flag_str, value, pre_delay_ms)
                                 (("start", 11111, None), ("end", 11113, None)),
                                 expected_result,
-                                precreated_shm4_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
+                                precreated_shm4_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
 
                     threads[0].start()
                     threads[1].start()
@@ -2515,18 +2909,27 @@ def test_backlog_sequence_timeout(self):
                 except Exception as ex:
                     for prefix in ENSEMBLE_PREFIXES:
                         if model_name.startswith(prefix):
-                            base_model_name = model_name[(len(prefix)):]
-                            self.assertTrue(ex.message().startswith(
-                                str("in ensemble '{}', " +
-                                    "inference request for sequence 1001 to " +
-                                    "model '{}' must specify the START flag on the first "
-                                    + "request of the sequence").format(
-                                        model_name, base_model_name)))
+                            base_model_name = model_name[(len(prefix)) :]
+                            self.assertTrue(
+                                ex.message().startswith(
+                                    str(
+                                        "in ensemble '{}', "
+                                        + "inference request for sequence 1001 to "
+                                        + "model '{}' must specify the START flag on the first "
+                                        + "request of the sequence"
+                                    ).format(model_name, base_model_name)
+                                )
+                            )
                             return
-                    self.assertTrue(ex.message().startswith(
-                        str("inference request for sequence 1001 to " +
-                            "model '{}' must specify the START flag on the first "
-                            + "request of the sequence").format(model_name)))
+                    self.assertTrue(
+                        ex.message().startswith(
+                            str(
+                                "inference request for sequence 1001 to "
+                                + "model '{}' must specify the START flag on the first "
+                                + "request of the sequence"
+                            ).format(model_name)
+                        )
+                    )
                 finally:
                     if TEST_SYSTEM_SHARED_MEMORY or TEST_CUDA_SHARED_MEMORY:
                         self.cleanup_shm_regions(precreated_shm0_handles)
@@ -2562,27 +2965,30 @@ def test_queue_delay_no_min_util(self):
                 self.clear_deferred_exceptions()
 
                 precreated_shm0_handles = self.precreate_register_regions(
-                    (1,), dtype, 0)
+                    (1,), dtype, 0
+                )
                 precreated_shm1_handles = self.precreate_register_regions(
-                    (11, 12), dtype, 1)
+                    (11, 12), dtype, 1
+                )
                 try:
                     self.check_setup(model_name)
 
                     # Need scheduler to wait for queue to contain 2 sequences.
                     self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
+                    self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 2)
+                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 2)
-                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                  os.environ)
-                    self.assertEqual(
-                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]),
-                        0)
+                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0
+                    )
 
                     threads = []
-                    expected_result = self.get_expected_result(
-                        1, 1, trial, "start"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        1, 1, trial, "start", dtype)
+                    expected_result = (
+                        self.get_expected_result(1, 1, trial, "start")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            1, 1, trial, "start", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2593,18 +2999,20 @@ def test_queue_delay_no_min_util(self):
                                 1001,
                                 (2000, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (
-                                    ("start", 1, None),),
+                                (("start", 1, None),),
                                 expected_result,
-                                precreated_shm0_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        23, 12, trial, None
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        23, 12, trial, None, dtype)
+                                precreated_shm0_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(23, 12, trial, None)
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            23, 12, trial, None, dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2620,11 +3028,11 @@ def test_queue_delay_no_min_util(self):
                                     (None, 12, None),
                                 ),
                                 expected_result,
-                                precreated_shm1_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
+                                precreated_shm1_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
 
                     threads[0].start()
                     time.sleep(1)
@@ -2668,27 +3076,30 @@ def test_queue_delay_half_min_util(self):
                 self.clear_deferred_exceptions()
 
                 precreated_shm0_handles = self.precreate_register_regions(
-                    (1,), dtype, 0)
+                    (1,), dtype, 0
+                )
                 precreated_shm1_handles = self.precreate_register_regions(
-                    (11, 12), dtype, 1)
+                    (11, 12), dtype, 1
+                )
                 try:
                     self.check_setup(model_name)
 
                     # Need scheduler to wait for queue to contain 2 sequences.
                     self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
+                    self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 2)
+                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 2)
-                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                  os.environ)
-                    self.assertEqual(
-                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]),
-                        0)
+                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0
+                    )
 
                     threads = []
-                    expected_result = self.get_expected_result(
-                        1, 1, trial, "start"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        1, 1, trial, "start", dtype)
+                    expected_result = (
+                        self.get_expected_result(1, 1, trial, "start")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            1, 1, trial, "start", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2699,18 +3110,20 @@ def test_queue_delay_half_min_util(self):
                                 1001,
                                 (2000, None),
                                 # (flag_str, value, pre_delay_ms)
-                                (
-                                    ("start", 1, None),),
+                                (("start", 1, None),),
                                 expected_result,
-                                precreated_shm0_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        23, 12, trial, None
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        23, 12, trial, None, dtype)
+                                precreated_shm0_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(23, 12, trial, None)
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            23, 12, trial, None, dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2726,11 +3139,11 @@ def test_queue_delay_half_min_util(self):
                                     (None, 12, None),
                                 ),
                                 expected_result,
-                                precreated_shm1_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
+                                precreated_shm1_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
 
                     threads[0].start()
                     time.sleep(1)
@@ -2774,27 +3187,30 @@ def test_queue_delay_full_min_util(self):
                 self.clear_deferred_exceptions()
 
                 precreated_shm0_handles = self.precreate_register_regions(
-                    (1,), dtype, 0)
+                    (1,), dtype, 0
+                )
                 precreated_shm1_handles = self.precreate_register_regions(
-                    (11, 12), dtype, 1)
+                    (11, 12), dtype, 1
+                )
                 try:
                     self.check_setup(model_name)
 
                     # Need scheduler to wait for queue to contain 2 sequences.
                     self.assertIn("TRITONSERVER_DELAY_SCHEDULER", os.environ)
+                    self.assertEqual(int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 2)
+                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER", os.environ)
                     self.assertEqual(
-                        int(os.environ["TRITONSERVER_DELAY_SCHEDULER"]), 2)
-                    self.assertIn("TRITONSERVER_BACKLOG_DELAY_SCHEDULER",
-                                  os.environ)
-                    self.assertEqual(
-                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]),
-                        0)
+                        int(os.environ["TRITONSERVER_BACKLOG_DELAY_SCHEDULER"]), 0
+                    )
 
                     threads = []
-                    expected_result = self.get_expected_result(
-                        1, 1, trial, "start"
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        1, 1, trial, "start", dtype)
+                    expected_result = (
+                        self.get_expected_result(1, 1, trial, "start")
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            1, 1, trial, "start", dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2805,18 +3221,20 @@ def test_queue_delay_full_min_util(self):
                                 1001,
                                 (4000, 3000),
                                 # (flag_str, value, pre_delay_ms)
-                                (
-                                    ("start", 1, None),),
+                                (("start", 1, None),),
                                 expected_result,
-                                precreated_shm0_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
-                    expected_result = self.get_expected_result(
-                        23, 12, trial, None
-                    ) if not IMPLICIT_STATE else self.get_expected_result_implicit(
-                        23, 12, trial, None, dtype)
+                                precreated_shm0_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
+                    expected_result = (
+                        self.get_expected_result(23, 12, trial, None)
+                        if not IMPLICIT_STATE
+                        else self.get_expected_result_implicit(
+                            23, 12, trial, None, dtype
+                        )
+                    )
                     threads.append(
                         threading.Thread(
                             target=self.check_sequence_async,
@@ -2832,11 +3250,11 @@ def test_queue_delay_full_min_util(self):
                                     (None, 12, 2000),
                                 ),
                                 expected_result,
-                                precreated_shm1_handles),
-                            kwargs={
-                                'sequence_name':
-                                    "{}".format(self._testMethodName)
-                            }))
+                                precreated_shm1_handles,
+                            ),
+                            kwargs={"sequence_name": "{}".format(self._testMethodName)},
+                        )
+                    )
 
                     threads[0].start()
                     time.sleep(1)
@@ -2855,51 +3273,53 @@ def test_queue_delay_full_min_util(self):
 
 
 class SequenceBatcherRequestTimeoutTest(su.SequenceBatcherTestUtil):
-
     def setUp(self):
         super(SequenceBatcherRequestTimeoutTest, self).setUp()
         # By default, find tritonserver on "localhost", but can be overridden
         # with TRITONSERVER_IPADDR envvar
-        self.server_address_ = os.environ.get('TRITONSERVER_IPADDR',
-                                              'localhost') + ":8001"
+        self.server_address_ = (
+            os.environ.get("TRITONSERVER_IPADDR", "localhost") + ":8001"
+        )
 
         # Prepare input and expected output based on the model and
         # the infer sequence sent for testing. If the test is to be extended
         # for different sequence and model, then proper grouping should be added
         self.model_name_ = "custom_sequence_int32_timeout"
         self.tensor_data_ = np.ones(shape=[1, 1], dtype=np.int32)
-        self.inputs_ = [grpcclient.InferInput('INPUT0', [1, 1], "INT32")]
+        self.inputs_ = [grpcclient.InferInput("INPUT0", [1, 1], "INT32")]
         self.inputs_[0].set_data_from_numpy(self.tensor_data_)
-        self.expected_out_seq_ = [("OUTPUT0", self.tensor_data_),
-                                  ("OUTPUT0", self.tensor_data_),
-                                  ("OUTPUT0", self.tensor_data_)]
-
-    def send_sequence_with_timeout(self,
-                                   seq_id,
-                                   callback,
-                                   timeout_us=3000000,
-                                   request_pause_sec=0):
-        with grpcclient.InferenceServerClient(
-                self.server_address_) as triton_client:
+        self.expected_out_seq_ = [
+            ("OUTPUT0", self.tensor_data_),
+            ("OUTPUT0", self.tensor_data_),
+            ("OUTPUT0", self.tensor_data_),
+        ]
+
+    def send_sequence_with_timeout(
+        self, seq_id, callback, timeout_us=3000000, request_pause_sec=0
+    ):
+        with grpcclient.InferenceServerClient(self.server_address_) as triton_client:
             triton_client.start_stream(callback=callback)
-            triton_client.async_stream_infer(self.model_name_,
-                                             self.inputs_,
-                                             sequence_id=seq_id,
-                                             sequence_start=True,
-                                             timeout=timeout_us)
-            if (request_pause_sec != 0):
+            triton_client.async_stream_infer(
+                self.model_name_,
+                self.inputs_,
+                sequence_id=seq_id,
+                sequence_start=True,
+                timeout=timeout_us,
+            )
+            if request_pause_sec != 0:
                 time.sleep(request_pause_sec)
-            triton_client.async_stream_infer(self.model_name_,
-                                             self.inputs_,
-                                             sequence_id=seq_id,
-                                             timeout=timeout_us)
-            if (request_pause_sec != 0):
+            triton_client.async_stream_infer(
+                self.model_name_, self.inputs_, sequence_id=seq_id, timeout=timeout_us
+            )
+            if request_pause_sec != 0:
                 time.sleep(request_pause_sec)
-            triton_client.async_stream_infer(self.model_name_,
-                                             self.inputs_,
-                                             sequence_id=seq_id,
-                                             sequence_end=True,
-                                             timeout=timeout_us)
+            triton_client.async_stream_infer(
+                self.model_name_,
+                self.inputs_,
+                sequence_id=seq_id,
+                sequence_end=True,
+                timeout=timeout_us,
+            )
 
     def test_request_timeout(self):
         # Test long running model that receives requests with shorter timeout,
@@ -2918,11 +3338,15 @@ def test_request_timeout(self):
         # send sequence with 1s interval to ensure processing order
         threads = []
         threads.append(
-            threading.Thread(target=self.send_sequence_with_timeout,
-                             args=(1, seq1_callback)))
+            threading.Thread(
+                target=self.send_sequence_with_timeout, args=(1, seq1_callback)
+            )
+        )
         threads.append(
-            threading.Thread(target=self.send_sequence_with_timeout,
-                             args=(2, seq2_callback)))
+            threading.Thread(
+                target=self.send_sequence_with_timeout, args=(2, seq2_callback)
+            )
+        )
         threads[0].start()
         time.sleep(1)
         threads[1].start()
@@ -2933,22 +3357,27 @@ def test_request_timeout(self):
             result, error = seq1_res[idx]
             self.assertIsNone(
                 error,
-                "Expect sucessful inference for sequence 1 requests, got error: {}"
-                .format(error))
+                "Expect successful inference for sequence 1 requests, got error: {}".format(
+                    error
+                ),
+            )
             out = result.as_numpy(self.expected_out_seq_[idx][0])
             expected_out = self.expected_out_seq_[idx][1]
             np.testing.assert_allclose(
                 out,
                 expected_out,
                 err_msg="Unexpected output tensor: expect {}, got {}".format(
-                    expected_out, out))
+                    expected_out, out
+                ),
+            )
 
         for _, error in seq2_res:
             self.assertIsNotNone(error, "Expect error for sequence 2 requests")
             with self.assertRaisesRegex(
-                    InferenceServerException,
-                    "timeout of the corresponding sequence has been expired",
-                    msg="Unexpected error: {}".format(error)):
+                InferenceServerException,
+                "timeout of the corresponding sequence has been expired",
+                msg="Unexpected error: {}".format(error),
+            ):
                 raise error
 
     def test_send_request_after_timeout(self):
@@ -2964,14 +3393,19 @@ def test_send_request_after_timeout(self):
 
         threads = []
         threads.append(
-            threading.Thread(target=self.send_sequence_with_timeout,
-                             args=(1, seq1_callback)))
+            threading.Thread(
+                target=self.send_sequence_with_timeout, args=(1, seq1_callback)
+            )
+        )
         # Each request will be sent with a pause, so the third request
         # will be sent after the sequence has been timed out
         threads.append(
-            threading.Thread(target=self.send_sequence_with_timeout,
-                             args=(2, seq2_callback),
-                             kwargs={'request_pause_sec': 2}))
+            threading.Thread(
+                target=self.send_sequence_with_timeout,
+                args=(2, seq2_callback),
+                kwargs={"request_pause_sec": 2},
+            )
+        )
         threads[0].start()
         time.sleep(1)
         threads[1].start()
@@ -2983,18 +3417,20 @@ def test_send_request_after_timeout(self):
         for _, error in seq2_res[0:-1]:
             self.assertIsNotNone(error, "Expect error for sequence 2 requests")
             with self.assertRaisesRegex(
-                    InferenceServerException,
-                    "timeout of the corresponding sequence has been expired",
-                    msg="Unexpected error: {}".format(error)):
+                InferenceServerException,
+                "timeout of the corresponding sequence has been expired",
+                msg="Unexpected error: {}".format(error),
+            ):
                 raise error
         _, last_err = seq2_res[-1]
         self.assertIsNotNone(last_err, "Expect error for sequence 2 requests")
         with self.assertRaisesRegex(
-                InferenceServerException,
-                "must specify the START flag on the first request",
-                msg="Unexpected error: {}".format(last_err)):
+            InferenceServerException,
+            "must specify the START flag on the first request",
+            msg="Unexpected error: {}".format(last_err),
+        ):
             raise last_err
 
 
-if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_trace/opentelemetry_unittest.py b/qa/L0_trace/opentelemetry_unittest.py
index 1aef6aefea..6ca1c18f49 100644
--- a/qa/L0_trace/opentelemetry_unittest.py
+++ b/qa/L0_trace/opentelemetry_unittest.py
@@ -28,20 +28,21 @@
 
 sys.path.append("../common")
 import json
+import time
 import unittest
-import tritonclient.http as httpclient
-import tritonclient.grpc as grpcclient
+
 import numpy as np
 import test_util as tu
-import time
+import tritonclient.grpc as grpcclient
+import tritonclient.http as httpclient
 
 EXPECTED_NUM_SPANS = 10
 
-class OpenTelemetryTest(tu.TestResultCollector):
 
+class OpenTelemetryTest(tu.TestResultCollector):
     def setUp(self):
         while True:
-            with open('trace_collector.log', 'rt') as f:
+            with open("trace_collector.log", "rt") as f:
                 data = f.read()
                 if data.count("resource_spans") != EXPECTED_NUM_SPANS:
                     time.sleep(5)
@@ -49,110 +50,105 @@ def setUp(self):
                 else:
                     break
 
-        data = data.split('\n')
-        full_spans = [entry.split('POST')[0] for entry in data if "resource_spans" in entry]
+        data = data.split("\n")
+        full_spans = [
+            entry.split("POST")[0] for entry in data if "resource_spans" in entry
+        ]
         self.spans = []
         for span in full_spans:
             span = json.loads(span)
-            self.spans.append(
-                span["resource_spans"][0]['scope_spans'][0]['spans'][0])
-        
+            self.spans.append(span["resource_spans"][0]["scope_spans"][0]["spans"][0])
+
         self.simple_model_name = "simple"
         self.ensemble_model_name = "ensemble_add_sub_int32_int32_int32"
         self.root_span = "InferRequest"
 
     def _check_events(self, span_name, events):
-        root_events_http =\
-              ["HTTP_RECV_START", 
-               "HTTP_RECV_END",
-               "INFER_RESPONSE_COMPLETE", 
-               "HTTP_SEND_START", 
-               "HTTP_SEND_END"]
-        root_events_grpc =\
-              ["GRPC_WAITREAD_START", 
-               "GRPC_WAITREAD_END",
-               "INFER_RESPONSE_COMPLETE", 
-               "GRPC_SEND_START", 
-               "GRPC_SEND_END"]
-        request_events =\
-              ["REQUEST_START", 
-               "QUEUE_START", 
-               "REQUEST_END"]
-        compute_events =\
-              ["COMPUTE_START", 
-               "COMPUTE_INPUT_END", 
-               "COMPUTE_OUTPUT_START", 
-               "COMPUTE_END"]
-        
+        root_events_http = [
+            "HTTP_RECV_START",
+            "HTTP_RECV_END",
+            "INFER_RESPONSE_COMPLETE",
+            "HTTP_SEND_START",
+            "HTTP_SEND_END",
+        ]
+        root_events_grpc = [
+            "GRPC_WAITREAD_START",
+            "GRPC_WAITREAD_END",
+            "INFER_RESPONSE_COMPLETE",
+            "GRPC_SEND_START",
+            "GRPC_SEND_END",
+        ]
+        request_events = ["REQUEST_START", "QUEUE_START", "REQUEST_END"]
+        compute_events = [
+            "COMPUTE_START",
+            "COMPUTE_INPUT_END",
+            "COMPUTE_OUTPUT_START",
+            "COMPUTE_END",
+        ]
+
         if span_name == "compute":
-            # Check that all compute related events (and only them) 
+            # Check that all compute related events (and only them)
             # are recorded in compute span
             self.assertTrue(all(entry in events for entry in compute_events))
             self.assertFalse(all(entry in events for entry in request_events))
             self.assertFalse(
-                all(entry in events 
-                    for entry in root_events_http + root_events_grpc))
-            
+                all(entry in events for entry in root_events_http + root_events_grpc)
+            )
+
         elif span_name == self.root_span:
-            # Check that root span has INFER_RESPONSE_COMPLETE, _RECV/_WAITREAD 
-            # and _SEND events (and only them) 
+            # Check that root span has INFER_RESPONSE_COMPLETE, _RECV/_WAITREAD
+            # and _SEND events (and only them)
             if "HTTP" in events:
-                self.assertTrue(
-                    all(entry in events for entry in root_events_http))
-                self.assertFalse(
-                    all(entry in events for entry in root_events_grpc))
-                
+                self.assertTrue(all(entry in events for entry in root_events_http))
+                self.assertFalse(all(entry in events for entry in root_events_grpc))
+
             elif "GRPC" in events:
-                self.assertTrue(
-                    all(entry in events for entry in root_events_grpc))
-                self.assertFalse(
-                    all(entry in events for entry in root_events_http))
-            self.assertFalse(
-                all(entry in events for entry in request_events))
-            self.assertFalse(
-                all(entry in events for entry in compute_events))
-            
+                self.assertTrue(all(entry in events for entry in root_events_grpc))
+                self.assertFalse(all(entry in events for entry in root_events_http))
+            self.assertFalse(all(entry in events for entry in request_events))
+            self.assertFalse(all(entry in events for entry in compute_events))
+
         elif span_name == self.simple_model_name:
-            # Check that all request related events (and only them) 
+            # Check that all request related events (and only them)
             # are recorded in request span
             self.assertTrue(all(entry in events for entry in request_events))
             self.assertFalse(
-                all(entry in events 
-                    for entry in root_events_http + root_events_grpc))
+                all(entry in events for entry in root_events_http + root_events_grpc)
+            )
             self.assertFalse(all(entry in events for entry in compute_events))
-        
+
     def _check_parent(self, child_span, parent_span):
         # Check that child and parent span have the same trace_id
         # and child's `parent_span_id` is the same as parent's `span_id`
-        self.assertEqual(child_span['trace_id'], parent_span['trace_id'])
+        self.assertEqual(child_span["trace_id"], parent_span["trace_id"])
         self.assertIn(
-            'parent_span_id', 
-            child_span, 
-            "child span does not have parent span id specified")
-        self.assertEqual(child_span['parent_span_id'], parent_span['span_id'])
+            "parent_span_id",
+            child_span,
+            "child span does not have parent span id specified",
+        )
+        self.assertEqual(child_span["parent_span_id"], parent_span["span_id"])
 
     def test_spans(self):
         parsed_spans = []
 
         # Check that collected spans have proper events recorded
         for span in self.spans:
-            span_name = span['name']
-            self._check_events(span_name, json.dumps(span['events']))
+            span_name = span["name"]
+            self._check_events(span_name, json.dumps(span["events"]))
             parsed_spans.append(span_name)
-        
+
         # There should be 6 spans in total:
         # 3 for http request, 3 for grpc request, 4 for ensemble
         self.assertEqual(len(self.spans), 10)
         # We should have 3 compute spans
-        self.assertEqual(parsed_spans.count("compute"), 3) 
+        self.assertEqual(parsed_spans.count("compute"), 3)
         # 4 request spans (3 named simple - same as our model name, 1 ensemble)
-        self.assertEqual(parsed_spans.count(self.simple_model_name), 3) 
-        self.assertEqual(parsed_spans.count(self.ensemble_model_name), 1) 
+        self.assertEqual(parsed_spans.count(self.simple_model_name), 3)
+        self.assertEqual(parsed_spans.count(self.ensemble_model_name), 1)
         # 3 root spans
-        self.assertEqual(parsed_spans.count(self.root_span), 3) 
-    
-    def test_nested_spans(self):
+        self.assertEqual(parsed_spans.count(self.root_span), 3)
 
+    def test_nested_spans(self):
         # First 3 spans in `self.spans` belong to HTTP request
         # They are recorded in the following order:
         # compute_span [idx 0] , request_span [idx 1], root_span [idx 2].
@@ -160,13 +156,12 @@ def test_nested_spans(self):
         # request_span should be a child of root_span
         for child, parent in zip(self.spans[:3], self.spans[1:3]):
             self._check_parent(child, parent)
-        
+
         # root_span should not have `parent_span_id` field
         self.assertNotIn(
-            'parent_span_id', 
-            self.spans[2],
-            "root span has a parent_span_id specified")
-        
+            "parent_span_id", self.spans[2], "root span has a parent_span_id specified"
+        )
+
         # Next 3 spans in `self.spans` belong to GRPC request
         # Order of spans and their relationship described earlier
         for child, parent in zip(self.spans[3:6], self.spans[4:6]):
@@ -174,9 +169,8 @@ def test_nested_spans(self):
 
         # root_span should not have `parent_span_id` field
         self.assertNotIn(
-            'parent_span_id',
-            self.spans[5],
-            "root span has a parent_span_id specified")
+            "parent_span_id", self.spans[5], "root span has a parent_span_id specified"
+        )
 
         # Final 4 spans in `self.spans` belong to ensemble request
         # Order of spans: compute span - request span - request span - root span
@@ -185,40 +179,41 @@ def test_nested_spans(self):
 
         # root_span should not have `parent_span_id` field
         self.assertNotIn(
-            'parent_span_id',
-            self.spans[9],
-            "root span has a parent_span_id specified")
+            "parent_span_id", self.spans[9], "root span has a parent_span_id specified"
+        )
 
-def prepare_data(client):
 
+def prepare_data(client):
     inputs = []
     input0_data = np.full(shape=(1, 16), fill_value=-1, dtype=np.int32)
     input1_data = np.full(shape=(1, 16), fill_value=-1, dtype=np.int32)
 
-    inputs.append(client.InferInput('INPUT0', [1, 16], "INT32"))
-    inputs.append(client.InferInput('INPUT1', [1, 16], "INT32"))
+    inputs.append(client.InferInput("INPUT0", [1, 16], "INT32"))
+    inputs.append(client.InferInput("INPUT1", [1, 16], "INT32"))
 
     # Initialize the data
     inputs[0].set_data_from_numpy(input0_data)
     inputs[1].set_data_from_numpy(input1_data)
-    
+
     return inputs
 
+
 def prepare_traces():
-        
-        triton_client_http = httpclient.InferenceServerClient("localhost:8000",
-                                                               verbose=True)
-        triton_client_grpc = grpcclient.InferenceServerClient("localhost:8001",
-                                                               verbose=True)
-        inputs = prepare_data(httpclient)
-        triton_client_http.infer("simple",inputs)
-        
-        inputs = prepare_data(grpcclient)
-        triton_client_grpc.infer("simple", inputs)
-
-        inputs = prepare_data(httpclient)
-        triton_client_http.infer("ensemble_add_sub_int32_int32_int32", inputs)
-        
-
-if __name__ == '__main__':
+    triton_client_http = httpclient.InferenceServerClient(
+        "localhost:8000", verbose=True
+    )
+    triton_client_grpc = grpcclient.InferenceServerClient(
+        "localhost:8001", verbose=True
+    )
+    inputs = prepare_data(httpclient)
+    triton_client_http.infer("simple", inputs)
+
+    inputs = prepare_data(grpcclient)
+    triton_client_grpc.infer("simple", inputs)
+
+    inputs = prepare_data(httpclient)
+    triton_client_http.infer("ensemble_add_sub_int32_int32_int32", inputs)
+
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/qa/L0_trace/test.sh b/qa/L0_trace/test.sh
index aeaa96e367..984dcb04c4 100755
--- a/qa/L0_trace/test.sh
+++ b/qa/L0_trace/test.sh
@@ -149,7 +149,7 @@ wait $SERVER_PID
 
 set +e
 
-# Expect only the requests after calling trace API are traced 
+# Expect only the requests after calling trace API are traced
 $TRACE_SUMMARY -t trace_off_to_min.log > summary_off_to_min.log
 
 if [ `grep -c "COMPUTE_INPUT_END" summary_off_to_min.log` != "20" ]; then
@@ -188,7 +188,7 @@ if [ "$code" != "200" ]; then
     echo -e "\n***\n*** Test Failed\n***"
     RET=1
 fi
-# Check if the current setting is returned (not specified setting from global) 
+# Check if the current setting is returned (not specified setting from global)
 if [ `grep -c "\"trace_level\":\[\"TIMESTAMPS\"\]" ./curl.out` != "1" ]; then
     RET=1
 fi
@@ -213,7 +213,7 @@ if [ "$code" != "200" ]; then
     RET=1
 fi
 
-# Check if the current setting is returned (not specified setting from global) 
+# Check if the current setting is returned (not specified setting from global)
 if [ `grep -c "\"trace_level\":\[\"TIMESTAMPS\"\]" ./curl.out` != "1" ]; then
     RET=1
 fi
@@ -350,7 +350,7 @@ done
 rm -f ./curl.out
 set +e
 
-# Clear trace setting by explicitly asking removal for every feild except 'trace_rate'
+# Clear trace setting by explicitly asking removal for every field except 'trace_rate'
 rm -f ./curl.out
 set +e
 code=`curl -s -w %{http_code} -o ./curl.out -d'{"trace_file":null, "trace_level":null}' localhost:8000/v2/models/simple/trace/setting`
@@ -513,7 +513,7 @@ for p in {1..10}; do
     fi
 done
 
-# Check the current setting agian and expect 'trace_count' becomes 0
+# Check the current setting again and expect 'trace_count' becomes 0
 rm -f ./curl.out
 set +e
 code=`curl -s -w %{http_code} -o ./curl.out localhost:8000/v2/models/simple/trace/setting`
@@ -658,12 +658,12 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-set +e 
+set +e
 
 # Check opentelemetry trace exporter sends proper info.
-# A helper python script starts listenning on $OTLP_PORT, where
-# OTLP exporter sends traces. 
-# Unittests then check that produced spans have expected format and events 
+# A helper python script starts listening on $OTLP_PORT, where
+# OTLP exporter sends traces.
+# Unittests then check that produced spans have expected format and events
 # FIXME: Redesign this test to remove time sensitivity
 
 OPENTELEMETRY_TEST=opentelemetry_unittest.py
@@ -693,14 +693,14 @@ apt-get update && apt-get install -y netcat
 nc -l -k 127.0.0.1 $OTLP_PORT >> $TRACE_COLLECTOR_LOG 2>&1 & COLLECTOR_PID=$!
 
 set +e
-# Preparing traces for unittest. 
-# Note: need to run this separately, to speed up trace collection. 
+# Preparing traces for unittest.
+# Note: need to run this separately, to speed up trace collection.
 # Otherwise internal (opentelemetry_unittest.OpenTelemetryTest.setUp) check
 # will slow down collection.
 python -c 'import opentelemetry_unittest; \
         opentelemetry_unittest.prepare_traces()' >>$CLIENT_LOG 2>&1
 
-# Unittest will not start untill expected number of spans is collected.
+# Unittest will not start until expected number of spans is collected.
 python $OPENTELEMETRY_TEST >>$OPENTELEMETRY_LOG 2>&1
 if [ $? -ne 0 ]; then
     cat $OPENTELEMETRY_LOG
diff --git a/qa/common/gen_qa_implicit_models.py b/qa/common/gen_qa_implicit_models.py
index 814e3987dc..84ca98c47c 100755
--- a/qa/common/gen_qa_implicit_models.py
+++ b/qa/common/gen_qa_implicit_models.py
@@ -28,10 +28,11 @@
 
 import argparse
 import os
-import numpy as np
-import gen_ensemble_model_utils as emu
 from typing import List, Tuple
 
+import gen_ensemble_model_utils as emu
+import numpy as np
+
 FLAGS = None
 np_dtype_string = np.dtype(object)
 
@@ -127,14 +128,15 @@ def np_to_torch_dtype(np_dtype):
     return None
 
 
-def create_onnx_modelfile_wo_initial_state(models_dir, model_version, max_batch,
-                                           dtype, shape):
-
+def create_onnx_modelfile_wo_initial_state(
+    models_dir, model_version, max_batch, dtype, shape
+):
     if not tu.validate_for_onnx_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_sequence_model_name(
-        "onnx_nobatch" if max_batch == 0 else "onnx", dtype)
+        "onnx_nobatch" if max_batch == 0 else "onnx", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     onnx_dtype = np_to_onnx_dtype(dtype)
@@ -155,142 +157,178 @@ def create_onnx_modelfile_wo_initial_state(models_dir, model_version, max_batch,
     batch_dim = [] if max_batch == 0 else [None]
 
     onnx_input = onnx.helper.make_tensor_value_info(
-        "INPUT", onnx_dtype, batch_dim + onnx_input_shape)
+        "INPUT", onnx_dtype, batch_dim + onnx_input_shape
+    )
     onnx_input_state = onnx.helper.make_tensor_value_info(
-        "INPUT_STATE", onnx_dtype, batch_dim + onnx_input_shape)
-    onnx_start = onnx.helper.make_tensor_value_info("START", onnx_control_dtype,
-                                                    batch_dim + [1])
-    onnx_ready = onnx.helper.make_tensor_value_info("READY", onnx_control_dtype,
-                                                    batch_dim + [1])
+        "INPUT_STATE", onnx_dtype, batch_dim + onnx_input_shape
+    )
+    onnx_start = onnx.helper.make_tensor_value_info(
+        "START", onnx_control_dtype, batch_dim + [1]
+    )
+    onnx_ready = onnx.helper.make_tensor_value_info(
+        "READY", onnx_control_dtype, batch_dim + [1]
+    )
     onnx_output = onnx.helper.make_tensor_value_info(
-        "OUTPUT", onnx_dtype, batch_dim + onnx_output_shape)
+        "OUTPUT", onnx_dtype, batch_dim + onnx_output_shape
+    )
     onnx_output_state = onnx.helper.make_tensor_value_info(
-        "OUTPUT_STATE", onnx_dtype, batch_dim + onnx_output_shape)
+        "OUTPUT_STATE", onnx_dtype, batch_dim + onnx_output_shape
+    )
 
     internal_input = onnx.helper.make_node("Identity", ["INPUT"], ["_INPUT"])
-    internal_input_state = onnx.helper.make_node("Identity", ["INPUT_STATE"],
-                                                 ["_INPUT_STATE"])
-    # cast int8, int16 input to higer precision int as Onnx Add/Sub operator doesn't support those type
-    if ((onnx_dtype == onnx.TensorProto.INT8) or
-        (onnx_dtype == onnx.TensorProto.INT16)):
-        internal_input = onnx.helper.make_node("Cast", ["INPUT"], ["_INPUT"],
-                                               to=onnx.TensorProto.INT32)
-        internal_input_state = onnx.helper.make_node("Cast", ["INPUT_STATE"],
-                                                     ["_INPUT_STATE"],
-                                                     to=onnx.TensorProto.INT32)
+    internal_input_state = onnx.helper.make_node(
+        "Identity", ["INPUT_STATE"], ["_INPUT_STATE"]
+    )
+    # cast int8, int16 input to higher precision int as Onnx Add/Sub operator doesn't support those type
+    if (onnx_dtype == onnx.TensorProto.INT8) or (onnx_dtype == onnx.TensorProto.INT16):
+        internal_input = onnx.helper.make_node(
+            "Cast", ["INPUT"], ["_INPUT"], to=onnx.TensorProto.INT32
+        )
+        internal_input_state = onnx.helper.make_node(
+            "Cast", ["INPUT_STATE"], ["_INPUT_STATE"], to=onnx.TensorProto.INT32
+        )
 
     # Convert boolean value to int32 value
     if onnx_control_dtype == onnx.TensorProto.BOOL:
         if onnx_dtype != onnx.TensorProto.STRING:
-            internal_input1 = onnx.helper.make_node("Cast", ["START"],
-                                                    ["_START"],
-                                                    to=onnx.TensorProto.INT32)
-            internal_input2 = onnx.helper.make_node("Cast", ["READY"],
-                                                    ["_READY"],
-                                                    to=onnx.TensorProto.INT32)
-            not_start_cast = onnx.helper.make_node("Not", ["START"],
-                                                   ["_NOT_START_CAST"])
-            not_start = onnx.helper.make_node("Cast", ["_NOT_START_CAST"],
-                                              ["_NOT_START"],
-                                              to=onnx.TensorProto.INT32)
-            not_ready_cast = onnx.helper.make_node("Not", ["START"],
-                                                   ["_NOT_READY_CAST"])
-            not_ready = onnx.helper.make_node("Cast", ["_NOT_READY_CAST"],
-                                              ["_NOT_READY"],
-                                              to=onnx.TensorProto.INT32)
+            internal_input1 = onnx.helper.make_node(
+                "Cast", ["START"], ["_START"], to=onnx.TensorProto.INT32
+            )
+            internal_input2 = onnx.helper.make_node(
+                "Cast", ["READY"], ["_READY"], to=onnx.TensorProto.INT32
+            )
+            not_start_cast = onnx.helper.make_node(
+                "Not", ["START"], ["_NOT_START_CAST"]
+            )
+            not_start = onnx.helper.make_node(
+                "Cast", ["_NOT_START_CAST"], ["_NOT_START"], to=onnx.TensorProto.INT32
+            )
+            not_ready_cast = onnx.helper.make_node(
+                "Not", ["START"], ["_NOT_READY_CAST"]
+            )
+            not_ready = onnx.helper.make_node(
+                "Cast", ["_NOT_READY_CAST"], ["_NOT_READY"], to=onnx.TensorProto.INT32
+            )
             input_state_cond = onnx.helper.make_node(
-                "And", ["READY", "_NOT_START_CAST"], ["input_state_cond"])
+                "And", ["READY", "_NOT_START_CAST"], ["input_state_cond"]
+            )
             input_state_cond_cast = onnx.helper.make_node(
-                "Cast", ["input_state_cond"], ["input_state_cond_cast"],
-                to=onnx.TensorProto.INT32)
+                "Cast",
+                ["input_state_cond"],
+                ["input_state_cond_cast"],
+                to=onnx.TensorProto.INT32,
+            )
             mul_state = onnx.helper.make_node(
-                "Mul", ["_INPUT_STATE", "input_state_cond_cast"], ["mul_state"])
-            add = onnx.helper.make_node("Add", ["_INPUT", "mul_state"],
-                                        ["CAST"])
+                "Mul", ["_INPUT_STATE", "input_state_cond_cast"], ["mul_state"]
+            )
+            add = onnx.helper.make_node("Add", ["_INPUT", "mul_state"], ["CAST"])
 
     else:
-
         if onnx_dtype != onnx.TensorProto.STRING:
-            start_cast = onnx.helper.make_node("Cast", ["START"],
-                                               ["_START_CAST"],
-                                               to=onnx.TensorProto.BOOL)
-            not_start_cast = onnx.helper.make_node("Not", ["_START_CAST"],
-                                                   ["_NOT_START_CAST"])
-            not_start = onnx.helper.make_node("Cast", ["_NOT_START_CAST"],
-                                              ["_NOT_START"],
-                                              to=onnx.TensorProto.INT32)
-
-            ready_cast = onnx.helper.make_node("Cast", ["READY"],
-                                               ["_READY_CAST"],
-                                               to=onnx.TensorProto.BOOL)
-            not_ready_cast = onnx.helper.make_node("Not", ["_READY_CAST"],
-                                                   ["_NOT_READY_CAST"])
-            not_ready = onnx.helper.make_node("Cast", ["_NOT_READY_CAST"],
-                                              ["_NOT_READY"],
-                                              to=onnx.TensorProto.INT32)
+            start_cast = onnx.helper.make_node(
+                "Cast", ["START"], ["_START_CAST"], to=onnx.TensorProto.BOOL
+            )
+            not_start_cast = onnx.helper.make_node(
+                "Not", ["_START_CAST"], ["_NOT_START_CAST"]
+            )
+            not_start = onnx.helper.make_node(
+                "Cast", ["_NOT_START_CAST"], ["_NOT_START"], to=onnx.TensorProto.INT32
+            )
+
+            ready_cast = onnx.helper.make_node(
+                "Cast", ["READY"], ["_READY_CAST"], to=onnx.TensorProto.BOOL
+            )
+            not_ready_cast = onnx.helper.make_node(
+                "Not", ["_READY_CAST"], ["_NOT_READY_CAST"]
+            )
+            not_ready = onnx.helper.make_node(
+                "Cast", ["_NOT_READY_CAST"], ["_NOT_READY"], to=onnx.TensorProto.INT32
+            )
             # Take advantage of knowledge that the READY false value is 0 and true is 1
             input_state_cond = onnx.helper.make_node(
-                "And", ["_NOT_START_CAST", "_READY_CAST"], ["input_state_cond"])
+                "And", ["_NOT_START_CAST", "_READY_CAST"], ["input_state_cond"]
+            )
             input_state_cond_cast = onnx.helper.make_node(
-                "Cast", ["input_state_cond"], ["input_state_cond_cast"],
-                to=onnx.TensorProto.INT32)
+                "Cast",
+                ["input_state_cond"],
+                ["input_state_cond_cast"],
+                to=onnx.TensorProto.INT32,
+            )
             mul_state = onnx.helper.make_node(
-                "Mul", ["_INPUT_STATE", "input_state_cond_cast"], ["mul_state"])
-            add = onnx.helper.make_node("Add", ["_INPUT", "mul_state"],
-                                        ["CAST"])
+                "Mul", ["_INPUT_STATE", "input_state_cond_cast"], ["mul_state"]
+            )
+            add = onnx.helper.make_node("Add", ["_INPUT", "mul_state"], ["CAST"])
 
     if onnx_dtype == onnx.TensorProto.STRING:
         cast = onnx.helper.make_node("Identity", ["_INPUT"], ["OUTPUT"])
-        cast_output_state = onnx.helper.make_node("Identity", ["_INPUT"],
-                                                  ["OUTPUT_STATE"])
+        cast_output_state = onnx.helper.make_node(
+            "Identity", ["_INPUT"], ["OUTPUT_STATE"]
+        )
     elif onnx_dtype == onnx.TensorProto.FLOAT16:
         # Avoid cast from float16 to float16
         # (bug in Onnx Runtime, cast from float16 to float16 will become cast from float16 to float32)
         cast = onnx.helper.make_node("Identity", ["CAST"], ["OUTPUT"])
-        cast_output_state = onnx.helper.make_node("Identity", ["CAST"],
-                                                  ["OUTPUT_STATE"])
+        cast_output_state = onnx.helper.make_node(
+            "Identity", ["CAST"], ["OUTPUT_STATE"]
+        )
     else:
-        cast = onnx.helper.make_node("Cast", ["CAST"], ["OUTPUT"],
-                                     to=onnx_dtype)
-        cast_output_state = onnx.helper.make_node("Cast", ["CAST"],
-                                                  ["OUTPUT_STATE"],
-                                                  to=onnx_dtype)
+        cast = onnx.helper.make_node("Cast", ["CAST"], ["OUTPUT"], to=onnx_dtype)
+        cast_output_state = onnx.helper.make_node(
+            "Cast", ["CAST"], ["OUTPUT_STATE"], to=onnx_dtype
+        )
 
     if onnx_control_dtype == onnx.TensorProto.BOOL:
         if onnx_dtype != onnx.TensorProto.STRING:
             onnx_nodes = [
-                internal_input, internal_input_state, internal_input1,
-                internal_input2, not_start_cast, not_start, not_ready_cast,
-                not_ready, input_state_cond, input_state_cond_cast, mul_state,
-                add, cast, cast_output_state
+                internal_input,
+                internal_input_state,
+                internal_input1,
+                internal_input2,
+                not_start_cast,
+                not_start,
+                not_ready_cast,
+                not_ready,
+                input_state_cond,
+                input_state_cond_cast,
+                mul_state,
+                add,
+                cast,
+                cast_output_state,
             ]
         else:
-            onnx_nodes = [
-                internal_input, internal_input_state, cast, cast_output_state
-            ]
+            onnx_nodes = [internal_input, internal_input_state, cast, cast_output_state]
     else:
         if onnx_dtype != onnx.TensorProto.STRING:
             onnx_nodes = [
-                internal_input, internal_input_state, start_cast,
-                not_start_cast, not_start, ready_cast, not_ready_cast,
-                not_ready, input_state_cond, input_state_cond_cast, mul_state,
-                add, cast, cast_output_state
+                internal_input,
+                internal_input_state,
+                start_cast,
+                not_start_cast,
+                not_start,
+                ready_cast,
+                not_ready_cast,
+                not_ready,
+                input_state_cond,
+                input_state_cond_cast,
+                mul_state,
+                add,
+                cast,
+                cast_output_state,
             ]
         else:
-            onnx_nodes = [
-                internal_input, internal_input_state, cast, cast_output_state
-            ]
+            onnx_nodes = [internal_input, internal_input_state, cast, cast_output_state]
 
     onnx_inputs = [onnx_input_state, onnx_input, onnx_start, onnx_ready]
     onnx_outputs = [onnx_output, onnx_output_state]
-    graph_proto = onnx.helper.make_graph(onnx_nodes, model_name, onnx_inputs,
-                                         onnx_outputs)
+    graph_proto = onnx.helper.make_graph(
+        onnx_nodes, model_name, onnx_inputs, onnx_outputs
+    )
 
     if FLAGS.onnx_opset > 0:
         model_opset = onnx.helper.make_operatorsetid("", FLAGS.onnx_opset)
-        model_def = onnx.helper.make_model(graph_proto,
-                                           producer_name="triton",
-                                           opset_imports=[model_opset])
+        model_def = onnx.helper.make_model(
+            graph_proto, producer_name="triton", opset_imports=[model_opset]
+        )
     else:
         model_def = onnx.helper.make_model(graph_proto, producer_name="triton")
 
@@ -302,13 +340,15 @@ def create_onnx_modelfile_wo_initial_state(models_dir, model_version, max_batch,
     onnx.save(model_def, model_version_dir + "/model.onnx")
 
 
-def create_onnx_modelfile_with_initial_state(models_dir, model_version,
-                                             max_batch, dtype, shape):
+def create_onnx_modelfile_with_initial_state(
+    models_dir, model_version, max_batch, dtype, shape
+):
     if not tu.validate_for_onnx_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_sequence_model_name(
-        "onnx_nobatch" if max_batch == 0 else "onnx", dtype)
+        "onnx_nobatch" if max_batch == 0 else "onnx", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     onnx_dtype = np_to_onnx_dtype(dtype)
@@ -329,65 +369,80 @@ def create_onnx_modelfile_with_initial_state(models_dir, model_version,
     batch_dim = [] if max_batch == 0 else [None]
 
     onnx_input = onnx.helper.make_tensor_value_info(
-        "INPUT", onnx_dtype, batch_dim + onnx_input_shape)
+        "INPUT", onnx_dtype, batch_dim + onnx_input_shape
+    )
     onnx_input_state = onnx.helper.make_tensor_value_info(
-        "INPUT_STATE", onnx_dtype, batch_dim + onnx_input_shape)
-    onnx_start = onnx.helper.make_tensor_value_info("START", onnx_control_dtype,
-                                                    batch_dim + [1])
-    onnx_ready = onnx.helper.make_tensor_value_info("READY", onnx_control_dtype,
-                                                    batch_dim + [1])
+        "INPUT_STATE", onnx_dtype, batch_dim + onnx_input_shape
+    )
+    onnx_start = onnx.helper.make_tensor_value_info(
+        "START", onnx_control_dtype, batch_dim + [1]
+    )
+    onnx_ready = onnx.helper.make_tensor_value_info(
+        "READY", onnx_control_dtype, batch_dim + [1]
+    )
     onnx_output = onnx.helper.make_tensor_value_info(
-        "OUTPUT", onnx_dtype, batch_dim + onnx_output_shape)
+        "OUTPUT", onnx_dtype, batch_dim + onnx_output_shape
+    )
     onnx_output_state = onnx.helper.make_tensor_value_info(
-        "OUTPUT_STATE", onnx_dtype, batch_dim + onnx_output_shape)
+        "OUTPUT_STATE", onnx_dtype, batch_dim + onnx_output_shape
+    )
 
     internal_input = onnx.helper.make_node("Identity", ["INPUT"], ["_INPUT"])
-    internal_input_state = onnx.helper.make_node("Identity", ["INPUT_STATE"],
-                                                 ["_INPUT_STATE"])
-    # cast int8, int16 input to higer precision int as Onnx Add/Sub operator doesn't support those type
-    if ((onnx_dtype == onnx.TensorProto.INT8) or
-        (onnx_dtype == onnx.TensorProto.INT16)):
-        internal_input = onnx.helper.make_node("Cast", ["INPUT"], ["_INPUT"],
-                                               to=onnx.TensorProto.INT32)
-        internal_input_state = onnx.helper.make_node("Cast", ["INPUT_STATE"],
-                                                     ["_INPUT_STATE"],
-                                                     to=onnx.TensorProto.INT32)
+    internal_input_state = onnx.helper.make_node(
+        "Identity", ["INPUT_STATE"], ["_INPUT_STATE"]
+    )
+    # cast int8, int16 input to higher precision int as Onnx Add/Sub operator doesn't support those type
+    if (onnx_dtype == onnx.TensorProto.INT8) or (onnx_dtype == onnx.TensorProto.INT16):
+        internal_input = onnx.helper.make_node(
+            "Cast", ["INPUT"], ["_INPUT"], to=onnx.TensorProto.INT32
+        )
+        internal_input_state = onnx.helper.make_node(
+            "Cast", ["INPUT_STATE"], ["_INPUT_STATE"], to=onnx.TensorProto.INT32
+        )
 
     if onnx_dtype == onnx.TensorProto.STRING:
         identity = onnx.helper.make_node("Identity", ["_INPUT"], ["OUTPUT"])
-        identity_output_state = onnx.helper.make_node("Identity", ["_INPUT"],
-                                                      ["OUTPUT_STATE"])
+        identity_output_state = onnx.helper.make_node(
+            "Identity", ["_INPUT"], ["OUTPUT_STATE"]
+        )
         onnx_nodes = [
-            internal_input, internal_input_state, identity,
-            identity_output_state
+            internal_input,
+            internal_input_state,
+            identity,
+            identity_output_state,
         ]
     else:
         add = onnx.helper.make_node("Add", ["_INPUT", "_INPUT_STATE"], ["CAST"])
-        cast = onnx.helper.make_node("Cast", ["CAST"], ["OUTPUT"],
-                                     to=onnx_dtype)
-        cast_output_state = onnx.helper.make_node("Cast", ["CAST"],
-                                                  ["OUTPUT_STATE"],
-                                                  to=onnx_dtype)
+        cast = onnx.helper.make_node("Cast", ["CAST"], ["OUTPUT"], to=onnx_dtype)
+        cast_output_state = onnx.helper.make_node(
+            "Cast", ["CAST"], ["OUTPUT_STATE"], to=onnx_dtype
+        )
         # Avoid cast from float16 to float16
         # (bug in Onnx Runtime, cast from float16 to float16 will become cast from float16 to float32)
         if onnx_dtype == onnx.TensorProto.FLOAT16:
             cast = onnx.helper.make_node("Identity", ["CAST"], ["OUTPUT"])
-            cast_output_state = onnx.helper.make_node("Identity", ["CAST"],
-                                                      ["OUTPUT_STATE"])
+            cast_output_state = onnx.helper.make_node(
+                "Identity", ["CAST"], ["OUTPUT_STATE"]
+            )
         onnx_nodes = [
-            internal_input, internal_input_state, add, cast, cast_output_state
+            internal_input,
+            internal_input_state,
+            add,
+            cast,
+            cast_output_state,
         ]
 
     onnx_inputs = [onnx_input_state, onnx_input, onnx_start, onnx_ready]
     onnx_outputs = [onnx_output, onnx_output_state]
-    graph_proto = onnx.helper.make_graph(onnx_nodes, model_name, onnx_inputs,
-                                         onnx_outputs)
+    graph_proto = onnx.helper.make_graph(
+        onnx_nodes, model_name, onnx_inputs, onnx_outputs
+    )
 
     if FLAGS.onnx_opset > 0:
         model_opset = onnx.helper.make_operatorsetid("", FLAGS.onnx_opset)
-        model_def = onnx.helper.make_model(graph_proto,
-                                           producer_name="triton",
-                                           opset_imports=[model_opset])
+        model_def = onnx.helper.make_model(
+            graph_proto, producer_name="triton", opset_imports=[model_opset]
+        )
     else:
         model_def = onnx.helper.make_model(graph_proto, producer_name="triton")
 
@@ -399,22 +454,24 @@ def create_onnx_modelfile_with_initial_state(models_dir, model_version,
     onnx.save(model_def, model_version_dir + "/model.onnx")
 
 
-def create_onnx_modelfile(models_dir, model_version, max_batch, dtype, shape,
-                          initial_state):
-
+def create_onnx_modelfile(
+    models_dir, model_version, max_batch, dtype, shape, initial_state
+):
     if initial_state is None:
-        create_onnx_modelfile_wo_initial_state(models_dir, model_version,
-                                               max_batch, dtype, shape)
+        create_onnx_modelfile_wo_initial_state(
+            models_dir, model_version, max_batch, dtype, shape
+        )
     else:
         # This model assumes that the initial state contains correct data
-        create_onnx_modelfile_with_initial_state(models_dir, model_version,
-                                                 max_batch, dtype, shape)
+        create_onnx_modelfile_with_initial_state(
+            models_dir, model_version, max_batch, dtype, shape
+        )
 
 
-def create_libtorch_modelfile_wo_initial_state(models_dir, model_version,
-                                               max_batch, dtype, shape):
-    if not tu.validate_for_libtorch_model(dtype, dtype, dtype, shape, shape,
-                                          shape):
+def create_libtorch_modelfile_wo_initial_state(
+    models_dir, model_version, max_batch, dtype, shape
+):
+    if not tu.validate_for_libtorch_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     torch_dtype = np_to_torch_dtype(dtype)
@@ -424,32 +481,34 @@ def create_libtorch_modelfile_wo_initial_state(models_dir, model_version,
         torch_dtype = torch.int32
 
     model_name = tu.get_sequence_model_name(
-        "libtorch_nobatch" if max_batch == 0 else "libtorch", dtype)
+        "libtorch_nobatch" if max_batch == 0 else "libtorch", dtype
+    )
 
     if torch_dtype == List[str]:
 
         class SequenceNet(nn.Module):
-
             def __init__(self):
                 super(SequenceNet, self).__init__()
 
-            def forward(self, input0: List[str], input0_state: List[str],
-                        start0, ready0) -> Tuple[List[str], List[str]]:
+            def forward(
+                self, input0: List[str], input0_state: List[str], start0, ready0
+            ) -> Tuple[List[str], List[str]]:
                 use_state = torch.logical_and(ready0, torch.logical_not(start0))
 
                 input0_state_int = torch.tensor(
-                    [int("0" + i) for i in input0_state],
-                    device=use_state.device)
-                input0_int = torch.tensor([int("0" + i) for i in input0],
-                                          device=use_state.device)
+                    [int("0" + i) for i in input0_state], device=use_state.device
+                )
+                input0_int = torch.tensor(
+                    [int("0" + i) for i in input0], device=use_state.device
+                )
                 result_int = torch.mul(use_state, input0_state_int)
                 result_int += input0_int
                 result = [str(i.item()) for i in result_int.cpu()]
                 return result, result
+
     else:
 
         class SequenceNet(nn.Module):
-
             def __init__(self):
                 super(SequenceNet, self).__init__()
 
@@ -471,10 +530,10 @@ def forward(self, input0, input0_state, start0, ready0):
     traced.save(model_version_dir + "/model.pt")
 
 
-def create_libtorch_modelfile_with_initial_state(models_dir, model_version,
-                                                 max_batch, dtype, shape):
-    if not tu.validate_for_libtorch_model(dtype, dtype, dtype, shape, shape,
-                                          shape):
+def create_libtorch_modelfile_with_initial_state(
+    models_dir, model_version, max_batch, dtype, shape
+):
+    if not tu.validate_for_libtorch_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     torch_dtype = np_to_torch_dtype(dtype)
@@ -485,28 +544,31 @@ def create_libtorch_modelfile_with_initial_state(models_dir, model_version,
         torch_dtype = torch.int32
 
     model_name = tu.get_sequence_model_name(
-        "libtorch_nobatch" if max_batch == 0 else "libtorch", dtype)
+        "libtorch_nobatch" if max_batch == 0 else "libtorch", dtype
+    )
     # handle for -1 (when variable) since can't create tensor with shape of [-1]
     if torch_dtype == List[str]:
 
         class SequenceNet(nn.Module):
-
             def __init__(self):
                 super(SequenceNet, self).__init__()
 
-            def forward(self, input0: List[str], input0_state: List[str],
-                        start0, ready0) -> Tuple[List[str], List[str]]:
+            def forward(
+                self, input0: List[str], input0_state: List[str], start0, ready0
+            ) -> Tuple[List[str], List[str]]:
                 input0_state_int = torch.tensor(
-                    [int("0" + i) for i in input0_state], device=start0.device)
-                input0_int = torch.tensor([int("0" + i) for i in input0],
-                                          device=start0.device)
+                    [int("0" + i) for i in input0_state], device=start0.device
+                )
+                input0_int = torch.tensor(
+                    [int("0" + i) for i in input0], device=start0.device
+                )
                 result_int = (input0_state_int + input0_int).cpu()
                 result = [str(i.item()) for i in result_int]
                 return result, result
+
     else:
 
         class SequenceNet(nn.Module):
-
             def __init__(self):
                 super(SequenceNet, self).__init__()
 
@@ -528,25 +590,29 @@ def forward(self, input0, input0_state, start0, ready0):
     traced.save(model_version_dir + "/model.pt")
 
 
-def create_libtorch_modelfile(models_dir, model_version, max_batch, dtype,
-                              shape, initial_state):
+def create_libtorch_modelfile(
+    models_dir, model_version, max_batch, dtype, shape, initial_state
+):
     if initial_state is None:
-        create_libtorch_modelfile_wo_initial_state(models_dir, model_version,
-                                                   max_batch, dtype, shape)
+        create_libtorch_modelfile_wo_initial_state(
+            models_dir, model_version, max_batch, dtype, shape
+        )
     else:
         # This model assumes that the initial state contains correct data
-        create_libtorch_modelfile_with_initial_state(models_dir, model_version,
-                                                     max_batch, dtype, shape)
+        create_libtorch_modelfile_with_initial_state(
+            models_dir, model_version, max_batch, dtype, shape
+        )
 
 
-def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
-                                shape, initial_state):
-    if not tu.validate_for_libtorch_model(dtype, dtype, dtype, shape, shape,
-                                          shape):
+def create_libtorch_modelconfig(
+    models_dir, model_version, max_batch, dtype, shape, initial_state
+):
+    if not tu.validate_for_libtorch_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_sequence_model_name(
-        "libtorch_nobatch" if max_batch == 0 else "libtorch", dtype)
+        "libtorch_nobatch" if max_batch == 0 else "libtorch", dtype
+    )
     config_dir = models_dir + "/" + model_name
 
     if dtype == np.float32:
@@ -557,15 +623,15 @@ def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
     else:
         control_type = "int32"
 
-    instance_group_string = '''
+    instance_group_string = """
 instance_group [
   {
     kind: KIND_GPU
   }
 ]
-'''
+"""
 
-    config = f'''
+    config = f"""
 name: "{model_name}"
 platform: "pytorch_libtorch"
 max_batch_size: {max_batch}
@@ -584,7 +650,7 @@ def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
     dims: [ {tu.shape_to_dims_str(shape)} ]
   }}
 ]
-'''
+"""
     config += instance_group_string
 
     # Prepare the shapes for initial state initialization
@@ -596,7 +662,7 @@ def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
             shape_without_variable_dims.append(dim)
 
     if initial_state is None:
-        config += '''
+        config += """
     sequence_batching {{
       max_sequence_idle_microseconds: 5000000
       control_input [
@@ -625,14 +691,16 @@ def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
           output_name: "OUTPUT_STATE__1"
           data_type: {dtype}
           dims: {dims}
-        }} 
+        }}
       ]
     }}
-    '''.format(type=control_type,
-               dims=tu.shape_to_dims_str(shape),
-               dtype=emu.dtype_str(dtype))
-    elif initial_state == 'zero':
-        config += f'''
+    """.format(
+            type=control_type,
+            dims=tu.shape_to_dims_str(shape),
+            dtype=emu.dtype_str(dtype),
+        )
+    elif initial_state == "zero":
+        config += f"""
     sequence_batching {{
       max_sequence_idle_microseconds: 5000000
       control_input [
@@ -667,12 +735,12 @@ def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
               dims: {tu.shape_to_dims_str(shape_without_variable_dims)}
               zero_data: true
           }}
-        }} 
+        }}
       ]
     }}
-    '''
-    elif initial_state == 'file':
-        config += '''
+    """
+    elif initial_state == "file":
+        config += """
     sequence_batching {{
       max_sequence_idle_microseconds: 5000000
       control_input [
@@ -707,14 +775,17 @@ def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
               dims: {shape_without_variable_dims}
               data_file: input_state_data
           }}
-        }} 
+        }}
       ]
     }}
-    '''.format(type=control_type,
-               dims=tu.shape_to_dims_str(shape),
-               dtype=emu.dtype_str(dtype),
-               shape_without_variable_dims=tu.shape_to_dims_str(
-                   shape_without_variable_dims))
+    """.format(
+            type=control_type,
+            dims=tu.shape_to_dims_str(shape),
+            dtype=emu.dtype_str(dtype),
+            shape_without_variable_dims=tu.shape_to_dims_str(
+                shape_without_variable_dims
+            ),
+        )
 
     try:
         os.makedirs(config_dir)
@@ -724,14 +795,15 @@ def create_libtorch_modelconfig(models_dir, model_version, max_batch, dtype,
         cfile.write(config)
 
 
-def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape,
-                            initial_state):
-
+def create_onnx_modelconfig(
+    models_dir, model_version, max_batch, dtype, shape, initial_state
+):
     if not tu.validate_for_onnx_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_sequence_model_name(
-        "onnx_nobatch" if max_batch == 0 else "onnx", dtype)
+        "onnx_nobatch" if max_batch == 0 else "onnx", dtype
+    )
     config_dir = models_dir + "/" + model_name
 
     if dtype == np.float32:
@@ -742,22 +814,30 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape,
     else:
         control_type = "int32"
 
-    instance_group_string = '''
+    instance_group_string = """
 instance_group [
   {
     kind: KIND_GPU
   }
 ]
-'''
+"""
 
     # [TODO] move create_general_modelconfig() out of emu as it is general
     # enough for all backends to use
     config = emu.create_general_modelconfig(
         model_name,
         "onnxruntime_onnx",
-        max_batch, [dtype], [shape], [None], [dtype], [shape], [None], [None],
+        max_batch,
+        [dtype],
+        [shape],
+        [None],
+        [dtype],
+        [shape],
+        [None],
+        [None],
         force_tensor_number_suffix=False,
-        instance_group_str=instance_group_string)
+        instance_group_str=instance_group_string,
+    )
 
     # Prepare the shapes for initial state initialization
     shape_without_variable_dims = []
@@ -768,7 +848,7 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape,
             shape_without_variable_dims.append(dim)
 
     if initial_state is None:
-        config += '''
+        config += """
     sequence_batching {{
       max_sequence_idle_microseconds: 5000000
       control_input [
@@ -797,14 +877,16 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape,
           output_name: "OUTPUT_STATE"
           data_type: {dtype}
           dims: {dims}
-        }} 
+        }}
       ]
     }}
-    '''.format(type=control_type,
-               dims=tu.shape_to_dims_str(shape),
-               dtype=emu.dtype_str(dtype))
-    elif initial_state == 'zero':
-        config += f'''
+    """.format(
+            type=control_type,
+            dims=tu.shape_to_dims_str(shape),
+            dtype=emu.dtype_str(dtype),
+        )
+    elif initial_state == "zero":
+        config += f"""
     sequence_batching {{
       max_sequence_idle_microseconds: 5000000
       control_input [
@@ -839,12 +921,12 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape,
               dims: {tu.shape_to_dims_str(shape_without_variable_dims)}
               zero_data: true
           }}
-        }} 
+        }}
       ]
     }}
-    '''
-    elif initial_state == 'file':
-        config += '''
+    """
+    elif initial_state == "file":
+        config += """
     sequence_batching {{
       max_sequence_idle_microseconds: 5000000
       control_input [
@@ -879,14 +961,17 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape,
               dims: {shape_without_variable_dims}
               data_file: input_state_data
           }}
-        }} 
+        }}
       ]
     }}
-    '''.format(type=control_type,
-               dims=tu.shape_to_dims_str(shape),
-               dtype=emu.dtype_str(dtype),
-               shape_without_variable_dims=tu.shape_to_dims_str(
-                   shape_without_variable_dims))
+    """.format(
+            type=control_type,
+            dims=tu.shape_to_dims_str(shape),
+            dtype=emu.dtype_str(dtype),
+            shape_without_variable_dims=tu.shape_to_dims_str(
+                shape_without_variable_dims
+            ),
+        )
 
     try:
         os.makedirs(config_dir)
@@ -897,8 +982,7 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape,
         cfile.write(config)
 
 
-def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
-                                shape):
+def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype, shape):
     trt_dtype = np_to_trt_dtype(dtype)
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
@@ -909,15 +993,19 @@ def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
     network.add_input("READY", trt_dtype, [1 for i in shape])
     constant_1_data = trt.Weights(np.ones([1 for i in shape], dtype=dtype))
     constant_1 = network.add_constant([1 for i in shape], constant_1_data)
-    not_start = network.add_elementwise(constant_1.get_output(0), start0,
-                                        trt.ElementWiseOperation.SUB)
+    not_start = network.add_elementwise(
+        constant_1.get_output(0), start0, trt.ElementWiseOperation.SUB
+    )
     not_start.set_output_type(0, trt_dtype)
-    internal_state = network.add_elementwise(in_state0, not_start.get_output(0),
-                                             trt.ElementWiseOperation.PROD)
-    out0 = network.add_elementwise(internal_state.get_output(0), in0,
-                                   trt.ElementWiseOperation.SUM)
-    out0_state = network.add_elementwise(internal_state.get_output(0), in0,
-                                         trt.ElementWiseOperation.SUM)
+    internal_state = network.add_elementwise(
+        in_state0, not_start.get_output(0), trt.ElementWiseOperation.PROD
+    )
+    out0 = network.add_elementwise(
+        internal_state.get_output(0), in0, trt.ElementWiseOperation.SUM
+    )
+    out0_state = network.add_elementwise(
+        internal_state.get_output(0), in0, trt.ElementWiseOperation.SUM
+    )
 
     out0.get_output(0).name = "OUTPUT"
     network.mark_output(out0.get_output(0))
@@ -937,7 +1025,8 @@ def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
     del network
 
     model_name = tu.get_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -949,8 +1038,7 @@ def create_plan_fixed_modelfile(models_dir, model_version, max_batch, dtype,
         f.write(engine_bytes)
 
 
-def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
-                                   shape):
+def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype, shape):
     trt_dtype = np_to_trt_dtype(dtype)
     trt_memory_format = trt.TensorFormat.LINEAR
 
@@ -963,16 +1051,20 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
     ready0 = network.add_input("READY", trt_dtype, [1 for i in shape])
     constant_1_data = trt.Weights(np.ones([1 for i in shape], dtype=dtype))
     constant_1 = network.add_constant([1 for i in shape], constant_1_data)
-    not_start = network.add_elementwise(constant_1.get_output(0), start0,
-                                        trt.ElementWiseOperation.SUB)
+    not_start = network.add_elementwise(
+        constant_1.get_output(0), start0, trt.ElementWiseOperation.SUB
+    )
     not_start.set_output_type(0, trt_dtype)
 
-    internal_state = network.add_elementwise(in_state0, not_start.get_output(0),
-                                             trt.ElementWiseOperation.PROD)
-    out0 = network.add_elementwise(internal_state.get_output(0), in0,
-                                   trt.ElementWiseOperation.SUM)
-    out0_state = network.add_elementwise(internal_state.get_output(0), in0,
-                                         trt.ElementWiseOperation.SUM)
+    internal_state = network.add_elementwise(
+        in_state0, not_start.get_output(0), trt.ElementWiseOperation.PROD
+    )
+    out0 = network.add_elementwise(
+        internal_state.get_output(0), in0, trt.ElementWiseOperation.SUM
+    )
+    out0_state = network.add_elementwise(
+        internal_state.get_output(0), in0, trt.ElementWiseOperation.SUM
+    )
 
     out0.get_output(0).name = "OUTPUT"
     network.mark_output(out0.get_output(0))
@@ -989,7 +1081,7 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
     out0.get_output(0).allowed_formats = 1 << int(trt_memory_format)
     out0_state.get_output(0).allowed_formats = 1 << int(trt_memory_format)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         in0.dynamic_range = (-128.0, 127.0)
         in_state0.dynamic_range = (-128.0, 127.0)
         out0.dynamic_range = (-128.0, 127.0)
@@ -999,9 +1091,9 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
 
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
-    elif (trt_dtype == trt.float16):
+    elif trt_dtype == trt.float16:
         flags |= 1 << int(trt.BuilderFlag.FP16)
 
     config = builder.create_builder_config()
@@ -1016,7 +1108,8 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
         del engine
 
     model_name = tu.get_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -1028,17 +1121,17 @@ def create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch, dtype,
         f.write(engine_bytes)
 
 
-def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
-                                  shape):
+def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype, shape):
     trt_dtype = np_to_trt_dtype(dtype)
     TRT_LOGGER = trt.Logger(trt.Logger.INFO)
     builder = trt.Builder(TRT_LOGGER)
 
     # EXPLICIT_BATCH must be used when the dimension is variable
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
 
-    unit_shape = ([1] * len(shape))
+    unit_shape = [1] * len(shape)
     if max_batch != 0:
         in0 = network.add_input("INPUT", trt_dtype, [-1] + shape)
         start0 = network.add_input("START", trt_dtype, [-1] + unit_shape)
@@ -1053,15 +1146,19 @@ def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
     # Append the dimension by 1 so that broadcasting works properly
     constant_1_data = trt.Weights(np.ones(unit_shape + [1], dtype=dtype))
     constant_1 = network.add_constant(unit_shape + [1], constant_1_data)
-    not_start = network.add_elementwise(constant_1.get_output(0), start0,
-                                        trt.ElementWiseOperation.SUB)
+    not_start = network.add_elementwise(
+        constant_1.get_output(0), start0, trt.ElementWiseOperation.SUB
+    )
     not_start.set_output_type(0, trt_dtype)
-    internal_state = network.add_elementwise(in_state0, not_start.get_output(0),
-                                             trt.ElementWiseOperation.PROD)
-    out0 = network.add_elementwise(internal_state.get_output(0), in0,
-                                   trt.ElementWiseOperation.SUM)
-    out0_state = network.add_elementwise(internal_state.get_output(0), in0,
-                                         trt.ElementWiseOperation.SUM)
+    internal_state = network.add_elementwise(
+        in_state0, not_start.get_output(0), trt.ElementWiseOperation.PROD
+    )
+    out0 = network.add_elementwise(
+        internal_state.get_output(0), in0, trt.ElementWiseOperation.SUM
+    )
+    out0_state = network.add_elementwise(
+        internal_state.get_output(0), in0, trt.ElementWiseOperation.SUM
+    )
 
     out0.get_output(0).name = "OUTPUT"
     network.mark_output(out0.get_output(0))
@@ -1090,10 +1187,18 @@ def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
     profile.set_shape("INPUT", min_shape, opt_shape, max_shape)
     profile.set_shape("INPUT_STATE", min_shape, opt_shape, max_shape)
     if max_batch != 0:
-        profile.set_shape("START", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
-        profile.set_shape("READY", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
+        profile.set_shape(
+            "START",
+            [1] + unit_shape,
+            [max_batch] + unit_shape,
+            [max_batch] + unit_shape,
+        )
+        profile.set_shape(
+            "READY",
+            [1] + unit_shape,
+            [max_batch] + unit_shape,
+            [max_batch] + unit_shape,
+        )
     else:
         profile.set_shape("START", unit_shape, unit_shape, unit_shape)
         profile.set_shape("READY", unit_shape, unit_shape, unit_shape)
@@ -1109,7 +1214,8 @@ def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
         del engine
 
     model_name = tu.get_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -1121,8 +1227,9 @@ def create_plan_dynamic_modelfile(models_dir, model_version, max_batch, dtype,
         f.write(engine_bytes)
 
 
-def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
-                                     dtype, shape):
+def create_plan_dynamic_rf_modelfile(
+    models_dir, model_version, max_batch, dtype, shape
+):
     trt_dtype = np_to_trt_dtype(dtype)
     trt_memory_format = trt.TensorFormat.LINEAR
 
@@ -1131,9 +1238,10 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
 
     # EXPLICIT_BATCH must be used when the dimension is variable
     network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
 
-    unit_shape = ([1] * len(shape))
+    unit_shape = [1] * len(shape)
     if max_batch != 0:
         in0 = network.add_input("INPUT", trt_dtype, [-1] + shape)
         start0 = network.add_input("START", trt_dtype, [-1] + unit_shape)
@@ -1148,15 +1256,19 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
     # Append the dimension by 1 so that broadcasting works properly
     constant_1_data = trt.Weights(np.ones(unit_shape + [1], dtype=dtype))
     constant_1 = network.add_constant(unit_shape + [1], constant_1_data)
-    not_start = network.add_elementwise(constant_1.get_output(0), start0,
-                                        trt.ElementWiseOperation.SUB)
+    not_start = network.add_elementwise(
+        constant_1.get_output(0), start0, trt.ElementWiseOperation.SUB
+    )
     not_start.set_output_type(0, trt_dtype)
-    internal_state = network.add_elementwise(in_state0, not_start.get_output(0),
-                                             trt.ElementWiseOperation.PROD)
-    out0 = network.add_elementwise(internal_state.get_output(0), in0,
-                                   trt.ElementWiseOperation.SUM)
-    out0_state = network.add_elementwise(internal_state.get_output(0), in0,
-                                         trt.ElementWiseOperation.SUM)
+    internal_state = network.add_elementwise(
+        in_state0, not_start.get_output(0), trt.ElementWiseOperation.PROD
+    )
+    out0 = network.add_elementwise(
+        internal_state.get_output(0), in0, trt.ElementWiseOperation.SUM
+    )
+    out0_state = network.add_elementwise(
+        internal_state.get_output(0), in0, trt.ElementWiseOperation.SUM
+    )
     out0.get_output(0).name = "OUTPUT"
     network.mark_output(out0.get_output(0))
 
@@ -1171,7 +1283,7 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
     ready0.allowed_formats = 1 << int(trt_memory_format)
     out0.get_output(0).allowed_formats = 1 << int(trt_memory_format)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         in0.dynamic_range = (-128.0, 127.0)
         in_state0.dynamic_range = (-128.0, 127.0)
         out0.dynamic_range = (-128.0, 127.0)
@@ -1180,9 +1292,9 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
 
     flags = 1 << int(trt.BuilderFlag.STRICT_TYPES)
 
-    if (trt_dtype == trt.int8):
+    if trt_dtype == trt.int8:
         flags |= 1 << int(trt.BuilderFlag.INT8)
-    elif (trt_dtype == trt.float16):
+    elif trt_dtype == trt.float16:
         flags |= 1 << int(trt.BuilderFlag.FP16)
 
     min_shape = []
@@ -1206,10 +1318,18 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
     profile.set_shape("INPUT", min_shape, opt_shape, max_shape)
     profile.set_shape("INPUT_STATE", min_shape, opt_shape, max_shape)
     if max_batch != 0:
-        profile.set_shape("START", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
-        profile.set_shape("READY", [1] + unit_shape, [max_batch] + unit_shape,
-                          [max_batch] + unit_shape)
+        profile.set_shape(
+            "START",
+            [1] + unit_shape,
+            [max_batch] + unit_shape,
+            [max_batch] + unit_shape,
+        )
+        profile.set_shape(
+            "READY",
+            [1] + unit_shape,
+            [max_batch] + unit_shape,
+            [max_batch] + unit_shape,
+        )
     else:
         profile.set_shape("START", unit_shape, unit_shape, unit_shape)
         profile.set_shape("READY", unit_shape, unit_shape, unit_shape)
@@ -1227,7 +1347,8 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
         del engine
 
     model_name = tu.get_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     model_version_dir = models_dir + "/" + model_name + "/" + str(model_version)
 
     try:
@@ -1240,35 +1361,38 @@ def create_plan_dynamic_rf_modelfile(models_dir, model_version, max_batch,
 
 
 def create_plan_modelfile(models_dir, model_version, max_batch, dtype, shape):
-
     if not tu.validate_for_trt_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     if dtype != np.float32:
-        if (not tu.shape_is_fixed(shape)):
-            create_plan_dynamic_rf_modelfile(models_dir, model_version,
-                                             max_batch, dtype, shape)
+        if not tu.shape_is_fixed(shape):
+            create_plan_dynamic_rf_modelfile(
+                models_dir, model_version, max_batch, dtype, shape
+            )
         else:
-            create_plan_fixed_rf_modelfile(models_dir, model_version, max_batch,
-                                           dtype, shape)
+            create_plan_fixed_rf_modelfile(
+                models_dir, model_version, max_batch, dtype, shape
+            )
     else:
-        if (not tu.shape_is_fixed(shape)):
-            create_plan_dynamic_modelfile(models_dir, model_version, max_batch,
-                                          dtype, shape)
+        if not tu.shape_is_fixed(shape):
+            create_plan_dynamic_modelfile(
+                models_dir, model_version, max_batch, dtype, shape
+            )
         else:
-            create_plan_fixed_modelfile(models_dir, model_version, max_batch,
-                                        dtype, shape)
+            create_plan_fixed_modelfile(
+                models_dir, model_version, max_batch, dtype, shape
+            )
 
 
 def create_plan_modelconfig(models_dir, model_version, max_batch, dtype, shape):
-
     if not tu.validate_for_trt_model(dtype, dtype, dtype, shape, shape, shape):
         return
 
     model_name = tu.get_sequence_model_name(
-        "plan_nobatch" if max_batch == 0 else "plan", dtype)
+        "plan_nobatch" if max_batch == 0 else "plan", dtype
+    )
     config_dir = models_dir + "/" + model_name
-    config = '''
+    config = """
 name: "{}"
 platform: "tensorrt_plan"
 max_batch_size: {}
@@ -1300,7 +1424,7 @@ def create_plan_modelconfig(models_dir, model_version, max_batch, dtype, shape):
       output_name: "OUTPUT_STATE"
       data_type: {dtype}
       dims: {shape}
-    }} 
+    }}
   ]
 }}
 input [
@@ -1322,12 +1446,14 @@ def create_plan_modelconfig(models_dir, model_version, max_batch, dtype, shape):
     kind: KIND_GPU
   }}
 ]
-'''.format(model_name,
-           max_batch,
-           "int32" if dtype == np.int32 else "fp32",
-           "int32" if dtype == np.int32 else "fp32",
-           dtype=np_to_model_dtype(dtype),
-           shape=tu.shape_to_dims_str(shape))
+""".format(
+        model_name,
+        max_batch,
+        "int32" if dtype == np.int32 else "fp32",
+        "int32" if dtype == np.int32 else "fp32",
+        dtype=np_to_model_dtype(dtype),
+        shape=tu.shape_to_dims_str(shape),
+    )
 
     try:
         os.makedirs(config_dir)
@@ -1342,15 +1468,17 @@ def create_models(models_dir, dtype, shape, initial_state, no_batch=True):
     model_version = 1
 
     if FLAGS.onnx:
-        create_onnx_modelconfig(models_dir, model_version, 8, dtype, shape,
-                                initial_state)
-        create_onnx_modelfile(models_dir, model_version, 8, dtype, shape,
-                              initial_state)
+        create_onnx_modelconfig(
+            models_dir, model_version, 8, dtype, shape, initial_state
+        )
+        create_onnx_modelfile(models_dir, model_version, 8, dtype, shape, initial_state)
         if no_batch:
-            create_onnx_modelconfig(models_dir, model_version, 0, dtype, shape,
-                                    initial_state)
-            create_onnx_modelfile(models_dir, model_version, 0, dtype, shape,
-                                  initial_state)
+            create_onnx_modelconfig(
+                models_dir, model_version, 0, dtype, shape, initial_state
+            )
+            create_onnx_modelfile(
+                models_dir, model_version, 0, dtype, shape, initial_state
+            )
 
     if FLAGS.tensorrt:
         if dtype == bool:
@@ -1359,87 +1487,103 @@ def create_models(models_dir, dtype, shape, initial_state, no_batch=True):
         if dtype == np.int8:
             suffix = [1, 1]
 
-        create_plan_modelconfig(models_dir, model_version, 8, dtype,
-                                shape + suffix)
-        create_plan_modelfile(models_dir, model_version, 8, dtype,
-                              shape + suffix)
+        create_plan_modelconfig(models_dir, model_version, 8, dtype, shape + suffix)
+        create_plan_modelfile(models_dir, model_version, 8, dtype, shape + suffix)
         if no_batch:
-            create_plan_modelconfig(models_dir, model_version, 0, dtype,
-                                    shape + suffix)
-            create_plan_modelfile(models_dir, model_version, 0, dtype,
-                                  shape + suffix)
+            create_plan_modelconfig(models_dir, model_version, 0, dtype, shape + suffix)
+            create_plan_modelfile(models_dir, model_version, 0, dtype, shape + suffix)
 
     if FLAGS.libtorch:
         suffix = []
         if dtype == np.int8:
             suffix = [1, 1]
 
-        create_libtorch_modelconfig(models_dir, model_version, 8, dtype,
-                                    shape + suffix, initial_state)
-        create_libtorch_modelfile(models_dir, model_version, 8, dtype,
-                                  shape + suffix, initial_state)
+        create_libtorch_modelconfig(
+            models_dir, model_version, 8, dtype, shape + suffix, initial_state
+        )
+        create_libtorch_modelfile(
+            models_dir, model_version, 8, dtype, shape + suffix, initial_state
+        )
         if no_batch:
-            create_libtorch_modelconfig(models_dir, model_version, 0, dtype,
-                                        shape + suffix, initial_state)
-            create_libtorch_modelfile(models_dir, model_version, 0, dtype,
-                                      shape + suffix, initial_state)
+            create_libtorch_modelconfig(
+                models_dir, model_version, 0, dtype, shape + suffix, initial_state
+            )
+            create_libtorch_modelfile(
+                models_dir, model_version, 0, dtype, shape + suffix, initial_state
+            )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--models_dir',
-                        type=str,
-                        required=True,
-                        help='Top-level model directory')
-    parser.add_argument('--graphdef',
-                        required=False,
-                        action='store_true',
-                        help='Generate GraphDef models')
-    parser.add_argument('--savedmodel',
-                        required=False,
-                        action='store_true',
-                        help='Generate SavedModel models')
-    parser.add_argument('--tensorrt',
-                        required=False,
-                        action='store_true',
-                        help='Generate TensorRT PLAN models')
-    parser.add_argument('--initial-state',
-                        required=False,
-                        choices=['zero', 'file'],
-                        help='Generate models that rely on initial state.')
     parser.add_argument(
-        '--tensorrt-shape-io',
+        "--models_dir", type=str, required=True, help="Top-level model directory"
+    )
+    parser.add_argument(
+        "--graphdef",
+        required=False,
+        action="store_true",
+        help="Generate GraphDef models",
+    )
+    parser.add_argument(
+        "--savedmodel",
+        required=False,
+        action="store_true",
+        help="Generate SavedModel models",
+    )
+    parser.add_argument(
+        "--tensorrt",
+        required=False,
+        action="store_true",
+        help="Generate TensorRT PLAN models",
+    )
+    parser.add_argument(
+        "--initial-state",
         required=False,
-        action='store_true',
-        help='Generate TensorRT PLAN models w/ shape tensor i/o')
-    parser.add_argument('--onnx',
-                        required=False,
-                        action='store_true',
-                        help='Generate Onnx models')
+        choices=["zero", "file"],
+        help="Generate models that rely on initial state.",
+    )
     parser.add_argument(
-        '--onnx_opset',
+        "--tensorrt-shape-io",
+        required=False,
+        action="store_true",
+        help="Generate TensorRT PLAN models w/ shape tensor i/o",
+    )
+    parser.add_argument(
+        "--onnx", required=False, action="store_true", help="Generate Onnx models"
+    )
+    parser.add_argument(
+        "--onnx_opset",
         type=int,
         required=False,
         default=0,
-        help='Opset used for Onnx models. Default is to use ONNXRT default')
-    parser.add_argument('--libtorch',
-                        required=False,
-                        action='store_true',
-                        help='Generate Pytorch LibTorch models')
-    parser.add_argument('--openvino',
-                        required=False,
-                        action='store_true',
-                        help='Generate OpenVino models')
-    parser.add_argument('--variable',
-                        required=False,
-                        action='store_true',
-                        help='Used variable-shape tensors for input/output')
-    parser.add_argument('--ensemble',
-                        required=False,
-                        action='store_true',
-                        help='Generate ensemble models against the models' +
-                        ' in all platforms. Note that the models generated' +
-                        ' are not completed.')
+        help="Opset used for Onnx models. Default is to use ONNXRT default",
+    )
+    parser.add_argument(
+        "--libtorch",
+        required=False,
+        action="store_true",
+        help="Generate Pytorch LibTorch models",
+    )
+    parser.add_argument(
+        "--openvino",
+        required=False,
+        action="store_true",
+        help="Generate OpenVino models",
+    )
+    parser.add_argument(
+        "--variable",
+        required=False,
+        action="store_true",
+        help="Used variable-shape tensors for input/output",
+    )
+    parser.add_argument(
+        "--ensemble",
+        required=False,
+        action="store_true",
+        help="Generate ensemble models against the models"
+        + " in all platforms. Note that the models generated"
+        + " are not completed.",
+    )
     FLAGS, unparsed = parser.parse_known_args()
 
     if FLAGS.onnx:
@@ -1456,30 +1600,74 @@ def create_models(models_dir, dtype, shape, initial_state, no_batch=True):
 
     # Tests with models that accept fixed-shape input/output tensors
     if not FLAGS.variable:
-        create_models(FLAGS.models_dir, np.float32, [
-            1,
-        ], FLAGS.initial_state)
-        create_models(FLAGS.models_dir, np.int32, [
-            1,
-        ], FLAGS.initial_state)
-        create_models(FLAGS.models_dir, np_dtype_string, [
-            1,
-        ], FLAGS.initial_state)
-        create_models(FLAGS.models_dir, bool, [
-            1,
-        ], FLAGS.initial_state)
+        create_models(
+            FLAGS.models_dir,
+            np.float32,
+            [
+                1,
+            ],
+            FLAGS.initial_state,
+        )
+        create_models(
+            FLAGS.models_dir,
+            np.int32,
+            [
+                1,
+            ],
+            FLAGS.initial_state,
+        )
+        create_models(
+            FLAGS.models_dir,
+            np_dtype_string,
+            [
+                1,
+            ],
+            FLAGS.initial_state,
+        )
+        create_models(
+            FLAGS.models_dir,
+            bool,
+            [
+                1,
+            ],
+            FLAGS.initial_state,
+        )
 
     # Tests with models that accept variable-shape input/output tensors
     if FLAGS.variable:
-        create_models(FLAGS.models_dir, np.int32, [
-            -1,
-        ], FLAGS.initial_state, False)
-        create_models(FLAGS.models_dir, np.float32, [
-            -1,
-        ], FLAGS.initial_state, False)
-        create_models(FLAGS.models_dir, np_dtype_string, [
-            -1,
-        ], FLAGS.initial_state, False)
-        create_models(FLAGS.models_dir, bool, [
-            -1,
-        ], FLAGS.initial_state, False)
\ No newline at end of file
+        create_models(
+            FLAGS.models_dir,
+            np.int32,
+            [
+                -1,
+            ],
+            FLAGS.initial_state,
+            False,
+        )
+        create_models(
+            FLAGS.models_dir,
+            np.float32,
+            [
+                -1,
+            ],
+            FLAGS.initial_state,
+            False,
+        )
+        create_models(
+            FLAGS.models_dir,
+            np_dtype_string,
+            [
+                -1,
+            ],
+            FLAGS.initial_state,
+            False,
+        )
+        create_models(
+            FLAGS.models_dir,
+            bool,
+            [
+                -1,
+            ],
+            FLAGS.initial_state,
+            False,
+        )
diff --git a/qa/python_models/bls_onnx_warmup/model.py b/qa/python_models/bls_onnx_warmup/model.py
index 7afad77b39..233bdc85ab 100644
--- a/qa/python_models/bls_onnx_warmup/model.py
+++ b/qa/python_models/bls_onnx_warmup/model.py
@@ -24,32 +24,33 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import numpy as np
 import unittest
+
+import numpy as np
 import triton_python_backend_utils as pb_utils
 from torch.utils.dlpack import from_dlpack
 
 
 class PBBLSONNXWarmupTest(unittest.TestCase):
-
     def test_onnx_output_mem_type(self):
         input0_np = np.random.randn(*[16])
         input0_np = input0_np.astype(np.float32)
         input1_np = np.random.randn(*[16])
         input1_np = input1_np.astype(np.float32)
-        input0 = pb_utils.Tensor('INPUT0', input0_np)
-        input1 = pb_utils.Tensor('INPUT1', input1_np)
+        input0 = pb_utils.Tensor("INPUT0", input0_np)
+        input1 = pb_utils.Tensor("INPUT1", input1_np)
         infer_request = pb_utils.InferenceRequest(
-            model_name='onnx_nobatch_float32_float32_float32',
+            model_name="onnx_nobatch_float32_float32_float32",
             inputs=[input0, input1],
-            requested_output_names=['OUTPUT0', 'OUTPUT1'])
+            requested_output_names=["OUTPUT0", "OUTPUT1"],
+        )
 
         infer_response = infer_request.exec()
 
         self.assertFalse(infer_response.has_error())
 
-        output0 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT0')
-        output1 = pb_utils.get_output_tensor_by_name(infer_response, 'OUTPUT1')
+        output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
+        output1 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT1")
 
         self.assertIsNotNone(output0)
         self.assertIsNotNone(output1)
@@ -61,27 +62,27 @@ def test_onnx_output_mem_type(self):
         expected_output_0 = input0.as_numpy() - input1.as_numpy()
         expected_output_1 = input0.as_numpy() + input1.as_numpy()
 
-        output0 = from_dlpack(
-            output0.to_dlpack()).to('cpu').cpu().detach().numpy()
-        output1 = from_dlpack(
-            output1.to_dlpack()).to('cpu').cpu().detach().numpy()
+        output0 = from_dlpack(output0.to_dlpack()).to("cpu").cpu().detach().numpy()
+        output1 = from_dlpack(output1.to_dlpack()).to("cpu").cpu().detach().numpy()
 
         self.assertTrue(np.all(output0 == expected_output_0))
         self.assertTrue(np.all(output1 == expected_output_1))
 
 
 class TritonPythonModel:
-
     def execute(self, requests):
         responses = []
         for _ in requests:
             # Run the unittest and store the results in InferenceResponse.
-            test = unittest.main('model', exit=False)
+            test = unittest.main("model", exit=False)
             responses.append(
-                pb_utils.InferenceResponse([
-                    pb_utils.Tensor(
-                        'OUTPUT0',
-                        np.array([test.result.wasSuccessful()],
-                                 dtype=np.float16))
-                ]))
+                pb_utils.InferenceResponse(
+                    [
+                        pb_utils.Tensor(
+                            "OUTPUT0",
+                            np.array([test.result.wasSuccessful()], dtype=np.float16),
+                        )
+                    ]
+                )
+            )
         return responses

From 1144058be6976648205c5bf5fbc1ec6ebaceab5a Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Sun, 9 Jul 2023 20:36:06 -0700
Subject: [PATCH 24/39] Mark not executable

---
 qa/python_models/bls_onnx_warmup/config.pbtxt | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 qa/python_models/bls_onnx_warmup/config.pbtxt

diff --git a/qa/python_models/bls_onnx_warmup/config.pbtxt b/qa/python_models/bls_onnx_warmup/config.pbtxt
old mode 100755
new mode 100644

From 62306b228e8f3648c53dc7cc418ec5722989d4be Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Mon, 10 Jul 2023 19:47:08 -0700
Subject: [PATCH 25/39] Run pre-commit hooks

---
 qa/L0_implicit_state/implicit_state.py | 156 ++++++++++++-------------
 1 file changed, 78 insertions(+), 78 deletions(-)

diff --git a/qa/L0_implicit_state/implicit_state.py b/qa/L0_implicit_state/implicit_state.py
index db8053dcb1..287d16f839 100755
--- a/qa/L0_implicit_state/implicit_state.py
+++ b/qa/L0_implicit_state/implicit_state.py
@@ -27,109 +27,110 @@
 
 import sys
 
-sys.path.append('../common')
+sys.path.append("../common")
 
-import numpy as np
 import os
+import unittest
 from builtins import range
+
+import numpy as np
+import test_util as tu
 import tritonclient.http as tritonhttpclient
 from tritonclient.utils import InferenceServerException
-import unittest
-import test_util as tu
 
-BACKENDS = os.environ.get('BACKENDS', "onnx plan libtorch")
+BACKENDS = os.environ.get("BACKENDS", "onnx plan libtorch")
 
 
 class ImplicitStateTest(tu.TestResultCollector):
-
     def test_no_implicit_state(self):
         triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
         inputs = []
-        inputs.append(tritonhttpclient.InferInput('INPUT', [1], 'INT32'))
-        inputs.append(tritonhttpclient.InferInput('TEST_CASE', [1], 'INT32'))
-        inputs[0].set_data_from_numpy(
-            np.random.randint(5, size=[1], dtype=np.int32))
+        inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32"))
+        inputs.append(tritonhttpclient.InferInput("TEST_CASE", [1], "INT32"))
+        inputs[0].set_data_from_numpy(np.random.randint(5, size=[1], dtype=np.int32))
         inputs[1].set_data_from_numpy(np.asarray([0], dtype=np.int32))
 
         with self.assertRaises(InferenceServerException) as e:
-            triton_client.infer(model_name="no_implicit_state",
-                                inputs=inputs,
-                                sequence_id=1,
-                                sequence_start=True)
+            triton_client.infer(
+                model_name="no_implicit_state",
+                inputs=inputs,
+                sequence_id=1,
+                sequence_start=True,
+            )
 
         err_str = str(e.exception).lower()
         self.assertIn("unable to add state 'undefined_state'", err_str)
         self.assertIn(
-            "state configuration is missing for model 'no_implicit_state'",
-            err_str)
+            "state configuration is missing for model 'no_implicit_state'", err_str
+        )
 
     def test_wrong_implicit_state_name(self):
         triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
         inputs = []
-        inputs.append(tritonhttpclient.InferInput('INPUT', [1], 'INT32'))
-        inputs.append(tritonhttpclient.InferInput('TEST_CASE', [1], 'INT32'))
-        inputs[0].set_data_from_numpy(
-            np.random.randint(5, size=[1], dtype=np.int32))
+        inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32"))
+        inputs.append(tritonhttpclient.InferInput("TEST_CASE", [1], "INT32"))
+        inputs[0].set_data_from_numpy(np.random.randint(5, size=[1], dtype=np.int32))
         inputs[1].set_data_from_numpy(np.asarray([0], dtype=np.int32))
 
         with self.assertRaises(InferenceServerException) as e:
-            triton_client.infer(model_name="wrong_internal_state",
-                                inputs=inputs,
-                                sequence_id=2,
-                                sequence_start=True)
+            triton_client.infer(
+                model_name="wrong_internal_state",
+                inputs=inputs,
+                sequence_id=2,
+                sequence_start=True,
+            )
 
         err_str = str(e.exception).lower()
-        self.assertIn("state 'undefined_state' is not a valid state name",
-                      err_str)
+        self.assertIn("state 'undefined_state' is not a valid state name", err_str)
 
     def test_no_update(self):
         # Test implicit state without updating any state
         triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
         inputs = []
-        inputs.append(tritonhttpclient.InferInput('INPUT', [1], 'INT32'))
-        inputs.append(tritonhttpclient.InferInput('TEST_CASE', [1], 'INT32'))
+        inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32"))
+        inputs.append(tritonhttpclient.InferInput("TEST_CASE", [1], "INT32"))
         inputs[0].set_data_from_numpy(np.asarray([1], dtype=np.int32))
         inputs[1].set_data_from_numpy(np.asarray([1], dtype=np.int32))
         correlation_id = 3
 
         # Make sure the state is never updated.
-        result_start = triton_client.infer(model_name="no_state_update",
-                                           inputs=inputs,
-                                           sequence_id=correlation_id,
-                                           sequence_start=True)
-        self.assertEqual(result_start.as_numpy('OUTPUT')[0], 1)
+        result_start = triton_client.infer(
+            model_name="no_state_update",
+            inputs=inputs,
+            sequence_id=correlation_id,
+            sequence_start=True,
+        )
+        self.assertEqual(result_start.as_numpy("OUTPUT")[0], 1)
         for _ in range(10):
-            result = triton_client.infer(model_name="no_state_update",
-                                         inputs=inputs,
-                                         sequence_id=correlation_id)
-            self.assertEqual(result.as_numpy('OUTPUT')[0], 1)
-
-        result_start = triton_client.infer(model_name="no_state_update",
-                                           inputs=inputs,
-                                           sequence_id=correlation_id,
-                                           sequence_end=True)
-        self.assertEqual(result.as_numpy('OUTPUT')[0], 1)
+            result = triton_client.infer(
+                model_name="no_state_update", inputs=inputs, sequence_id=correlation_id
+            )
+            self.assertEqual(result.as_numpy("OUTPUT")[0], 1)
+
+        result_start = triton_client.infer(
+            model_name="no_state_update",
+            inputs=inputs,
+            sequence_id=correlation_id,
+            sequence_end=True,
+        )
+        self.assertEqual(result.as_numpy("OUTPUT")[0], 1)
 
     def test_request_output_not_allowed(self):
         triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
 
         for backend in BACKENDS.split(" "):
             inputs = []
-            if backend.strip() == 'libtorch':
-                inputs.append(
-                    tritonhttpclient.InferInput('INPUT__0', [1], 'INT32'))
+            if backend.strip() == "libtorch":
+                inputs.append(tritonhttpclient.InferInput("INPUT__0", [1], "INT32"))
             else:
-                inputs.append(tritonhttpclient.InferInput(
-                    'INPUT', [1], 'INT32'))
+                inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32"))
             inputs[0].set_data_from_numpy(np.asarray([1], dtype=np.int32))
 
             outputs = []
-            if backend.strip() == 'libtorch':
-                outputs.append(
-                    tritonhttpclient.InferRequestedOutput('OUTPUT_STATE__1'))
+            if backend.strip() == "libtorch":
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT_STATE__1"))
             else:
-                outputs.append(
-                    tritonhttpclient.InferRequestedOutput('OUTPUT_STATE'))
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT_STATE"))
 
             with self.assertRaises(InferenceServerException) as e:
                 triton_client.infer(
@@ -138,38 +139,36 @@ def test_request_output_not_allowed(self):
                     outputs=outputs,
                     sequence_id=1,
                     sequence_start=True,
-                    sequence_end=True)
-            if backend.strip() == 'libtorch':
+                    sequence_end=True,
+                )
+            if backend.strip() == "libtorch":
                 self.assertIn(
                     "unexpected inference output 'OUTPUT_STATE__1' for model",
-                    str(e.exception))
+                    str(e.exception),
+                )
             else:
                 self.assertIn(
                     "unexpected inference output 'OUTPUT_STATE' for model",
-                    str(e.exception))
+                    str(e.exception),
+                )
 
     def test_request_output(self):
         triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
         for backend in BACKENDS.split(" "):
             inputs = []
-            if backend.strip() == 'libtorch':
-                inputs.append(
-                    tritonhttpclient.InferInput('INPUT__0', [1], 'INT32'))
+            if backend.strip() == "libtorch":
+                inputs.append(tritonhttpclient.InferInput("INPUT__0", [1], "INT32"))
             else:
-                inputs.append(tritonhttpclient.InferInput(
-                    'INPUT', [1], 'INT32'))
+                inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32"))
             inputs[0].set_data_from_numpy(np.asarray([1], dtype=np.int32))
 
             outputs = []
-            if backend.strip() == 'libtorch':
-                outputs.append(
-                    tritonhttpclient.InferRequestedOutput('OUTPUT_STATE__1'))
-                outputs.append(
-                    tritonhttpclient.InferRequestedOutput('OUTPUT__0'))
+            if backend.strip() == "libtorch":
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT_STATE__1"))
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT__0"))
             else:
-                outputs.append(
-                    tritonhttpclient.InferRequestedOutput('OUTPUT_STATE'))
-                outputs.append(tritonhttpclient.InferRequestedOutput('OUTPUT'))
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT_STATE"))
+                outputs.append(tritonhttpclient.InferRequestedOutput("OUTPUT"))
 
             result = triton_client.infer(
                 model_name=f"{backend}_nobatch_sequence_int32_output",
@@ -177,14 +176,15 @@ def test_request_output(self):
                 outputs=outputs,
                 sequence_id=1,
                 sequence_start=True,
-                sequence_end=True)
-            if backend.strip() == 'libtorch':
-                self.assertTrue(result.as_numpy('OUTPUT_STATE__1')[0], 1)
-                self.assertTrue(result.as_numpy('OUTPUT__0')[0], 1)
+                sequence_end=True,
+            )
+            if backend.strip() == "libtorch":
+                self.assertTrue(result.as_numpy("OUTPUT_STATE__1")[0], 1)
+                self.assertTrue(result.as_numpy("OUTPUT__0")[0], 1)
             else:
-                self.assertTrue(result.as_numpy('OUTPUT_STATE')[0], 1)
-                self.assertTrue(result.as_numpy('OUTPUT')[0], 1)
+                self.assertTrue(result.as_numpy("OUTPUT_STATE")[0], 1)
+                self.assertTrue(result.as_numpy("OUTPUT")[0], 1)
 
 
-if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+if __name__ == "__main__":
+    unittest.main()

From 26dbbd57cd85779794ba172eb3b8f5c02d3d7793 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Mon, 10 Jul 2023 19:49:00 -0700
Subject: [PATCH 26/39] Remove unused variable

---
 qa/L0_implicit_state/implicit_state.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qa/L0_implicit_state/implicit_state.py b/qa/L0_implicit_state/implicit_state.py
index 287d16f839..8d56a2c570 100755
--- a/qa/L0_implicit_state/implicit_state.py
+++ b/qa/L0_implicit_state/implicit_state.py
@@ -107,7 +107,7 @@ def test_no_update(self):
             )
             self.assertEqual(result.as_numpy("OUTPUT")[0], 1)
 
-        result_start = triton_client.infer(
+        _ = triton_client.infer(
             model_name="no_state_update",
             inputs=inputs,
             sequence_id=correlation_id,

From 6382e494b9bfe7495a24acd18f1b8f30c26e9e20 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Wed, 12 Jul 2023 09:31:18 -0700
Subject: [PATCH 27/39] Run pre-commit hooks after rebase

---
 build.py   | 2078 +++++++++++++++++++++-----------------
 issues.csv | 2863 ----------------------------------------------------
 2 files changed, 1131 insertions(+), 3810 deletions(-)
 delete mode 100644 issues.csv

diff --git a/build.py b/build.py
index 1339c5c6f9..3515698a45 100755
--- a/build.py
+++ b/build.py
@@ -26,9 +26,9 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
+import multiprocessing
 import os
 import os.path
-import multiprocessing
 import pathlib
 import platform
 import stat
@@ -42,7 +42,7 @@
 
 # By default build.py builds the Triton Docker image, but can also be
 # used to build without Docker.  See docs/build.md and --help for more
-# infomation.
+# information.
 #
 # The TRITON_VERSION file indicates the Triton version and
 # TRITON_VERSION_MAP is used to determine the corresponding container
@@ -67,17 +67,18 @@
 # incorrectly load the other version of the openvino libraries.
 #
 TRITON_VERSION_MAP = {
-    '2.37.0dev': (
-        '23.08dev',  # triton container
-        '23.06',  # upstream container
-        '1.15.0',  # ORT
-        '2023.0.0',  # ORT OpenVINO
-        '2023.0.0',  # Standalone OpenVINO
-        '2.4.7',  # DCGM version
-        'py310_23.1.0-1')  # Conda version.
+    "2.37.0dev": (
+        "23.08dev",  # triton container
+        "23.06",  # upstream container
+        "1.15.0",  # ORT
+        "2023.0.0",  # ORT OpenVINO
+        "2023.0.0",  # Standalone OpenVINO
+        "2.4.7",  # DCGM version
+        "py310_23.1.0-1",
+    )  # Conda version.
 }
 
-CORE_BACKENDS = ['ensemble']
+CORE_BACKENDS = ["ensemble"]
 
 FLAGS = None
 EXTRA_CORE_CMAKE_FLAGS = {}
@@ -93,7 +94,7 @@ def log(msg, force=False):
         try:
             print(msg, file=sys.stderr)
         except Exception:
-            print('<failed to log>', file=sys.stderr)
+            print("<failed to log>", file=sys.stderr)
 
 
 def log_verbose(msg):
@@ -107,7 +108,7 @@ def fail(msg):
 
 def fail_if(p, msg):
     if p:
-        print('error: {}'.format(msg), file=sys.stderr)
+        print("error: {}".format(msg), file=sys.stderr)
         sys.exit(1)
 
 
@@ -126,11 +127,11 @@ def target_machine():
 def container_versions(version, container_version, upstream_container_version):
     if container_version is None:
         if version not in TRITON_VERSION_MAP:
-            fail('container version not known for {}'.format(version))
+            fail("container version not known for {}".format(version))
         container_version = TRITON_VERSION_MAP[version][0]
     if upstream_container_version is None:
         if version not in TRITON_VERSION_MAP:
-            fail('upstream container version not known for {}'.format(version))
+            fail("upstream container version not known for {}".format(version))
         upstream_container_version = TRITON_VERSION_MAP[version][1]
     return container_version, upstream_container_version
 
@@ -155,13 +156,13 @@ def __del__(self):
 
     def close(self):
         if self._file is not None:
-            if target_platform() == 'windows':
+            if target_platform() == "windows":
                 self.blankln()
-                self._file.write('}\n')
-                self._file.write('catch {\n')
-                self._file.write('    $_;\n')
-                self._file.write('    ExitWithCode 1;\n')
-                self._file.write('}\n')
+                self._file.write("}\n")
+                self._file.write("catch {\n")
+                self._file.write("    $_;\n")
+                self._file.write("    ExitWithCode 1;\n")
+                self._file.write("}\n")
             """Close the file"""
             self._file.close()
             self._file = None
@@ -169,28 +170,28 @@ def close(self):
             os.chmod(self._filepath, st.st_mode | stat.S_IEXEC)
 
     def blankln(self):
-        self._file.write('\n')
+        self._file.write("\n")
 
     def commentln(self, cnt):
-        self._file.write('#' * cnt + '\n')
+        self._file.write("#" * cnt + "\n")
 
-    def comment(self, msg=''):
+    def comment(self, msg=""):
         if not isinstance(msg, str):
             try:
                 for m in msg:
-                    self._file.write(f'# {msg}\n')
+                    self._file.write(f"# {msg}\n")
                 return
             except TypeError:
                 pass
-        self._file.write(f'# {msg}\n')
+        self._file.write(f"# {msg}\n")
 
-    def comment_verbose(self, msg=''):
+    def comment_verbose(self, msg=""):
         if self._verbose:
             self.comment(msg)
 
     def header(self, desc=None):
-        if target_platform() != 'windows':
-            self._file.write('#!/usr/bin/env bash\n\n')
+        if target_platform() != "windows":
+            self._file.write("#!/usr/bin/env bash\n\n")
 
         if desc is not None:
             self.comment()
@@ -198,135 +199,134 @@ def header(self, desc=None):
             self.comment()
             self.blankln()
 
-        self.comment('Exit script immediately if any command fails')
-        if target_platform() == 'windows':
-            self._file.write('function ExitWithCode($exitcode) {\n')
-            self._file.write('    $host.SetShouldExit($exitcode)\n')
-            self._file.write('    exit $exitcode\n')
-            self._file.write('}\n')
+        self.comment("Exit script immediately if any command fails")
+        if target_platform() == "windows":
+            self._file.write("function ExitWithCode($exitcode) {\n")
+            self._file.write("    $host.SetShouldExit($exitcode)\n")
+            self._file.write("    exit $exitcode\n")
+            self._file.write("}\n")
             self.blankln()
             if self._verbose:
-                self._file.write('Set-PSDebug -Trace 1\n')
+                self._file.write("Set-PSDebug -Trace 1\n")
             self.blankln()
-            self._file.write('try {\n')
+            self._file.write("try {\n")
         else:
-            self._file.write('set -e\n')
+            self._file.write("set -e\n")
             if self._verbose:
-                self._file.write('set -x\n')
+                self._file.write("set -x\n")
         self.blankln()
 
     def envvar_ref(self, v):
-        if target_platform() == 'windows':
-            return f'${{env:{v}}}'
-        return f'${{{v}}}'
+        if target_platform() == "windows":
+            return f"${{env:{v}}}"
+        return f"${{{v}}}"
 
     def cmd(self, clist, check_exitcode=False):
         if isinstance(clist, str):
-            self._file.write(f'{clist}\n')
+            self._file.write(f"{clist}\n")
         else:
             for c in clist:
-                self._file.write(f'{c} ')
+                self._file.write(f"{c} ")
             self.blankln()
 
         if check_exitcode:
-            if target_platform() == 'windows':
-                self._file.write('if ($LASTEXITCODE -ne 0) {\n')
+            if target_platform() == "windows":
+                self._file.write("if ($LASTEXITCODE -ne 0) {\n")
                 self._file.write(
-                    '  Write-Output "exited with status code $LASTEXITCODE";\n')
-                self._file.write('  ExitWithCode 1;\n')
-                self._file.write('}\n')
+                    '  Write-Output "exited with status code $LASTEXITCODE";\n'
+                )
+                self._file.write("  ExitWithCode 1;\n")
+                self._file.write("}\n")
 
     def cwd(self, path):
-        if target_platform() == 'windows':
-            self.cmd(f'Set-Location -EV Err -EA Stop {path}')
+        if target_platform() == "windows":
+            self.cmd(f"Set-Location -EV Err -EA Stop {path}")
         else:
-            self.cmd(f'cd {path}')
+            self.cmd(f"cd {path}")
 
     def cp(self, src, dest):
-        if target_platform() == 'windows':
-            self.cmd(f'Copy-Item -EV Err -EA Stop {src} -Destination {dest}')
+        if target_platform() == "windows":
+            self.cmd(f"Copy-Item -EV Err -EA Stop {src} -Destination {dest}")
         else:
-            self.cmd(f'cp {src} {dest}')
+            self.cmd(f"cp {src} {dest}")
 
     def mkdir(self, path):
-        if target_platform() == 'windows':
+        if target_platform() == "windows":
             self.cmd(
-                f'New-Item -EV Err -EA Stop -ItemType Directory -Force -Path {path}'
+                f"New-Item -EV Err -EA Stop -ItemType Directory -Force -Path {path}"
             )
         else:
-            self.cmd(f'mkdir -p {pathlib.Path(path)}')
+            self.cmd(f"mkdir -p {pathlib.Path(path)}")
 
     def rmdir(self, path):
-        if target_platform() == 'windows':
-            self.cmd(f'if (Test-Path -Path {path}) {{')
-            self.cmd(f'  Remove-Item -EV Err -EA Stop -Recurse -Force {path}')
-            self.cmd('}')
+        if target_platform() == "windows":
+            self.cmd(f"if (Test-Path -Path {path}) {{")
+            self.cmd(f"  Remove-Item -EV Err -EA Stop -Recurse -Force {path}")
+            self.cmd("}")
         else:
-            self.cmd(f'rm -fr {pathlib.Path(path)}')
+            self.cmd(f"rm -fr {pathlib.Path(path)}")
 
     def cpdir(self, src, dest):
-        if target_platform() == 'windows':
-            self.cmd(
-                f'Copy-Item -EV Err -EA Stop -Recurse {src} -Destination {dest}'
-            )
+        if target_platform() == "windows":
+            self.cmd(f"Copy-Item -EV Err -EA Stop -Recurse {src} -Destination {dest}")
         else:
-            self.cmd(f'cp -r {src} {dest}')
+            self.cmd(f"cp -r {src} {dest}")
 
     def tar(self, subdir, tar_filename):
-        if target_platform() == 'windows':
-            fail('unsupported operation: tar')
+        if target_platform() == "windows":
+            fail("unsupported operation: tar")
         else:
-            self.cmd(f'tar zcf {tar_filename} {subdir}')
+            self.cmd(f"tar zcf {tar_filename} {subdir}")
 
     def cmake(self, args):
         # Pass some additional envvars into cmake...
         env_args = []
-        for k in ('TRT_VERSION', 'CMAKE_TOOLCHAIN_FILE',
-                  'VCPKG_TARGET_TRIPLET'):
+        for k in ("TRT_VERSION", "CMAKE_TOOLCHAIN_FILE", "VCPKG_TARGET_TRIPLET"):
             env_args += [f'"-D{k}={self.envvar_ref(k)}"']
-        self.cmd(f'cmake {" ".join(env_args)} {" ".join(args)}',
-                 check_exitcode=True)
+        self.cmd(f'cmake {" ".join(env_args)} {" ".join(args)}', check_exitcode=True)
 
-    def makeinstall(self, target='install'):
-        if target_platform() == 'windows':
-            verbose_flag = '' if self._verbose else '-clp:ErrorsOnly'
+    def makeinstall(self, target="install"):
+        if target_platform() == "windows":
+            verbose_flag = "" if self._verbose else "-clp:ErrorsOnly"
             self.cmd(
-                f'msbuild.exe -m:{FLAGS.build_parallel} {verbose_flag} -p:Configuration={FLAGS.build_type} {target}.vcxproj',
-                check_exitcode=True)
+                f"msbuild.exe -m:{FLAGS.build_parallel} {verbose_flag} -p:Configuration={FLAGS.build_type} {target}.vcxproj",
+                check_exitcode=True,
+            )
         else:
-            verbose_flag = 'VERBOSE=1' if self._verbose else 'VERBOSE=0'
-            self.cmd(f'make -j{FLAGS.build_parallel} {verbose_flag} {target}')
+            verbose_flag = "VERBOSE=1" if self._verbose else "VERBOSE=0"
+            self.cmd(f"make -j{FLAGS.build_parallel} {verbose_flag} {target}")
 
     def gitclone(self, repo, tag, subdir, org):
         clone_dir = subdir
         if not FLAGS.no_force_clone:
             self.rmdir(clone_dir)
 
-        if target_platform() == 'windows':
-            self.cmd(f'if (-Not (Test-Path -Path {clone_dir})) {{')
+        if target_platform() == "windows":
+            self.cmd(f"if (-Not (Test-Path -Path {clone_dir})) {{")
         else:
-            self.cmd(f'if [[ ! -e {clone_dir} ]]; then')
+            self.cmd(f"if [[ ! -e {clone_dir} ]]; then")
 
         # FIXME [DLIS-4045 - Currently the tag starting with "pull/" is not
-        # working with "--repo-tag" as the option is not forwared to the
+        # working with "--repo-tag" as the option is not forwarded to the
         # individual repo build correctly.]
         # If 'tag' starts with "pull/" then it must be of form
         # "pull/<pr>/head". We just clone at "main" and then fetch the
         # reference onto a new branch we name "tritonbuildref".
         if tag.startswith("pull/"):
             self.cmd(
-                f'  git clone --recursive --depth=1 {org}/{repo}.git {subdir};',
-                check_exitcode=True)
-            self.cmd('}' if target_platform() == 'windows' else 'fi')
+                f"  git clone --recursive --depth=1 {org}/{repo}.git {subdir};",
+                check_exitcode=True,
+            )
+            self.cmd("}" if target_platform() == "windows" else "fi")
             self.cwd(subdir)
-            self.cmd(f'git fetch origin {tag}:tritonbuildref',
-                     check_exitcode=True)
-            self.cmd(f'git checkout tritonbuildref', check_exitcode=True)
+            self.cmd(f"git fetch origin {tag}:tritonbuildref", check_exitcode=True)
+            self.cmd(f"git checkout tritonbuildref", check_exitcode=True)
         else:
             self.cmd(
-                f'  git clone --recursive --single-branch --depth=1 -b {tag} {org}/{repo}.git {subdir};',
-                check_exitcode=True)
-            self.cmd('}' if target_platform() == 'windows' else 'fi')
+                f"  git clone --recursive --single-branch --depth=1 -b {tag} {org}/{repo}.git {subdir};",
+                check_exitcode=True,
+            )
+            self.cmd("}" if target_platform() == "windows" else "fi")
 
 
 def cmake_core_arg(name, type, value):
@@ -335,9 +335,9 @@ def cmake_core_arg(name, type, value):
     if name in OVERRIDE_CORE_CMAKE_FLAGS:
         value = OVERRIDE_CORE_CMAKE_FLAGS[name]
     if type is None:
-        type = ''
+        type = ""
     else:
-        type = ':{}'.format(type)
+        type = ":{}".format(type)
     return '"-D{}{}={}"'.format(name, type, value)
 
 
@@ -348,7 +348,7 @@ def cmake_core_enable(name, flag):
     if name in OVERRIDE_CORE_CMAKE_FLAGS:
         value = OVERRIDE_CORE_CMAKE_FLAGS[name]
     else:
-        value = 'ON' if flag else 'OFF'
+        value = "ON" if flag else "OFF"
     return '"-D{}:BOOL={}"'.format(name, value)
 
 
@@ -366,9 +366,9 @@ def cmake_backend_arg(backend, name, type, value):
         if name in OVERRIDE_BACKEND_CMAKE_FLAGS[backend]:
             value = OVERRIDE_BACKEND_CMAKE_FLAGS[backend][name]
     if type is None:
-        type = ''
+        type = ""
     else:
-        type = ':{}'.format(type)
+        type = ":{}".format(type)
     return '"-D{}{}={}"'.format(name, type, value)
 
 
@@ -381,7 +381,7 @@ def cmake_backend_enable(backend, name, flag):
         if name in OVERRIDE_BACKEND_CMAKE_FLAGS[backend]:
             value = OVERRIDE_BACKEND_CMAKE_FLAGS[backend][name]
     if value is None:
-        value = 'ON' if flag else 'OFF'
+        value = "ON" if flag else "OFF"
     return '"-D{}:BOOL={}"'.format(name, value)
 
 
@@ -396,15 +396,15 @@ def cmake_backend_extra_args(backend):
 def cmake_repoagent_arg(name, type, value):
     # For now there is no override for repo-agents
     if type is None:
-        type = ''
+        type = ""
     else:
-        type = ':{}'.format(type)
+        type = ":{}".format(type)
     return '"-D{}{}={}"'.format(name, type, value)
 
 
 def cmake_repoagent_enable(name, flag):
     # For now there is no override for repo-agents
-    value = 'ON' if flag else 'OFF'
+    value = "ON" if flag else "OFF"
     return '"-D{}:BOOL={}"'.format(name, value)
 
 
@@ -417,15 +417,15 @@ def cmake_repoagent_extra_args():
 def cmake_cache_arg(name, type, value):
     # For now there is no override for caches
     if type is None:
-        type = ''
+        type = ""
     else:
-        type = ':{}'.format(type)
+        type = ":{}".format(type)
     return '"-D{}{}={}"'.format(name, type, value)
 
 
 def cmake_cache_enable(name, flag):
     # For now there is no override for caches
-    value = 'ON' if flag else 'OFF'
+    value = "ON" if flag else "OFF"
     return '"-D{}:BOOL={}"'.format(name, value)
 
 
@@ -437,64 +437,57 @@ def cmake_cache_extra_args():
 
 def core_cmake_args(components, backends, cmake_dir, install_dir):
     cargs = [
-        cmake_core_arg('CMAKE_BUILD_TYPE', None, FLAGS.build_type),
-        cmake_core_arg('CMAKE_INSTALL_PREFIX', 'PATH', install_dir),
-        cmake_core_arg('TRITON_VERSION', 'STRING', FLAGS.version),
-        cmake_core_arg('TRITON_COMMON_REPO_TAG', 'STRING',
-                       components['common']),
-        cmake_core_arg('TRITON_CORE_REPO_TAG', 'STRING', components['core']),
-        cmake_core_arg('TRITON_BACKEND_REPO_TAG', 'STRING',
-                       components['backend']),
-        cmake_core_arg('TRITON_THIRD_PARTY_REPO_TAG', 'STRING',
-                       components['thirdparty'])
+        cmake_core_arg("CMAKE_BUILD_TYPE", None, FLAGS.build_type),
+        cmake_core_arg("CMAKE_INSTALL_PREFIX", "PATH", install_dir),
+        cmake_core_arg("TRITON_VERSION", "STRING", FLAGS.version),
+        cmake_core_arg("TRITON_COMMON_REPO_TAG", "STRING", components["common"]),
+        cmake_core_arg("TRITON_CORE_REPO_TAG", "STRING", components["core"]),
+        cmake_core_arg("TRITON_BACKEND_REPO_TAG", "STRING", components["backend"]),
+        cmake_core_arg(
+            "TRITON_THIRD_PARTY_REPO_TAG", "STRING", components["thirdparty"]
+        ),
     ]
 
+    cargs.append(cmake_core_enable("TRITON_ENABLE_LOGGING", FLAGS.enable_logging))
+    cargs.append(cmake_core_enable("TRITON_ENABLE_STATS", FLAGS.enable_stats))
+    cargs.append(cmake_core_enable("TRITON_ENABLE_METRICS", FLAGS.enable_metrics))
     cargs.append(
-        cmake_core_enable('TRITON_ENABLE_LOGGING', FLAGS.enable_logging))
-    cargs.append(cmake_core_enable('TRITON_ENABLE_STATS', FLAGS.enable_stats))
-    cargs.append(
-        cmake_core_enable('TRITON_ENABLE_METRICS', FLAGS.enable_metrics))
-    cargs.append(
-        cmake_core_enable('TRITON_ENABLE_METRICS_GPU',
-                          FLAGS.enable_gpu_metrics))
-    cargs.append(
-        cmake_core_enable('TRITON_ENABLE_METRICS_CPU',
-                          FLAGS.enable_cpu_metrics))
+        cmake_core_enable("TRITON_ENABLE_METRICS_GPU", FLAGS.enable_gpu_metrics)
+    )
     cargs.append(
-        cmake_core_enable('TRITON_ENABLE_TRACING', FLAGS.enable_tracing))
-    cargs.append(cmake_core_enable('TRITON_ENABLE_NVTX', FLAGS.enable_nvtx))
+        cmake_core_enable("TRITON_ENABLE_METRICS_CPU", FLAGS.enable_cpu_metrics)
+    )
+    cargs.append(cmake_core_enable("TRITON_ENABLE_TRACING", FLAGS.enable_tracing))
+    cargs.append(cmake_core_enable("TRITON_ENABLE_NVTX", FLAGS.enable_nvtx))
 
-    cargs.append(cmake_core_enable('TRITON_ENABLE_GPU', FLAGS.enable_gpu))
+    cargs.append(cmake_core_enable("TRITON_ENABLE_GPU", FLAGS.enable_gpu))
     cargs.append(
-        cmake_core_arg('TRITON_MIN_COMPUTE_CAPABILITY', None,
-                       FLAGS.min_compute_capability))
+        cmake_core_arg(
+            "TRITON_MIN_COMPUTE_CAPABILITY", None, FLAGS.min_compute_capability
+        )
+    )
 
-    cargs.append(
-        cmake_core_enable('TRITON_ENABLE_MALI_GPU', FLAGS.enable_mali_gpu))
+    cargs.append(cmake_core_enable("TRITON_ENABLE_MALI_GPU", FLAGS.enable_mali_gpu))
 
+    cargs.append(cmake_core_enable("TRITON_ENABLE_GRPC", "grpc" in FLAGS.endpoint))
+    cargs.append(cmake_core_enable("TRITON_ENABLE_HTTP", "http" in FLAGS.endpoint))
     cargs.append(
-        cmake_core_enable('TRITON_ENABLE_GRPC', 'grpc' in FLAGS.endpoint))
-    cargs.append(
-        cmake_core_enable('TRITON_ENABLE_HTTP', 'http' in FLAGS.endpoint))
-    cargs.append(
-        cmake_core_enable('TRITON_ENABLE_SAGEMAKER', 'sagemaker'
-                          in FLAGS.endpoint))
+        cmake_core_enable("TRITON_ENABLE_SAGEMAKER", "sagemaker" in FLAGS.endpoint)
+    )
     cargs.append(
-        cmake_core_enable('TRITON_ENABLE_VERTEX_AI', 'vertex-ai'
-                          in FLAGS.endpoint))
+        cmake_core_enable("TRITON_ENABLE_VERTEX_AI", "vertex-ai" in FLAGS.endpoint)
+    )
 
+    cargs.append(cmake_core_enable("TRITON_ENABLE_GCS", "gcs" in FLAGS.filesystem))
+    cargs.append(cmake_core_enable("TRITON_ENABLE_S3", "s3" in FLAGS.filesystem))
     cargs.append(
-        cmake_core_enable('TRITON_ENABLE_GCS', 'gcs' in FLAGS.filesystem))
-    cargs.append(cmake_core_enable('TRITON_ENABLE_S3', 's3'
-                                   in FLAGS.filesystem))
-    cargs.append(
-        cmake_core_enable('TRITON_ENABLE_AZURE_STORAGE', 'azure_storage'
-                          in FLAGS.filesystem))
+        cmake_core_enable(
+            "TRITON_ENABLE_AZURE_STORAGE", "azure_storage" in FLAGS.filesystem
+        )
+    )
 
-    cargs.append(
-        cmake_core_enable('TRITON_ENABLE_ENSEMBLE', 'ensemble' in backends))
-    cargs.append(
-        cmake_core_enable('TRITON_ENABLE_TENSORRT', 'tensorrt' in backends))
+    cargs.append(cmake_core_enable("TRITON_ENABLE_ENSEMBLE", "ensemble" in backends))
+    cargs.append(cmake_core_enable("TRITON_ENABLE_TENSORRT", "tensorrt" in backends))
 
     cargs += cmake_core_extra_args()
     cargs.append(cmake_dir)
@@ -502,258 +495,289 @@ def core_cmake_args(components, backends, cmake_dir, install_dir):
 
 
 def repoagent_repo(ra):
-    return '{}_repository_agent'.format(ra)
+    return "{}_repository_agent".format(ra)
 
 
 def repoagent_cmake_args(images, components, ra, install_dir):
     args = []
 
     cargs = args + [
-        cmake_repoagent_arg('CMAKE_BUILD_TYPE', None, FLAGS.build_type),
-        cmake_repoagent_arg('CMAKE_INSTALL_PREFIX', 'PATH', install_dir),
-        cmake_repoagent_arg('TRITON_COMMON_REPO_TAG', 'STRING',
-                            components['common']),
-        cmake_repoagent_arg('TRITON_CORE_REPO_TAG', 'STRING',
-                            components['core'])
+        cmake_repoagent_arg("CMAKE_BUILD_TYPE", None, FLAGS.build_type),
+        cmake_repoagent_arg("CMAKE_INSTALL_PREFIX", "PATH", install_dir),
+        cmake_repoagent_arg("TRITON_COMMON_REPO_TAG", "STRING", components["common"]),
+        cmake_repoagent_arg("TRITON_CORE_REPO_TAG", "STRING", components["core"]),
     ]
 
-    cargs.append(cmake_repoagent_enable('TRITON_ENABLE_GPU', FLAGS.enable_gpu))
+    cargs.append(cmake_repoagent_enable("TRITON_ENABLE_GPU", FLAGS.enable_gpu))
     cargs += cmake_repoagent_extra_args()
-    cargs.append('..')
+    cargs.append("..")
     return cargs
 
 
 def cache_repo(cache):
     # example: "local", or "redis"
-    return '{}_cache'.format(cache)
+    return "{}_cache".format(cache)
 
 
 def cache_cmake_args(images, components, cache, install_dir):
     args = []
 
     cargs = args + [
-        cmake_cache_arg('CMAKE_BUILD_TYPE', None, FLAGS.build_type),
-        cmake_cache_arg('CMAKE_INSTALL_PREFIX', 'PATH', install_dir),
-        cmake_cache_arg('TRITON_COMMON_REPO_TAG', 'STRING',
-                        components['common']),
-        cmake_cache_arg('TRITON_CORE_REPO_TAG', 'STRING', components['core'])
+        cmake_cache_arg("CMAKE_BUILD_TYPE", None, FLAGS.build_type),
+        cmake_cache_arg("CMAKE_INSTALL_PREFIX", "PATH", install_dir),
+        cmake_cache_arg("TRITON_COMMON_REPO_TAG", "STRING", components["common"]),
+        cmake_cache_arg("TRITON_CORE_REPO_TAG", "STRING", components["core"]),
     ]
 
-    cargs.append(cmake_cache_enable('TRITON_ENABLE_GPU', FLAGS.enable_gpu))
+    cargs.append(cmake_cache_enable("TRITON_ENABLE_GPU", FLAGS.enable_gpu))
     cargs += cmake_cache_extra_args()
-    cargs.append('..')
+    cargs.append("..")
     return cargs
 
 
 def backend_repo(be):
-    return '{}_backend'.format(be)
+    return "{}_backend".format(be)
 
 
 def backend_cmake_args(images, components, be, install_dir, library_paths):
     cmake_build_type = FLAGS.build_type
 
-    if be == 'onnxruntime':
+    if be == "onnxruntime":
         args = onnxruntime_cmake_args(images, library_paths)
-    elif be == 'openvino':
+    elif be == "openvino":
         args = openvino_cmake_args()
-    elif be == 'tensorflow':
+    elif be == "tensorflow":
         args = tensorflow_cmake_args(images, library_paths)
-    elif be == 'python':
+    elif be == "python":
         args = []
-    elif be == 'dali':
+    elif be == "dali":
         args = dali_cmake_args()
-    elif be == 'pytorch':
+    elif be == "pytorch":
         args = pytorch_cmake_args(images)
-    elif be == 'armnn_tflite':
+    elif be == "armnn_tflite":
         args = armnn_tflite_cmake_args()
-    elif be == 'fil':
+    elif be == "fil":
         args = fil_cmake_args(images)
         # DLIS-4618: FIL backend fails debug build, so override it for now.
         cmake_build_type = "Release"
-    elif be == 'fastertransformer':
+    elif be == "fastertransformer":
         args = fastertransformer_cmake_args()
-    elif be == 'tensorrt':
+    elif be == "tensorrt":
         args = tensorrt_cmake_args()
     else:
         args = []
 
     cargs = args + [
-        cmake_backend_arg(be, 'CMAKE_BUILD_TYPE', None, cmake_build_type),
-        cmake_backend_arg(be, 'CMAKE_INSTALL_PREFIX', 'PATH', install_dir),
-        cmake_backend_arg(be, 'TRITON_COMMON_REPO_TAG', 'STRING',
-                          components['common']),
-        cmake_backend_arg(be, 'TRITON_CORE_REPO_TAG', 'STRING',
-                          components['core']),
-        cmake_backend_arg(be, 'TRITON_BACKEND_REPO_TAG', 'STRING',
-                          components['backend'])
+        cmake_backend_arg(be, "CMAKE_BUILD_TYPE", None, cmake_build_type),
+        cmake_backend_arg(be, "CMAKE_INSTALL_PREFIX", "PATH", install_dir),
+        cmake_backend_arg(be, "TRITON_COMMON_REPO_TAG", "STRING", components["common"]),
+        cmake_backend_arg(be, "TRITON_CORE_REPO_TAG", "STRING", components["core"]),
+        cmake_backend_arg(
+            be, "TRITON_BACKEND_REPO_TAG", "STRING", components["backend"]
+        ),
     ]
 
-    cargs.append(cmake_backend_enable(be, 'TRITON_ENABLE_GPU',
-                                      FLAGS.enable_gpu))
-    cargs.append(
-        cmake_backend_enable(be, 'TRITON_ENABLE_MALI_GPU',
-                             FLAGS.enable_mali_gpu))
+    cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_GPU", FLAGS.enable_gpu))
     cargs.append(
-        cmake_backend_enable(be, 'TRITON_ENABLE_STATS', FLAGS.enable_stats))
+        cmake_backend_enable(be, "TRITON_ENABLE_MALI_GPU", FLAGS.enable_mali_gpu)
+    )
+    cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_STATS", FLAGS.enable_stats))
     cargs.append(
-        cmake_backend_enable(be, 'TRITON_ENABLE_METRICS', FLAGS.enable_metrics))
+        cmake_backend_enable(be, "TRITON_ENABLE_METRICS", FLAGS.enable_metrics)
+    )
 
     # [DLIS-4950] always enable below once Windows image is updated with CUPTI
     # cargs.append(cmake_backend_enable(be, 'TRITON_ENABLE_MEMORY_TRACKER', True))
-    if (target_platform() == 'windows') and (not FLAGS.no_container_build):
+    if (target_platform() == "windows") and (not FLAGS.no_container_build):
         print(
             "Warning: Detected docker build is used for Windows, backend utility 'device memory tracker' will be disabled due to missing library in CUDA Windows docker image."
         )
-        cargs.append(
-            cmake_backend_enable(be, 'TRITON_ENABLE_MEMORY_TRACKER', False))
-    elif target_platform() == 'jetpack':
+        cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_MEMORY_TRACKER", False))
+    elif target_platform() == "jetpack":
         print(
             "Warning: Detected Jetpack build, backend utility 'device memory tracker' will be disabled as Jetpack doesn't contain required version of the library."
         )
-        cargs.append(
-            cmake_backend_enable(be, 'TRITON_ENABLE_MEMORY_TRACKER', False))
+        cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_MEMORY_TRACKER", False))
     elif FLAGS.enable_gpu:
-        cargs.append(
-            cmake_backend_enable(be, 'TRITON_ENABLE_MEMORY_TRACKER', True))
+        cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_MEMORY_TRACKER", True))
 
     cargs += cmake_backend_extra_args(be)
-    cargs.append('..')
+    cargs.append("..")
     return cargs
 
 
 def pytorch_cmake_args(images):
-
     # If platform is jetpack do not use docker based build
-    if target_platform() == 'jetpack':
-        if 'pytorch' not in library_paths:
+    if target_platform() == "jetpack":
+        if "pytorch" not in library_paths:
             raise Exception(
                 "Must specify library path for pytorch using --library-paths=pytorch:<path_to_pytorch>"
             )
-        pt_lib_path = library_paths['pytorch'] + "/lib"
+        pt_lib_path = library_paths["pytorch"] + "/lib"
         pt_include_paths = ""
         for suffix in [
-                'include/torch', 'include/torch/torch/csrc/api/include',
-                'include/torchvision'
+            "include/torch",
+            "include/torch/torch/csrc/api/include",
+            "include/torchvision",
         ]:
-            pt_include_paths += library_paths['pytorch'] + '/' + suffix + ';'
+            pt_include_paths += library_paths["pytorch"] + "/" + suffix + ";"
         cargs = [
-            cmake_backend_arg('pytorch', 'TRITON_PYTORCH_INCLUDE_PATHS', None,
-                              pt_include_paths),
-            cmake_backend_arg('pytorch', 'TRITON_PYTORCH_LIB_PATHS', None,
-                              pt_lib_path),
+            cmake_backend_arg(
+                "pytorch", "TRITON_PYTORCH_INCLUDE_PATHS", None, pt_include_paths
+            ),
+            cmake_backend_arg("pytorch", "TRITON_PYTORCH_LIB_PATHS", None, pt_lib_path),
         ]
     else:
         if "pytorch" in images:
             image = images["pytorch"]
         else:
-            image = 'nvcr.io/nvidia/pytorch:{}-py3'.format(
-                FLAGS.upstream_container_version)
+            image = "nvcr.io/nvidia/pytorch:{}-py3".format(
+                FLAGS.upstream_container_version
+            )
         cargs = [
-            cmake_backend_arg('pytorch', 'TRITON_PYTORCH_DOCKER_IMAGE', None,
-                              image),
+            cmake_backend_arg("pytorch", "TRITON_PYTORCH_DOCKER_IMAGE", None, image),
         ]
 
         if FLAGS.enable_gpu:
             cargs.append(
-                cmake_backend_enable('pytorch',
-                                     'TRITON_PYTORCH_ENABLE_TORCHTRT', True))
+                cmake_backend_enable("pytorch", "TRITON_PYTORCH_ENABLE_TORCHTRT", True)
+            )
         cargs.append(
-            cmake_backend_enable('pytorch', 'TRITON_ENABLE_NVTX',
-                                 FLAGS.enable_nvtx))
+            cmake_backend_enable("pytorch", "TRITON_ENABLE_NVTX", FLAGS.enable_nvtx)
+        )
     return cargs
 
 
 def onnxruntime_cmake_args(images, library_paths):
     cargs = [
-        cmake_backend_arg('onnxruntime', 'TRITON_BUILD_ONNXRUNTIME_VERSION',
-                          None, TRITON_VERSION_MAP[FLAGS.version][2])
+        cmake_backend_arg(
+            "onnxruntime",
+            "TRITON_BUILD_ONNXRUNTIME_VERSION",
+            None,
+            TRITON_VERSION_MAP[FLAGS.version][2],
+        )
     ]
 
     # TRITON_ENABLE_GPU is already set for all backends in backend_cmake_args()
     if FLAGS.enable_gpu:
         cargs.append(
-            cmake_backend_enable('onnxruntime',
-                                 'TRITON_ENABLE_ONNXRUNTIME_TENSORRT', True))
+            cmake_backend_enable(
+                "onnxruntime", "TRITON_ENABLE_ONNXRUNTIME_TENSORRT", True
+            )
+        )
 
     # If platform is jetpack do not use docker based build
-    if target_platform() == 'jetpack':
-        if 'onnxruntime' not in library_paths:
+    if target_platform() == "jetpack":
+        if "onnxruntime" not in library_paths:
             raise Exception(
                 "Must specify library path for onnxruntime using --library-paths=onnxruntime:<path_to_onnxruntime>"
             )
-        ort_lib_path = library_paths['onnxruntime'] + "/lib"
-        ort_include_path = library_paths['onnxruntime'] + "/include"
+        ort_lib_path = library_paths["onnxruntime"] + "/lib"
+        ort_include_path = library_paths["onnxruntime"] + "/include"
         cargs += [
-            cmake_backend_arg('onnxruntime', 'TRITON_ONNXRUNTIME_INCLUDE_PATHS',
-                              None, ort_include_path),
-            cmake_backend_arg('onnxruntime', 'TRITON_ONNXRUNTIME_LIB_PATHS',
-                              None, ort_lib_path),
-            cmake_backend_enable('onnxruntime',
-                                 'TRITON_ENABLE_ONNXRUNTIME_OPENVINO', False)
+            cmake_backend_arg(
+                "onnxruntime",
+                "TRITON_ONNXRUNTIME_INCLUDE_PATHS",
+                None,
+                ort_include_path,
+            ),
+            cmake_backend_arg(
+                "onnxruntime", "TRITON_ONNXRUNTIME_LIB_PATHS", None, ort_lib_path
+            ),
+            cmake_backend_enable(
+                "onnxruntime", "TRITON_ENABLE_ONNXRUNTIME_OPENVINO", False
+            ),
         ]
     else:
-        if target_platform() == 'windows':
-            if 'base' in images:
+        if target_platform() == "windows":
+            if "base" in images:
                 cargs.append(
-                    cmake_backend_arg('onnxruntime', 'TRITON_BUILD_CONTAINER',
-                                      None, images['base']))
+                    cmake_backend_arg(
+                        "onnxruntime", "TRITON_BUILD_CONTAINER", None, images["base"]
+                    )
+                )
         else:
-            if 'base' in images:
+            if "base" in images:
                 cargs.append(
-                    cmake_backend_arg('onnxruntime', 'TRITON_BUILD_CONTAINER',
-                                      None, images['base']))
+                    cmake_backend_arg(
+                        "onnxruntime", "TRITON_BUILD_CONTAINER", None, images["base"]
+                    )
+                )
             else:
                 cargs.append(
-                    cmake_backend_arg('onnxruntime',
-                                      'TRITON_BUILD_CONTAINER_VERSION', None,
-                                      TRITON_VERSION_MAP[FLAGS.version][1]))
-
-            if ((target_machine() != 'aarch64') and
-                (TRITON_VERSION_MAP[FLAGS.version][3] is not None)):
+                    cmake_backend_arg(
+                        "onnxruntime",
+                        "TRITON_BUILD_CONTAINER_VERSION",
+                        None,
+                        TRITON_VERSION_MAP[FLAGS.version][1],
+                    )
+                )
+
+            if (target_machine() != "aarch64") and (
+                TRITON_VERSION_MAP[FLAGS.version][3] is not None
+            ):
                 cargs.append(
-                    cmake_backend_enable('onnxruntime',
-                                         'TRITON_ENABLE_ONNXRUNTIME_OPENVINO',
-                                         True))
+                    cmake_backend_enable(
+                        "onnxruntime", "TRITON_ENABLE_ONNXRUNTIME_OPENVINO", True
+                    )
+                )
                 cargs.append(
                     cmake_backend_arg(
-                        'onnxruntime',
-                        'TRITON_BUILD_ONNXRUNTIME_OPENVINO_VERSION', None,
-                        TRITON_VERSION_MAP[FLAGS.version][3]))
+                        "onnxruntime",
+                        "TRITON_BUILD_ONNXRUNTIME_OPENVINO_VERSION",
+                        None,
+                        TRITON_VERSION_MAP[FLAGS.version][3],
+                    )
+                )
 
     return cargs
 
 
 def openvino_cmake_args():
     cargs = [
-        cmake_backend_arg('openvino', 'TRITON_BUILD_OPENVINO_VERSION', None,
-                          TRITON_VERSION_MAP[FLAGS.version][4])
+        cmake_backend_arg(
+            "openvino",
+            "TRITON_BUILD_OPENVINO_VERSION",
+            None,
+            TRITON_VERSION_MAP[FLAGS.version][4],
+        )
     ]
-    if target_platform() == 'windows':
-        if 'base' in images:
+    if target_platform() == "windows":
+        if "base" in images:
             cargs.append(
-                cmake_backend_arg('openvino', 'TRITON_BUILD_CONTAINER', None,
-                                  images['base']))
+                cmake_backend_arg(
+                    "openvino", "TRITON_BUILD_CONTAINER", None, images["base"]
+                )
+            )
     else:
-        if 'base' in images:
+        if "base" in images:
             cargs.append(
-                cmake_backend_arg('openvino', 'TRITON_BUILD_CONTAINER', None,
-                                  images['base']))
+                cmake_backend_arg(
+                    "openvino", "TRITON_BUILD_CONTAINER", None, images["base"]
+                )
+            )
         else:
             cargs.append(
-                cmake_backend_arg('openvino', 'TRITON_BUILD_CONTAINER_VERSION',
-                                  None, TRITON_VERSION_MAP[FLAGS.version][1]))
+                cmake_backend_arg(
+                    "openvino",
+                    "TRITON_BUILD_CONTAINER_VERSION",
+                    None,
+                    TRITON_VERSION_MAP[FLAGS.version][1],
+                )
+            )
     return cargs
 
 
 def tensorrt_cmake_args():
     cargs = [
-        cmake_backend_enable('tensorrt', 'TRITON_ENABLE_NVTX',
-                             FLAGS.enable_nvtx),
+        cmake_backend_enable("tensorrt", "TRITON_ENABLE_NVTX", FLAGS.enable_nvtx),
     ]
-    if target_platform() == 'windows':
+    if target_platform() == "windows":
         cargs.append(
-            cmake_backend_arg('tensorrt', 'TRITON_TENSORRT_INCLUDE_PATHS', None,
-                              'c:/TensorRT/include'))
+            cmake_backend_arg(
+                "tensorrt", "TRITON_TENSORRT_INCLUDE_PATHS", None, "c:/TensorRT/include"
+            )
+        )
 
     return cargs
 
@@ -763,11 +787,15 @@ def tensorflow_cmake_args(images, library_paths):
 
     # If platform is jetpack do not use docker images
     extra_args = []
-    if target_platform() == 'jetpack':
+    if target_platform() == "jetpack":
         if backend_name in library_paths:
             extra_args = [
-                cmake_backend_arg(backend_name, 'TRITON_TENSORFLOW_LIB_PATHS',
-                                  None, library_paths[backend_name])
+                cmake_backend_arg(
+                    backend_name,
+                    "TRITON_TENSORFLOW_LIB_PATHS",
+                    None,
+                    library_paths[backend_name],
+                )
             ]
         else:
             raise Exception(
@@ -778,77 +806,90 @@ def tensorflow_cmake_args(images, library_paths):
         if backend_name in images:
             image = images[backend_name]
         else:
-            image = 'nvcr.io/nvidia/tensorflow:{}-tf2-py3'.format(
-                FLAGS.upstream_container_version)
+            image = "nvcr.io/nvidia/tensorflow:{}-tf2-py3".format(
+                FLAGS.upstream_container_version
+            )
         extra_args = [
-            cmake_backend_arg(backend_name, 'TRITON_TENSORFLOW_DOCKER_IMAGE',
-                              None, image)
+            cmake_backend_arg(
+                backend_name, "TRITON_TENSORFLOW_DOCKER_IMAGE", None, image
+            )
         ]
     return extra_args
 
 
 def dali_cmake_args():
     return [
-        cmake_backend_enable('dali', 'TRITON_DALI_SKIP_DOWNLOAD', False),
+        cmake_backend_enable("dali", "TRITON_DALI_SKIP_DOWNLOAD", False),
     ]
 
 
 def fil_cmake_args(images):
-    cargs = [cmake_backend_enable('fil', 'TRITON_FIL_DOCKER_BUILD', True)]
-    if 'base' in images:
+    cargs = [cmake_backend_enable("fil", "TRITON_FIL_DOCKER_BUILD", True)]
+    if "base" in images:
         cargs.append(
-            cmake_backend_arg('fil', 'TRITON_BUILD_CONTAINER', None,
-                              images['base']))
+            cmake_backend_arg("fil", "TRITON_BUILD_CONTAINER", None, images["base"])
+        )
     else:
         cargs.append(
-            cmake_backend_arg('fil', 'TRITON_BUILD_CONTAINER_VERSION', None,
-                              TRITON_VERSION_MAP[FLAGS.version][1]))
+            cmake_backend_arg(
+                "fil",
+                "TRITON_BUILD_CONTAINER_VERSION",
+                None,
+                TRITON_VERSION_MAP[FLAGS.version][1],
+            )
+        )
 
     return cargs
 
 
 def armnn_tflite_cmake_args():
     return [
-        cmake_backend_arg('armnn_tflite', 'JOBS', None,
-                          multiprocessing.cpu_count()),
+        cmake_backend_arg("armnn_tflite", "JOBS", None, multiprocessing.cpu_count()),
     ]
 
 
 def fastertransformer_cmake_args():
     print("Warning: FasterTransformer backend is not officially supported.")
     cargs = [
-        cmake_backend_arg('fastertransformer', 'CMAKE_EXPORT_COMPILE_COMMANDS',
-                          None, 1),
-        cmake_backend_arg('fastertransformer', 'ENABLE_FP8', None, 'OFF')
+        cmake_backend_arg(
+            "fastertransformer", "CMAKE_EXPORT_COMPILE_COMMANDS", None, 1
+        ),
+        cmake_backend_arg("fastertransformer", "ENABLE_FP8", None, "OFF"),
     ]
     return cargs
 
 
 def install_dcgm_libraries(dcgm_version, target_machine):
-    if dcgm_version == '':
+    if dcgm_version == "":
         fail(
-            'unable to determine default repo-tag, DCGM version not known for {}'
-            .format(FLAGS.version))
-        return ''
+            "unable to determine default repo-tag, DCGM version not known for {}".format(
+                FLAGS.version
+            )
+        )
+        return ""
     else:
-        if target_machine == 'aarch64':
-            return '''
+        if target_machine == "aarch64":
+            return """
 ENV DCGM_VERSION {}
 # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
 RUN curl -o /tmp/cuda-keyring.deb \
     https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/sbsa/cuda-keyring_1.0-1_all.deb \
     && apt install /tmp/cuda-keyring.deb && rm /tmp/cuda-keyring.deb && \
     apt-get update && apt-get install -y datacenter-gpu-manager=1:{}
-'''.format(dcgm_version, dcgm_version)
+""".format(
+                dcgm_version, dcgm_version
+            )
         else:
-            return '''
+            return """
 ENV DCGM_VERSION {}
 # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
 RUN curl -o /tmp/cuda-keyring.deb \
     https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
     && apt install /tmp/cuda-keyring.deb && rm /tmp/cuda-keyring.deb && \
     apt-get update && apt-get install -y datacenter-gpu-manager=1:{}
-'''.format(dcgm_version, dcgm_version)
+""".format(
+                dcgm_version, dcgm_version
+            )
 
 
 def install_miniconda(conda_version, target_machine):
@@ -857,16 +898,18 @@ def install_miniconda(conda_version, target_machine):
         # macos arm arch names "arm64" when in linux it's names "aarch64".
         # So we just replace the architecture to able find right conda version for Linux
         target_machine = "aarch64"
-    if conda_version == '':
+    if conda_version == "":
         fail(
-            'unable to determine default repo-tag, CONDA version not known for {}'
-            .format(FLAGS.version))
+            "unable to determine default repo-tag, CONDA version not known for {}".format(
+                FLAGS.version
+            )
+        )
     miniconda_url = f"https://repo.anaconda.com/miniconda/Miniconda3-{conda_version}-Linux-{target_machine}.sh"
-    if target_machine == 'x86_64':
+    if target_machine == "x86_64":
         sha_sum = "32d73e1bc33fda089d7cd9ef4c1be542616bd8e437d1f77afeeaf7afdb019787"
     else:
         sha_sum = "80d6c306b015e1e3b01ea59dc66c676a81fa30279bc2da1f180a7ef7b2191d6e"
-    return f'''
+    return f"""
 RUN mkdir -p /opt/
 RUN wget "{miniconda_url}" -O miniconda.sh -q && \
     echo "{sha_sum}" "miniconda.sh" > shasum && \
@@ -877,30 +920,33 @@ def install_miniconda(conda_version, target_machine):
     find /opt/conda/ -follow -type f -name '*.js.map' -delete && \
     /opt/conda/bin/conda clean -afy
 ENV PATH /opt/conda/bin:${{PATH}}
-'''
+"""
 
 
 def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
-    df = '''
+    df = """
 ARG TRITON_VERSION={}
 ARG TRITON_CONTAINER_VERSION={}
 ARG BASE_IMAGE={}
-'''.format(argmap['TRITON_VERSION'], argmap['TRITON_CONTAINER_VERSION'],
-           argmap['BASE_IMAGE'])
+""".format(
+        argmap["TRITON_VERSION"],
+        argmap["TRITON_CONTAINER_VERSION"],
+        argmap["BASE_IMAGE"],
+    )
 
-    df += '''
+    df += """
 FROM ${BASE_IMAGE}
 
 ARG TRITON_VERSION
 ARG TRITON_CONTAINER_VERSION
-'''
+"""
     # Install the windows- or linux-specific buildbase dependencies
-    if target_platform() == 'windows':
-        df += '''
+    if target_platform() == "windows":
+        df += """
 SHELL ["cmd", "/S", "/C"]
-'''
+"""
     else:
-        df += '''
+        df += """
 # Ensure apt-get won't prompt for selecting options
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -936,7 +982,7 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
             wget \
             zlib1g-dev \
             libarchive-dev \
-            libxml2-dev \ 
+            libxml2-dev \
             libnuma-dev && \
     rm -rf /var/lib/apt/lists/*
 
@@ -959,51 +1005,53 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
     echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | \
     tee /etc/apt/sources.list.d/kitware.list >/dev/null && \
     apt-get update && \
-    apt-get install -y --no-install-recommends cmake cmake-data 
-'''
+    apt-get install -y --no-install-recommends cmake cmake-data
+"""
 
         if FLAGS.enable_gpu:
-            df += install_dcgm_libraries(argmap['DCGM_VERSION'],
-                                         target_machine())
+            df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine())
 
-    df += '''
+    df += """
 ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
 ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
-'''
+"""
 
     # Copy in the triton source. We remove existing contents first in
     # case the FROM container has something there already.
-    if target_platform() == 'windows':
-        df += '''
+    if target_platform() == "windows":
+        df += """
 WORKDIR /workspace
 RUN rmdir /S/Q * || exit 0
 COPY . .
-'''
+"""
     else:
-        df += '''
+        df += """
 WORKDIR /workspace
 RUN rm -fr *
 COPY . .
 ENTRYPOINT []
-'''
+"""
 
     # Install miniconda required for the DALI backend.
-    if target_platform() != 'windows':
-        df += install_miniconda(argmap['CONDA_VERSION'], target_machine())
+    if target_platform() != "windows":
+        df += install_miniconda(argmap["CONDA_VERSION"], target_machine())
 
     with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
         dfile.write(df)
 
 
 def create_dockerfile_cibase(ddir, dockerfile_name, argmap):
-    df = '''
+    df = """
 ARG TRITON_VERSION={}
 ARG TRITON_CONTAINER_VERSION={}
 ARG BASE_IMAGE={}
-'''.format(argmap['TRITON_VERSION'], argmap['TRITON_CONTAINER_VERSION'],
-           argmap['BASE_IMAGE'])
+""".format(
+        argmap["TRITON_VERSION"],
+        argmap["TRITON_CONTAINER_VERSION"],
+        argmap["BASE_IMAGE"],
+    )
 
-    df += '''
+    df += """
 FROM ${BASE_IMAGE}
 
 ARG TRITON_VERSION
@@ -1015,79 +1063,84 @@ def create_dockerfile_cibase(ddir, dockerfile_name, argmap):
 
 ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
 ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
-'''
+"""
 
     with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
         dfile.write(df)
 
 
-def create_dockerfile_linux(ddir, dockerfile_name, argmap, backends, repoagents,
-                            caches, endpoints):
-    df = '''
+def create_dockerfile_linux(
+    ddir, dockerfile_name, argmap, backends, repoagents, caches, endpoints
+):
+    df = """
 ARG TRITON_VERSION={}
 ARG TRITON_CONTAINER_VERSION={}
 ARG BASE_IMAGE={}
 
-'''.format(argmap['TRITON_VERSION'], argmap['TRITON_CONTAINER_VERSION'],
-           argmap['BASE_IMAGE'])
+""".format(
+        argmap["TRITON_VERSION"],
+        argmap["TRITON_CONTAINER_VERSION"],
+        argmap["BASE_IMAGE"],
+    )
 
     # PyTorch and TensorFlow backends need extra CUDA and other
     # dependencies during runtime that are missing in the CPU-only base container.
     # These dependencies must be copied from the Triton Min image.
-    if not FLAGS.enable_gpu and (('pytorch' in backends) or
-                                 ('tensorflow' in backends)):
-        df += '''
+    if not FLAGS.enable_gpu and (("pytorch" in backends) or ("tensorflow" in backends)):
+        df += """
 ############################################################################
 ##  Triton Min image
 ############################################################################
 FROM {} AS min_container
 
-'''.format(argmap['GPU_BASE_IMAGE'])
+""".format(
+            argmap["GPU_BASE_IMAGE"]
+        )
 
-    df += '''
+    df += """
 ############################################################################
 ##  Production stage: Create container with just inference server executable
 ############################################################################
 FROM ${BASE_IMAGE}
-'''
+"""
 
-    df += dockerfile_prepare_container_linux(argmap, backends, FLAGS.enable_gpu,
-                                             target_machine())
+    df += dockerfile_prepare_container_linux(
+        argmap, backends, FLAGS.enable_gpu, target_machine()
+    )
 
-    df += '''
+    df += """
 WORKDIR /opt
 COPY --chown=1000:1000 build/install tritonserver
 
 WORKDIR /opt/tritonserver
 COPY --chown=1000:1000 NVIDIA_Deep_Learning_Container_License.pdf .
 
-'''
+"""
     if not FLAGS.no_core_build:
         # Add feature labels for SageMaker endpoint
-        if 'sagemaker' in endpoints:
-            df += '''
+        if "sagemaker" in endpoints:
+            df += """
 LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
 LABEL com.amazonaws.sagemaker.capabilities.multi-models=true
 COPY --chown=1000:1000 docker/sagemaker/serve /usr/bin/.
-'''
+"""
 
     # This is required since libcublasLt.so is not present during the build
     # stage of the PyTorch backend
-    if not FLAGS.enable_gpu and ('pytorch' in backends):
-        df += '''
+    if not FLAGS.enable_gpu and ("pytorch" in backends):
+        df += """
 RUN patchelf --add-needed /usr/local/cuda/lib64/stubs/libcublasLt.so.12 backends/pytorch/libtorch_cuda.so
-'''
+"""
 
     with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
         dfile.write(df)
 
 
-def dockerfile_prepare_container_linux(argmap, backends, enable_gpu,
-                                       target_machine):
+def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_machine):
     gpu_enabled = 1 if enable_gpu else 0
     # Common steps to produce docker images shared by build.py and compose.py.
-    # Sets enviroment variables, installs dependencies and adds entrypoint
-    df = '''
+    # Sets environment variables, installs dependencies and adds entrypoint
+    df = """
 ARG TRITON_VERSION
 ARG TRITON_CONTAINER_VERSION
 
@@ -1096,33 +1149,33 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu,
 LABEL com.nvidia.tritonserver.version="${TRITON_SERVER_VERSION}"
 
 ENV PATH /opt/tritonserver/bin:${PATH}
-'''
+"""
 
     # TODO Remove once the ORT-OpenVINO "Exception while Reading network" is fixed
-    if 'onnxruntime' in backends:
-        df += '''
+    if "onnxruntime" in backends:
+        df += """
 ENV LD_LIBRARY_PATH /opt/tritonserver/backends/onnxruntime:${LD_LIBRARY_PATH}
-'''
+"""
 
     # Necessary for libtorch.so to find correct HPCX libraries
-    if ('pytorch' in backends):
-        df += '''
+    if "pytorch" in backends:
+        df += """
 ENV LD_LIBRARY_PATH /opt/hpcx/ucc/lib/:/opt/hpcx/ucx/lib/:${LD_LIBRARY_PATH}
-'''
+"""
 
     backend_dependencies = ""
     # libgomp1 is needed by both onnxruntime and pytorch backends
-    if ('onnxruntime' in backends) or ('pytorch' in backends):
+    if ("onnxruntime" in backends) or ("pytorch" in backends):
         backend_dependencies = "libgomp1"
 
     # libgfortran5 is needed by pytorch backend on ARM
-    if ('pytorch' in backends) and (target_machine == 'aarch64'):
+    if ("pytorch" in backends) and (target_machine == "aarch64"):
         backend_dependencies += " libgfortran5"
     # openssh-server is needed for fastertransformer
-    if ('fastertransformer' in backends):
+    if "fastertransformer" in backends:
         backend_dependencies += " openssh-server"
 
-    df += '''
+    df += """
 ENV TF_ADJUST_HUE_FUSED         1
 ENV TF_ADJUST_SATURATION_FUSED  1
 ENV TF_ENABLE_WINOGRAD_NONFUSED 1
@@ -1163,37 +1216,42 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu,
 
 # Set TCMALLOC_RELEASE_RATE for users setting LD_PRELOAD with tcmalloc
 ENV TCMALLOC_RELEASE_RATE 200
-'''.format(gpu_enabled=gpu_enabled, backend_dependencies=backend_dependencies)
+""".format(
+        gpu_enabled=gpu_enabled, backend_dependencies=backend_dependencies
+    )
 
-    if ('fastertransformer' in backends):
+    if "fastertransformer" in backends:
         be = "fastertransformer"
-        import importlib.util, requests
-        url = 'https://raw.githubusercontent.com/triton-inference-server/fastertransformer_backend/{}/docker/create_dockerfile_and_build.py'.format(
-            backends[be])
+        import importlib.util
+
+        import requests
+
+        url = "https://raw.githubusercontent.com/triton-inference-server/fastertransformer_backend/{}/docker/create_dockerfile_and_build.py".format(
+            backends[be]
+        )
         response = requests.get(url)
-        spec = importlib.util.spec_from_loader('fastertransformer_buildscript',
-                                               loader=None,
-                                               origin=url)
+        spec = importlib.util.spec_from_loader(
+            "fastertransformer_buildscript", loader=None, origin=url
+        )
         fastertransformer_buildscript = importlib.util.module_from_spec(spec)
         exec(response.content, fastertransformer_buildscript.__dict__)
-        df += fastertransformer_buildscript.create_postbuild(
-            is_multistage_build=False)
+        df += fastertransformer_buildscript.create_postbuild(is_multistage_build=False)
 
     if enable_gpu:
-        df += install_dcgm_libraries(argmap['DCGM_VERSION'], target_machine)
-        df += '''
+        df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine)
+        df += """
 # Extra defensive wiring for CUDA Compat lib
 RUN ln -sf ${_CUDA_COMPAT_PATH}/lib.real ${_CUDA_COMPAT_PATH}/lib \
  && echo ${_CUDA_COMPAT_PATH}/lib > /etc/ld.so.conf.d/00-cuda-compat.conf \
  && ldconfig \
  && rm -f ${_CUDA_COMPAT_PATH}/lib
-'''
+"""
     else:
         df += add_cpu_libs_to_linux_dockerfile(backends, target_machine)
 
     # Add dependencies needed for python backend
-    if 'python' in backends:
-        df += '''
+    if "python" in backends:
+        df += """
 # python3, python3-pip and some pip installs required for the python backend
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
@@ -1204,45 +1262,46 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu,
     pip3 install --upgrade wheel setuptools && \
     pip3 install --upgrade numpy && \
     rm -rf /var/lib/apt/lists/*
-'''
+"""
 
-    df += '''
+    df += """
 WORKDIR /opt/tritonserver
 RUN rm -fr /opt/tritonserver/*
 ENV NVIDIA_PRODUCT_NAME="Triton Server"
 COPY docker/entrypoint.d/ /opt/nvidia/entrypoint.d/
-'''
+"""
 
     # The CPU-only build uses ubuntu as the base image, and so the
     # entrypoint files are not available in /opt/nvidia in the base
     # image, so we must provide them ourselves.
     if not enable_gpu:
-        df += '''
+        df += """
 COPY docker/cpu_only/ /opt/nvidia/
 ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
-'''
+"""
 
-    df += '''
+    df += """
 ENV NVIDIA_BUILD_ID {}
 LABEL com.nvidia.build.id={}
 LABEL com.nvidia.build.ref={}
-'''.format(argmap['NVIDIA_BUILD_ID'], argmap['NVIDIA_BUILD_ID'],
-           argmap['NVIDIA_BUILD_REF'])
+""".format(
+        argmap["NVIDIA_BUILD_ID"], argmap["NVIDIA_BUILD_ID"], argmap["NVIDIA_BUILD_REF"]
+    )
 
     return df
 
 
 def add_cpu_libs_to_linux_dockerfile(backends, target_machine):
-    df = ''
-    libs_arch = 'aarch64' if target_machine == 'aarch64' else 'x86_64'
-    if 'pytorch' in backends:
+    df = ""
+    libs_arch = "aarch64" if target_machine == "aarch64" else "x86_64"
+    if "pytorch" in backends:
         # Add extra dependencies for pytorch backend.
         # Note: Even though the build is CPU-only, the version of pytorch
         # we are using depend upon libraries like cuda and cudnn. Since
         # these dependencies are not present in the ubuntu base image,
         # we must copy these from the Triton min container ourselves.
-        cuda_arch = 'sbsa' if target_machine == 'aarch64' else 'x86_64'
-        df += '''
+        cuda_arch = "sbsa" if target_machine == "aarch64" else "x86_64"
+        df += """
 RUN mkdir -p /usr/local/cuda/lib64/stubs
 COPY --from=min_container /usr/local/cuda/lib64/stubs/libcusparse.so /usr/local/cuda/lib64/stubs/libcusparse.so.12
 COPY --from=min_container /usr/local/cuda/lib64/stubs/libcusolver.so /usr/local/cuda/lib64/stubs/libcusolver.so.11
@@ -1272,24 +1331,29 @@ def add_cpu_libs_to_linux_dockerfile(backends, target_machine):
         apt-get install -y --no-install-recommends openmpi-bin patchelf
 
 ENV LD_LIBRARY_PATH /usr/local/cuda/targets/{cuda_arch}-linux/lib:/usr/local/cuda/lib64/stubs:${{LD_LIBRARY_PATH}}
-'''.format(cuda_arch=cuda_arch, libs_arch=libs_arch)
+""".format(
+            cuda_arch=cuda_arch, libs_arch=libs_arch
+        )
 
-    if ('pytorch' in backends) or ('tensorflow' in backends):
+    if ("pytorch" in backends) or ("tensorflow" in backends):
         # Add NCCL dependency for tensorflow/pytorch backend.
         # Note: Even though the build is CPU-only, the version of
         # tensorflow/pytorch we are using depends upon the NCCL library.
         # Since this dependency is not present in the ubuntu base image,
         # we must copy it from the Triton min container ourselves.
-        df += '''
+        df += """
 COPY --from=min_container /usr/lib/{libs_arch}-linux-gnu/libnccl.so.2 /usr/lib/{libs_arch}-linux-gnu/libnccl.so.2
-'''.format(libs_arch=libs_arch)
+""".format(
+            libs_arch=libs_arch
+        )
 
     return df
 
 
-def create_dockerfile_windows(ddir, dockerfile_name, argmap, backends,
-                              repoagents, caches):
-    df = '''
+def create_dockerfile_windows(
+    ddir, dockerfile_name, argmap, backends, repoagents, caches
+):
+    df = """
 ARG TRITON_VERSION={}
 ARG TRITON_CONTAINER_VERSION={}
 ARG BASE_IMAGE={}
@@ -1308,9 +1372,12 @@ def create_dockerfile_windows(ddir, dockerfile_name, argmap, backends,
 
 RUN setx path "%path%;C:\opt\tritonserver\bin"
 
-'''.format(argmap['TRITON_VERSION'], argmap['TRITON_CONTAINER_VERSION'],
-           argmap['BASE_IMAGE'])
-    df += '''
+""".format(
+        argmap["TRITON_VERSION"],
+        argmap["TRITON_CONTAINER_VERSION"],
+        argmap["BASE_IMAGE"],
+    )
+    df += """
 WORKDIR /opt
 RUN rmdir /S/Q tritonserver || exit 0
 COPY --chown=1000:1000 build/install tritonserver
@@ -1318,119 +1385,136 @@ def create_dockerfile_windows(ddir, dockerfile_name, argmap, backends,
 WORKDIR /opt/tritonserver
 COPY --chown=1000:1000 NVIDIA_Deep_Learning_Container_License.pdf .
 
-'''
-    df += '''
+"""
+    df += """
 ENTRYPOINT []
 ENV NVIDIA_BUILD_ID {}
 LABEL com.nvidia.build.id={}
 LABEL com.nvidia.build.ref={}
-'''.format(argmap['NVIDIA_BUILD_ID'], argmap['NVIDIA_BUILD_ID'],
-           argmap['NVIDIA_BUILD_REF'])
+""".format(
+        argmap["NVIDIA_BUILD_ID"], argmap["NVIDIA_BUILD_ID"], argmap["NVIDIA_BUILD_REF"]
+    )
 
     with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
         dfile.write(df)
 
 
-def create_build_dockerfiles(container_build_dir, images, backends, repoagents,
-                             caches, endpoints):
-    if 'base' in images:
-        base_image = images['base']
-    elif target_platform() == 'windows':
-        base_image = 'mcr.microsoft.com/dotnet/framework/sdk:4.8'
+def create_build_dockerfiles(
+    container_build_dir, images, backends, repoagents, caches, endpoints
+):
+    if "base" in images:
+        base_image = images["base"]
+    elif target_platform() == "windows":
+        base_image = "mcr.microsoft.com/dotnet/framework/sdk:4.8"
     elif FLAGS.enable_gpu:
-        base_image = 'nvcr.io/nvidia/tritonserver:{}-py3-min'.format(
-            FLAGS.upstream_container_version)
+        base_image = "nvcr.io/nvidia/tritonserver:{}-py3-min".format(
+            FLAGS.upstream_container_version
+        )
     else:
-        base_image = 'ubuntu:22.04'
+        base_image = "ubuntu:22.04"
 
     dockerfileargmap = {
-        'NVIDIA_BUILD_REF':
-            '' if FLAGS.build_sha is None else FLAGS.build_sha,
-        'NVIDIA_BUILD_ID':
-            '<unknown>' if FLAGS.build_id is None else FLAGS.build_id,
-        'TRITON_VERSION':
-            FLAGS.version,
-        'TRITON_CONTAINER_VERSION':
-            FLAGS.container_version,
-        'BASE_IMAGE':
-            base_image,
-        'DCGM_VERSION':
-            '' if FLAGS.version is None or FLAGS.version
-            not in TRITON_VERSION_MAP else TRITON_VERSION_MAP[FLAGS.version][5],
-        'CONDA_VERSION':
-            '' if FLAGS.version is None or FLAGS.version
-            not in TRITON_VERSION_MAP else TRITON_VERSION_MAP[FLAGS.version][6]
+        "NVIDIA_BUILD_REF": "" if FLAGS.build_sha is None else FLAGS.build_sha,
+        "NVIDIA_BUILD_ID": "<unknown>" if FLAGS.build_id is None else FLAGS.build_id,
+        "TRITON_VERSION": FLAGS.version,
+        "TRITON_CONTAINER_VERSION": FLAGS.container_version,
+        "BASE_IMAGE": base_image,
+        "DCGM_VERSION": ""
+        if FLAGS.version is None or FLAGS.version not in TRITON_VERSION_MAP
+        else TRITON_VERSION_MAP[FLAGS.version][5],
+        "CONDA_VERSION": ""
+        if FLAGS.version is None or FLAGS.version not in TRITON_VERSION_MAP
+        else TRITON_VERSION_MAP[FLAGS.version][6],
     }
 
     # For CPU-only image we need to copy some cuda libraries and dependencies
     # since we are using PyTorch and TensorFlow containers that
     # are not CPU-only.
-    if not FLAGS.enable_gpu and (
-        ('pytorch' in backends) or
-        ('tensorflow' in backends)) and (target_platform() != 'windows'):
-        if 'gpu-base' in images:
-            gpu_base_image = images['gpu-base']
+    if (
+        not FLAGS.enable_gpu
+        and (("pytorch" in backends) or ("tensorflow" in backends))
+        and (target_platform() != "windows")
+    ):
+        if "gpu-base" in images:
+            gpu_base_image = images["gpu-base"]
         else:
-            gpu_base_image = 'nvcr.io/nvidia/tritonserver:{}-py3-min'.format(
-                FLAGS.upstream_container_version)
-        dockerfileargmap['GPU_BASE_IMAGE'] = gpu_base_image
+            gpu_base_image = "nvcr.io/nvidia/tritonserver:{}-py3-min".format(
+                FLAGS.upstream_container_version
+            )
+        dockerfileargmap["GPU_BASE_IMAGE"] = gpu_base_image
 
-    create_dockerfile_buildbase(FLAGS.build_dir, 'Dockerfile.buildbase',
-                                dockerfileargmap)
+    create_dockerfile_buildbase(
+        FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap
+    )
 
-    if target_platform() == 'windows':
-        create_dockerfile_windows(FLAGS.build_dir, 'Dockerfile',
-                                  dockerfileargmap, backends, repoagents,
-                                  caches)
+    if target_platform() == "windows":
+        create_dockerfile_windows(
+            FLAGS.build_dir,
+            "Dockerfile",
+            dockerfileargmap,
+            backends,
+            repoagents,
+            caches,
+        )
     else:
-        create_dockerfile_linux(FLAGS.build_dir, 'Dockerfile', dockerfileargmap,
-                                backends, repoagents, caches, endpoints)
+        create_dockerfile_linux(
+            FLAGS.build_dir,
+            "Dockerfile",
+            dockerfileargmap,
+            backends,
+            repoagents,
+            caches,
+            endpoints,
+        )
 
     # Dockerfile used for the creating the CI base image.
-    create_dockerfile_cibase(FLAGS.build_dir, 'Dockerfile.cibase',
-                             dockerfileargmap)
+    create_dockerfile_cibase(FLAGS.build_dir, "Dockerfile.cibase", dockerfileargmap)
 
 
-def create_docker_build_script(script_name, container_install_dir,
-                               container_ci_dir):
+def create_docker_build_script(script_name, container_install_dir, container_ci_dir):
     with BuildScript(
-            os.path.join(FLAGS.build_dir, script_name),
-            verbose=FLAGS.verbose,
-            desc=('Docker-based build script for Triton Inference Server'
-                 )) as docker_script:
-
+        os.path.join(FLAGS.build_dir, script_name),
+        verbose=FLAGS.verbose,
+        desc=("Docker-based build script for Triton Inference Server"),
+    ) as docker_script:
         #
         # Build base image... tritonserver_buildbase
         #
         docker_script.commentln(8)
-        docker_script.comment('Create Triton base build image')
+        docker_script.comment("Create Triton base build image")
         docker_script.comment(
-            'This image contains all dependencies necessary to build Triton')
+            "This image contains all dependencies necessary to build Triton"
+        )
         docker_script.comment()
 
         cachefrommap = [
-            'tritonserver_buildbase', 'tritonserver_buildbase_cache0',
-            'tritonserver_buildbase_cache1'
+            "tritonserver_buildbase",
+            "tritonserver_buildbase_cache0",
+            "tritonserver_buildbase_cache1",
         ]
 
         baseargs = [
-            'docker', 'build', '-t', 'tritonserver_buildbase', '-f',
-            os.path.join(FLAGS.build_dir, 'Dockerfile.buildbase')
+            "docker",
+            "build",
+            "-t",
+            "tritonserver_buildbase",
+            "-f",
+            os.path.join(FLAGS.build_dir, "Dockerfile.buildbase"),
         ]
 
         if not FLAGS.no_container_pull:
             baseargs += [
-                '--pull',
+                "--pull",
             ]
 
         # Windows docker runs in a VM and memory needs to be specified
         # explicitly (at least for some configurations of docker).
-        if target_platform() == 'windows':
+        if target_platform() == "windows":
             if FLAGS.container_memory:
-                baseargs += ['--memory', FLAGS.container_memory]
+                baseargs += ["--memory", FLAGS.container_memory]
 
-        baseargs += ['--cache-from={}'.format(k) for k in cachefrommap]
-        baseargs += ['.']
+        baseargs += ["--cache-from={}".format(k) for k in cachefrommap]
+        baseargs += ["."]
 
         docker_script.cwd(THIS_SCRIPT_DIR)
         docker_script.cmd(baseargs, check_exitcode=True)
@@ -1440,10 +1524,9 @@ def create_docker_build_script(script_name, container_install_dir,
         #
         docker_script.blankln()
         docker_script.commentln(8)
-        docker_script.comment('Run build in tritonserver_buildbase container')
-        docker_script.comment(
-            'Mount a directory into the container where the install')
-        docker_script.comment('artifacts will be placed.')
+        docker_script.comment("Run build in tritonserver_buildbase container")
+        docker_script.comment("Mount a directory into the container where the install")
+        docker_script.comment("artifacts will be placed.")
         docker_script.comment()
 
         # Don't use '-v' to communicate the built artifacts out of the
@@ -1451,34 +1534,34 @@ def create_docker_build_script(script_name, container_install_dir,
         # Docker (i.e. docker-in-docker) and not just if run directly
         # from host.
         runargs = [
-            'docker', 'run', '-w', '/workspace/build', '--name',
-            'tritonserver_builder'
+            "docker",
+            "run",
+            "-w",
+            "/workspace/build",
+            "--name",
+            "tritonserver_builder",
         ]
 
         if not FLAGS.no_container_interactive:
-            runargs += ['-it']
+            runargs += ["-it"]
 
-        if target_platform() == 'windows':
+        if target_platform() == "windows":
             if FLAGS.container_memory:
-                runargs += ['--memory', FLAGS.container_memory]
-            runargs += [
-                '-v', '\\\\.\pipe\docker_engine:\\\\.\pipe\docker_engine'
-            ]
+                runargs += ["--memory", FLAGS.container_memory]
+            runargs += ["-v", "\\\\.\pipe\docker_engine:\\\\.\pipe\docker_engine"]
         else:
-            runargs += ['-v', '/var/run/docker.sock:/var/run/docker.sock']
+            runargs += ["-v", "/var/run/docker.sock:/var/run/docker.sock"]
 
-        runargs += ['tritonserver_buildbase']
+        runargs += ["tritonserver_buildbase"]
 
-        if target_platform() == 'windows':
-            runargs += [
-                'powershell.exe', '-noexit', '-File', './cmake_build.ps1'
-            ]
+        if target_platform() == "windows":
+            runargs += ["powershell.exe", "-noexit", "-File", "./cmake_build.ps1"]
         else:
-            runargs += ['./cmake_build']
+            runargs += ["./cmake_build"]
 
         # Remove existing tritonserver_builder container...
-        if target_platform() == 'windows':
-            docker_script.cmd(['docker', 'rm', 'tritonserver_builder'])
+        if target_platform() == "windows":
+            docker_script.cmd(["docker", "rm", "tritonserver_builder"])
         else:
             docker_script._file.write(
                 'if [ "$(docker ps -a | grep tritonserver_builder)" ]; then  docker rm tritonserver_builder; fi\n'
@@ -1486,28 +1569,41 @@ def create_docker_build_script(script_name, container_install_dir,
 
         docker_script.cmd(runargs, check_exitcode=True)
 
-        docker_script.cmd([
-            'docker', 'cp', 'tritonserver_builder:/tmp/tritonbuild/install',
-            FLAGS.build_dir
-        ],
-                          check_exitcode=True)
-        docker_script.cmd([
-            'docker', 'cp', 'tritonserver_builder:/tmp/tritonbuild/ci',
-            FLAGS.build_dir
-        ],
-                          check_exitcode=True)
+        docker_script.cmd(
+            [
+                "docker",
+                "cp",
+                "tritonserver_builder:/tmp/tritonbuild/install",
+                FLAGS.build_dir,
+            ],
+            check_exitcode=True,
+        )
+        docker_script.cmd(
+            [
+                "docker",
+                "cp",
+                "tritonserver_builder:/tmp/tritonbuild/ci",
+                FLAGS.build_dir,
+            ],
+            check_exitcode=True,
+        )
 
         #
         # Final image... tritonserver
         #
         docker_script.blankln()
         docker_script.commentln(8)
-        docker_script.comment('Create final tritonserver image')
+        docker_script.comment("Create final tritonserver image")
         docker_script.comment()
 
         finalargs = [
-            'docker', 'build', '-t', 'tritonserver', '-f',
-            os.path.join(FLAGS.build_dir, 'Dockerfile'), '.'
+            "docker",
+            "build",
+            "-t",
+            "tritonserver",
+            "-f",
+            os.path.join(FLAGS.build_dir, "Dockerfile"),
+            ".",
         ]
 
         docker_script.cwd(THIS_SCRIPT_DIR)
@@ -1518,86 +1614,113 @@ def create_docker_build_script(script_name, container_install_dir,
         #
         docker_script.blankln()
         docker_script.commentln(8)
-        docker_script.comment('Create CI base image')
+        docker_script.comment("Create CI base image")
         docker_script.comment()
 
         cibaseargs = [
-            'docker', 'build', '-t', 'tritonserver_cibase', '-f',
-            os.path.join(FLAGS.build_dir, 'Dockerfile.cibase'), '.'
+            "docker",
+            "build",
+            "-t",
+            "tritonserver_cibase",
+            "-f",
+            os.path.join(FLAGS.build_dir, "Dockerfile.cibase"),
+            ".",
         ]
 
         docker_script.cwd(THIS_SCRIPT_DIR)
         docker_script.cmd(cibaseargs, check_exitcode=True)
 
 
-def core_build(cmake_script, repo_dir, cmake_dir, build_dir, install_dir,
-               components, backends):
-    repo_build_dir = os.path.join(build_dir, 'tritonserver', 'build')
-    repo_install_dir = os.path.join(build_dir, 'tritonserver', 'install')
+def core_build(
+    cmake_script, repo_dir, cmake_dir, build_dir, install_dir, components, backends
+):
+    repo_build_dir = os.path.join(build_dir, "tritonserver", "build")
+    repo_install_dir = os.path.join(build_dir, "tritonserver", "install")
 
     cmake_script.commentln(8)
-    cmake_script.comment('Triton core library and tritonserver executable')
+    cmake_script.comment("Triton core library and tritonserver executable")
     cmake_script.comment()
     cmake_script.mkdir(repo_build_dir)
     cmake_script.cwd(repo_build_dir)
     cmake_script.cmake(
-        core_cmake_args(components, backends, cmake_dir, repo_install_dir))
+        core_cmake_args(components, backends, cmake_dir, repo_install_dir)
+    )
     cmake_script.makeinstall()
 
-    if target_platform() == 'windows':
-        cmake_script.mkdir(os.path.join(install_dir, 'bin'))
+    if target_platform() == "windows":
+        cmake_script.mkdir(os.path.join(install_dir, "bin"))
         cmake_script.cp(
-            os.path.join(repo_install_dir, 'bin', 'tritonserver.exe'),
-            os.path.join(install_dir, 'bin'))
+            os.path.join(repo_install_dir, "bin", "tritonserver.exe"),
+            os.path.join(install_dir, "bin"),
+        )
         cmake_script.cp(
-            os.path.join(repo_install_dir, 'bin', 'tritonserver.dll'),
-            os.path.join(install_dir, 'bin'))
+            os.path.join(repo_install_dir, "bin", "tritonserver.dll"),
+            os.path.join(install_dir, "bin"),
+        )
     else:
-        cmake_script.mkdir(os.path.join(install_dir, 'bin'))
-        cmake_script.cp(os.path.join(repo_install_dir, 'bin', 'tritonserver'),
-                        os.path.join(install_dir, 'bin'))
-        cmake_script.mkdir(os.path.join(install_dir, 'lib'))
+        cmake_script.mkdir(os.path.join(install_dir, "bin"))
         cmake_script.cp(
-            os.path.join(repo_install_dir, 'lib', 'libtritonserver.so'),
-            os.path.join(install_dir, 'lib'))
+            os.path.join(repo_install_dir, "bin", "tritonserver"),
+            os.path.join(install_dir, "bin"),
+        )
+        cmake_script.mkdir(os.path.join(install_dir, "lib"))
+        cmake_script.cp(
+            os.path.join(repo_install_dir, "lib", "libtritonserver.so"),
+            os.path.join(install_dir, "lib"),
+        )
 
-    cmake_script.mkdir(os.path.join(install_dir, 'include', 'triton'))
+    cmake_script.mkdir(os.path.join(install_dir, "include", "triton"))
     cmake_script.cpdir(
-        os.path.join(repo_install_dir, 'include', 'triton', 'core'),
-        os.path.join(install_dir, 'include', 'triton', 'core'))
+        os.path.join(repo_install_dir, "include", "triton", "core"),
+        os.path.join(install_dir, "include", "triton", "core"),
+    )
 
-    cmake_script.cp(os.path.join(repo_dir, 'LICENSE'), install_dir)
-    cmake_script.cp(os.path.join(repo_dir, 'TRITON_VERSION'), install_dir)
+    cmake_script.cp(os.path.join(repo_dir, "LICENSE"), install_dir)
+    cmake_script.cp(os.path.join(repo_dir, "TRITON_VERSION"), install_dir)
 
     # If requested, package the source code for all OSS used to build
     # For windows, Triton is not delivered as a container so skip for
     # windows platform.
-    if target_platform() != 'windows':
-        if (not FLAGS.no_container_build) and (not FLAGS.no_core_build) and (
-                not FLAGS.no_container_source):
-            cmake_script.mkdir(os.path.join(install_dir, 'third-party-src'))
+    if target_platform() != "windows":
+        if (
+            (not FLAGS.no_container_build)
+            and (not FLAGS.no_core_build)
+            and (not FLAGS.no_container_source)
+        ):
+            cmake_script.mkdir(os.path.join(install_dir, "third-party-src"))
             cmake_script.cwd(repo_build_dir)
             cmake_script.tar(
-                'third-party-src',
-                os.path.join(install_dir, 'third-party-src', 'src.tar.gz'))
+                "third-party-src",
+                os.path.join(install_dir, "third-party-src", "src.tar.gz"),
+            )
             cmake_script.cp(
-                os.path.join(repo_dir, 'docker', 'README.third-party-src'),
-                os.path.join(install_dir, 'third-party-src', 'README'))
+                os.path.join(repo_dir, "docker", "README.third-party-src"),
+                os.path.join(install_dir, "third-party-src", "README"),
+            )
 
     cmake_script.comment()
-    cmake_script.comment('end Triton core library and tritonserver executable')
+    cmake_script.comment("end Triton core library and tritonserver executable")
     cmake_script.commentln(8)
     cmake_script.blankln()
 
 
-def backend_build(be, cmake_script, tag, build_dir, install_dir,
-                  github_organization, images, components, library_paths):
-    repo_build_dir = os.path.join(build_dir, be, 'build')
-    repo_install_dir = os.path.join(build_dir, be, 'install')
+def backend_build(
+    be,
+    cmake_script,
+    tag,
+    build_dir,
+    install_dir,
+    github_organization,
+    images,
+    components,
+    library_paths,
+):
+    repo_build_dir = os.path.join(build_dir, be, "build")
+    repo_install_dir = os.path.join(build_dir, be, "install")
 
     cmake_script.commentln(8)
-    cmake_script.comment(f'\'{be}\' backend')
-    cmake_script.comment('Delete this section to remove backend from build')
+    cmake_script.comment(f"'{be}' backend")
+    cmake_script.comment("Delete this section to remove backend from build")
     cmake_script.comment()
     cmake_script.mkdir(build_dir)
     cmake_script.cwd(build_dir)
@@ -1606,190 +1729,216 @@ def backend_build(be, cmake_script, tag, build_dir, install_dir,
     cmake_script.mkdir(repo_build_dir)
     cmake_script.cwd(repo_build_dir)
     cmake_script.cmake(
-        backend_cmake_args(images, components, be, repo_install_dir,
-                           library_paths))
+        backend_cmake_args(images, components, be, repo_install_dir, library_paths)
+    )
     cmake_script.makeinstall()
 
-    cmake_script.mkdir(os.path.join(install_dir, 'backends'))
-    cmake_script.rmdir(os.path.join(install_dir, 'backends', be))
-    cmake_script.cpdir(os.path.join(repo_install_dir, 'backends', be),
-                       os.path.join(install_dir, 'backends'))
+    cmake_script.mkdir(os.path.join(install_dir, "backends"))
+    cmake_script.rmdir(os.path.join(install_dir, "backends", be))
+    cmake_script.cpdir(
+        os.path.join(repo_install_dir, "backends", be),
+        os.path.join(install_dir, "backends"),
+    )
 
     cmake_script.comment()
-    cmake_script.comment(f'end \'{be}\' backend')
+    cmake_script.comment(f"end '{be}' backend")
     cmake_script.commentln(8)
     cmake_script.blankln()
 
 
-def repo_agent_build(ra, cmake_script, build_dir, install_dir, repoagent_repo,
-                     repoagents):
-    repo_build_dir = os.path.join(build_dir, ra, 'build')
-    repo_install_dir = os.path.join(build_dir, ra, 'install')
+def repo_agent_build(
+    ra, cmake_script, build_dir, install_dir, repoagent_repo, repoagents
+):
+    repo_build_dir = os.path.join(build_dir, ra, "build")
+    repo_install_dir = os.path.join(build_dir, ra, "install")
 
     cmake_script.commentln(8)
-    cmake_script.comment(f'\'{ra}\' repository agent')
-    cmake_script.comment(
-        'Delete this section to remove repository agent from build')
+    cmake_script.comment(f"'{ra}' repository agent")
+    cmake_script.comment("Delete this section to remove repository agent from build")
     cmake_script.comment()
     cmake_script.mkdir(build_dir)
     cmake_script.cwd(build_dir)
-    cmake_script.gitclone(repoagent_repo(ra), repoagents[ra], ra,
-                          FLAGS.github_organization)
+    cmake_script.gitclone(
+        repoagent_repo(ra), repoagents[ra], ra, FLAGS.github_organization
+    )
 
     cmake_script.mkdir(repo_build_dir)
     cmake_script.cwd(repo_build_dir)
-    cmake_script.cmake(
-        repoagent_cmake_args(images, components, ra, repo_install_dir))
+    cmake_script.cmake(repoagent_cmake_args(images, components, ra, repo_install_dir))
     cmake_script.makeinstall()
 
-    cmake_script.mkdir(os.path.join(install_dir, 'repoagents'))
-    cmake_script.rmdir(os.path.join(install_dir, 'repoagents', ra))
-    cmake_script.cpdir(os.path.join(repo_install_dir, 'repoagents', ra),
-                       os.path.join(install_dir, 'repoagents'))
+    cmake_script.mkdir(os.path.join(install_dir, "repoagents"))
+    cmake_script.rmdir(os.path.join(install_dir, "repoagents", ra))
+    cmake_script.cpdir(
+        os.path.join(repo_install_dir, "repoagents", ra),
+        os.path.join(install_dir, "repoagents"),
+    )
     cmake_script.comment()
-    cmake_script.comment(f'end \'{ra}\' repository agent')
+    cmake_script.comment(f"end '{ra}' repository agent")
     cmake_script.commentln(8)
     cmake_script.blankln()
 
 
-def cache_build(cache, cmake_script, build_dir, install_dir, cache_repo,
-                caches):
-    repo_build_dir = os.path.join(build_dir, cache, 'build')
-    repo_install_dir = os.path.join(build_dir, cache, 'install')
+def cache_build(cache, cmake_script, build_dir, install_dir, cache_repo, caches):
+    repo_build_dir = os.path.join(build_dir, cache, "build")
+    repo_install_dir = os.path.join(build_dir, cache, "install")
 
     cmake_script.commentln(8)
-    cmake_script.comment(f'\'{cache}\' cache')
-    cmake_script.comment('Delete this section to remove cache from build')
+    cmake_script.comment(f"'{cache}' cache")
+    cmake_script.comment("Delete this section to remove cache from build")
     cmake_script.comment()
     cmake_script.mkdir(build_dir)
     cmake_script.cwd(build_dir)
-    cmake_script.gitclone(cache_repo(cache), caches[cache], cache,
-                          FLAGS.github_organization)
+    cmake_script.gitclone(
+        cache_repo(cache), caches[cache], cache, FLAGS.github_organization
+    )
 
     cmake_script.mkdir(repo_build_dir)
     cmake_script.cwd(repo_build_dir)
-    cmake_script.cmake(
-        cache_cmake_args(images, components, cache, repo_install_dir))
+    cmake_script.cmake(cache_cmake_args(images, components, cache, repo_install_dir))
     cmake_script.makeinstall()
 
-    cmake_script.mkdir(os.path.join(install_dir, 'caches'))
-    cmake_script.rmdir(os.path.join(install_dir, 'caches', cache))
-    cmake_script.cpdir(os.path.join(repo_install_dir, 'caches', cache),
-                       os.path.join(install_dir, 'caches'))
+    cmake_script.mkdir(os.path.join(install_dir, "caches"))
+    cmake_script.rmdir(os.path.join(install_dir, "caches", cache))
+    cmake_script.cpdir(
+        os.path.join(repo_install_dir, "caches", cache),
+        os.path.join(install_dir, "caches"),
+    )
     cmake_script.comment()
-    cmake_script.comment(f'end \'{cache}\' cache')
+    cmake_script.comment(f"end '{cache}' cache")
     cmake_script.commentln(8)
     cmake_script.blankln()
 
 
-def cibase_build(cmake_script, repo_dir, cmake_dir, build_dir, install_dir,
-                 ci_dir, backends):
-    repo_install_dir = os.path.join(build_dir, 'tritonserver', 'install')
+def cibase_build(
+    cmake_script, repo_dir, cmake_dir, build_dir, install_dir, ci_dir, backends
+):
+    repo_install_dir = os.path.join(build_dir, "tritonserver", "install")
 
     cmake_script.commentln(8)
-    cmake_script.comment('Collect Triton CI artifacts')
+    cmake_script.comment("Collect Triton CI artifacts")
     cmake_script.comment()
 
     cmake_script.mkdir(ci_dir)
 
     # On windows we are not yet using a CI/QA docker image for
     # testing, so don't do anything...
-    if target_platform() == 'windows':
+    if target_platform() == "windows":
         return
 
     # The core build produces some artifacts that are needed for CI
     # testing, so include those in the install.
-    cmake_script.cpdir(os.path.join(repo_dir, 'qa'), ci_dir)
-    cmake_script.cpdir(os.path.join(repo_dir, 'deploy'), ci_dir)
-    cmake_script.mkdir(os.path.join(ci_dir, 'docs'))
-    cmake_script.cpdir(os.path.join(repo_dir, 'docs', 'examples'),
-                       os.path.join(ci_dir, 'docs'))
-    cmake_script.mkdir(os.path.join(ci_dir, 'src', 'test'))
-    cmake_script.cpdir(os.path.join(repo_dir, 'src', 'test', 'models'),
-                       os.path.join(ci_dir, 'src', 'test'))
+    cmake_script.cpdir(os.path.join(repo_dir, "qa"), ci_dir)
+    cmake_script.cpdir(os.path.join(repo_dir, "deploy"), ci_dir)
+    cmake_script.mkdir(os.path.join(ci_dir, "docs"))
+    cmake_script.cpdir(
+        os.path.join(repo_dir, "docs", "examples"), os.path.join(ci_dir, "docs")
+    )
+    cmake_script.mkdir(os.path.join(ci_dir, "src", "test"))
+    cmake_script.cpdir(
+        os.path.join(repo_dir, "src", "test", "models"),
+        os.path.join(ci_dir, "src", "test"),
+    )
     # Skip copying the artifacts in the bin and lib as those directories will
     # be missing when the core build is not enabled.
     if not FLAGS.no_core_build:
-        cmake_script.cpdir(os.path.join(repo_install_dir, 'bin'), ci_dir)
-        cmake_script.mkdir(os.path.join(ci_dir, 'lib'))
+        cmake_script.cpdir(os.path.join(repo_install_dir, "bin"), ci_dir)
+        cmake_script.mkdir(os.path.join(ci_dir, "lib"))
         cmake_script.cp(
-            os.path.join(repo_install_dir, 'lib',
-                         'libtritonrepoagent_relocation.so'),
-            os.path.join(ci_dir, 'lib'))
+            os.path.join(repo_install_dir, "lib", "libtritonrepoagent_relocation.so"),
+            os.path.join(ci_dir, "lib"),
+        )
 
     # Some of the backends are needed for CI testing
-    cmake_script.mkdir(os.path.join(ci_dir, 'backends'))
-    for be in ('identity', 'repeat', 'square'):
-        be_install_dir = os.path.join(build_dir, be, 'install', 'backends', be)
-        if target_platform() == 'windows':
-            cmake_script.cmd(f'if (Test-Path -Path {be_install_dir}) {{')
+    cmake_script.mkdir(os.path.join(ci_dir, "backends"))
+    for be in ("identity", "repeat", "square"):
+        be_install_dir = os.path.join(build_dir, be, "install", "backends", be)
+        if target_platform() == "windows":
+            cmake_script.cmd(f"if (Test-Path -Path {be_install_dir}) {{")
         else:
-            cmake_script.cmd(f'if [[ -e {be_install_dir} ]]; then')
-        cmake_script.cpdir(be_install_dir, os.path.join(ci_dir, 'backends'))
-        cmake_script.cmd('}' if target_platform() == 'windows' else 'fi')
+            cmake_script.cmd(f"if [[ -e {be_install_dir} ]]; then")
+        cmake_script.cpdir(be_install_dir, os.path.join(ci_dir, "backends"))
+        cmake_script.cmd("}" if target_platform() == "windows" else "fi")
 
     # Some of the unit-test built backends are needed for CI testing
-    cmake_script.mkdir(
-        os.path.join(ci_dir, 'tritonbuild', 'tritonserver', 'backends'))
-    for be in ('query', 'implicit_state', 'sequence', 'dyna_sequence',
-               'distributed_addsub'):
-        be_install_dir = os.path.join(repo_install_dir, 'backends', be)
-        if target_platform() == 'windows':
-            cmake_script.cmd(f'if (Test-Path -Path {be_install_dir}) {{')
+    cmake_script.mkdir(os.path.join(ci_dir, "tritonbuild", "tritonserver", "backends"))
+    for be in (
+        "query",
+        "implicit_state",
+        "sequence",
+        "dyna_sequence",
+        "distributed_addsub",
+    ):
+        be_install_dir = os.path.join(repo_install_dir, "backends", be)
+        if target_platform() == "windows":
+            cmake_script.cmd(f"if (Test-Path -Path {be_install_dir}) {{")
         else:
-            cmake_script.cmd(f'if [[ -e {be_install_dir} ]]; then')
+            cmake_script.cmd(f"if [[ -e {be_install_dir} ]]; then")
         cmake_script.cpdir(
             be_install_dir,
-            os.path.join(ci_dir, 'tritonbuild', 'tritonserver', 'backends'))
-        cmake_script.cmd('}' if target_platform() == 'windows' else 'fi')
+            os.path.join(ci_dir, "tritonbuild", "tritonserver", "backends"),
+        )
+        cmake_script.cmd("}" if target_platform() == "windows" else "fi")
 
     # The onnxruntime_backend build produces some artifacts that
     # are needed for CI testing.
-    if 'onnxruntime' in backends:
-        ort_install_dir = os.path.join(build_dir, 'onnxruntime', 'install')
-        cmake_script.mkdir(os.path.join(ci_dir, 'qa', 'L0_custom_ops'))
+    if "onnxruntime" in backends:
+        ort_install_dir = os.path.join(build_dir, "onnxruntime", "install")
+        cmake_script.mkdir(os.path.join(ci_dir, "qa", "L0_custom_ops"))
         cmake_script.cp(
-            os.path.join(ort_install_dir, 'test', 'libcustom_op_library.so'),
-            os.path.join(ci_dir, 'qa', 'L0_custom_ops'))
+            os.path.join(ort_install_dir, "test", "libcustom_op_library.so"),
+            os.path.join(ci_dir, "qa", "L0_custom_ops"),
+        )
         cmake_script.cp(
-            os.path.join(ort_install_dir, 'test', 'custom_op_test.onnx'),
-            os.path.join(ci_dir, 'qa', 'L0_custom_ops'))
+            os.path.join(ort_install_dir, "test", "custom_op_test.onnx"),
+            os.path.join(ci_dir, "qa", "L0_custom_ops"),
+        )
         # [WIP] other way than wildcard?
-        backend_tests = os.path.join(build_dir, 'onnxruntime', 'test', "*")
-        cmake_script.cpdir(backend_tests, os.path.join(ci_dir, 'qa'))
+        backend_tests = os.path.join(build_dir, "onnxruntime", "test", "*")
+        cmake_script.cpdir(backend_tests, os.path.join(ci_dir, "qa"))
 
     # Need the build area for some backends so that they can be
     # rebuilt with specific options.
-    cmake_script.mkdir(os.path.join(ci_dir, 'tritonbuild'))
-    for be in ('identity', 'python'):
+    cmake_script.mkdir(os.path.join(ci_dir, "tritonbuild"))
+    for be in ("identity", "python"):
         if be in backends:
-            cmake_script.rmdir(os.path.join(build_dir, be, 'build'))
-            cmake_script.rmdir(os.path.join(build_dir, be, 'install'))
-            cmake_script.cpdir(os.path.join(build_dir, be),
-                               os.path.join(ci_dir, 'tritonbuild'))
+            cmake_script.rmdir(os.path.join(build_dir, be, "build"))
+            cmake_script.rmdir(os.path.join(build_dir, be, "install"))
+            cmake_script.cpdir(
+                os.path.join(build_dir, be), os.path.join(ci_dir, "tritonbuild")
+            )
 
     cmake_script.comment()
-    cmake_script.comment('end Triton CI artifacts')
+    cmake_script.comment("end Triton CI artifacts")
     cmake_script.commentln(8)
     cmake_script.blankln()
 
 
 def finalize_build(cmake_script, install_dir, ci_dir):
-    cmake_script.cmd(f'chmod -R a+rw {install_dir}')
-    cmake_script.cmd(f'chmod -R a+rw {ci_dir}')
+    cmake_script.cmd(f"chmod -R a+rw {install_dir}")
+    cmake_script.cmd(f"chmod -R a+rw {ci_dir}")
 
 
 def enable_all():
-    if target_platform() != 'windows':
+    if target_platform() != "windows":
         all_backends = [
-            'ensemble', 'identity', 'square', 'repeat', 'tensorflow',
-            'onnxruntime', 'python', 'dali', 'pytorch', 'openvino', 'fil',
-            'tensorrt'
+            "ensemble",
+            "identity",
+            "square",
+            "repeat",
+            "tensorflow",
+            "onnxruntime",
+            "python",
+            "dali",
+            "pytorch",
+            "openvino",
+            "fil",
+            "tensorrt",
         ]
-        all_repoagents = ['checksum']
-        all_caches = ['local', 'redis']
-        all_filesystems = ['gcs', 's3', 'azure_storage']
-        all_endpoints = ['http', 'grpc', 'sagemaker', 'vertex-ai']
+        all_repoagents = ["checksum"]
+        all_caches = ["local", "redis"]
+        all_filesystems = ["gcs", "s3", "azure_storage"]
+        all_endpoints = ["http", "grpc", "sagemaker", "vertex-ai"]
 
         FLAGS.enable_logging = True
         FLAGS.enable_stats = True
@@ -1801,13 +1950,18 @@ def enable_all():
         FLAGS.enable_gpu = True
     else:
         all_backends = [
-            'ensemble', 'identity', 'square', 'repeat', 'onnxruntime',
-            'openvino', 'tensorrt'
+            "ensemble",
+            "identity",
+            "square",
+            "repeat",
+            "onnxruntime",
+            "openvino",
+            "tensorrt",
         ]
-        all_repoagents = ['checksum']
-        all_caches = ['local', 'redis']
+        all_repoagents = ["checksum"]
+        all_caches = ["local", "redis"]
         all_filesystems = []
-        all_endpoints = ['http', 'grpc']
+        all_endpoints = ["http", "grpc"]
 
         FLAGS.enable_logging = True
         FLAGS.enable_stats = True
@@ -1816,7 +1970,7 @@ def enable_all():
 
     requested_backends = []
     for be in FLAGS.backend:
-        parts = be.split(':')
+        parts = be.split(":")
         requested_backends += [parts[0]]
     for be in all_backends:
         if be not in requested_backends:
@@ -1824,7 +1978,7 @@ def enable_all():
 
     requested_repoagents = []
     for ra in FLAGS.repoagent:
-        parts = ra.split(':')
+        parts = ra.split(":")
         requested_repoagents += [parts[0]]
     for ra in all_repoagents:
         if ra not in requested_repoagents:
@@ -1832,7 +1986,7 @@ def enable_all():
 
     requested_caches = []
     for cache in FLAGS.cache:
-        parts = cache.split(':')
+        parts = cache.split(":")
         requested_caches += [parts[0]]
     for cache in all_caches:
         if cache not in requested_caches:
@@ -1847,298 +2001,296 @@ def enable_all():
             FLAGS.endpoint += [ep]
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
     group_qv = parser.add_mutually_exclusive_group()
-    group_qv.add_argument('-q',
-                          '--quiet',
-                          action="store_true",
-                          required=False,
-                          help='Disable console output.')
-    group_qv.add_argument('-v',
-                          '--verbose',
-                          action="store_true",
-                          required=False,
-                          help='Enable verbose output.')
+    group_qv.add_argument(
+        "-q",
+        "--quiet",
+        action="store_true",
+        required=False,
+        help="Disable console output.",
+    )
+    group_qv.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        help="Enable verbose output.",
+    )
 
     parser.add_argument(
-        '--dryrun',
+        "--dryrun",
         action="store_true",
         required=False,
-        help='Output the build scripts, but do not perform build.')
-    parser.add_argument('--no-container-build',
-                        action="store_true",
-                        required=False,
-                        help='Do not use Docker container for build.')
+        help="Output the build scripts, but do not perform build.",
+    )
     parser.add_argument(
-        '--no-container-interactive',
+        "--no-container-build",
         action="store_true",
         required=False,
-        help=
-        'Do not use -it argument to "docker run" when performing container build.'
+        help="Do not use Docker container for build.",
     )
     parser.add_argument(
-        '--no-container-pull',
+        "--no-container-interactive",
         action="store_true",
         required=False,
-        help='Do not use Docker --pull argument when building container.')
+        help='Do not use -it argument to "docker run" when performing container build.',
+    )
     parser.add_argument(
-        '--container-memory',
+        "--no-container-pull",
+        action="store_true",
+        required=False,
+        help="Do not use Docker --pull argument when building container.",
+    )
+    parser.add_argument(
+        "--container-memory",
         default=None,
         required=False,
-        help='Value for Docker --memory argument. Used only for windows builds.'
+        help="Value for Docker --memory argument. Used only for windows builds.",
     )
     parser.add_argument(
-        '--target-platform',
+        "--target-platform",
         required=False,
         default=None,
-        help=
-        'Target platform for build, can be "linux", "windows" or "jetpack". If not specified, build targets the current platform.'
+        help='Target platform for build, can be "linux", "windows" or "jetpack". If not specified, build targets the current platform.',
     )
     parser.add_argument(
-        '--target-machine',
+        "--target-machine",
         required=False,
         default=None,
-        help=
-        'Target machine/architecture for build. If not specified, build targets the current machine/architecture.'
-    )
-
-    parser.add_argument('--build-id',
-                        type=str,
-                        required=False,
-                        help='Build ID associated with the build.')
-    parser.add_argument('--build-sha',
-                        type=str,
-                        required=False,
-                        help='SHA associated with the build.')
+        help="Target machine/architecture for build. If not specified, build targets the current machine/architecture.",
+    )
+
+    parser.add_argument(
+        "--build-id",
+        type=str,
+        required=False,
+        help="Build ID associated with the build.",
+    )
+    parser.add_argument(
+        "--build-sha", type=str, required=False, help="SHA associated with the build."
+    )
     parser.add_argument(
-        '--build-dir',
+        "--build-dir",
         type=str,
         required=False,
-        help=
-        'Build directory. All repo clones and builds will be performed in this directory.'
+        help="Build directory. All repo clones and builds will be performed in this directory.",
     )
     parser.add_argument(
-        '--install-dir',
+        "--install-dir",
         type=str,
         required=False,
         default=None,
-        help='Install directory, default is <builddir>/opt/tritonserver.')
+        help="Install directory, default is <builddir>/opt/tritonserver.",
+    )
     parser.add_argument(
-        '--cmake-dir',
+        "--cmake-dir",
         type=str,
         required=False,
-        help='Directory containing the CMakeLists.txt file for Triton server.')
+        help="Directory containing the CMakeLists.txt file for Triton server.",
+    )
     parser.add_argument(
-        '--tmp-dir',
+        "--tmp-dir",
         type=str,
         required=False,
-        default='/tmp',
-        help=
-        'Temporary directory used for building inside docker. Default is /tmp.')
+        default="/tmp",
+        help="Temporary directory used for building inside docker. Default is /tmp.",
+    )
     parser.add_argument(
-        '--library-paths',
-        action='append',
+        "--library-paths",
+        action="append",
         required=False,
         default=None,
-        help=
-        'Specify library paths for respective backends in build as <backend-name>[:<library_path>].'
+        help="Specify library paths for respective backends in build as <backend-name>[:<library_path>].",
     )
     parser.add_argument(
-        '--build-type',
+        "--build-type",
         required=False,
-        default='Release',
-        help=
-        'Build type, one of "Release", "Debug", "RelWithDebInfo" or "MinSizeRel". Default is "Release".'
+        default="Release",
+        help='Build type, one of "Release", "Debug", "RelWithDebInfo" or "MinSizeRel". Default is "Release".',
     )
     parser.add_argument(
-        '-j',
-        '--build-parallel',
+        "-j",
+        "--build-parallel",
         type=int,
         required=False,
         default=None,
-        help='Build parallelism. Defaults to 2 * number-of-cores.')
+        help="Build parallelism. Defaults to 2 * number-of-cores.",
+    )
 
     parser.add_argument(
-        '--github-organization',
+        "--github-organization",
         type=str,
         required=False,
-        default='https://github.com/triton-inference-server',
-        help=
-        'The GitHub organization containing the repos used for the build. Defaults to "https://github.com/triton-inference-server".'
+        default="https://github.com/triton-inference-server",
+        help='The GitHub organization containing the repos used for the build. Defaults to "https://github.com/triton-inference-server".',
     )
     parser.add_argument(
-        '--version',
+        "--version",
         type=str,
         required=False,
-        help=
-        'The Triton version. If not specified defaults to the value in the TRITON_VERSION file.'
+        help="The Triton version. If not specified defaults to the value in the TRITON_VERSION file.",
     )
     parser.add_argument(
-        '--container-version',
+        "--container-version",
         type=str,
         required=False,
-        help=
-        'The Triton container version to build. If not specified the container version will be chosen automatically based on --version value.'
+        help="The Triton container version to build. If not specified the container version will be chosen automatically based on --version value.",
     )
     parser.add_argument(
-        '--upstream-container-version',
+        "--upstream-container-version",
         type=str,
         required=False,
-        help=
-        'The upstream container version to use for the build. If not specified the upstream container version will be chosen automatically based on --version value.'
+        help="The upstream container version to use for the build. If not specified the upstream container version will be chosen automatically based on --version value.",
     )
     parser.add_argument(
-        '--container-prebuild-command',
+        "--container-prebuild-command",
         type=str,
         required=False,
-        help=
-        'When performing a container build, this command will be executed within the container just before the build it performed.'
+        help="When performing a container build, this command will be executed within the container just before the build it performed.",
     )
     parser.add_argument(
-        '--no-container-source',
+        "--no-container-source",
         action="store_true",
         required=False,
-        help='Do not include OSS source code in Docker container.')
+        help="Do not include OSS source code in Docker container.",
+    )
     parser.add_argument(
-        '--image',
-        action='append',
+        "--image",
+        action="append",
         required=False,
-        help=
-        'Use specified Docker image in build as <image-name>,<full-image-name>. <image-name> can be "base", "gpu-base", "tensorflow", or "pytorch".'
+        help='Use specified Docker image in build as <image-name>,<full-image-name>. <image-name> can be "base", "gpu-base", "tensorflow", or "pytorch".',
     )
 
     parser.add_argument(
-        '--enable-all',
+        "--enable-all",
+        action="store_true",
+        required=False,
+        help="Enable all standard released Triton features, backends, repository agents, caches, endpoints and file systems.",
+    )
+    parser.add_argument(
+        "--enable-logging", action="store_true", required=False, help="Enable logging."
+    )
+    parser.add_argument(
+        "--enable-stats",
         action="store_true",
         required=False,
-        help=
-        'Enable all standard released Triton features, backends, repository agents, caches, endpoints and file systems.'
-    )
-    parser.add_argument('--enable-logging',
-                        action="store_true",
-                        required=False,
-                        help='Enable logging.')
-    parser.add_argument('--enable-stats',
-                        action="store_true",
-                        required=False,
-                        help='Enable statistics collection.')
-    parser.add_argument('--enable-metrics',
-                        action="store_true",
-                        required=False,
-                        help='Enable metrics reporting.')
-    parser.add_argument('--enable-gpu-metrics',
-                        action="store_true",
-                        required=False,
-                        help='Include GPU metrics in reported metrics.')
-    parser.add_argument('--enable-cpu-metrics',
-                        action="store_true",
-                        required=False,
-                        help='Include CPU metrics in reported metrics.')
-    parser.add_argument('--enable-tracing',
-                        action="store_true",
-                        required=False,
-                        help='Enable tracing.')
-    parser.add_argument('--enable-nvtx',
-                        action="store_true",
-                        required=False,
-                        help='Enable NVTX.')
-    parser.add_argument('--enable-gpu',
-                        action="store_true",
-                        required=False,
-                        help='Enable GPU support.')
-    parser.add_argument('--enable-mali-gpu',
-                        action="store_true",
-                        required=False,
-                        help='Enable ARM MALI GPU support.')
+        help="Enable statistics collection.",
+    )
     parser.add_argument(
-        '--min-compute-capability',
+        "--enable-metrics",
+        action="store_true",
+        required=False,
+        help="Enable metrics reporting.",
+    )
+    parser.add_argument(
+        "--enable-gpu-metrics",
+        action="store_true",
+        required=False,
+        help="Include GPU metrics in reported metrics.",
+    )
+    parser.add_argument(
+        "--enable-cpu-metrics",
+        action="store_true",
+        required=False,
+        help="Include CPU metrics in reported metrics.",
+    )
+    parser.add_argument(
+        "--enable-tracing", action="store_true", required=False, help="Enable tracing."
+    )
+    parser.add_argument(
+        "--enable-nvtx", action="store_true", required=False, help="Enable NVTX."
+    )
+    parser.add_argument(
+        "--enable-gpu", action="store_true", required=False, help="Enable GPU support."
+    )
+    parser.add_argument(
+        "--enable-mali-gpu",
+        action="store_true",
+        required=False,
+        help="Enable ARM MALI GPU support.",
+    )
+    parser.add_argument(
+        "--min-compute-capability",
         type=str,
         required=False,
-        default='6.0',
-        help='Minimum CUDA compute capability supported by server.')
+        default="6.0",
+        help="Minimum CUDA compute capability supported by server.",
+    )
 
     parser.add_argument(
-        '--endpoint',
-        action='append',
+        "--endpoint",
+        action="append",
         required=False,
-        help=
-        'Include specified endpoint in build. Allowed values are "grpc", "http", "vertex-ai" and "sagemaker".'
+        help='Include specified endpoint in build. Allowed values are "grpc", "http", "vertex-ai" and "sagemaker".',
     )
     parser.add_argument(
-        '--filesystem',
-        action='append',
+        "--filesystem",
+        action="append",
         required=False,
-        help=
-        'Include specified filesystem in build. Allowed values are "gcs", "azure_storage" and "s3".'
+        help='Include specified filesystem in build. Allowed values are "gcs", "azure_storage" and "s3".',
     )
     parser.add_argument(
-        '--no-core-build',
+        "--no-core-build",
         action="store_true",
         required=False,
-        help='Do not build Triton core shared library or executable.')
+        help="Do not build Triton core shared library or executable.",
+    )
     parser.add_argument(
-        '--backend',
-        action='append',
+        "--backend",
+        action="append",
         required=False,
-        help=
-        'Include specified backend in build as <backend-name>[:<repo-tag>]. If <repo-tag> starts with "pull/" then it refers to a pull-request reference, otherwise <repo-tag> indicates the git tag/branch to use for the build. If the version is non-development then the default <repo-tag> is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default <repo-tag> is "main" (e.g. version YY.MMdev -> branch main).'
+        help='Include specified backend in build as <backend-name>[:<repo-tag>]. If <repo-tag> starts with "pull/" then it refers to a pull-request reference, otherwise <repo-tag> indicates the git tag/branch to use for the build. If the version is non-development then the default <repo-tag> is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default <repo-tag> is "main" (e.g. version YY.MMdev -> branch main).',
     )
     parser.add_argument(
-        '--repo-tag',
-        action='append',
+        "--repo-tag",
+        action="append",
         required=False,
-        help=
-        'The version of a component to use in the build as <component-name>:<repo-tag>. <component-name> can be "common", "core", "backend" or "thirdparty". <repo-tag> indicates the git tag/branch to use for the build. Currently <repo-tag> does not support pull-request reference. If the version is non-development then the default <repo-tag> is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default <repo-tag> is "main" (e.g. version YY.MMdev -> branch main).'
+        help='The version of a component to use in the build as <component-name>:<repo-tag>. <component-name> can be "common", "core", "backend" or "thirdparty". <repo-tag> indicates the git tag/branch to use for the build. Currently <repo-tag> does not support pull-request reference. If the version is non-development then the default <repo-tag> is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default <repo-tag> is "main" (e.g. version YY.MMdev -> branch main).',
     )
     parser.add_argument(
-        '--repoagent',
-        action='append',
+        "--repoagent",
+        action="append",
         required=False,
-        help=
-        'Include specified repo agent in build as <repoagent-name>[:<repo-tag>]. If <repo-tag> starts with "pull/" then it refers to a pull-request reference, otherwise <repo-tag> indicates the git tag/branch to use for the build. If the version is non-development then the default <repo-tag> is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default <repo-tag> is "main" (e.g. version YY.MMdev -> branch main).'
+        help='Include specified repo agent in build as <repoagent-name>[:<repo-tag>]. If <repo-tag> starts with "pull/" then it refers to a pull-request reference, otherwise <repo-tag> indicates the git tag/branch to use for the build. If the version is non-development then the default <repo-tag> is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default <repo-tag> is "main" (e.g. version YY.MMdev -> branch main).',
     )
     parser.add_argument(
-        '--cache',
-        action='append',
+        "--cache",
+        action="append",
         required=False,
-        help=
-        'Include specified cache in build as <cache-name>[:<repo-tag>]. If <repo-tag> starts with "pull/" then it refers to a pull-request reference, otherwise <repo-tag> indicates the git tag/branch to use for the build. If the version is non-development then the default <repo-tag> is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default <repo-tag> is "main" (e.g. version YY.MMdev -> branch main).'
+        help='Include specified cache in build as <cache-name>[:<repo-tag>]. If <repo-tag> starts with "pull/" then it refers to a pull-request reference, otherwise <repo-tag> indicates the git tag/branch to use for the build. If the version is non-development then the default <repo-tag> is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default <repo-tag> is "main" (e.g. version YY.MMdev -> branch main).',
     )
     parser.add_argument(
-        '--no-force-clone',
+        "--no-force-clone",
         action="store_true",
         default=False,
-        help='Do not create fresh clones of repos that have already been cloned.'
+        help="Do not create fresh clones of repos that have already been cloned.",
     )
     parser.add_argument(
-        '--extra-core-cmake-arg',
-        action='append',
+        "--extra-core-cmake-arg",
+        action="append",
         required=False,
-        help=
-        'Extra CMake argument as <name>=<value>. The argument is passed to CMake as -D<name>=<value> and is included after all CMake arguments added by build.py for the core builds.'
+        help="Extra CMake argument as <name>=<value>. The argument is passed to CMake as -D<name>=<value> and is included after all CMake arguments added by build.py for the core builds.",
     )
     parser.add_argument(
-        '--override-core-cmake-arg',
-        action='append',
+        "--override-core-cmake-arg",
+        action="append",
         required=False,
-        help=
-        'Override specified CMake argument in the build as <name>=<value>. The argument is passed to CMake as -D<name>=<value>. This flag only impacts CMake arguments that are used by build.py. To unconditionally add a CMake argument to the core build use --extra-core-cmake-arg.'
+        help="Override specified CMake argument in the build as <name>=<value>. The argument is passed to CMake as -D<name>=<value>. This flag only impacts CMake arguments that are used by build.py. To unconditionally add a CMake argument to the core build use --extra-core-cmake-arg.",
     )
     parser.add_argument(
-        '--extra-backend-cmake-arg',
-        action='append',
+        "--extra-backend-cmake-arg",
+        action="append",
         required=False,
-        help=
-        'Extra CMake argument for a backend build as <backend>:<name>=<value>. The argument is passed to CMake as -D<name>=<value> and is included after all CMake arguments added by build.py for the backend.'
+        help="Extra CMake argument for a backend build as <backend>:<name>=<value>. The argument is passed to CMake as -D<name>=<value> and is included after all CMake arguments added by build.py for the backend.",
     )
     parser.add_argument(
-        '--override-backend-cmake-arg',
-        action='append',
+        "--override-backend-cmake-arg",
+        action="append",
         required=False,
-        help=
-        'Override specified backend CMake argument in the build as <backend>:<name>=<value>. The argument is passed to CMake as -D<name>=<value>. This flag only impacts CMake arguments that are used by build.py. To unconditionally add a CMake argument to the backend build use --extra-backend-cmake-arg.'
+        help="Override specified backend CMake argument in the build as <backend>:<name>=<value>. The argument is passed to CMake as -D<name>=<value>. This flag only impacts CMake arguments that are used by build.py. To unconditionally add a CMake argument to the backend build use --extra-backend-cmake-arg.",
     )
 
     FLAGS = parser.parse_args()
@@ -2180,64 +2332,63 @@ def enable_all():
     # set.
     if FLAGS.no_container_build:
         if FLAGS.build_dir is None:
-            fail('--no-container-build requires --build-dir')
+            fail("--no-container-build requires --build-dir")
         if FLAGS.install_dir is None:
-            FLAGS.install_dir = os.path.join(FLAGS.build_dir, "opt",
-                                             "tritonserver")
+            FLAGS.install_dir = os.path.join(FLAGS.build_dir, "opt", "tritonserver")
         if FLAGS.cmake_dir is None:
             FLAGS.cmake_dir = THIS_SCRIPT_DIR
     else:
         if FLAGS.build_dir is not None:
-            fail('--build-dir must not be set for container-based build')
+            fail("--build-dir must not be set for container-based build")
         if FLAGS.install_dir is not None:
-            fail('--install-dir must not be set for container-based build')
+            fail("--install-dir must not be set for container-based build")
         if FLAGS.cmake_dir is not None:
-            fail('--cmake-dir must not be set for container-based build')
-        FLAGS.build_dir = os.path.join(THIS_SCRIPT_DIR, 'build')
+            fail("--cmake-dir must not be set for container-based build")
+        FLAGS.build_dir = os.path.join(THIS_SCRIPT_DIR, "build")
 
     # Determine the versions. Start with Triton version, if --version
     # is not explicitly specified read from TRITON_VERSION file.
     if FLAGS.version is None:
-        with open(os.path.join(THIS_SCRIPT_DIR, 'TRITON_VERSION'),
-                  "r") as vfile:
+        with open(os.path.join(THIS_SCRIPT_DIR, "TRITON_VERSION"), "r") as vfile:
             FLAGS.version = vfile.readline().strip()
 
     if FLAGS.build_parallel is None:
         FLAGS.build_parallel = multiprocessing.cpu_count() * 2
 
-    log('Building Triton Inference Server')
-    log('platform {}'.format(target_platform()))
-    log('machine {}'.format(target_machine()))
-    log('version {}'.format(FLAGS.version))
-    log('build dir {}'.format(FLAGS.build_dir))
-    log('install dir {}'.format(FLAGS.install_dir))
-    log('cmake dir {}'.format(FLAGS.cmake_dir))
+    log("Building Triton Inference Server")
+    log("platform {}".format(target_platform()))
+    log("machine {}".format(target_machine()))
+    log("version {}".format(FLAGS.version))
+    log("build dir {}".format(FLAGS.build_dir))
+    log("install dir {}".format(FLAGS.install_dir))
+    log("cmake dir {}".format(FLAGS.cmake_dir))
 
     # Determine the default repo-tag that should be used for images,
     # backends, repo-agents, and caches if a repo-tag is not given
     # explicitly. For release branches we use the release branch as
     # the default, otherwise we use 'main'.
-    default_repo_tag = 'main'
+    default_repo_tag = "main"
     cver = FLAGS.container_version
     if cver is None:
         if FLAGS.version not in TRITON_VERSION_MAP:
             fail(
-                'unable to determine default repo-tag, container version not known for {}'
-                .format(FLAGS.version))
+                "unable to determine default repo-tag, container version not known for {}".format(
+                    FLAGS.version
+                )
+            )
         cver = TRITON_VERSION_MAP[FLAGS.version][0]
-    if not cver.endswith('dev'):
-        default_repo_tag = 'r' + cver
-    log('default repo-tag: {}'.format(default_repo_tag))
+    if not cver.endswith("dev"):
+        default_repo_tag = "r" + cver
+    log("default repo-tag: {}".format(default_repo_tag))
 
     # For other versions use the TRITON_VERSION_MAP unless explicitly
     # given.
     FLAGS.container_version, FLAGS.upstream_container_version = container_versions(
-        FLAGS.version, FLAGS.container_version,
-        FLAGS.upstream_container_version)
+        FLAGS.version, FLAGS.container_version, FLAGS.upstream_container_version
+    )
 
-    log('container version {}'.format(FLAGS.container_version))
-    log('upstream container version {}'.format(
-        FLAGS.upstream_container_version))
+    log("container version {}".format(FLAGS.container_version))
+    log("upstream container version {}".format(FLAGS.upstream_container_version))
 
     for ep in FLAGS.endpoint:
         log(f'endpoint "{ep}"')
@@ -2247,22 +2398,22 @@ def enable_all():
     # Initialize map of backends to build and repo-tag for each.
     backends = {}
     for be in FLAGS.backend:
-        parts = be.split(':')
+        parts = be.split(":")
         if len(parts) == 1:
             parts.append(default_repo_tag)
-        if parts[0] == 'tensorflow1':
+        if parts[0] == "tensorflow1":
             fail(
-                'Starting from Triton version 23.04, support for TensorFlow 1 has been discontinued. Please switch to Tensorflow 2.'
+                "Starting from Triton version 23.04, support for TensorFlow 1 has been discontinued. Please switch to Tensorflow 2."
             )
-        if parts[0] == 'tensorflow2':
-            parts[0] = 'tensorflow'
+        if parts[0] == "tensorflow2":
+            parts[0] = "tensorflow"
         log('backend "{}" at tag/branch "{}"'.format(parts[0], parts[1]))
         backends[parts[0]] = parts[1]
 
     # Initialize map of repo agents to build and repo-tag for each.
     repoagents = {}
     for be in FLAGS.repoagent:
-        parts = be.split(':')
+        parts = be.split(":")
         if len(parts) == 1:
             parts.append(default_repo_tag)
         log('repoagent "{}" at tag/branch "{}"'.format(parts[0], parts[1]))
@@ -2271,7 +2422,7 @@ def enable_all():
     # Initialize map of caches to build and repo-tag for each.
     caches = {}
     for be in FLAGS.cache:
-        parts = be.split(':')
+        parts = be.split(":")
         if len(parts) == 1:
             parts.append(default_repo_tag)
         log('cache "{}" at tag/branch "{}"'.format(parts[0], parts[1]))
@@ -2280,102 +2431,104 @@ def enable_all():
     # Initialize map of docker images.
     images = {}
     for img in FLAGS.image:
-        parts = img.split(',')
+        parts = img.split(",")
         fail_if(
-            len(parts) != 2,
-            '--image must specify <image-name>,<full-image-registry>')
+            len(parts) != 2, "--image must specify <image-name>,<full-image-registry>"
+        )
         fail_if(
             parts[0]
-            not in ['base', 'gpu-base', 'pytorch', 'tensorflow',
-                    'tensorflow2'], 'unsupported value for --image')
+            not in ["base", "gpu-base", "pytorch", "tensorflow", "tensorflow2"],
+            "unsupported value for --image",
+        )
         log('image "{}": "{}"'.format(parts[0], parts[1]))
-        if parts[0] == 'tensorflow2':
-            parts[0] = 'tensorflow'
+        if parts[0] == "tensorflow2":
+            parts[0] = "tensorflow"
         images[parts[0]] = parts[1]
 
     # Initialize map of library paths for each backend.
     library_paths = {}
     for lpath in FLAGS.library_paths:
-        parts = lpath.split(':')
+        parts = lpath.split(":")
         if len(parts) == 2:
             log('backend "{}" library path "{}"'.format(parts[0], parts[1]))
-            if parts[0] == 'tensorflow2':
-                parts[0] = 'tensorflow'
+            if parts[0] == "tensorflow2":
+                parts[0] = "tensorflow"
             library_paths[parts[0]] = parts[1]
 
     # Parse any explicitly specified cmake arguments
     for cf in FLAGS.extra_core_cmake_arg:
-        parts = cf.split('=')
-        fail_if(
-            len(parts) != 2,
-            '--extra-core-cmake-arg must specify <name>=<value>')
+        parts = cf.split("=")
+        fail_if(len(parts) != 2, "--extra-core-cmake-arg must specify <name>=<value>")
         log('CMake core extra "-D{}={}"'.format(parts[0], parts[1]))
         EXTRA_CORE_CMAKE_FLAGS[parts[0]] = parts[1]
 
     for cf in FLAGS.override_core_cmake_arg:
-        parts = cf.split('=')
+        parts = cf.split("=")
         fail_if(
-            len(parts) != 2,
-            '--override-core-cmake-arg must specify <name>=<value>')
+            len(parts) != 2, "--override-core-cmake-arg must specify <name>=<value>"
+        )
         log('CMake core override "-D{}={}"'.format(parts[0], parts[1]))
         OVERRIDE_CORE_CMAKE_FLAGS[parts[0]] = parts[1]
 
     for cf in FLAGS.extra_backend_cmake_arg:
-        parts = cf.split(':', 1)
+        parts = cf.split(":", 1)
         fail_if(
             len(parts) != 2,
-            '--extra-backend-cmake-arg must specify <backend>:<name>=<value>')
+            "--extra-backend-cmake-arg must specify <backend>:<name>=<value>",
+        )
         be = parts[0]
-        parts = parts[1].split('=', 1)
+        parts = parts[1].split("=", 1)
         fail_if(
             len(parts) != 2,
-            '--extra-backend-cmake-arg must specify <backend>:<name>=<value>')
+            "--extra-backend-cmake-arg must specify <backend>:<name>=<value>",
+        )
         fail_if(
             be not in backends,
-            '--extra-backend-cmake-arg specifies backend "{}" which is not included in build'
-            .format(be))
+            '--extra-backend-cmake-arg specifies backend "{}" which is not included in build'.format(
+                be
+            ),
+        )
         log('backend "{}" CMake extra "-D{}={}"'.format(be, parts[0], parts[1]))
         if be not in EXTRA_BACKEND_CMAKE_FLAGS:
             EXTRA_BACKEND_CMAKE_FLAGS[be] = {}
         EXTRA_BACKEND_CMAKE_FLAGS[be][parts[0]] = parts[1]
 
     for cf in FLAGS.override_backend_cmake_arg:
-        parts = cf.split(':', 1)
+        parts = cf.split(":", 1)
         fail_if(
             len(parts) != 2,
-            '--override-backend-cmake-arg must specify <backend>:<name>=<value>'
+            "--override-backend-cmake-arg must specify <backend>:<name>=<value>",
         )
         be = parts[0]
-        parts = parts[1].split('=', 1)
+        parts = parts[1].split("=", 1)
         fail_if(
             len(parts) != 2,
-            '--override-backend-cmake-arg must specify <backend>:<name>=<value>'
+            "--override-backend-cmake-arg must specify <backend>:<name>=<value>",
         )
         fail_if(
             be not in backends,
-            '--override-backend-cmake-arg specifies backend "{}" which is not included in build'
-            .format(be))
-        log('backend "{}" CMake override "-D{}={}"'.format(
-            be, parts[0], parts[1]))
+            '--override-backend-cmake-arg specifies backend "{}" which is not included in build'.format(
+                be
+            ),
+        )
+        log('backend "{}" CMake override "-D{}={}"'.format(be, parts[0], parts[1]))
         if be not in OVERRIDE_BACKEND_CMAKE_FLAGS:
             OVERRIDE_BACKEND_CMAKE_FLAGS[be] = {}
         OVERRIDE_BACKEND_CMAKE_FLAGS[be][parts[0]] = parts[1]
 
     # Initialize map of common components and repo-tag for each.
     components = {
-        'common': default_repo_tag,
-        'core': default_repo_tag,
-        'backend': default_repo_tag,
-        'thirdparty': default_repo_tag
+        "common": default_repo_tag,
+        "core": default_repo_tag,
+        "backend": default_repo_tag,
+        "thirdparty": default_repo_tag,
     }
     for be in FLAGS.repo_tag:
-        parts = be.split(':')
-        fail_if(
-            len(parts) != 2,
-            '--repo-tag must specify <component-name>:<repo-tag>')
+        parts = be.split(":")
+        fail_if(len(parts) != 2, "--repo-tag must specify <component-name>:<repo-tag>")
         fail_if(
             parts[0] not in components,
-            '--repo-tag <component-name> must be "common", "core", "backend", or "thirdparty"'
+            '--repo-tag <component-name> must be "common", "core", "backend", or "thirdparty"',
         )
         components[parts[0]] = parts[1]
     for c in components:
@@ -2394,79 +2547,109 @@ def enable_all():
         # FLAGS.tmp_dir may be specified with "\" on Windows, adjust
         # to "/" for docker usage.
         script_build_dir = os.path.normpath(
-            os.path.join(FLAGS.tmp_dir, 'tritonbuild').replace("\\", "/"))
-        script_install_dir = os.path.normpath(
-            os.path.join(script_build_dir, 'install'))
-        script_ci_dir = os.path.normpath(os.path.join(script_build_dir, 'ci'))
-        if target_platform() == 'windows':
-            script_repo_dir = script_cmake_dir = os.path.normpath(
-                'c:/workspace')
+            os.path.join(FLAGS.tmp_dir, "tritonbuild").replace("\\", "/")
+        )
+        script_install_dir = os.path.normpath(os.path.join(script_build_dir, "install"))
+        script_ci_dir = os.path.normpath(os.path.join(script_build_dir, "ci"))
+        if target_platform() == "windows":
+            script_repo_dir = script_cmake_dir = os.path.normpath("c:/workspace")
         else:
-            script_repo_dir = script_cmake_dir = '/workspace'
+            script_repo_dir = script_cmake_dir = "/workspace"
 
-    script_name = 'cmake_build'
-    if target_platform() == 'windows':
-        script_name += '.ps1'
+    script_name = "cmake_build"
+    if target_platform() == "windows":
+        script_name += ".ps1"
 
     # Write the build script that invokes cmake for the core, backends, repo-agents, and caches.
     pathlib.Path(FLAGS.build_dir).mkdir(parents=True, exist_ok=True)
     with BuildScript(
-            os.path.join(FLAGS.build_dir, script_name),
-            verbose=FLAGS.verbose,
-            desc=('Build script for Triton Inference Server')) as cmake_script:
-
+        os.path.join(FLAGS.build_dir, script_name),
+        verbose=FLAGS.verbose,
+        desc=("Build script for Triton Inference Server"),
+    ) as cmake_script:
         # Run the container pre-build command if the cmake build is
         # being done within the build container.
         if not FLAGS.no_container_build and FLAGS.container_prebuild_command:
-            cmake_script.cmd(FLAGS.container_prebuild_command,
-                             check_exitcode=True)
+            cmake_script.cmd(FLAGS.container_prebuild_command, check_exitcode=True)
             cmake_script.blankln()
 
         # Commands to build the core shared library and the server executable.
         if not FLAGS.no_core_build:
-            core_build(cmake_script, script_repo_dir, script_cmake_dir,
-                       script_build_dir, script_install_dir, components,
-                       backends)
+            core_build(
+                cmake_script,
+                script_repo_dir,
+                script_cmake_dir,
+                script_build_dir,
+                script_install_dir,
+                components,
+                backends,
+            )
 
         # Commands to build each backend...
         for be in backends:
             # Core backends are not built separately from core so skip...
-            if (be in CORE_BACKENDS):
+            if be in CORE_BACKENDS:
                 continue
 
             # If armnn_tflite backend, source from external repo for git clone
-            if be == 'armnn_tflite':
-                github_organization = 'https://gitlab.com/arm-research/smarter/'
+            if be == "armnn_tflite":
+                github_organization = "https://gitlab.com/arm-research/smarter/"
             else:
                 github_organization = FLAGS.github_organization
 
-            backend_build(be, cmake_script, backends[be], script_build_dir,
-                          script_install_dir, github_organization, images,
-                          components, library_paths)
+            backend_build(
+                be,
+                cmake_script,
+                backends[be],
+                script_build_dir,
+                script_install_dir,
+                github_organization,
+                images,
+                components,
+                library_paths,
+            )
 
         # Commands to build each repo agent...
         for ra in repoagents:
-            repo_agent_build(ra, cmake_script, script_build_dir,
-                             script_install_dir, repoagent_repo, repoagents)
+            repo_agent_build(
+                ra,
+                cmake_script,
+                script_build_dir,
+                script_install_dir,
+                repoagent_repo,
+                repoagents,
+            )
 
         # Commands to build each cache...
         for cache in caches:
-            cache_build(cache, cmake_script, script_build_dir,
-                        script_install_dir, cache_repo, caches)
+            cache_build(
+                cache,
+                cmake_script,
+                script_build_dir,
+                script_install_dir,
+                cache_repo,
+                caches,
+            )
 
         # Commands needed only when building with Docker...
         if not FLAGS.no_container_build:
             # Commands to collect all the build artifacts needed for CI
             # testing.
-            cibase_build(cmake_script, script_repo_dir, script_cmake_dir,
-                         script_build_dir, script_install_dir, script_ci_dir,
-                         backends)
+            cibase_build(
+                cmake_script,
+                script_repo_dir,
+                script_cmake_dir,
+                script_build_dir,
+                script_install_dir,
+                script_ci_dir,
+                backends,
+            )
 
             # When building with Docker the install and ci artifacts
             # written to the build-dir while running the docker container
             # may have root ownership, so give them permissions to be
             # managed by all users on the host system.
-            if target_platform() != 'windows':
+            if target_platform() != "windows":
                 finalize_build(cmake_script, script_install_dir, script_ci_dir)
 
     # If --no-container-build is not specified then we perform the
@@ -2475,24 +2658,25 @@ def enable_all():
     # generate a few Dockerfiles and a top-level script that drives
     # the build process.
     if not FLAGS.no_container_build:
-        script_name = 'docker_build'
-        if target_platform() == 'windows':
-            script_name += '.ps1'
+        script_name = "docker_build"
+        if target_platform() == "windows":
+            script_name += ".ps1"
 
-        create_build_dockerfiles(script_build_dir, images, backends, repoagents,
-                                 caches, FLAGS.endpoint)
-        create_docker_build_script(script_name, script_install_dir,
-                                   script_ci_dir)
+        create_build_dockerfiles(
+            script_build_dir, images, backends, repoagents, caches, FLAGS.endpoint
+        )
+        create_docker_build_script(script_name, script_install_dir, script_ci_dir)
 
     # In not dry-run, execute the script to perform the build...  If a
     # container-based build is requested use 'docker_build' script,
     # otherwise build directly on this system using cmake script.
     if not FLAGS.dryrun:
-        if target_platform() == 'windows':
+        if target_platform() == "windows":
             p = subprocess.Popen(
-                ['powershell.exe', '-noexit', '-File', f'./{script_name}'],
-                cwd=FLAGS.build_dir)
+                ["powershell.exe", "-noexit", "-File", f"./{script_name}"],
+                cwd=FLAGS.build_dir,
+            )
         else:
-            p = subprocess.Popen([f'./{script_name}'], cwd=FLAGS.build_dir)
+            p = subprocess.Popen([f"./{script_name}"], cwd=FLAGS.build_dir)
         p.wait()
-        fail_if(p.returncode != 0, 'build failed')
\ No newline at end of file
+        fail_if(p.returncode != 0, "build failed")
diff --git a/issues.csv b/issues.csv
deleted file mode 100644
index d520a62427..0000000000
--- a/issues.csv
+++ /dev/null
@@ -1,2863 +0,0 @@
-6044,OPEN,Is there a triton endpoint to check if there is requests in the model or not?,,2023-07-11 21:27:12 +0000 UTC
-6042,OPEN,transformer model output mismatch,,2023-07-11 09:14:56 +0000 UTC
-6038,OPEN,Memory Leak When Using Python/TensorRT Backend,bug,2023-07-10 20:55:34 +0000 UTC
-6036,OPEN,How to convert an image into the input required by the model in Java,question,2023-07-07 14:21:43 +0000 UTC
-6035,OPEN,How can I get OpenVINO model configure example?,question,2023-07-07 12:38:49 +0000 UTC
-6034,OPEN,triton aiohttp client report "Timeout context manager should be used inside a task" error,question,2023-07-07 12:33:59 +0000 UTC
-6030,CLOSED,error running inference using grpc client for java,invalid, question,2023-07-07 14:16:14 +0000 UTC
-6029,CLOSED,[QST] when/where to exactly load the pytorch mode?,question,2023-07-06 04:17:57 +0000 UTC
-6028,OPEN,model lock for python backend Model Loading API,enhancement, question,2023-07-06 17:51:03 +0000 UTC
-6027,OPEN,Create a torch::multipy (torch::deploy) backend for eager mode serving,enhancement,2023-07-07 20:20:54 +0000 UTC
-6025,OPEN,About the deployment of the chat-glm model,question,2023-07-06 02:32:11 +0000 UTC
-6024,CLOSED,How to set up the config.pbtxt for 2 completely different shaped inputs?,,2023-07-05 03:24:43 +0000 UTC
-6023,CLOSED,There is a bug when I set the initial_state,invalid,2023-07-10 20:18:17 +0000 UTC
-6021,OPEN,model repository to use semantic versioning,enhancement,2023-07-05 14:15:38 +0000 UTC
-6020,OPEN,Support client to build with CMAKE_MSVC_RUNTIME_LIBRARY,enhancement,2023-07-05 22:55:51 +0000 UTC
-6015,OPEN,Build and run client in mac, ios and android,enhancement,2023-07-04 05:58:49 +0000 UTC
-6014,OPEN,Newer versions of triton server have a consirable slowdown in start time,bug, investigating,2023-07-07 18:31:56 +0000 UTC
-6013,CLOSED,Question About Batching Process in Triton,question,2023-07-06 15:31:08 +0000 UTC
-6012,CLOSED,Compiling error,question,2023-07-05 20:59:41 +0000 UTC
-6011,CLOSED,Prometheus metric nv_inference_request_summary_us missing in 23.06 release,investigating,2023-07-07 06:14:28 +0000 UTC
-6009,OPEN,memory_usage is [] on A6000s,,2023-07-03 12:17:06 +0000 UTC
-6008,OPEN,How can i identify the model instances in the instance group ?,question,2023-07-06 22:46:34 +0000 UTC
-6007,OPEN,How can we update models in all triton servers using mlflow triton plugin,,2023-06-29 22:03:11 +0000 UTC
-6004,CLOSED,Default Models not working,,2023-06-29 17:58:35 +0000 UTC
-6003,CLOSED,async_stream_infer has no keyword argument 'enable_empty_final_response',,2023-06-30 01:28:08 +0000 UTC
-6002,CLOSED,Dynamic batching does not work in decoupled model,,2023-07-03 01:54:37 +0000 UTC
-5999,CLOSED,json data,,2023-06-28 11:36:51 +0000 UTC
-5998,CLOSED,how to return json data,question,2023-06-29 02:08:54 +0000 UTC
-5997,OPEN,UNAVAILABLE: Internal: failed to load model 'yolov5': PytorchStreamReader failed locating file constants.pkl: file not found,question,2023-06-30 19:10:11 +0000 UTC
-5996,CLOSED,how to set the ensemble_scheduling when do inference of difference outputs,question,2023-07-05 06:03:13 +0000 UTC
-5989,OPEN,PeftModel work well on python backend?,question, investigating,2023-06-29 01:20:02 +0000 UTC
-5988,CLOSED,【PythonBackend】TRITONBACKEND_ResponseSend hangs,,2023-06-30 05:55:39 +0000 UTC
-5987,OPEN,OpenTelemetry Tracing allow to configure service name,enhancement,2023-06-26 19:19:47 +0000 UTC
-5983,OPEN,How to restart a model instance,enhancement,2023-06-28 00:09:02 +0000 UTC
-5982,OPEN,allow model parameters to be specified in ensemble config,enhancement,2023-06-23 19:50:14 +0000 UTC
-5978,CLOSED,[Question] What is the recommended way to run Triton ?,question,2023-06-23 01:50:28 +0000 UTC
-5977,CLOSED,how can i use triton_python_backend_utils?,question,2023-06-23 19:30:50 +0000 UTC
-5975,CLOSED,Incompatible TensorRT versions between nvcr.io/nvidia/tritonserver:23.05-py3 and nvcr.io/nvidia/tensorrt:23.05-py3 images,,2023-07-07 22:34:20 +0000 UTC
-5973,OPEN,Raise exception when falling back to pinned memory,enhancement,2023-06-21 18:47:52 +0000 UTC
-5972,OPEN,Error when using models with zero-length inputs,bug,2023-06-22 01:29:58 +0000 UTC
-5971,OPEN,CORS Issue,enhancement,2023-06-23 00:12:12 +0000 UTC
-5970,CLOSED,[Question] How does Triton run ensemble models ?,,2023-06-22 14:07:24 +0000 UTC
-5969,CLOSED,multi triton_python_backend cause image preprocess (torchvision) very slow.,question,2023-07-07 22:34:36 +0000 UTC
-5968,OPEN,LoRA support,enhancement,2023-07-10 23:45:47 +0000 UTC
-5964,OPEN,Could not load model using mlflow triton plugin with S3/minio as model repository,bug,2023-06-23 10:18:02 +0000 UTC
-5961,OPEN,Allow introspection and static analysis of `pb_utils` (Python backend),enhancement,2023-06-20 19:38:38 +0000 UTC
-5960,CLOSED,Error with float16 datatype,question,2023-07-07 22:35:34 +0000 UTC
-5959,OPEN,BF16 support for integrated TensorRT precision mode,enhancement,2023-06-20 19:31:18 +0000 UTC
-5958,CLOSED,Am I the only one who can't pull nvidia image from `nvcr.io/nvidia` ?,question,2023-06-21 18:17:31 +0000 UTC
-5957,OPEN,The performance of the model is greatly affected by the request distribution,,2023-07-11 02:37:40 +0000 UTC
-5956,CLOSED,Invalid argument - input 'INPUT' already exists in request,,2023-07-07 22:37:23 +0000 UTC
-5953,OPEN,Performance analyzer with real data byte size mismatch,question,2023-06-23 00:39:26 +0000 UTC
-5952,CLOSED,python backend error: c_python_backend_utils.TritonModelException: Tensor is stored in GPU and cannot be converted to NumPy,,2023-06-16 23:34:58 +0000 UTC
-5951,OPEN,Decoupled Models Hang Unexpectedly with No Clear Error Message,,2023-06-22 21:41:38 +0000 UTC
-5949,CLOSED,boost::interprocess::lock_exception,investigating,2023-07-06 16:58:43 +0000 UTC
-5948,CLOSED,socket timed out when an inference request is made using a client.py script,,2023-07-07 22:37:59 +0000 UTC
-5947,CLOSED,need InferenceRequest and InferenceResponse definitions to better test user codes,question,2023-07-07 22:37:50 +0000 UTC
-5944,OPEN,InferenceRequestNew cost time will slowly increase with the number of model reloads,,2023-06-28 12:52:07 +0000 UTC
-5943,CLOSED,Poll failed for model directory 'ensemble': output 'OUTPUT_0' for ensemble 'ensemble' is not written,,2023-07-06 15:35:35 +0000 UTC
-5942,OPEN,Why does a process still occupy the CPU after the server is started?,,2023-06-14 01:46:05 +0000 UTC
-5940,CLOSED,Questions about loading models,question,2023-07-07 22:38:10 +0000 UTC
-5938,CLOSED,TensorRT model inference speed varies dramatically depending on if an ONNX model is also loaded,,2023-06-13 20:06:27 +0000 UTC
-5934,OPEN,The Docker build process has failed.,,2023-06-13 00:54:27 +0000 UTC
-5933,CLOSED,version `GLIBCXX_3.4.30' not found (required by /opt/tritonserver/backends/python/triton_python_backend_stub,,2023-06-28 21:44:45 +0000 UTC
-5932,OPEN,How to request the deployed server through postman？,question,2023-06-13 02:04:28 +0000 UTC
-5931,OPEN,ERROR: The NVIDIA Driver is present, but CUDA failed to initialize.,,2023-07-07 10:18:27 +0000 UTC
-5929,CLOSED,How the aio stream client gets the response,,2023-06-29 09:39:54 +0000 UTC
-5927,OPEN,Metrics: number of requests in queue,enhancement,2023-06-13 04:52:38 +0000 UTC
-5926,CLOSED,[Python Backend] Dynamic batching does not work in decoupled model,,2023-06-12 19:20:32 +0000 UTC
-5920,CLOSED,Triton server returns Request Timeout error after a few hours of continuous inference,,2023-06-20 05:07:00 +0000 UTC
-5918,CLOSED,gRPC version mismatch between the Triton client and DeepStream,,2023-06-11 11:36:27 +0000 UTC
-5913,CLOSED,Yielding Tokens During LM Inference,,2023-07-01 11:41:46 +0000 UTC
-5912,CLOSED,Triton will initialize logging twice when starting,,2023-06-27 23:33:52 +0000 UTC
-5907,OPEN,onnxruntime_backend doesn't support loading models concurrently,,2023-06-07 16:39:18 +0000 UTC
-5906,CLOSED,Provide server configuration through file,,2023-06-06 10:57:43 +0000 UTC
-5905,OPEN,the c++ API of backends get output from tensrort and onnx is different,,2023-06-21 22:05:06 +0000 UTC
-5904,CLOSED,the c++ API of backends get output from tensrort and onnx,,2023-06-06 19:27:17 +0000 UTC
-5903,CLOSED,How to view server logs,question,2023-06-07 07:51:36 +0000 UTC
-5902,OPEN,how to serve thousands of models in python backend?,,2023-06-26 14:17:12 +0000 UTC
-5901,CLOSED,Python backend custom metrics: Unsupported TRITONSERVER_MetricKind,,2023-06-12 22:59:07 +0000 UTC
-5899,CLOSED,perf_analyzer removes arguments with empty contents,,2023-07-03 15:26:03 +0000 UTC
-5895,CLOSED,Python version used in backend updated to 3.10?,,2023-06-02 20:13:27 +0000 UTC
-5893,CLOSED,about triton client c++ api send float,,2023-07-07 22:53:53 +0000 UTC
-5891,CLOSED,Triton server produces inconsistent results,question,2023-06-07 05:08:22 +0000 UTC
-5890,CLOSED,Ensemble model versioning,question,2023-06-05 18:27:53 +0000 UTC
-5889,CLOSED,How to serve Python models on GPU,question,2023-06-23 20:06:35 +0000 UTC
-5887,CLOSED,Issues regarding the performance of cudashm.set_shared_memory_region(shm_input_handle, [img]),,2023-07-07 22:36:52 +0000 UTC
-5886,CLOSED,How to interpret perf_analyzer results,question,2023-07-07 22:58:28 +0000 UTC
-5879,CLOSED,[Question] Can not infer model because of batch in client request smaller than batch config in server,question,2023-06-02 02:02:51 +0000 UTC
-5874,CLOSED,how to config BLS model to instantiate multi BLS model,question,2023-07-07 22:59:14 +0000 UTC
-5872,CLOSED,How to convert pytorch model to onnx for use ragged batching?,,2023-05-30 12:44:38 +0000 UTC
-5870,CLOSED,multi-gpu error,,2023-05-30 22:14:19 +0000 UTC
-5869,CLOSED,Shared memory error occured when using Triton Inference Server in SageMaker Batch Transform.,question,2023-06-14 19:48:21 +0000 UTC
-5868,CLOSED,No space left on device on when deployed using python backend,,2023-05-30 05:29:58 +0000 UTC
-5867,CLOSED,Triton client InferenceServerException: Failed to process request due to TypeError,,2023-05-30 13:33:47 +0000 UTC
-5866,CLOSED,Model unable to support batching on triton inference server,question,2023-05-31 18:56:50 +0000 UTC
-5865,CLOSED,Unknown performance bottleneck when using ensemble model,,2023-06-23 19:57:04 +0000 UTC
-5864,CLOSED,Incorrect outputs in TRT model,question,2023-06-14 19:48:45 +0000 UTC
-5863,CLOSED,convert tensorrt sucessfully same version with triton tensorrt, but can not load,,2023-05-31 09:39:10 +0000 UTC
-5862,CLOSED,custom repo agent for model repo on s3,,2023-05-30 12:47:45 +0000 UTC
-5859,CLOSED,Why get huge matrix, cannot get detection results, when using C API to implement " densenet_onnx" model inference like simple.cc,,bug,2023-05-30 02:01:30 +0000 UTC
-5858,OPEN,tritonserver stopped at the following step, it seems model loaded fail,question,2023-07-10 16:14:04 +0000 UTC
-5853,OPEN,How to pass traceparent in the OpenTelemetry tracing feature,enhancement,2023-06-26 21:47:08 +0000 UTC
-5850,CLOSED,Model on DEVICE_CPU become 4x slower when it is part of an ensemble,,2023-05-24 15:59:45 +0000 UTC
-5848,CLOSED,how to use perf_analyzer without cuda ?,question,2023-06-12 21:56:10 +0000 UTC
-5847,OPEN,Triton issue,,2023-05-24 09:48:09 +0000 UTC
-5846,CLOSED,Error "unable to create stream: the provided PTX was compiled with an unsupported toolchain" once installed other version of CUDA,,2023-06-12 21:55:59 +0000 UTC
-5844,CLOSED,Onnxruntime backend `gpu_mem_limit` is not respected,,2023-05-24 14:18:28 +0000 UTC
-5843,CLOSED,Run onnixruntime backend occured fault ！,,2023-07-07 23:44:36 +0000 UTC
-5841,OPEN,GPU memory leak when loading/unloading models,bug, investigating,2023-06-20 17:15:32 +0000 UTC
-5840,OPEN,Triton crash when query is > Aproximatively 4 MB,bug, investigating,2023-05-25 17:45:09 +0000 UTC
-5839,CLOSED,Triton Inference Server installation failure,question,2023-06-14 19:49:18 +0000 UTC
-5838,CLOSED,[Question] Inference with torchscript model,question,2023-06-12 21:57:19 +0000 UTC
-5837,OPEN,[Question] How does triton inference server limit endpoint access for HTTP protocol ?,enhancement,2023-07-07 23:43:38 +0000 UTC
-5835,CLOSED,[Question] How does triton server use s3 as model-respoitory?,question,2023-05-23 15:39:37 +0000 UTC
-5834,CLOSED,Model Configuration is wrongly restrictive (2),question,2023-05-24 08:53:11 +0000 UTC
-5833,CLOSED,How to terminate a grpc streaming request immediately during tritonserver inference with a FasterTransformer backend?,question,2023-06-01 02:48:26 +0000 UTC
-5830,CLOSED,[python-backend] How to load script model with _extra files in model.pt?,,2023-07-06 15:35:12 +0000 UTC
-5829,OPEN,The speed of pre- and post- processing based on python backend is very slow!,,2023-07-07 23:24:45 +0000 UTC
-5828,CLOSED,Triton server crashes with a signal 11 when attempting to call nonexistent model inference,bug, investigating,2023-05-23 17:12:11 +0000 UTC
-5827,CLOSED,Cant build server with debug symbols,question,2023-06-12 21:57:49 +0000 UTC
-5823,CLOSED,Can I use triton with deepspeed inference?,question,2023-05-24 15:29:24 +0000 UTC
-5822,CLOSED,model_configuration -> model_transaction_policy -> decoupled mode -> requests[0].get_response_sender(),,2023-06-12 21:58:04 +0000 UTC
-5821,OPEN,error: private field 'event_' is not used [-Werror,-Wunused-private-field],question,2023-06-08 04:25:50 +0000 UTC
-5820,CLOSED,python_backend dlpack: Bool type is not supported,question,2023-05-29 16:07:50 +0000 UTC
-5819,OPEN,python_backend tries to chmod the triton_python_backend_stub even after running `chmod 777 triton_python_backend_stub`,,2023-06-26 20:30:54 +0000 UTC
-5818,CLOSED,Triton server does not load models onto all MiG instances,question,2023-05-22 23:00:53 +0000 UTC
-5817,OPEN,different performance between 23.03 and 23.04,investigating,2023-07-07 21:58:16 +0000 UTC
-5816,CLOSED,DLPack tensor is not contiguous. Only contiguous DLPack tensors that are stored in C-Order are supported,,2023-07-06 15:34:29 +0000 UTC
-5815,CLOSED,Calling TRT backend from BLS and bypassing TritonPythonModel.execute() call - Question,question,2023-06-12 21:59:07 +0000 UTC
-5814,OPEN,int8_mode=2 loading model fails,,2023-05-18 08:39:55 +0000 UTC
-5813,OPEN,Install Python Backend via pip locally,enhancement,2023-06-26 14:12:51 +0000 UTC
-5812,CLOSED,triton server (23.04) doesn't support specific version of tensorrt,question,2023-06-12 21:58:24 +0000 UTC
-5811,OPEN,GRPC prediction calls with BYTES input errors out in Big Endian Machines,investigating,2023-05-26 06:13:04 +0000 UTC
-5809,CLOSED,Tritonserver:23.04 crashes, but it wasn't with 23.03.,question,2023-05-22 19:54:10 +0000 UTC
-5804,CLOSED,ubuntu 20.04 GDB 10.2 debug child process occurred error；,,2023-06-12 22:01:01 +0000 UTC
-5803,CLOSED,Pytorch Backend Undefined Symbol [commit 4a8a870],,2023-05-17 09:24:36 +0000 UTC
-5802,OPEN,dynamic batching log created batch size,,2023-05-17 13:45:00 +0000 UTC
-5798,CLOSED,Failed to register CUDA shared mem: failed to open CUDA IPC handle: invalid resource handle,,2023-05-22 08:19:13 +0000 UTC
-5794,CLOSED,Cannot enable OpenTelemetry trace option; "unrecognized option",,2023-05-16 00:33:38 +0000 UTC
-5793,CLOSED,tritonserver start error: basic_string::_S_construct null not valid,,2023-05-26 19:12:17 +0000 UTC
-5792,CLOSED,Using Triton Python backend to serve as an RTSP camera grabber (Question).,question,2023-06-12 21:59:15 +0000 UTC
-5791,CLOSED,Loading ensemble reloads already loaded models,,2023-05-30 18:23:02 +0000 UTC
-5790,CLOSED,Unable to load models from "Private" s3 location,,2023-06-08 21:59:05 +0000 UTC
-5789,OPEN,tensorrt backend coredump at using cudaGraph with output_copy_stream True,,2023-05-18 00:45:42 +0000 UTC
-5788,OPEN,Can Triton achieve a return in typewriter form?,,2023-05-15 08:46:53 +0000 UTC
-5787,CLOSED,Custom auto scaling,question,2023-05-19 16:01:06 +0000 UTC
-5786,CLOSED,Triton Inference Server(23.03 container) fails to load all models.,,2023-07-07 22:52:43 +0000 UTC
-5785,CLOSED,Reiterating the suggestion for an enhanced extension of CUDA tensors to cudashm.,,2023-06-01 07:46:09 +0000 UTC
-5784,CLOSED,How does the function ModelInstanceState::ProcessRequests do inference for each request?,,2023-07-07 23:44:23 +0000 UTC
-5783,OPEN,Inaccurate request handling when configuring queue policy,,2023-05-13 15:01:10 +0000 UTC
-5781,CLOSED,what should I do if i want to write a plugin for pulling model repository from other file system,,2023-06-21 21:41:15 +0000 UTC
-5779,OPEN,python backend: cuDNN error: CUDNN_STATUS_MAPPING_ERROR and following CUDA error: an illegal memory access was encountered,bug, investigating,2023-07-03 20:06:25 +0000 UTC
-5777,OPEN,Custom Backend Tracing,enhancement,2023-05-12 19:47:21 +0000 UTC
-5774,OPEN,How can I install TF2 in the triton python_backend ?,,2023-05-12 18:21:06 +0000 UTC
-5773,OPEN,Order of tensors in Response does not correspond to config.pbtxt,bug,2023-05-12 21:54:50 +0000 UTC
-5772,OPEN,Sanity check before updating Sagemaker endpoint,,2023-05-12 21:16:15 +0000 UTC
-5768,CLOSED,Founded batching opportunity,question,2023-05-23 15:13:24 +0000 UTC
-5765,OPEN,Server should exit on unrecoverable errors in underlying runtime that cannot be resolved by model reloading,bug,2023-06-27 05:14:11 +0000 UTC
-5759,CLOSED,Reshape property not functional for python models.,,2023-05-22 18:39:04 +0000 UTC
-5758,CLOSED,some errors build from source,,2023-05-22 18:40:02 +0000 UTC
-5757,OPEN,downtime on the server when reloading a model in explicit mode (when using aws s3 repository),bug,2023-05-12 00:20:24 +0000 UTC
-5754,CLOSED,is there any custom C++ post-process backend and pre-process backend examples？,question,2023-05-12 06:30:02 +0000 UTC
-5752,CLOSED,Dynamic Input Shape Handling in triton,,2023-06-07 18:24:45 +0000 UTC
-5751,CLOSED,How about "Support for Reporting Metric Data with Timestamp Labels in Prometheus Metric Reporting Functionality"?,enhancement,2023-05-22 18:40:24 +0000 UTC
-5749,CLOSED,An error occurred while making inference requests for tritonserver,,2023-06-20 07:21:18 +0000 UTC
-5748,CLOSED,Does triton support pulling model repo from OSS,,2023-05-10 08:54:03 +0000 UTC
-5747,CLOSED,1,,2023-05-05 11:23:56 +0000 UTC
-5746,CLOSED,Perf_analyzer input JSON data type,,2023-05-22 18:40:39 +0000 UTC
-5745,OPEN,Occasionally meet Request for unknown model: 'xxx' has no available versions for existing models,bug,2023-07-07 23:59:22 +0000 UTC
-5744,CLOSED,Help on serving OpenPose to Triton backend,,2023-05-10 07:23:16 +0000 UTC
-5743,CLOSED,Question about "max_sequence_idle_microseconds" config value,question,2023-05-22 18:40:31 +0000 UTC
-5742,CLOSED,Issue creating ensembling config, error msg not helpful,,2023-06-02 22:18:53 +0000 UTC
-5741,CLOSED,Where can I find the information about the values for various platforms such as onnx and tensorflow,,2023-05-08 23:44:15 +0000 UTC
-5740,CLOSED,Is it possible to deploy a stable-diffusion-like unet model with a loop using ONNX or TRT?,,2023-05-22 18:41:22 +0000 UTC
-5736,CLOSED,ONNX instances on same device are not running concurrently,,2023-05-26 06:48:37 +0000 UTC
-5735,CLOSED,input and torch.tensor or tensor?,enhancement,2023-05-08 15:41:21 +0000 UTC
-5733,CLOSED,Concurrent inferences get mixed up when preprocessing python model runs torch operations on GPU,,2023-05-20 09:35:36 +0000 UTC
-5732,OPEN,Update the documentation about the triton throughput while using MIG,,2023-05-03 13:19:50 +0000 UTC
-5730,CLOSED,Release notes saying ONNXRuntime for 23.04 is 1.13.1,investigating,2023-05-05 00:01:22 +0000 UTC
-5726,CLOSED,How to load list of strings into input_tensors,,2023-05-03 17:50:47 +0000 UTC
-5725,CLOSED,Not able to fully utilize A16 GPUs with Triton.,,2023-06-23 20:07:54 +0000 UTC
-5722,CLOSED,Better log message for the case when error happens inside of TRITONBACKEND_ModelBatcherInitialize(),bug, investigating,2023-05-04 16:32:27 +0000 UTC
-5721,CLOSED,Dynamic batching does not work on server side,question,2023-06-05 15:56:21 +0000 UTC
-5717,CLOSED,Issue with 23.04-tf2-python-py3 docker image on arm64,investigating,2023-05-19 22:26:17 +0000 UTC
-5715,OPEN,_cshm_get_shared_memory_handle_info allocates memory on different GPU,investigating,2023-05-03 14:49:45 +0000 UTC
-5712,CLOSED,Tesla K80 GPU is not supported by triton-inference-server container 21.10,,2023-04-29 00:42:50 +0000 UTC
-5711,CLOSED,How to upgrade the model configuration file to poll mode for tritonserver,question,2023-05-06 05:33:55 +0000 UTC
-5709,OPEN,Is concurrent model instance loading supported?,enhancement,2023-05-01 17:36:41 +0000 UTC
-5703,CLOSED,Accessing model_config of ensemble config.pbtxt,question,2023-05-01 17:55:06 +0000 UTC
-5702,CLOSED,Triton server crashed when requesting deployed Python and ONNX model services,bug,2023-05-24 16:20:29 +0000 UTC
-5701,CLOSED,Intermittent Error with Python BLS Backend Model,,2023-05-12 21:20:57 +0000 UTC
-5698,CLOSED,Deploy yolov5 Model with Triton Inference Server issue and error : expecting model output to be a vector,question,2023-05-12 21:22:24 +0000 UTC
-5694,CLOSED,Support instance group of type 'MODEL' in pytorch backend,enhancement,2023-06-13 05:42:55 +0000 UTC
-5693,CLOSED,protobuf version used,question,2023-05-12 21:23:42 +0000 UTC
-5690,CLOSED,Output response Parsing Error,question,2023-05-12 21:24:40 +0000 UTC
-5689,OPEN,Docs for Multi model serving with over-commit,enhancement,2023-04-24 18:14:49 +0000 UTC
-5688,CLOSED,Docs for Parameters Support by Triton Backends,investigating,2023-07-08 00:03:23 +0000 UTC
-5687,CLOSED,Question: scheduling on multiple GPUs when using cuda shared memory,question,2023-04-27 18:43:57 +0000 UTC
-5685,OPEN,Inference exception in torchscript backend,bug,2023-04-28 02:04:08 +0000 UTC
-5683,CLOSED,With ensemble mode submodel can run in different gpus, like pipeline parallelism,question,2023-04-24 11:05:54 +0000 UTC
-5682,OPEN,How to implement a CPU-only Rust backend,enhancement,2023-04-21 19:58:14 +0000 UTC
-5681,CLOSED,Alternatives to increasing shared memory,question,2023-04-22 05:04:18 +0000 UTC
-5680,CLOSED,Possible issue with cuda shared memories during batch inference,,2023-04-25 07:05:07 +0000 UTC
-5679,CLOSED,how can i get a C++backends,question,2023-05-12 21:26:10 +0000 UTC
-5678,CLOSED,Vulnerabilities are reported while using the Triton for inference.,,2023-05-22 18:41:48 +0000 UTC
-5677,OPEN,Segmentation fault while triton python BLS model execute infer request,,2023-05-18 19:36:06 +0000 UTC
-5676,CLOSED,cmake failed！,,2023-05-12 21:28:10 +0000 UTC
-5675,CLOSED,Higher inference times than without using Triton inference server,,2023-05-12 21:29:35 +0000 UTC
-5668,OPEN,How long does it take to compile triton? Every time you execute cmake, you will download the dependent library. Can you set a global cache to speed up the build? Is there a more time-saving and labor-saving way?,enhancement,2023-04-21 03:23:01 +0000 UTC
-5666,CLOSED,Install mlflow-triton plugin in mlflow container (ghcr.io/mlflow/mlflow:latest) results error in processing aiohttp,,2023-04-21 22:28:35 +0000 UTC
-5665,CLOSED,Error reported while running BLS instance,,2023-06-08 23:32:48 +0000 UTC
-5664,CLOSED,vector length overflow,bug, investigating,2023-05-16 14:52:12 +0000 UTC
-5661,CLOSED,Unable to run Onnx model converted databricks/dolly-v1-6b (GPT-J),question,2023-05-12 21:31:31 +0000 UTC
-5654,OPEN,ONNX TensorRT FP16 Inference results incorrect,question,2023-04-30 10:26:48 +0000 UTC
-5649,CLOSED,Error: InferenceServerException: inference header size should be in range (0, -1012208453), got: 187,bug,2023-07-08 00:02:25 +0000 UTC
-5648,CLOSED,Best practice for an input pair <I_1, I,2>^0, ..., <I_1, I,2>^N,question,2023-05-12 21:32:20 +0000 UTC
-5647,CLOSED,How does windows11 compile triton locally?,,2023-05-12 21:33:21 +0000 UTC
-5645,CLOSED,CMake Error at /usr/share/cmake-3.25/Modules/ExternalProject.cmake:3115 (message): No download info given for 'triton-server' and its source directory:,,2023-05-12 21:34:29 +0000 UTC
-5644,CLOSED,Unused private field if TRITON_ENABLE_GPU is false,bug,2023-05-17 07:46:50 +0000 UTC
-5643,OPEN,Compile error with ubuntu 22.04 because of -Werror,enhancement,2023-04-20 22:57:49 +0000 UTC
-5641,CLOSED,docker can‘t run --gpus in orin,,2023-05-12 21:35:08 +0000 UTC
-5640,CLOSED,Inference results differ for diffrent batch size,question,2023-04-18 07:43:46 +0000 UTC
-5639,OPEN,Parse error for models that might return empty output,,2023-04-17 10:51:00 +0000 UTC
-5638,CLOSED,Any community group in slack or telegram for Triton?,question,2023-05-22 18:44:19 +0000 UTC
-5636,CLOSED,It seems that the model was loaded successfully, but why was it unloaded immediately?,,2023-04-13 06:17:42 +0000 UTC
-5633,CLOSED,capping resources assigned to each model in multi model serving,question,2023-05-22 18:43:08 +0000 UTC
-5627,OPEN,Does triton-inference-server only support slurm for multi-node deployment?,question,2023-04-13 21:48:10 +0000 UTC
-5622,OPEN,triton inference client pinned to geventhttpclient==2.0.2, cabundle doesn't support letsencrypt,,2023-04-12 18:00:12 +0000 UTC
-5617,CLOSED,HTTPS support,question,2023-06-28 21:34:04 +0000 UTC
-5614,CLOSED,Integrate triton with vector database in Python backend,question,2023-05-22 18:43:47 +0000 UTC
-5610,OPEN,TYPE_STRING datatype is throwing error while using tritonserver/Python backend/HTTP endpoint for Linux-s390x,question,2023-04-12 05:30:28 +0000 UTC
-5609,CLOSED,Implicit State Management in Pytorch,,2023-05-22 18:42:55 +0000 UTC
-5608,CLOSED,How to use golang's ModelStreamInfer interface,question,2023-04-20 03:38:02 +0000 UTC
-5606,CLOSED,Any examples for writing the input.json for perf_analyzer on fastertransformer models,,2023-05-22 18:41:02 +0000 UTC
-5605,CLOSED,Memory illegal access when using perf_analyzer,,2023-04-07 05:34:38 +0000 UTC
-5603,CLOSED,Question about how dynamic batching works,question,2023-04-11 06:59:22 +0000 UTC
-5602,CLOSED,Trouble starting Triton with 3 ONNX models loaded,,2023-04-13 01:17:22 +0000 UTC
-5598,OPEN,r23.03 onnxruntime backend consumes more GPU memory. might need cudnn_conv_use_max_workspace exposed,investigating,2023-04-25 09:44:30 +0000 UTC
-5594,OPEN,Build from source triggers errors "../libtritonserver.so: undefined reference to `absl::lts_20211102::variant_internal::ThrowBadVariantAccess()'",,2023-04-12 20:15:27 +0000 UTC
-5593,OPEN,Slow Inference using Triton Java Bindings,bug, investigating,2023-07-06 15:33:52 +0000 UTC
-5590,CLOSED,Question about the dynamic batcher and multi-instance model,question,2023-04-25 18:32:20 +0000 UTC
-5587,CLOSED,throughput of perf analyzer,question,2023-04-05 02:22:09 +0000 UTC
-5586,CLOSED,Batch between model in ensemble model,question,2023-07-08 00:01:12 +0000 UTC
-5585,CLOSED,Cannot build QA environment,,2023-04-14 07:17:47 +0000 UTC
-5584,OPEN,Triton Server Client libraries linker errors with the Response classes,question,2023-04-07 12:59:50 +0000 UTC
-5583,CLOSED,Serve tf-trt converted model return error: NodeDef mentions attr 'max_batch_size' not in Op: name=TRTEngineOp,question,2023-04-04 07:56:16 +0000 UTC
-5579,OPEN,Questions about model instances and dynamic batch when setting model concurrency,question,2023-04-06 14:15:36 +0000 UTC
-5578,OPEN,GPU tensor support for python backend on Jetson,enhancement,2023-06-23 20:11:10 +0000 UTC
-5577,CLOSED,Triton multi node - multi GPU inference,question,2023-04-10 20:26:36 +0000 UTC
-5576,OPEN,Error deploying TensorRT engine on Triton, possible nonzero op issue with data-dependent output shape.,bug, investigating,2023-04-20 23:17:17 +0000 UTC
-5574,CLOSED,When will ubuntu22.04 be supported?,question,2023-03-30 17:56:12 +0000 UTC
-5573,CLOSED,Error when run multiple triton server process in one machine,question,2023-06-28 21:37:41 +0000 UTC
-5572,CLOSED,AutoCompleteConfig() help,question,2023-04-10 20:26:51 +0000 UTC
-5567,CLOSED,google cloud model repository not working,,2023-03-29 14:17:09 +0000 UTC
-5566,CLOSED,more example for ssl for grpc,question,2023-04-11 14:35:14 +0000 UTC
-5565,CLOSED,Can I add model instances without restarting the triton server?,question,2023-04-04 01:28:14 +0000 UTC
-5564,OPEN,Route request to model instance running on specified GPU device id,enhancement,2023-06-23 19:55:43 +0000 UTC
-5563,CLOSED,Can connect Triton directly to Grafana?,question,2023-04-10 20:27:05 +0000 UTC
-5561,CLOSED,custom backend [undefined symbol],,2023-03-29 02:55:54 +0000 UTC
-5559,CLOSED,the c++ custom backend works with infer but crashes with async_stream_infer,question,2023-03-28 16:23:38 +0000 UTC
-5558,OPEN,Authorisation for endpoints,enhancement,2023-03-31 20:07:43 +0000 UTC
-5555,CLOSED,HTTP 400: Bad Request on all HTTP endpoints,,2023-03-28 04:58:10 +0000 UTC
-5552,CLOSED,Serving python libraries using Triton Inference Server,,2023-03-27 15:36:51 +0000 UTC
-5551,OPEN,Precision Setting for Performance Analyzer Output Validation,,2023-07-08 00:06:37 +0000 UTC
-5548,CLOSED,Logging doesn't output useful information when exception happens,,2023-04-10 20:28:20 +0000 UTC
-5547,OPEN,Error Unrecognized attribute: mask_filter_value for operator Attention,,2023-03-30 07:51:46 +0000 UTC
-5546,OPEN,Linker problems with Error class in Client libraries,,2023-03-24 00:36:12 +0000 UTC
-5545,OPEN,Triton server not combined requests to batch in python_backend,,2023-03-24 01:47:45 +0000 UTC
-5543,CLOSED,error using grpc,,2023-04-17 14:25:25 +0000 UTC
-5537,CLOSED,Is there a way to start triton server without using service account json in GCP ?,,2023-04-10 20:28:30 +0000 UTC
-5536,CLOSED,How to use java client api transfer images and texts to service backend by grcp?,,2023-04-10 20:28:57 +0000 UTC
-5534,OPEN,Multiple GPUs do not scale at the expected rate,,2023-04-05 23:17:19 +0000 UTC
-5533,CLOSED,Unable to Load Models from Azure Storage,,2023-06-06 21:51:35 +0000 UTC
-5530,CLOSED,Question about ragged batching,question,2023-04-10 20:25:42 +0000 UTC
-5527,CLOSED,Error when building grpc gcc11/ubuntu22.04?,,2023-03-21 19:25:02 +0000 UTC
-5525,CLOSED,Runing both normal onnx model and stateful model,,2023-04-10 20:29:14 +0000 UTC
-5524,CLOSED,TRITONBACKEND_ModelInstanceInitialize: xxx (CPU device 0),,2023-04-10 20:29:24 +0000 UTC
-5523,CLOSED,Slower inference times on using triton onnx backend instead of python backend,,2023-04-10 20:25:06 +0000 UTC
-5520,CLOSED,Triton not working with Inferentia in AWS,bug,2023-04-27 19:55:56 +0000 UTC
-5518,CLOSED,Citation of Triton inference server,,2023-03-23 22:45:25 +0000 UTC
-5517,CLOSED,can the custom backend as flexible as the python backend?,question,2023-03-18 02:53:12 +0000 UTC
-5516,CLOSED,Triton support for Red Hat Enterprise Linux,question,2023-03-21 08:08:52 +0000 UTC
-5515,CLOSED,Connection refused,,2023-03-27 18:37:16 +0000 UTC
-5513,OPEN,High GPU consumption when deploying into Kubernetes,,2023-03-20 17:54:32 +0000 UTC
-5512,CLOSED,How to get headers of request?,,2023-03-16 15:10:19 +0000 UTC
-5508,CLOSED,c++ grpc client send raw image to triton ensemble, can not use PIL.Image open,,2023-03-16 07:49:35 +0000 UTC
-5507,CLOSED,When triton loaded the python backend model preload libtensorflow_framework.so, a segment error,,2023-03-20 15:54:58 +0000 UTC
-5506,CLOSED,Some error about building custom Pytorch backend,,2023-03-27 18:37:48 +0000 UTC
-5503,CLOSED,Request ID not in Response when accessed by In-Process Triton Server API,bug, investigating,2023-03-21 01:57:58 +0000 UTC
-5501,OPEN,Minio model repository stuck on downloading files with <=2.31.0,investigating,2023-04-25 06:47:21 +0000 UTC
-5496,CLOSED,How to access inputs in TRITONSERVER_InferenceRequest,,2023-03-13 23:07:24 +0000 UTC
-5495,OPEN,README.md for client repo doesn't give complete instructions for Windows build,,2023-03-16 15:31:25 +0000 UTC
-5494,CLOSED,[Question / Bug?] DLPack tensor is not contiguous, even though I use tensor.contiguous in torch,question,2023-04-03 08:30:55 +0000 UTC
-5493,CLOSED,400 Bad request - http client,,2023-03-22 08:30:42 +0000 UTC
-5491,CLOSED,from /tmp/tritonbuild/tritonserver/build/_deps/repo-third-party-src/libevhtp/libevhtp/triton_timestamp.cc:29: /usr/include/c++/4.8.2/bits/c++0x_warning.h:32:2: error: #error This file requires compiler and library support for the ISO C++ 2011 standard. This support is currently experimental, and must be enabled with the -std=c++11 or -std=gnu++11 compiler options. #error This file requires compiler and library support for the \,question,2023-03-27 18:42:47 +0000 UTC
-5489,CLOSED,Onnx model with initializers as extra input,enhancement,2023-04-10 20:16:06 +0000 UTC
-5487,OPEN,Use boost::interprocess::shared_memory_object i.s.o POSIX shared memory functions,enhancement,2023-03-10 20:53:35 +0000 UTC
-5485,CLOSED,I build a centos image by compiling the source code on mac m1 aarch 64, but the cmake compilation fails,,2023-03-10 21:00:36 +0000 UTC
-5484,CLOSED,Server is returning BYTES when output datatype is FP32,,2023-03-10 12:42:56 +0000 UTC
-5483,CLOSED,Failed to open the cudaIpcHandle when I call an ONNX / TRT backend from Python backend,bug,2023-04-20 19:58:38 +0000 UTC
-5479,CLOSED,TensorRT: batching is unavailable,,2023-03-11 00:37:32 +0000 UTC
-5478,CLOSED,How to build a tritonserver image based on centos7, which is now based on ubuntu, and the current build.py does not support it;,,2023-03-11 14:49:19 +0000 UTC
-5477,CLOSED,The difference between triton bls and python banckend,,2023-03-10 21:36:04 +0000 UTC
-5476,CLOSED,> Hello,,,2023-03-09 09:49:26 +0000 UTC
-5475,CLOSED,The difference between triton bls and python banckend,,2023-03-09 09:49:05 +0000 UTC
-5474,CLOSED,Invalid argument: ensemble 'face_pose' depends on 'face_pose_trt' whose required version 0 is not loaded,question,2023-03-27 18:43:02 +0000 UTC
-5473,CLOSED,Questions about the "--disable-auto-complete-config" flag,,2023-03-10 21:40:21 +0000 UTC
-5472,CLOSED,Confusion about passing/converting NumPy/PyTorch tensors from and to pb_utils.Tensor for inference request,question,2023-03-24 15:59:42 +0000 UTC
-5471,CLOSED,Inference result of single batch ONNX model contains all zeros and also emits "Failed to open the cudaIpcHandle." error in additional inference calls,bug,2023-04-25 15:24:58 +0000 UTC
-5467,OPEN,Configurable rate-limiting / queue policy for sequence batcher,enhancement,2023-03-10 23:03:30 +0000 UTC
-5466,CLOSED,How to deploy a model whose size exceeds the memory of a single gpu,,2023-03-09 02:11:45 +0000 UTC
-5465,CLOSED,ensemble model transfer a context,,2023-03-15 03:38:54 +0000 UTC
-5464,CLOSED,Is dynamic batch use in ensemble model ? can i create a data pineline in triton API?,,2023-03-09 02:19:36 +0000 UTC
-5461,OPEN,23.02-pyt-python-py3 ModuleNotFoundError: No module named 'torch',enhancement,2023-03-31 21:05:30 +0000 UTC
-5460,CLOSED,unable to load shared library: `/lib/x86_64-linux-gnu/libstdc++.so.6`,question,2023-03-27 18:37:26 +0000 UTC
-5459,CLOSED,triton inference latency increases rapidly if the interval of inference request is more than 1 second.,,2023-03-10 05:28:53 +0000 UTC
-5454,CLOSED,Feature Request: More flexible model interactions,,2023-03-07 23:28:59 +0000 UTC
-5453,OPEN,Feature Request: Server side callbacks to evict state,enhancement,2023-03-22 09:29:06 +0000 UTC
-5452,CLOSED,Python BLS script fails to load additional models on sagemaker,enhancement,2023-03-10 21:35:09 +0000 UTC
-5451,CLOSED,Build for Windows 10 fails with hcsshim::PrepareLayer Win32: Incorrect function,question,2023-03-27 18:42:55 +0000 UTC
-5450,CLOSED,Segmentation fault on perf_client with STRING input and custom input_data,,2023-03-07 10:56:26 +0000 UTC
-5448,CLOSED,Error when parsing ONNX model from file in tao-toolkit-triton-apps docker container.,,2023-03-08 14:59:32 +0000 UTC
-5447,CLOSED,Lack of nvcr.io/nvidia/tritonserver:23.02-py3,,2023-03-27 18:43:39 +0000 UTC
-5446,CLOSED,Multiple implicit state,,2023-03-02 10:17:11 +0000 UTC
-5445,OPEN,Best Practices for a secured Triton installation,,2023-03-02 02:29:31 +0000 UTC
-5440,OPEN,Better Error reporting when using device or host memory (C-API),enhancement,2023-03-01 19:26:07 +0000 UTC
-5439,CLOSED,triton server logging time is not right after change timezone in container,enhancement, question,2023-07-06 18:21:19 +0000 UTC
-5438,CLOSED,Why I still perform inference while triton server is not ready?,,2023-03-01 19:01:16 +0000 UTC
-5434,CLOSED,Installation guide for OpenShift,,2023-02-28 17:07:59 +0000 UTC
-5433,OPEN,Connecting to Triton server takes about 2 minutes,question,2023-03-14 16:07:52 +0000 UTC
-5432,CLOSED,Redis Cache Repository does not exist,,2023-02-28 18:33:08 +0000 UTC
-5431,OPEN,Document memory exchanges in an Ensemble and in a BLS,enhancement,2023-03-01 19:34:04 +0000 UTC
-5422,CLOSED,TRITON_ENABLE_METRICS_CPU not found,,2023-02-27 17:10:10 +0000 UTC
-5421,CLOSED,Long queue time when preprocessing using the python backend,question,2023-03-08 02:59:43 +0000 UTC
-5420,CLOSED,Classification extension wrong for multiple images inference with Triton server 22.12-py3 or 23.01-py3 with MacBook M1,,2023-03-16 18:20:48 +0000 UTC
-5419,CLOSED,The deployed model loses a lot of accuracy!,,2023-03-27 18:44:53 +0000 UTC
-5417,CLOSED,Is this to add support for jetson in the model gen scripts?,,2023-02-27 18:09:40 +0000 UTC
-5416,CLOSED,Ensemble model with shared memory,,2023-02-25 01:07:23 +0000 UTC
-5412,CLOSED,Use perf_client on models with string as input and variable shape,,2023-02-24 18:06:11 +0000 UTC
-5410,CLOSED,[Question] Setting the default input and output tensor names in the config.pbtxt,,2023-02-24 18:12:33 +0000 UTC
-5408,CLOSED,Unable to Connect to Triton Server at localhost:8000,,2023-02-24 18:00:39 +0000 UTC
-5396,CLOSED,FasterTransformer: Start to forward terminate called after throwing an instance of 'std::out_of_range',,2023-02-23 19:12:20 +0000 UTC
-5395,CLOSED,unexpected explicit tensor data for input tensor 'attention_mask' for model 'pipeline-poc-inference__isvc-211152d1e7' of type 'INT32', expected datatype 'INT64',,2023-02-23 21:43:32 +0000 UTC
-5393,OPEN,add --version info option to tritonserver,,2023-02-25 12:31:57 +0000 UTC
-5392,OPEN,Triton Server costs too much memory,,2023-02-24 09:12:21 +0000 UTC
-5391,OPEN,Pass a python dict to triton server python backend,enhancement,2023-02-22 20:56:40 +0000 UTC
-5390,CLOSED,TritonServer output tensor diff from original onnx runtime inference,,2023-05-12 02:46:05 +0000 UTC
-5389,CLOSED,ONNX model inferencing :: "vmodel not found",,2023-02-21 14:22:12 +0000 UTC
-5388,CLOSED,Triton failed to load a large (19GB) ONNX model,,2023-03-06 18:56:27 +0000 UTC
-5387,CLOSED,Failed to build on Windows when upgrade from r22.02 to r23.01,,2023-03-10 02:15:08 +0000 UTC
-5386,CLOSED,Set the input data takes so many time,invalid,2023-02-28 00:35:37 +0000 UTC
-5385,CLOSED,Use Encrypted ONNX model in Triton Server,,2023-02-23 00:38:16 +0000 UTC
-5384,CLOSED,Response Cache Metrics Usage,,2023-03-22 00:08:30 +0000 UTC
-5383,CLOSED,Reduce image size and increase build frequency to cut high/medium vulnerabilities in tritonserver docker images,,2023-02-23 00:37:04 +0000 UTC
-5382,OPEN,All deployed model inference outputs are 0 and -1,,2023-02-24 07:38:05 +0000 UTC
-5381,CLOSED,[python backend] Triton hangs when a python backend model process is killed by kernel OOM killer,,2023-03-30 23:54:10 +0000 UTC
-5379,OPEN,Unify building experience among windows and posix,,2023-02-17 23:44:59 +0000 UTC
-5378,CLOSED,Enabling triton client to build with minial set to rid needless dependencies such as zlib and re2,enhancement,2023-05-08 17:08:06 +0000 UTC
-5377,OPEN,Enablging triton client to build and run on Mac,,2023-02-17 23:41:15 +0000 UTC
-5376,OPEN,Documentation for creating small versions of Triton Docker images is not sufficient,,2023-06-21 19:54:05 +0000 UTC
-5374,CLOSED,Need help in understanding triton client "get_inference_statistics" output.,,2023-03-27 18:43:14 +0000 UTC
-5373,CLOSED,How to use a fixed TensorRT version in triton server?,question,2023-02-23 00:35:22 +0000 UTC
-5372,OPEN,Metrics from Metric port being mixed when both Triton Model Analyzer and Triton Inference Server being started,,2023-02-27 18:56:38 +0000 UTC
-5371,CLOSED,Unable to create S3 filesystem client. Check account credentials.,,2023-03-15 09:39:35 +0000 UTC
-5370,CLOSED,torchscript can not serving on multi GPUs,investigating,2023-05-08 19:06:34 +0000 UTC
-5369,CLOSED,Dumping Logs of Triton Inference Server for tools like Grafana Loki,,2023-03-27 18:41:11 +0000 UTC
-5366,OPEN,Custom Header And GRPC MetaData IN TritonServer,question,2023-02-16 05:00:04 +0000 UTC
-5365,CLOSED,Can multiple models share the same python-backend stub?,,2023-03-27 18:45:07 +0000 UTC
-5364,CLOSED,Customize-triton-container using compose.py failed,,2023-02-20 01:42:22 +0000 UTC
-5359,CLOSED,Missing default tag (`latest`) in `nvcr.io/nvidia/tritonserver` container,,2023-03-16 18:23:14 +0000 UTC
-5358,CLOSED,Suggestions for k8s_onprem helm chart,,2023-02-23 00:40:48 +0000 UTC
-5357,CLOSED,How can I use dynamic batch?,question,2023-02-23 00:36:20 +0000 UTC
-5356,CLOSED,How to get the dependencies between backends using BLS,question,2023-02-16 07:02:51 +0000 UTC
-5354,OPEN,Triton limitations when deploying gpt-like generative models,,2023-02-16 16:29:26 +0000 UTC
-5353,CLOSED,Unable to load models from s3 location,question,2023-04-10 20:28:17 +0000 UTC
-5352,CLOSED,slow when I use triton torchscript model to infer,,2023-03-16 18:21:31 +0000 UTC
-5351,CLOSED,How to handle "output config" when "empty tensor" is the output of the detection model ?,question,2023-02-14 18:14:49 +0000 UTC
-5350,CLOSED,Unable to run my own model!,,2023-02-23 00:41:57 +0000 UTC
-5349,OPEN,Question about server/docs/examples/stable_diffusion/,question,2023-05-01 11:02:19 +0000 UTC
-5348,CLOSED,Request timeout expired While The Grpc Deadline Not Exceeded,,2023-02-23 06:59:58 +0000 UTC
-5346,CLOSED,Unable to connect to the triton service!,,2023-02-13 16:12:15 +0000 UTC
-5345,CLOSED,Dynamic model loading/unloading depending on requests?,,2023-02-13 17:50:09 +0000 UTC
-5343,CLOSED,OpenVINO Backend 2022.3 support,investigating,2023-07-07 17:15:06 +0000 UTC
-5342,CLOSED,Python backend Cannot Concurrent Load Model,question,2023-04-28 00:22:14 +0000 UTC
-5341,OPEN,Deploying Triton on Kubernetes results in Crashloopbackoff,question,2023-02-10 22:57:01 +0000 UTC
-5338,CLOSED,Need for help! Is that my configuration wrong?,question,2023-02-19 13:22:54 +0000 UTC
-5337,CLOSED,pytorch(torchscript) custom classes,,2023-02-10 06:39:46 +0000 UTC
-5335,CLOSED,Perf_client reporting different requests per second depending on network,performance,2023-02-09 20:26:50 +0000 UTC
-5334,CLOSED,dynamic batch not work,question,2023-02-09 12:25:06 +0000 UTC
-5333,OPEN,Allow to pass custom logging format via options,enhancement,2023-02-09 10:54:25 +0000 UTC
-5332,CLOSED,Memory segmentation fault in `libtriton_fil.so` for an xgboost model,,2023-02-14 20:51:38 +0000 UTC
-5331,OPEN,Timeout was reached,enhancement,2023-02-10 20:35:24 +0000 UTC
-5328,CLOSED,Signal 11 received while stress testing,bug,2023-02-27 16:10:04 +0000 UTC
-5327,CLOSED,Build without container has protobuf and curl issues,question,2023-06-28 21:40:37 +0000 UTC
-5326,CLOSED,Model Warmup Invalid!,performance,2023-02-09 02:08:12 +0000 UTC
-5325,CLOSED,/v2/repository/index api is very slowly,investigating,2023-02-28 00:16:05 +0000 UTC
-5324,OPEN,Triton cannot retrieve GPU metrics with MIG-enabled GPU devices (A100 and A30),enhancement,2023-02-07 01:54:25 +0000 UTC
-5323,OPEN,run triton docker,print () of model.py in python_backend cannot be printed,How to solve it?,,2023-02-08 08:09:08 +0000 UTC
-5322,CLOSED,How do i set labels for my Triton Model?,,2023-04-24 17:12:39 +0000 UTC
-5321,CLOSED,Custom Build Python Backend Locale Error,investigating,2023-06-26 15:54:14 +0000 UTC
-5320,CLOSED,Shape mismatch attempting to re-use buffer. {1,16,48,48} != {4,16,48,48},,2023-02-08 07:47:11 +0000 UTC
-5319,CLOSED,Document how expected model inputs tensor gets an extra dimension (of -1 items) and how to deal with it,,2023-02-08 18:14:08 +0000 UTC
-5318,CLOSED,Enable cache in TritonServer,,2023-02-21 17:50:59 +0000 UTC
-5317,CLOSED,Running preprocessing on TritonPythonModel is bottlenecking,performance,2023-02-10 05:37:15 +0000 UTC
-5316,CLOSED,TCP port was full and the triton server could not accept any request.,,2023-02-21 17:50:54 +0000 UTC
-5315,CLOSED,Server CPU usage decreases after a new model request arrives,,2023-02-09 10:12:25 +0000 UTC
-5313,OPEN,Unable to run triton outside container with python backend,,2023-02-04 19:24:33 +0000 UTC
-5309,CLOSED,Onnx runtime build error for 23.01,,2023-02-27 20:20:47 +0000 UTC
-5308,CLOSED,Proper input in json file, for python model accepting dictionary with string values as input,,2023-02-15 14:15:03 +0000 UTC
-5307,CLOSED,When loading a pretrained model,question,2023-02-21 17:51:47 +0000 UTC
-5305,CLOSED,Does `tritonserver` Support IPv6 `--http-address` bind?,,2023-04-10 22:19:44 +0000 UTC
-5304,CLOSED,[TRT] | Complete error not propagated to the http server,enhancement,2023-04-19 20:02:33 +0000 UTC
-5301,OPEN,[Feature request]Can you provide a golang grpc image infer client?,enhancement,2023-02-13 16:14:30 +0000 UTC
-5300,CLOSED,Triton onnxruntime backend serving having different result compared to the native python inference on a resnet18 model.,,2023-02-06 01:48:54 +0000 UTC
-5299,CLOSED,[Question] Inference image ensemble model with gRPC request got error,question,2023-02-23 22:21:44 +0000 UTC
-5296,CLOSED,Wrong prometheus metrics reports while using horizontally placed nodes in an ensemble model DAG,bug, investigating,2023-03-22 23:37:42 +0000 UTC
-5295,CLOSED,Scaling is not happening for slow models,question,2023-02-14 21:21:39 +0000 UTC
-5294,OPEN,How to serve n identical models (except for their weights) without using n times the GPU memory ?,enhancement,2023-03-09 17:49:02 +0000 UTC
-5293,CLOSED,[Question] can different models served within the same container share the input names ?,,2023-01-31 22:19:37 +0000 UTC
-5292,CLOSED,Include Python protobuf files in install folder,,2023-02-02 14:24:24 +0000 UTC
-5291,CLOSED,how to build "xx.yy-py3-min" for base image "ubuntu:20.04"?,question,2023-01-31 22:42:48 +0000 UTC
-5286,CLOSED,[Question] Is there a way to send raw string as input to triton server?,question,2023-01-31 18:37:59 +0000 UTC
-5285,CLOSED,[Feature Request] Add load_model api to triton_python_backend_utils,question,2023-02-02 03:03:10 +0000 UTC
-5284,CLOSED,[Question] Inference diffusion model with gRPC request got error,,2023-01-31 07:07:35 +0000 UTC
-5283,CLOSED,[Question]About Data collection and Data send back,,2023-01-31 22:20:44 +0000 UTC
-5279,CLOSED,[Python Backend] When using print() and multiple models, the print logs show out-of-order.,question,2023-04-10 22:12:42 +0000 UTC
-5278,CLOSED,Incorrect output for a movinet model on a tensorflow backend - Triton 22.04,bug,2023-03-01 21:04:24 +0000 UTC
-5277,CLOSED,Backend configs ignoring/not receiving some config flags in latest release,bug, investigating,2023-03-16 21:56:46 +0000 UTC
-5276,CLOSED,Every other sequence_id is set to zero using pytorch backend with stateful TS model.,bug,2023-02-08 23:18:06 +0000 UTC
-5274,CLOSED,Observing gradually increase in response time of pytorch model,,2023-02-13 19:09:25 +0000 UTC
-5273,CLOSED,Build CMake target missing .so definition,,2023-02-21 21:56:29 +0000 UTC
-5272,CLOSED,[Question] How image data should be serialized for warmup,,2023-02-13 19:18:20 +0000 UTC
-5271,CLOSED,Guidance for model instaces and gpu count,,2023-02-06 15:04:44 +0000 UTC
-5269,CLOSED,Starting triton docker container on kube cluster,,2023-02-22 08:51:50 +0000 UTC
-5268,CLOSED,E0120 09:32:45.604616 1854 model_repository_manager.cc:1002] Poll failed for model directory 'ensemble_model': output 'decoded_sequence' for ensemble 'ensemble_model' is not written,,2023-03-03 12:18:53 +0000 UTC
-5266,CLOSED,libtritonserver.so and onnxruntime files are missing from cpu build,,2023-01-20 15:43:04 +0000 UTC
-5259,OPEN,Suggestion to reduce RAM consumption,investigating,2023-02-27 03:44:20 +0000 UTC
-5255,CLOSED,E0116 09:33:42.589212 1 model_repository_manager.cc:1002] Poll failed for model directory 'pytorch_classifier': Invalid model name: Could not determine backend for model 'pytorch_classifier' with no backend in model configuration. Expected model name of the form 'model.<backend_name>'.,question,2023-01-19 22:59:47 +0000 UTC
-5254,OPEN,Keeps on getting "Invalid private key" when using tritonclient.grpc with SSL,bug,2023-04-11 06:53:39 +0000 UTC
-5253,CLOSED,Setup Triton server with build.py Error at setting up libonnxruntime.so,,2023-01-30 18:52:25 +0000 UTC
-5252,CLOSED,Get per requests timing data,question,2023-01-20 01:19:45 +0000 UTC
-5248,CLOSED,Issue with system shared memory and OpenCL (failed reading shared memory buffer with clEnqueueWriteBuffer ),bug,2023-02-01 20:15:09 +0000 UTC
-5242,CLOSED,Traces. OpenTelemetry,enhancement,2023-04-26 20:05:25 +0000 UTC
-5240,OPEN,Using PyTorch 2.0 for the PyTorch Backend,enhancement,2023-07-04 15:10:36 +0000 UTC
-5239,CLOSED,Triton per model resource distribution monitoring,question,2023-02-13 19:17:22 +0000 UTC
-5238,OPEN,ehancement(client): Python type-hints,enhancement,2023-01-31 23:33:27 +0000 UTC
-5237,OPEN,Does ensemble model release CUDA cache?,,2023-01-18 05:23:53 +0000 UTC
-5236,OPEN,[RFC] Provide an option to start any backend out-of-proc to help with memory management on UNLOAD,enhancement,2023-01-20 18:51:55 +0000 UTC
-5234,CLOSED,[Bug] Triton Server crashes and becomes unavailable for a few seconds before restarting,,2023-04-06 16:06:06 +0000 UTC
-5232,CLOSED,Unable to load shared library: libc10.so when building Pytorch backend using Docker image,,2023-01-11 13:29:32 +0000 UTC
-5231,CLOSED,Triton server restarts after polling a new model from GCS,,2023-02-10 20:00:46 +0000 UTC
-5230,CLOSED,Setup Triton Inference Server on a Windows 2019 server with Tesla GPU + inference using python,,2023-02-13 19:19:19 +0000 UTC
-5229,CLOSED,Triton inference is 2 times slower than non triton inference for me,,2023-03-16 18:27:09 +0000 UTC
-5227,CLOSED,Custom metrics for Python backend?,enhancement,2023-05-04 00:38:35 +0000 UTC
-5225,CLOSED,[Question] Tips on sending gRPC requests from .NET,question,2023-02-01 14:41:52 +0000 UTC
-5224,CLOSED,cpu build is failing with --- Target "caffe2plan" links to target "CUDA::cudart" but the target was not found,question,2023-01-30 18:55:31 +0000 UTC
-5223,CLOSED,Server crash running torchvision.io.decode_image on GPU,bug, investigating,2023-04-14 00:04:28 +0000 UTC
-5222,CLOSED,can not set the log file path,,2023-01-06 08:30:56 +0000 UTC
-5221,CLOSED,/v2/repository/index api is very slowly,question,2023-02-01 09:48:18 +0000 UTC
-5220,CLOSED,Python client grpcio==1.42.0 requirement too strict,,2023-01-17 07:59:44 +0000 UTC
-5217,CLOSED,UNAVAILABLE: Internal: unable to create stream: the provided PTX was compiled with an unsupported toolchain.,,2023-01-04 20:18:02 +0000 UTC
-5216,CLOSED,AttributeError: 'InferenceServerClient' object has no attribute '_pool',,2023-01-04 22:26:27 +0000 UTC
-5215,CLOSED,Is it necessary to specify max_batch_size when using dynamic batch?,question,2023-01-30 18:55:53 +0000 UTC
-5214,CLOSED,[Question 🙋] Is this fps performance normal?,question,2023-01-30 18:55:46 +0000 UTC
-5212,CLOSED,Client memory leak on unreachable,,2023-01-03 03:07:05 +0000 UTC
-5211,CLOSED,Triton server stuck during initialization/reload of python models,bug, investigating,2023-01-12 18:27:00 +0000 UTC
-5210,CLOSED,python backend IPC makes triton-model QPS drop rapidly,bug,2023-01-30 18:56:05 +0000 UTC
-5209,OPEN,Is there a way to call other methods rather "forward"?,enhancement,2023-02-03 00:45:31 +0000 UTC
-5208,CLOSED,Input_memories in libtorch backend,,2022-12-30 21:56:32 +0000 UTC
-5207,OPEN,How to start triton server after building the Windows 10 "Min" Image?,question,2023-01-10 06:52:27 +0000 UTC
-5206,CLOSED,Encounter error Invalid argument: unable to find 'libtriton_tensorrt_plan.so' when starting triton server,,2023-01-28 00:00:01 +0000 UTC
-5205,CLOSED,[question] How to make sure that dynamic batching works making concurrent requests?,question,2023-05-17 11:46:56 +0000 UTC
-5201,CLOSED,[Question] Is it possible to load a separate TRT model from the initialize function of a BLS model ?,question,2022-12-29 09:37:06 +0000 UTC
-5200,CLOSED,Has anyone implemented a backend for Goldwasser GPU inference?,question,2022-12-27 18:32:12 +0000 UTC
-5199,CLOSED,can fully use gpu-utils with Python client ( grpc, http ) ?,question,2023-01-30 18:56:46 +0000 UTC
-5198,CLOSED,GRPC client and decoupled mode - detecting last response of an infer request,duplicate,2022-12-27 22:49:37 +0000 UTC
-5197,CLOSED,Serve a Model in 3 Easy Steps is erro,question,2022-12-29 02:27:13 +0000 UTC
-5196,CLOSED,Python backend (BLS) abnormally consumes large GPU memory,,2023-05-22 18:43:02 +0000 UTC
-5195,CLOSED,Internal: An input of type 'Tensor?' was detected in the model. Only a single input of type Dict(str, Tensor) or input(s) of type Tensor are surpported,,2023-01-09 19:51:58 +0000 UTC
-5194,CLOSED,Question about Triton's Setup for a Speech-To-Text model,question,2023-01-09 19:51:27 +0000 UTC
-5192,CLOSED,jetson compilation,question,2022-12-27 17:35:15 +0000 UTC
-5191,CLOSED,Question about basic sequence stream example,question,2023-01-04 09:47:53 +0000 UTC
-5189,CLOSED,Core dump when load model with config which containning repoagent in explicit mode,question,2023-01-30 18:57:28 +0000 UTC
-5188,CLOSED,Can't use tensorrt with stateful model,question,2023-01-25 22:01:34 +0000 UTC
-5187,CLOSED,Release the tar file contains the Triton server executable and shared libraries,,2023-01-28 00:00:35 +0000 UTC
-5185,CLOSED,HTTP response from server does not include "content-encoding" header even if the response body is encoded,bug,2023-04-19 20:38:57 +0000 UTC
-5183,OPEN,Can no longer use GCS for model store with latest release,enhancement, investigating,2022-12-16 21:04:55 +0000 UTC
-5182,CLOSED,how to load weights files for python execution environment in the cloud?,question,2023-01-09 19:52:42 +0000 UTC
-5180,CLOSED,Can I free the memory of model instance?,question,2023-01-09 19:52:16 +0000 UTC
-5175,CLOSED,Assigning GPU fraction to a model,,2022-12-15 22:51:46 +0000 UTC
-5174,CLOSED,what's the behavior about python_backend.InferenceRequest.exec()?,,2023-07-07 07:48:28 +0000 UTC
-5171,CLOSED,question: What is the easiest way to check my custom backend library for memory errors?,question,2023-01-09 19:54:07 +0000 UTC
-5170,CLOSED,Supports of group instance count,question,2023-07-08 00:35:17 +0000 UTC
-5168,CLOSED,Python client getting significantly less performance than perf_client tool,question,2023-01-09 16:15:37 +0000 UTC
-5167,CLOSED,Fatal error - No such file or directory when building custom PyTorch backend,,2023-01-11 13:29:22 +0000 UTC
-5165,CLOSED,[Question] Is it possible to add custom information in a model config ?,,2022-12-14 14:17:58 +0000 UTC
-5164,CLOSED,[Question] Is there a way to specify a different filename depending on the GPU ?,,2022-12-14 14:17:46 +0000 UTC
-5163,CLOSED,problem about daemonts svclb-tritoninferenceserve,,2023-02-21 21:57:06 +0000 UTC
-5162,CLOSED,How can I make this scenario with image_client?,,2022-12-16 09:25:56 +0000 UTC
-5160,OPEN,Question: Determine if all responses are processed for a given request.,enhancement,2022-12-14 21:14:04 +0000 UTC
-5159,CLOSED,Question: Order of responses for the same request for decoupled models.,question,2023-01-09 19:53:08 +0000 UTC
-5158,CLOSED,triton is slower than direct access the trt engine file,,2022-12-12 13:56:14 +0000 UTC
-5157,CLOSED,[Question]Why triton inference server uses different gpu devices when inference?,question,2022-12-13 07:27:14 +0000 UTC
-5156,CLOSED,archive_write_data_block() failed with error code = -20, error message is Write failed for model.py (ensemble model deployment on triton),,2022-12-25 16:25:25 +0000 UTC
-5155,CLOSED,tritonserver: unrecognized option '--cloud-credentials=path/to/creds/.json',,2023-02-16 09:32:10 +0000 UTC
-5154,CLOSED,Detecting abrupt stream closing in C++ client library,question,2022-12-14 04:24:55 +0000 UTC
-5152,CLOSED,Torchscript model inference time too slow, how to know if model is running on GPU,performance,2023-07-11 15:36:02 +0000 UTC
-5150,CLOSED,Change model namespacing to allow Triton to re-use model names across different model repos,bug,2023-03-29 19:10:42 +0000 UTC
-5147,CLOSED,[Question] about NVFUSER with pytorch backend,,2022-12-09 08:10:52 +0000 UTC
-5146,CLOSED,how to keep model output on gpu in python-backend?,,2022-12-12 08:59:40 +0000 UTC
-5145,CLOSED,how to add lib and header into different c++ project,,2022-12-07 19:13:08 +0000 UTC
-5144,OPEN,Stub process is unhealthy and it will be restarted.,bug,2023-07-11 02:23:56 +0000 UTC
-5134,CLOSED,[Question] Is it possible to disable model concurrency ?,,2022-12-06 18:51:29 +0000 UTC
-5133,CLOSED,Can not create Customized Python Backends with Python 3.10 - 3.11 due to a Conda Pack issue,,2022-12-13 16:22:08 +0000 UTC
-5132,CLOSED,Calling metrics endpoint stops the triton server,,2023-01-28 00:06:16 +0000 UTC
-5131,CLOSED,All models load successfully but unload themselves.,,2022-12-05 15:28:00 +0000 UTC
-5130,CLOSED,How to convert to dictionary in Python, Getting TypeError,,2022-12-04 18:32:55 +0000 UTC
-5129,CLOSED,model_warmup is ignored,,2022-12-29 13:28:28 +0000 UTC
-5128,CLOSED,NvFuser is disabled,,2022-12-07 15:44:00 +0000 UTC
-5127,CLOSED,Mutex lock in statistics report,,2022-12-29 13:28:40 +0000 UTC
-5124,CLOSED,Does triton support kv_cache?,question,2022-12-05 02:53:40 +0000 UTC
-5123,CLOSED,Torchscript model input requires tensor on gpu,,2022-12-05 10:40:34 +0000 UTC
-5121,CLOSED,CPU consumtion much higher when using Triton server + aws inferentia vs aws inferentia alone,question,2022-12-12 14:49:25 +0000 UTC
-5120,CLOSED,Parameters CPU_THREADS_NUM in openvino_backend and instance_group in python_backend don't work well,,2023-01-28 00:00:26 +0000 UTC
-5116,CLOSED,Error while nginx proxy,bug, investigating,2023-01-28 16:01:36 +0000 UTC
-5115,CLOSED,Torchscript backend **MUCH** slower only with FP16 on 1650,,2022-11-28 19:22:30 +0000 UTC
-5114,CLOSED,Bump up curl version for compatibility for Linux,enhancement, investigating,2022-12-05 06:08:38 +0000 UTC
-5113,CLOSED,Ensemble output waiting,question,2022-12-19 20:22:01 +0000 UTC
-5110,CLOSED,Can not create a customized Python backend when using Python 3.11 for the Conda environment,bug,2022-12-05 15:39:57 +0000 UTC
-5109,CLOSED,GRPC error when return the client results,question,2022-12-20 20:33:17 +0000 UTC
-5108,CLOSED,Multi instances's performance is slightly low. [pytorch_backend],question,2022-12-03 05:38:05 +0000 UTC
-5107,CLOSED,Can not use container with tensorflow in python backend,,2022-12-19 20:23:03 +0000 UTC
-5106,CLOSED,The end-to-end request duration (_nv_inference_request_duration_us) is 10x the actual inference duration of the model (_nv_inference_compute_infer_duration_us_),question,2022-12-19 20:23:32 +0000 UTC
-5105,CLOSED,How to determine which instance the request is on？,question,2022-12-19 20:19:53 +0000 UTC
-5103,CLOSED,Build Error Windows without docker,question,2022-12-19 20:21:03 +0000 UTC
-5102,CLOSED,Triton client failed to build on Mac - <rapidjson/document.h> file not found,,2023-01-28 00:04:23 +0000 UTC
-5100,CLOSED,Want to install specific version of TensorRT in Triton server,question,2022-12-08 17:48:57 +0000 UTC
-5098,OPEN,Question about 22.09,bug,2022-11-29 14:37:03 +0000 UTC
-5097,CLOSED,Is there any way to check if the model is now used by another client ? Then my client will not unload this model,question,2022-12-01 17:48:13 +0000 UTC
-5095,CLOSED,What are the things to do to make inference faster or more responsive?,,2023-01-28 00:03:49 +0000 UTC
-5094,CLOSED,Cannot server custom cuda kernel GPU pytorch models via the python backend with conda pack,,2022-11-29 01:19:28 +0000 UTC
-5092,CLOSED,Tensorrt output is wrong in GPU-A40 but GPU-P40 is right,,2022-11-21 18:57:35 +0000 UTC
-5091,CLOSED,Error case handling,question,2022-12-19 20:23:58 +0000 UTC
-5090,CLOSED,need help about the deployment of large model,,2022-12-19 20:24:49 +0000 UTC
-5089,CLOSED,How to setup a local server for realtime streaming inferences?,,2023-01-28 00:03:56 +0000 UTC
-5087,CLOSED,Tritonserver load recommended backend failed,,2022-11-18 03:54:23 +0000 UTC
-5084,CLOSED,onnx model causes core dump in 22.08+, works with 22.06,bug,2023-01-03 20:04:34 +0000 UTC
-5082,CLOSED,Preprocessing input on server causing increased latency,,2022-11-16 16:19:17 +0000 UTC
-5081,CLOSED,Question about the "python_backend",question,2022-12-06 19:02:42 +0000 UTC
-5080,CLOSED,Model Ensemble using only the latest configuration files,,2022-12-06 19:04:07 +0000 UTC
-5079,CLOSED,getting incorrect output shape,,2022-12-06 19:04:13 +0000 UTC
-5078,CLOSED,client fails to compile,,2022-12-06 19:03:58 +0000 UTC
-5077,CLOSED,triton server can not start if docker run with specified cpusets using docker run -cpuset-cpus,,2023-03-07 02:49:07 +0000 UTC
-5074,CLOSED,Provide an API to access configpb.txt programmatically to know which models are configured to run on Triton,enhancement,2022-12-01 00:36:59 +0000 UTC
-5073,CLOSED,docs/examples/stable_diffusion -> `scale_model_input` function should be called before `step`,bug,2022-11-28 22:54:33 +0000 UTC
-5072,CLOSED,class inference::RepositoryModelLoadRequest has no member named ‘mutable_parameters’,,2022-11-15 00:33:18 +0000 UTC
-5071,CLOSED,Failed to serve model coverted by torch2trt,,2022-11-17 00:52:46 +0000 UTC
-5069,CLOSED,mobilenet based model deploy success, but run fail,,2023-02-21 21:58:08 +0000 UTC
-5067,OPEN,Thread control options in PyTorch backend,enhancement,2022-11-14 04:26:46 +0000 UTC
-5065,CLOSED,Triton doesn't recognize any model,,2022-11-10 20:55:56 +0000 UTC
-5064,CLOSED,When i use 3060 gpu,can not load model,,2022-11-14 05:41:29 +0000 UTC
-5063,CLOSED,Problems with installation of python_backend from source,,2022-11-10 20:55:14 +0000 UTC
-5062,CLOSED,Support Load/ Unload Model Type Of PyTorch Geometric With TorchScript,,2022-11-29 16:57:16 +0000 UTC
-5055,CLOSED,Stable-diffusion Example Inference Error due to Triton Server side's triton version update,bug,2022-11-17 00:33:42 +0000 UTC
-5053,CLOSED,Infer error - unknown request input name input__0,,2022-11-13 17:06:50 +0000 UTC
-5052,CLOSED,Deploying multiple Triton containers on a single GPU and unable to servce by GRPC at the same time,,2023-01-27 16:44:55 +0000 UTC
-5050,OPEN,Ragged batching support for PyTorch backend,enhancement,2022-11-14 04:51:35 +0000 UTC
-5049,CLOSED,Our onnx models need onnxruntime version 1.10.0. I'm using triton server version 22.08 which has onnxruntime version 1.11.1 in onnx backend. How can i use the required version?,,2022-11-29 16:57:35 +0000 UTC
-5046,CLOSED,Can't make more than one request,,2023-07-06 18:30:25 +0000 UTC
-5045,CLOSED,Triton server container taking long time to launch. How to reduce the time when scaling to new instances,,2023-02-07 10:46:12 +0000 UTC
-5042,CLOSED,Triton-server TF backend, delayed response during explicit model update,,2022-11-29 16:58:07 +0000 UTC
-5041,CLOSED,Error on loading Pytorch Model using Triton-PytorchBackedn,,2022-11-29 16:58:15 +0000 UTC
-5040,CLOSED,How to build TensorFlow Backend With Custom TensorFlow?,,2022-11-25 12:38:46 +0000 UTC
-5039,CLOSED,Unable to access GCS bucket with workload identity mechanism in GKE,bug,2023-04-18 09:59:19 +0000 UTC
-5037,OPEN,allow constant input tensors in (ensemble) models,enhancement,2022-11-04 23:17:28 +0000 UTC
-5035,CLOSED,Inputs' shape with reshape for batch inference has wired behaviour,,2023-02-23 22:23:52 +0000 UTC
-5034,CLOSED,async_infer GRPC result mapping,question,2022-11-06 12:57:29 +0000 UTC
-5031,CLOSED,Question about deploying multiple docker models on a single GPU,question,2022-11-07 18:54:43 +0000 UTC
-5027,CLOSED,InferenceServerClient request time is large than model-analyzer,question,2022-11-09 17:33:50 +0000 UTC
-5026,CLOSED,AVX,,2023-02-06 10:42:03 +0000 UTC
-5025,CLOSED,22.09 Logger still unavailable in python_backend,question,2022-11-02 18:08:27 +0000 UTC
-5024,CLOSED,Inference Speed drops,question,2022-11-29 16:58:23 +0000 UTC
-5023,OPEN,Socket Closed when running on K8S,bug,2022-11-03 21:33:35 +0000 UTC
-5021,CLOSED,Pytorch Backend Compatibility on AGX,question,2022-11-29 17:02:35 +0000 UTC
-5018,CLOSED,USE GRPC ON OPENSHIFT,,2022-10-28 22:58:20 +0000 UTC
-5017,CLOSED,report error when model batchsize>1.,,2022-12-29 13:36:41 +0000 UTC
-5014,CLOSED,How to decrease GPU memory in onnxruntime?,,2022-11-22 02:26:06 +0000 UTC
-5013,CLOSED,Why does TensorRT engine only supports max-batch 1,,2022-11-19 14:21:09 +0000 UTC
-5009,CLOSED,Possibility to recreate new connection to the triton server,,2022-11-29 17:02:51 +0000 UTC
-5008,CLOSED,why jetpack5.0 not supported S3 storage?,,2023-02-10 14:36:24 +0000 UTC
-5007,CLOSED,Triton inference results lower than the corresponding .tlt model,,2023-01-28 00:04:52 +0000 UTC
-5001,CLOSED,Can I use Triton server for inference on GPU AWS graviton instances,question,2022-11-29 17:04:05 +0000 UTC
-5000,CLOSED,Windows triton build cannot make https call because of a reduced default libcurl,bug, investigating,2022-11-18 23:01:28 +0000 UTC
-4999,CLOSED,How do I know that Triton's return is over?,duplicate,2023-06-28 12:45:01 +0000 UTC
-4998,CLOSED,Triton Inference Server Image Python Libraries,,2022-10-20 19:16:10 +0000 UTC
-4997,CLOSED,Model config and data transmitted is type F32, response in BYTES,bug, investigating,2022-10-21 01:16:03 +0000 UTC
-4996,CLOSED,UnicodeEncodeError when using python backend to encode utf-8,question,2022-10-20 04:47:26 +0000 UTC
-4995,CLOSED,How to properly use dynamic input shape with Triton?,bug, investigating,2022-10-31 20:45:56 +0000 UTC
-4994,CLOSED,ModelInfer RPC doesn't support models with decoupled transaction policy,question,2022-10-25 23:22:51 +0000 UTC
-4992,CLOSED,NVIDIA A2 Volatile GPU-Util = 88% just max_batch_size=2,question,2022-12-29 13:36:59 +0000 UTC
-4990,CLOSED,Triton server throwing error on inference request after upgrading to latest version - INVALID_ARGUMENT: Fail to proof the equality of two dimensions at compile time,,2022-10-17 19:57:01 +0000 UTC
-4989,CLOSED,Logging question,question,2022-12-19 20:28:19 +0000 UTC
-4988,OPEN,Triton image_client.py fails to run,enhancement, investigating,2022-10-17 20:55:29 +0000 UTC
-4987,CLOSED,The Issue of Triton Server Load Same Model Many Times,,2022-11-22 03:12:45 +0000 UTC
-4986,CLOSED,Model profiling without model analyzer,,2023-02-17 19:55:21 +0000 UTC
-4985,CLOSED,tritonclient.utils.InferenceServerException: [StatusCode.UNAVAILABLE] Connection reset by peer,,2022-10-18 03:16:09 +0000 UTC
-4984,OPEN,gRPC client generated python files are really old,enhancement, investigating,2023-02-08 11:09:07 +0000 UTC
-4983,CLOSED,Ho to debug the model file while using Python backend,question,2022-12-19 20:32:18 +0000 UTC
-4981,CLOSED,Docker build fail on windows,bug,2022-12-05 19:58:34 +0000 UTC
-4980,CLOSED,Can not run the demo example : Request for unknown model: 'densenet_onnx' is not found,question,2022-10-14 01:14:01 +0000 UTC
-4979,CLOSED,Build error on windows,bug,2022-11-28 18:36:29 +0000 UTC
-4978,CLOSED,Question: Triton server scaling when too many models,question,2022-10-19 17:25:33 +0000 UTC
-4977,CLOSED,Unable to Launch container ImagePull Back Off Issue,question,2022-10-18 00:25:37 +0000 UTC
-4976,CLOSED,Load/Unload model,,2022-10-26 01:01:19 +0000 UTC
-4975,CLOSED,Python backend: enable sharing memory between multiple instances of a model,,2023-01-04 22:36:23 +0000 UTC
-4974,CLOSED,Loading parameters at model loading time,question,2022-12-19 20:28:56 +0000 UTC
-4973,CLOSED,How to discover for Tensorrt backend if batch size for the model is restricted to exactly max_batch_size?,question,2022-12-20 20:31:49 +0000 UTC
-4972,CLOSED,python backend {'error': 'GRPC Execute Failed, message: Received message larger than max (4915244 vs. 4194304)'},,2022-11-22 03:12:55 +0000 UTC
-4971,CLOSED,The results returned by the interface can be customized,,2022-10-13 00:37:23 +0000 UTC
-4970,CLOSED,Shared Memory Header and Libraries not included in the prebuilt libraries. Can they be included?,,2022-10-11 23:38:19 +0000 UTC
-4967,CLOSED,terminate called after throwing an instance of 'std::out_of_range',,2022-11-22 03:12:50 +0000 UTC
-4966,CLOSED,ONNX model unload does not free system memory,investigating,2023-01-30 18:57:56 +0000 UTC
-4965,CLOSED,Error while loading shared libraries: libdcgm.so.2,bug,2023-05-22 22:37:22 +0000 UTC
-4959,CLOSED,Triton server with python backend slow for YOLO inferencing,question,2022-10-18 11:48:52 +0000 UTC
-4958,CLOSED,8000 port is not responding,,2022-11-22 03:13:00 +0000 UTC
-4957,CLOSED,Unable to load custom YOLOX model with CPU only mode - failed to stat file,question,2022-10-11 05:55:59 +0000 UTC
-4956,CLOSED,Triton Server crash when upload a wrong serializationVersion model,bug, investigating,2022-10-10 21:50:13 +0000 UTC
-4955,CLOSED,Failed to create environment directory for '/tmp/python_env_QNQ3ug/5',investigating,2022-12-04 03:55:47 +0000 UTC
-4954,CLOSED,how to get triton_python.dll?,question,2022-10-14 23:26:14 +0000 UTC
-4952,CLOSED,libgomp-d22c30c5.so.1: cannot allocate memory in static TLS block,,2022-10-25 02:51:48 +0000 UTC
-4947,CLOSED,Load python model GPU failed,,2022-11-22 02:57:02 +0000 UTC
-4942,OPEN,Add example model for FIL backend,enhancement,2022-11-22 03:13:42 +0000 UTC
-4941,CLOSED,Get request_id_ from failed request,enhancement,2023-01-19 18:30:16 +0000 UTC
-4940,CLOSED,Simultaneous execution and dynamic batching,question,2022-10-06 17:36:21 +0000 UTC
-4938,CLOSED,[INFO] How does the internal queue/batching work?,question,2022-10-06 16:40:16 +0000 UTC
-4937,OPEN,Missing doc for `platform`,bug,2022-10-03 16:29:53 +0000 UTC
-4936,CLOSED,Deploy triton model on multiple nodes,question,2022-10-06 16:42:14 +0000 UTC
-4935,CLOSED,Load model to repository,question,2022-12-19 20:33:01 +0000 UTC
-4934,CLOSED,RuntimeError: Error in dlopen: libtorch_cuda_linalg.so: cannot open shared object file: No such file or directory,bug,2023-02-21 21:01:11 +0000 UTC
-4932,CLOSED,Python Backend Uses Lots of GPU Memory,question,2022-12-26 09:21:08 +0000 UTC
-4931,CLOSED,Failed to process the request(s) for model '***', message: error: unpack_from requires a buffer of at least 578635587 bytes for unpacking 578486289 bytes at offset 149298 (actual buffer size is 242891) At: /opt/tritonserver/backends/python/triton_python_backend_utils.py(116): deserialize_bytes_tensor,bug,2023-05-19 22:31:16 +0000 UTC
-4930,CLOSED,tritonclient with pycuda integration,enhancement,2023-06-21 01:33:39 +0000 UTC
-4929,CLOSED,Error Resuming when not pause,bug,2022-12-20 20:22:03 +0000 UTC
-4927,CLOSED,Seek help: Unrecognized data format,,2022-09-28 02:52:48 +0000 UTC
-4925,CLOSED,Occasional segfault in dynamic batch scheduler using ONNX runtime,bug, investigating,2022-10-07 15:51:53 +0000 UTC
-4924,CLOSED,[Tensorflow Backend] | [Segfault, intermittent] | Unloading model on GPU results in a segfault i.e. tritonserver crashes,bug,2022-10-28 23:52:04 +0000 UTC
-4919,OPEN,Add support to "parameters" in Python tritonclient package,enhancement,2022-09-26 21:17:58 +0000 UTC
-4918,CLOSED,Responses arrives in different order than has been sent,question,2023-02-23 23:12:41 +0000 UTC
-4917,CLOSED,Grpc_serve unable to handle Java client UINT32 requests,,2022-11-10 16:33:32 +0000 UTC
-4916,CLOSED,Bytes/String Datatype golang grpc client example,,2022-10-07 01:12:19 +0000 UTC
-4915,CLOSED,Triton failed to open the cudaIpcHandle upon prediction request when launched in container under WSL2,bug,2022-10-19 02:45:41 +0000 UTC
-4910,CLOSED,explicit mode load model failed,,2022-10-13 00:52:00 +0000 UTC
-4909,CLOSED,Multiple Classification,,2022-10-18 00:02:10 +0000 UTC
-4908,CLOSED,module 'triton_python_backend_utils' has no attribute 'Logger',,2022-09-25 17:57:47 +0000 UTC
-4907,CLOSED,Yolo in the Cloud,,2022-11-22 03:13:56 +0000 UTC
-4906,CLOSED,Server Hangs when loading python backend w/ pytorch (including the example),,2022-11-22 03:14:23 +0000 UTC
-4904,CLOSED,Broken metrics link in the backend example,,2022-10-03 18:57:24 +0000 UTC
-4903,CLOSED,Unexpected shape for input 'TEXT' for model 'ensemble_model'. Expected [-1,-1], got [2],question,2022-12-20 20:28:38 +0000 UTC
-4902,CLOSED,wrong format of returned results,,2022-09-22 07:04:12 +0000 UTC
-4899,CLOSED,Parameter CPU_THREADS_NUM setting bug in openvino_backend,,2022-09-22 13:57:36 +0000 UTC
-4898,CLOSED,Triton Server return wrong results,,2022-09-21 07:08:13 +0000 UTC
-4896,CLOSED,Supporting MIG with multi model instance groups,,2022-10-04 15:24:17 +0000 UTC
-4894,CLOSED,Fatal error: EfficientNMS_TRT is not a registered function/op,question,2022-09-30 22:45:32 +0000 UTC
-4893,CLOSED,Fatal error: EfficientNMS_TRT is not a registered function/op,duplicate,2022-09-19 18:50:43 +0000 UTC
-4892,CLOSED,[QUESTION] What are Benefits of Using Backends?,question,2022-09-21 05:06:34 +0000 UTC
-4890,CLOSED,How to invoke multiple models using a single BLS?,,2022-10-04 15:24:34 +0000 UTC
-4889,CLOSED,[QUESTION] About Concurrent Model Execution Feature,,2022-09-28 06:53:20 +0000 UTC
-4888,CLOSED,Splitting a batch of images into small batches for different instances.,,2022-10-04 15:25:40 +0000 UTC
-4887,CLOSED,UNAVAILABLE: Internal: Unable to set NUMA memory policy: Operation not permitted,,2022-10-10 23:59:26 +0000 UTC
-4886,CLOSED,Dynamic batching but require fixed input size,bug,2022-11-29 17:03:48 +0000 UTC
-4885,CLOSED,Get shm memory status,,2022-11-22 03:15:02 +0000 UTC
-4881,CLOSED,image_client.py does not get enough responses from server,question,2022-09-20 10:25:10 +0000 UTC
-4880,CLOSED,too many open files,bug, investigating,2022-09-27 14:31:28 +0000 UTC
-4875,CLOSED,Lower latency with high cpu usage, while higher latency with low cpu usage,question,2022-09-14 17:07:08 +0000 UTC
-4874,CLOSED,build file after edit image_client.py code,question,2022-09-16 01:54:53 +0000 UTC
-4872,CLOSED,Transfering results of Async Requests to Queue Best Practice,,2022-09-13 15:48:19 +0000 UTC
-4870,OPEN,Python backend cannot import Tensor,enhancement,2023-01-29 10:59:57 +0000 UTC
-4869,CLOSED,python backend: How to set the shm-size with triton helm chart?,,2022-09-12 11:37:55 +0000 UTC
-4868,CLOSED,Memory leek problem in 22.07 of tensorrt backend,,2022-09-11 09:02:50 +0000 UTC
-4867,CLOSED,Run constrained beam search T5 with TensorRT Triton,question,2022-09-30 22:45:29 +0000 UTC
-4866,CLOSED,Lost file when deploy Triton from S3,question,2022-09-30 22:42:52 +0000 UTC
-4865,CLOSED,Python backend 0-size GPU tensors causing "failed to get cuda pointer device attribute: invalid argument",bug, investigating,2022-09-29 00:42:27 +0000 UTC
-4858,CLOSED,Tritonserver: symbol lookup error: _sentencepiece_tokenizer.so,question,2022-09-22 21:39:51 +0000 UTC
-4857,CLOSED,python backend crash,,2022-09-30 22:42:45 +0000 UTC
-4856,CLOSED,question about the priority in ratelimiter,question,2022-11-08 23:02:47 +0000 UTC
-4855,CLOSED,strange coredump in libtritonserver.so,bug,2022-09-30 18:47:14 +0000 UTC
-4854,CLOSED,how could I know the default value of model_config.proto,question,2022-09-09 01:58:23 +0000 UTC
-4853,CLOSED,Impossible to load custom python backend,investigating,2023-03-15 10:06:26 +0000 UTC
-4851,CLOSED,How to use triton for Multi Object Tracking,question,2022-09-30 22:47:28 +0000 UTC
-4850,CLOSED,Question with client with cuda shared memory,question,2022-09-06 17:06:21 +0000 UTC
-4849,CLOSED,`tritonclient[grpc]==2.24.0` Produces OOMs When Async gRPC Calls Are Performed,,2022-11-22 03:14:18 +0000 UTC
-4848,CLOSED,use triton container 22.07 sdk load torchscript model failed,,2022-09-05 11:46:07 +0000 UTC
-4847,CLOSED,Can't build the Dockerfile.win10.min,,2022-11-22 18:19:50 +0000 UTC
-4846,CLOSED,data serializing error when the concurrency of requests is high,,2022-09-30 22:47:02 +0000 UTC
-4845,CLOSED,Ensemble scheduler parallel,question,2022-09-09 01:59:11 +0000 UTC
-4844,CLOSED,Errors when loading new models multiple times in quick succession,,2022-11-22 03:15:18 +0000 UTC
-4843,OPEN,How stub methods are using?,question,2022-09-09 23:53:29 +0000 UTC
-4842,CLOSED,BERT model is returning NaN logits values in output,,2022-09-30 22:44:01 +0000 UTC
-4841,CLOSED,Under high load, stateful batcher queues small batches,bug,2022-11-18 10:15:24 +0000 UTC
-4840,CLOSED,GRPC: unable to provide 'prob' in GPU, will use CPU,,2022-11-22 03:10:49 +0000 UTC
-4839,CLOSED,building without docker: How to specify backend's CUDA / CuDNN version?,,2022-09-02 09:01:19 +0000 UTC
-4838,CLOSED,Is there any sort of C++ equivalent of the Python BLS in Triton?,,2022-09-02 16:53:14 +0000 UTC
-4837,CLOSED,Unable to create cluster with single t4 gpu , 2 core 12 gb ram in GKE,question,2022-10-13 00:52:59 +0000 UTC
-4836,OPEN,Dose StreamInfer assure connecting with same model instance?,question,2022-09-02 17:01:24 +0000 UTC
-4834,CLOSED,Using shm in bls,,2022-11-22 02:07:43 +0000 UTC
-4833,CLOSED,when using model load api to load model in explicit mode by passing the model config, get attempt to access JSON non-string as string exception,,2022-08-31 11:58:10 +0000 UTC
-4832,CLOSED,How to make different model version use different config?,question,2022-10-11 00:00:30 +0000 UTC
-4831,CLOSED,example for Kubernetes configuration multi-node multi-gpu,,2023-02-13 19:19:42 +0000 UTC
-4829,CLOSED,README.md steps not working for me,question, investigating,2023-07-08 00:31:18 +0000 UTC
-4828,CLOSED,Wrong behaviour of ensemble unloading using HTTP API,,2022-08-31 11:27:13 +0000 UTC
-4827,CLOSED,How to use nsight system in triton inference server,,2022-08-30 08:23:43 +0000 UTC
-4826,CLOSED,max_batch_size configuration issue,question,2022-09-12 17:33:43 +0000 UTC
-4825,CLOSED,Buiding without docker failed,,2022-11-22 03:10:12 +0000 UTC
-4824,CLOSED,Load a new pytorch model but get an error: configuration expects 0 inputs, model provides 2,,2022-10-27 01:02:30 +0000 UTC
-4821,CLOSED,Cannot use pb_utils.Logger,,2022-08-29 18:40:33 +0000 UTC
-4820,CLOSED,Model config for empty dimensions,,2022-08-29 16:53:46 +0000 UTC
-4819,OPEN,Core rebuild is extremely long,enhancement,2022-08-30 20:08:42 +0000 UTC
-4818,CLOSED,Request Cancellation,question,2022-09-12 17:33:20 +0000 UTC
-4817,CLOSED,Tensorflow model does not work on inference,question,2022-09-12 17:35:11 +0000 UTC
-4815,CLOSED,Triton not loading custom ONNX model,question,2022-09-14 18:58:41 +0000 UTC
-4814,CLOSED,Error when doing inference using tf-trt converted frozen model,,2022-10-11 00:00:00 +0000 UTC
-4813,CLOSED,triton_python_backend_utils's api document?,,2022-09-09 07:17:57 +0000 UTC
-4812,CLOSED,Incomprehensible overhead in Tritonserver inference,question, performance,2023-02-23 23:10:42 +0000 UTC
-4811,CLOSED,onnx model dynamic axis not working,,2022-10-13 00:48:32 +0000 UTC
-4810,CLOSED,[12th Gen Intel(R)] container was built for CPUs supporting at least the AVX instruction set,,2022-11-22 03:11:33 +0000 UTC
-4809,CLOSED,No-copy Tensor transfer in python backend-based ensemble,question, performance,2022-11-29 17:03:24 +0000 UTC
-4808,CLOSED,If I open the log verbose in python http client, will time cost increase?,,2022-08-26 08:46:36 +0000 UTC
-4807,CLOSED,Support for Mac M1 chips?,,2022-08-24 07:02:28 +0000 UTC
-4806,CLOSED,Check torchlib torch version,,2022-08-23 22:51:37 +0000 UTC
-4804,CLOSED,Failed to set cuda graph shape when I set max_batch_size==0,bug,2022-09-23 18:27:21 +0000 UTC
-4800,CLOSED,Shared memory failing in gunicorn following example,,2022-11-22 03:15:29 +0000 UTC
-4799,CLOSED,Multiple instances on the same GPU,question,2022-08-23 00:24:44 +0000 UTC
-4798,CLOSED,Error details: Error setting the binding dimension,,2022-08-24 10:19:16 +0000 UTC
-4797,CLOSED,W0821 08:49:41.251296 104802 server.cc:208] failed to enable peer access for some device pairs,,2022-10-13 00:49:24 +0000 UTC
-4796,CLOSED,What should I do when backend_input_collector return need_cuda_input_sync of True value?,question,2022-09-13 20:54:51 +0000 UTC
-4795,CLOSED,Images Downloaded Location?,question,2022-08-24 18:27:45 +0000 UTC
-4790,CLOSED,Low gpu utilization,,2022-11-22 03:13:23 +0000 UTC
-4788,CLOSED,Question about inferring on multiple cards using cuda_shared_memory,question,2022-09-12 17:33:56 +0000 UTC
-4787,CLOSED,How can I get the model list when I set --model-control-mode=poll by using "curl -X POST http://<ip>: <port>/v2/reporitory/models/....",question,2022-08-31 20:37:56 +0000 UTC
-4786,OPEN,Show ensemble stage where error happens,enhancement,2022-08-19 15:57:22 +0000 UTC
-4784,CLOSED,Changing Maximum Batch Size after model deployment,question,2022-08-19 15:41:38 +0000 UTC
-4783,CLOSED,Python backend bug when using model load api to reload model in explicit mode,bug, investigating,2023-03-16 22:28:34 +0000 UTC
-4782,CLOSED,How long support of current backend API you are guarantee.,question,2022-08-18 21:54:47 +0000 UTC
-4781,OPEN,Container images for Jetson devices,enhancement,2022-08-17 23:20:00 +0000 UTC
-4780,CLOSED,[BUG] Multi-input model with varying sizes gives error with gRPC client,,2022-08-18 13:32:53 +0000 UTC
-4779,OPEN,Tests for backend examples,enhancement,2022-08-17 18:37:03 +0000 UTC
-4778,CLOSED,triton pytorch backend malloc coredump,bug,2022-09-05 09:25:38 +0000 UTC
-4776,CLOSED,Socket operation on non-socket when using multiprocessing,,2022-08-18 06:01:16 +0000 UTC
-4775,CLOSED,Issues with implicit state management,,2022-08-17 18:40:25 +0000 UTC
-4772,OPEN,Python Backend to support GPU instance,enhancement,2023-07-06 18:14:38 +0000 UTC
-4769,CLOSED,Core dump when dynamic batch Infer using tensorflow backend,bug, investigating,2023-04-10 16:22:00 +0000 UTC
-4767,CLOSED,Unexpected inference output 'model_output', allowed outputs are: logits,question,2022-09-21 12:49:20 +0000 UTC
-4766,OPEN,CUDNN_STATUS_EXECUTION_FAILED when Triton server is running,bug,2022-09-01 00:34:30 +0000 UTC
-4765,CLOSED,WebRTC Support ?,,2022-08-31 20:36:57 +0000 UTC
-4764,CLOSED,Does model should instantly free memory after unload?,bug,2022-08-31 20:36:43 +0000 UTC
-4763,CLOSED,Run server with public IP !,question,2022-08-15 16:49:01 +0000 UTC
-4761,CLOSED,Triton pod not scheduled in all GPUs in a physical server.,question,2022-08-13 01:56:48 +0000 UTC
-4756,CLOSED,TF_SIGNATURE_DEF is not used when selected on service startup,bug,2023-03-24 00:40:27 +0000 UTC
-4754,CLOSED,Real time inference using Triton C APIs for multiple models,,2022-11-10 16:35:14 +0000 UTC
-4753,CLOSED,Limit maximum nubmer of concurrent requests for triton server,question,2022-08-10 17:17:22 +0000 UTC
-4752,CLOSED,Build a custom python backend environment for old fashion model. How to use specific CUDA version in conda environment?,,2022-11-22 03:03:02 +0000 UTC
-4749,CLOSED,Have any roadmap to support http2?,,2022-09-06 23:43:31 +0000 UTC
-4748,CLOSED,[question] Development workflow for custom C++ backends,,2022-09-02 10:26:06 +0000 UTC
-4745,CLOSED,Is it possible to add logging to python backend?,,2022-08-05 19:29:11 +0000 UTC
-4744,CLOSED,Triton server always crash during stress test.,,2022-09-22 11:13:16 +0000 UTC
-4743,CLOSED,Python Backend complains "triton_python_backend_utils" has no attribute "InferenceRequest",bug,2022-09-30 22:43:27 +0000 UTC
-4742,CLOSED,onnxruntime: `--no-container-build` not honored,,2022-09-09 20:38:15 +0000 UTC
-4739,CLOSED,Couldn't get temp CUBIN file name - TensorFlow XLA,investigating,2022-08-17 22:05:44 +0000 UTC
-4737,CLOSED,How to run triton on windows10?,investigating,2023-05-29 08:22:16 +0000 UTC
-4734,CLOSED,Perf_analyzer json file error on unspecified optional tensors.,,2022-08-03 14:32:27 +0000 UTC
-4733,OPEN,Python backend dynamic loading model uses configuration parameters, and loading fails,bug,2022-08-05 21:05:57 +0000 UTC
-4731,OPEN,enh: Trace to capture the child models invoked from BLS,enhancement,2022-08-05 00:37:14 +0000 UTC
-4730,CLOSED,build error: /workspace/src/grpc_server.cc:822:35: error: 'google::protobuf::stringpiece_internal' has not been declared,,2022-09-07 21:47:14 +0000 UTC
-4728,CLOSED,Python backend shared codebase and code import,question,2022-08-03 05:56:51 +0000 UTC
-4726,CLOSED,Question complex output with different shape for every sample - Token classification,,2022-08-28 12:24:51 +0000 UTC
-4725,OPEN,Fail to fetch PR with `--repo-tag`,bug,2022-08-04 22:54:42 +0000 UTC
-4724,CLOSED,failed to load model,,2022-09-07 21:47:37 +0000 UTC
-4722,CLOSED,Customize response when using raw binary request.,enhancement,2022-08-12 16:17:41 +0000 UTC
-4720,CLOSED,Trace summary script doesn't correctly handle splitting string values,,2022-10-21 23:42:35 +0000 UTC
-4719,CLOSED,Large model output is copied when working from a BLS,,2023-02-01 20:46:26 +0000 UTC
-4718,CLOSED,Add request id to trace output,enhancement,2023-03-07 19:36:32 +0000 UTC
-4716,CLOSED,Parallel model inferencing flakey after upgrading triton,,2022-10-13 00:42:15 +0000 UTC
-4715,CLOSED,Triton server periodically stops responding to `ServerLive`, `ServerReady` and `RepositoryIndex` requests,,2022-11-22 03:31:59 +0000 UTC
-4714,CLOSED,oops,,2022-07-29 18:51:27 +0000 UTC
-4713,CLOSED,Error message printed when install python packages in `nvcr.io/nvidia/tritonserver:22.06-py3` docker,,2022-08-01 22:23:01 +0000 UTC
-4712,CLOSED,Tensorrt is very slow when batch size is small,,2022-09-07 21:48:51 +0000 UTC
-4711,CLOSED,[Question] About perf_analyzer request rate,bug,2023-02-10 03:18:17 +0000 UTC
-4708,CLOSED,can't print chinese in Python backend,,2022-09-09 20:40:55 +0000 UTC
-4703,CLOSED,Python backend initialize 'model_repository' arg breaking change,,2022-07-29 18:45:34 +0000 UTC
-4702,CLOSED,Error in accessing MINIO with self signed cert,,2022-09-07 21:49:19 +0000 UTC
-4700,CLOSED,How to load plugin dynamically,enhancement,2023-07-06 18:47:59 +0000 UTC
-4697,CLOSED,Failed to load tensorflow model: Not loaded: No model version was found,,2022-07-26 23:02:44 +0000 UTC
-4696,CLOSED,Two issues when trying to do the homework of Triton server lesson from the NVIDIA DLI,,2022-07-27 02:11:34 +0000 UTC
-4691,CLOSED,Compute output much larger than compute input,,2022-11-22 03:31:55 +0000 UTC
-4690,CLOSED,openvinobackend inference only support synchronous mode? why not asynchronous mode,,2022-11-22 03:03:09 +0000 UTC
-4689,CLOSED,Using S3 repository for Triton does not working properly / Cannot plug in to S3,,2022-07-26 17:35:12 +0000 UTC
-4688,CLOSED,NGC container 22.07,question,2022-07-25 17:49:58 +0000 UTC
-4687,CLOSED,Support TensorRT 8.4,,2022-07-23 13:16:32 +0000 UTC
-4685,CLOSED,tritonclient.utils.InferenceServerException: [StatusCode.INTERNAL] Unable to open shared memory region: '/tVHIApxKugk6TwsknLr1b',question,2023-02-23 23:05:02 +0000 UTC
-4683,CLOSED,TensorRT Optimization in ConfigMap,question,2022-07-31 12:39:49 +0000 UTC
-4679,CLOSED,Clients supporting sending multiple synchronous inferences at the same time,question,2022-10-11 00:01:56 +0000 UTC
-4678,CLOSED,How can I run an uninterrupted thread in python backend?,question,2022-07-25 01:58:39 +0000 UTC
-4675,CLOSED,Optional usage of shared memory for python backend,,2022-07-21 11:27:53 +0000 UTC
-4674,CLOSED,perf_analyzer,,2022-07-21 17:32:40 +0000 UTC
-4673,CLOSED,Ensemble model can't obtain data from pre-processor,question,2022-07-21 02:03:52 +0000 UTC
-4672,CLOSED,What is the difference between `Plattform` and `Backend`,question,2022-10-11 00:03:01 +0000 UTC
-4671,CLOSED,run testing triton fail,question,2022-09-07 21:50:27 +0000 UTC
-4669,CLOSED,Questions for asynchronous Triton deployment,question,2022-07-20 08:21:27 +0000 UTC
-4668,OPEN,One click deployment to GKE no longer works as Istio deprecated,enhancement, investigating,2023-02-16 18:11:46 +0000 UTC
-4667,CLOSED,The average latency of the fp16 bert demo trt.engine with dynamic batch size is up to 2s,question,2022-07-21 21:16:14 +0000 UTC
-4665,CLOSED,Server crashes on loading shared libraries,bug, investigating,2023-01-28 02:50:58 +0000 UTC
-4662,CLOSED,Server died forever when overloaded,question,2022-07-19 21:49:57 +0000 UTC
-4661,OPEN,Add source distribution to Python client package,enhancement,2022-07-19 21:41:10 +0000 UTC
-4660,CLOSED,I am new to triton inference. I am looking for documents related to A/B testing but couldn't find them so far. If someone here is already used this feature, please let me the details.,invalid,2022-07-18 20:55:21 +0000 UTC
-4659,CLOSED,exec /opt/nvidia/nvidia_entrypoint.sh: exec format error,,2022-07-18 20:49:17 +0000 UTC
-4658,CLOSED,How to profile applications running on the triton-server?,question,2022-08-08 20:12:17 +0000 UTC
-4657,CLOSED,The cc_model_filenames does not work,question,2022-08-10 01:31:55 +0000 UTC
-4655,CLOSED,TRTIS 19.10 does notfind GPUs | failed call to cuInit: UNKNOWN ERROR (303),,2022-08-10 15:14:36 +0000 UTC
-4651,CLOSED,Cannot load Custom Op file in the container LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.,,2022-07-19 16:54:55 +0000 UTC
-4647,OPEN,[python backend] Add class_count argument for inference requests with BLS scripting,enhancement,2022-07-15 22:00:40 +0000 UTC
-4646,CLOSED,How to get predict_proba as inference output of pytorch model?,question,2022-08-08 09:54:52 +0000 UTC
-4644,CLOSED,jetson AGX xavier: Shared memory in docker container.,question,2022-07-14 15:30:31 +0000 UTC
-4643,CLOSED,question about batch mechanism,question,2022-07-15 13:42:32 +0000 UTC
-4639,CLOSED,Failing to get output tensor on GPU device,,2022-07-28 17:07:22 +0000 UTC
-4638,CLOSED,enh: Extend model management to support load / unload at model version,duplicate,2022-07-13 15:42:30 +0000 UTC
-4637,CLOSED,onnxruntime tensorrt is faster then triton server tensorrt,,2022-10-31 14:36:35 +0000 UTC
-4636,CLOSED,bug: Trace output for `BYTES` has invalid JSON encoding,bug, investigating,2022-08-01 00:28:10 +0000 UTC
-4630,OPEN,Dynamically loaded models don't work with ensemble,bug, investigating,2022-07-20 18:58:08 +0000 UTC
-4629,CLOSED,[question] Adding timeout options for both client and server with a custom backend and stateful batching,question,2022-07-19 16:09:09 +0000 UTC
-4628,CLOSED,Client gets into deadlock when max_sequence_idle_microseconds timeout occurs on triton server,,2022-07-19 08:38:42 +0000 UTC
-4627,CLOSED,[question] Versioning for ensemble models,question,2022-07-13 08:36:50 +0000 UTC
-4620,CLOSED,Why is C api recommended on Jetson ?,question,2022-07-14 14:22:05 +0000 UTC
-4619,CLOSED,[question] performance comparison, ensemble vs. BLS,question,2022-07-26 19:22:08 +0000 UTC
-4618,CLOSED,Can I stop execution?,question,2022-07-13 14:28:05 +0000 UTC
-4617,CLOSED,Get unexpected deadlock with ensemble model,,2022-07-13 02:43:36 +0000 UTC
-4616,CLOSED,How ot imporve throughput on tritonserver,,2022-09-07 21:51:09 +0000 UTC
-4615,CLOSED,How to connect to remote server with GRPC?,,2022-07-09 15:03:25 +0000 UTC
-4610,CLOSED,Ensemble model scheduler,question,2022-07-13 20:09:26 +0000 UTC
-4609,CLOSED,Changing the gRPC protocol to implement standard gRPC Health Checking Protocol,enhancement,2023-01-23 21:03:08 +0000 UTC
-4608,CLOSED,Triton server how to schedule GPU resource?,question,2022-07-11 20:37:20 +0000 UTC
-4606,CLOSED,Automating Image and Payload Upload for ONNX Backend Inference - Image Shape and Data payload,,2022-09-06 23:46:43 +0000 UTC
-4605,CLOSED,Cannot start Triton inference server with Python backend stub and ONNX models,,2023-03-17 15:45:19 +0000 UTC
-4604,CLOSED,Fail loading TensorRT model: could not set binding dimension,,2022-07-07 13:29:49 +0000 UTC
-4603,CLOSED,Accumulate inference time with an ensemble model is way slower than the slowest individual,question,2022-09-07 21:46:30 +0000 UTC
-4600,CLOSED,MT-NLG - Are we ever getting access to the 530 B parameters trained model?,,2022-07-06 21:59:45 +0000 UTC
-4598,CLOSED,[confused] Does Triton-server support to handle data from multiple video streams , maybe with model like detection + tracking(deep sort) + classification(stateful model),,2022-09-07 21:46:09 +0000 UTC
-4597,CLOSED,Autoscale instances,,2022-07-08 17:45:39 +0000 UTC
-4594,CLOSED,Add support for loading onnx files with the tensorRT backend,,2023-01-28 00:03:20 +0000 UTC
-4593,CLOSED,Triton server doesn't detect GPUs,,2022-08-01 06:52:51 +0000 UTC
-4590,CLOSED,Build latest triton custom image for Ubuntu 18.04,,2022-12-29 13:24:33 +0000 UTC
-4587,OPEN,ONNXRuntime TensorRT cache gets regenerated every time a model is uploaded even with correct settings,investigating,2022-07-29 03:00:28 +0000 UTC
-4585,CLOSED,Metrics,question,2022-07-05 21:42:05 +0000 UTC
-4584,CLOSED,Will warmup been done when we start the server with --model-control-mode=explicit,,2022-07-05 21:25:12 +0000 UTC
-4583,CLOSED,Help! triton_client.load_model return timeout error!,,2022-11-16 08:02:53 +0000 UTC
-4582,CLOSED,How can I send variable-length tensors as a batch in one request using Python APIs?,,2022-07-05 21:33:01 +0000 UTC
-4581,CLOSED,python backend doesn't support run each instance on multiple gpu,,2022-07-04 14:39:15 +0000 UTC
-4580,CLOSED,failed to launch triton-server,,2022-07-04 14:41:53 +0000 UTC
-4572,CLOSED,Use-cases and benefits of "Streaming" inference,,2022-07-22 23:57:37 +0000 UTC
-4571,CLOSED,Auto-Generated Model Configuraton with label file,,2022-07-14 01:19:36 +0000 UTC
-4570,CLOSED,request should include at least one InferRequestedOutput object,,2022-11-22 03:30:09 +0000 UTC
-4566,CLOSED,Triton terminated with Signal (6),bug,2022-10-08 06:54:57 +0000 UTC
-4563,CLOSED,tritonclient expects a different shape as defined in config.pbtxt,,2022-06-30 01:04:28 +0000 UTC
-4562,CLOSED,UNAVAILABLE: Internal: archive_read_open_filename() failed.,,2022-11-23 02:18:34 +0000 UTC
-4561,CLOSED,Release tags for tritonclient,,2022-06-29 17:32:04 +0000 UTC
-4560,CLOSED,Poll failed for model directory 'full-pipeline': output 'OUT' for ensemble 'full-pipeline' is not written,,2022-06-29 14:00:25 +0000 UTC
-4559,CLOSED,Assertion `batchSize > 0' failed, when deploy the tf-trt int8 optimization model,,2022-06-29 16:48:47 +0000 UTC
-4558,CLOSED,failed to split the output tensor 'dets' in responses: expected batch size of atleast 2 in model output, got 1,,2022-07-15 18:28:02 +0000 UTC
-4557,CLOSED,UNAVAILABLE: Internal: output 'labels' does not follow naming convention i.e. <name>__<index>.,,2022-06-29 19:09:36 +0000 UTC
-4556,CLOSED,support Hyper-Q in triton-server,,2022-07-15 18:27:47 +0000 UTC
-4555,CLOSED,Triton Server Docker Image with ONNXRuntime support,question,2022-07-14 23:14:01 +0000 UTC
-4554,CLOSED,Failed when building for Windows 10,,2022-09-07 21:44:49 +0000 UTC
-4550,CLOSED,[question] simple.cc, why std::vector<char>,question,2022-07-15 18:52:11 +0000 UTC
-4549,CLOSED,Server not ready: Warmup using python BLS,,2022-11-22 03:31:02 +0000 UTC
-4548,CLOSED,Questions regarding the Rate Limiter,question,2022-06-30 09:04:20 +0000 UTC
-4547,OPEN,Splitting a batch to max_batch_size if the batch size is larger than max_batch_size,enhancement, investigating,2023-07-07 23:11:44 +0000 UTC
-4545,CLOSED,When using perf_analyzer, throughput decreases sharply as concurrency increases.,,2022-07-11 14:18:23 +0000 UTC
-4542,CLOSED,Request the feature to send metadata in BLS,enhancement,2022-10-06 18:21:17 +0000 UTC
-4541,OPEN,python tritonclient stream_infer should send end signal to callback,enhancement,2022-08-26 17:17:25 +0000 UTC
-4540,OPEN,[Question] Customize HTTP response status code for malformed GPU card,bug, investigating,2022-06-29 00:20:52 +0000 UTC
-4538,OPEN,Support for the new CUDA virtual memory management functions for shared memory.,enhancement,2023-05-19 03:50:33 +0000 UTC
-4537,CLOSED,Server start stuck when loading python model instantiating certain transformers model,bug, investigating,2023-02-01 12:15:31 +0000 UTC
-4533,CLOSED,what is raw_mug_data?,question,2022-07-08 18:57:45 +0000 UTC
-4531,CLOSED,Triton's resource consumption,,2022-06-27 08:51:33 +0000 UTC
-4530,OPEN,support decoupled mode in perf_analyzer,enhancement,2022-06-20 15:04:19 +0000 UTC
-4529,OPEN,Hardening guide for Triton Server,enhancement,2022-06-21 19:08:35 +0000 UTC
-4528,CLOSED,Encounter memory leak issue when using http /load api to load new version's model,,2022-08-10 03:14:19 +0000 UTC
-4527,CLOSED,Loosing a horrible amount of recall on triton server inference,,2022-07-14 05:19:31 +0000 UTC
-4526,CLOSED,Option for adding or overriding model config attributes at server startup,enhancement,2023-07-08 00:36:26 +0000 UTC
-4525,CLOSED,pytorch_backend[undefined symbol],question,2022-07-08 18:58:31 +0000 UTC
-4524,CLOSED,tritonserver sometimes turn slow when gpu error,bug,2023-07-10 23:33:54 +0000 UTC
-4523,CLOSED,GRPC Health check method,,2022-06-18 09:29:35 +0000 UTC
-4520,CLOSED,Token classification,,2022-07-08 18:59:22 +0000 UTC
-4519,CLOSED,Triton 2.10.0 Build without docker on Ubuntu 20.04 with ONNX Backend,,2022-11-22 03:23:19 +0000 UTC
-4517,CLOSED,Tritonserver hanging on startup,,2022-06-17 21:40:10 +0000 UTC
-4516,CLOSED,How to use java client api transfer images and texts to python backend?,question,2022-06-21 17:03:37 +0000 UTC
-4515,CLOSED,Relative path for s3 storage,bug,2022-12-16 18:12:11 +0000 UTC
-4513,CLOSED,add param for pytorch backend to specific get_method other than forward,,2022-06-17 06:04:48 +0000 UTC
-4512,CLOSED,accuracy difference in local inference and triton inference,question, investigating,2022-06-15 10:11:57 +0000 UTC
-4511,CLOSED,perf_analyzer and perf_client don't have 'x' permission after pip install tritonclient(2.22.3),bug,2022-06-20 16:03:20 +0000 UTC
-4509,CLOSED,triton image classification example client http memory leak,bug,2022-07-08 19:33:19 +0000 UTC
-4508,CLOSED,On custom config and accessing other models config on python backend,question,2022-07-13 16:19:43 +0000 UTC
-4507,CLOSED,first run ok,seconde run error:xxx_model(version:1)inference error1:PyTorch execute failure: UNSUPPORTED DTYPE: Device,,2022-06-27 14:53:32 +0000 UTC
-4506,CLOSED,AttributeError: 'python backend utils.Inference Request' object has no attribute 'as_numpy',,2022-06-13 15:03:58 +0000 UTC
-4505,CLOSED,Status Message: CUDNN error executing cudnnFindConvolutionForwardAlgorithmEx,bug,2023-02-03 21:03:29 +0000 UTC
-4504,CLOSED,Triton 22.03-py3 stuck at TRITONBACKEND_ModelInstanceInitialize on older Ubuntu 18.04,question,2023-02-23 17:12:58 +0000 UTC
-4503,CLOSED,Warm up by sending multiple requests,question,2022-07-08 18:59:01 +0000 UTC
-4502,CLOSED,Multiple models in Triton,,2022-06-15 18:19:35 +0000 UTC
-4492,CLOSED,Throughput and concurrency values,question,2022-06-08 15:19:55 +0000 UTC
-4491,CLOSED,Triton server stops while making Async request from multiple threads,bug,2022-06-16 08:54:25 +0000 UTC
-4490,CLOSED,How to delete a backend?,question,2022-06-08 15:23:00 +0000 UTC
-4489,CLOSED,[Solved] Bug: socket.timeout: timed out. Server failed to respond to requests,,2022-09-12 17:51:07 +0000 UTC
-4488,CLOSED,Tensorflow and NVIDIA Triton Setup Issue,,2022-06-08 22:03:52 +0000 UTC
-4487,CLOSED,Is there a good solution for video streaming? For example, RTSP and RTMP,,2022-06-09 02:14:36 +0000 UTC
-4486,CLOSED,openvino backend 2022,question,2022-07-27 01:48:18 +0000 UTC
-4485,OPEN,SHARK Backend integration,enhancement,2022-06-07 21:17:48 +0000 UTC
-4483,CLOSED,Compute infer time increases linearly with batch size even with batching,,2022-06-14 10:10:56 +0000 UTC
-4482,CLOSED,Python backend with additional OS libraries,,2022-06-27 14:54:35 +0000 UTC
-4481,CLOSED,how to debug the model file when use python backend,question,2022-06-08 06:55:20 +0000 UTC
-4480,CLOSED,run ci test fail,,2022-06-07 18:41:59 +0000 UTC
-4479,CLOSED,Failed to register CUDA shared memory region 'fc6_1',,2023-06-28 10:30:12 +0000 UTC
-4478,CLOSED,C-API onnx runtime error 2: not enough space: expected 3145728, got 786432,,2023-05-29 11:33:17 +0000 UTC
-4477,CLOSED,Server stuck for a while when declaring gpu tensor of torch or cupy in python backend in first time of inference,,2022-06-10 07:35:54 +0000 UTC
-4471,CLOSED,Run and query a finetuned T5 model in Triton Inference Server,,2022-06-06 20:07:32 +0000 UTC
-4470,CLOSED,Can ragged input used together with stateful model?,,2022-11-22 03:23:03 +0000 UTC
-4467,CLOSED,Build from source failed,,2022-07-08 19:01:40 +0000 UTC
-4460,CLOSED,InferRequestedOutput throws memory error during object destruction,,2022-06-04 08:24:51 +0000 UTC
-4457,CLOSED,Conversion of Pbutils Output Tensor to Numpy Array without Torch dl pack,,2022-06-21 19:17:42 +0000 UTC
-4456,CLOSED,Cannot load model from GCS when LD_PRELOAD env var is set,bug,2023-07-06 18:44:41 +0000 UTC
-4454,CLOSED,Unable to use S3 model storage,,2022-06-08 00:14:30 +0000 UTC
-4453,CLOSED,"POST v2/repository/models/${MODEL_NAME}/load" failed on 22.05 but works fine on 21.08,,2022-07-12 20:54:31 +0000 UTC
-4452,CLOSED,Question: support complex model out data struct (List[Dict[str, Tensor]])?,question,2022-05-31 19:00:45 +0000 UTC
-4451,OPEN,Torchscript backend **MUCH** slower only with FP16 on 1650,investigating,2023-02-24 21:48:34 +0000 UTC
-4450,CLOSED,Build simple.cc,,2022-05-31 15:43:11 +0000 UTC
-4449,CLOSED,Torchscript backend error in multi-gpu environment,,2022-06-08 13:00:16 +0000 UTC
-4448,CLOSED,Tritonclient cushm on multi-gpu server,,2022-06-05 12:41:57 +0000 UTC
-4447,CLOSED,Issue with running .trt model,,2022-06-14 15:00:48 +0000 UTC
-4445,CLOSED,model expected the shape of dimension 0 to be between 1 and 1 but received 32,,2022-06-16 00:01:48 +0000 UTC
-4444,CLOSED,Download nvcr.io/nvidia/tritonserver:22.04-py3 file,,2022-05-30 10:12:49 +0000 UTC
-4443,CLOSED,[Question] Reducing payload size,,2022-06-02 15:33:54 +0000 UTC
-4436,CLOSED,WSL2 CUDA SHM support,,2022-12-28 18:30:58 +0000 UTC
-4434,CLOSED,WSL2 CUDA SHM support,,2022-05-27 17:32:03 +0000 UTC
-4433,CLOSED,run ci test fail,,2022-06-11 05:12:34 +0000 UTC
-4432,OPEN,Internal: An input of type 'Tensor[]' was detected in the model. Only a single input of type Dict(str, Tensor) or input(s) of type Tensor are supported.,enhancement,2022-12-29 23:12:58 +0000 UTC
-4430,CLOSED,Triton inference is slow then normal pytorch model on GPU, http_server.cc:1226] HTTP: unable to provide 'OUTPUT__0' in GPU, will use CPU,,2022-05-26 05:30:56 +0000 UTC
-4429,CLOSED,Triton inference is slow then normal pytorch model on GPU http_server.cc:1226] HTTP: unable to provide 'OUTPUT__0' in GPU, will use CPU,,2022-05-25 15:33:13 +0000 UTC
-4425,CLOSED,Triton docker client image does not have cmake installed (/usr/bin/cmake),,2022-05-25 15:02:23 +0000 UTC
-4422,CLOSED,Reduce load/unload model time,,2022-11-22 03:21:56 +0000 UTC
-4421,CLOSED,config.pbtxt format in Python Protobuf text_format,,2022-05-25 00:36:31 +0000 UTC
-4416,CLOSED,Allow loading/unloading of specific version for a given model,enhancement,2023-01-17 11:47:28 +0000 UTC
-4415,CLOSED,how to deploy stateful model without implicit state management,,2022-06-01 20:32:35 +0000 UTC
-4414,CLOSED,[mlflow-plugin] create a separate repository for mlflow-triton-plugin,,2022-11-22 03:21:42 +0000 UTC
-4412,CLOSED,Cannot warmup with python backend with batch_size,,2022-07-08 19:02:04 +0000 UTC
-4411,CLOSED,Segmentation fault, Core dumped during inference tensorrt model,,2022-05-23 11:18:04 +0000 UTC
-4408,CLOSED,GPT NeoX 20B,question,2023-03-03 00:53:04 +0000 UTC
-4407,CLOSED,Cannot build triton docker container due GPG keys rotation,,2022-07-08 19:02:48 +0000 UTC
-4406,CLOSED,Parse error at offset 0: The document is empty.,,2022-07-08 19:00:24 +0000 UTC
-4405,CLOSED,Shape does not match with data while using kserve http protocol BYTES data type,bug,2022-07-14 20:59:59 +0000 UTC
-4404,CLOSED,Option to return non binary data from when sending raw binary request,,2022-06-08 21:45:23 +0000 UTC
-4402,CLOSED,L0_https Client Example Hangs,,2022-05-31 23:06:23 +0000 UTC
-4400,CLOSED,Dynamically passing model parameters in inference request,,2022-06-08 21:47:58 +0000 UTC
-4399,CLOSED,pytorch_backend build fail,,2022-06-07 08:43:48 +0000 UTC
-4398,CLOSED,Using custom Python backend stub causes no such file or directory error,,2022-05-19 12:47:46 +0000 UTC
-4397,CLOSED,[question] about the GPU metrics,question,2022-07-11 22:41:43 +0000 UTC
-4396,CLOSED,Run dynamic shapes engine in Triton Server,,2022-05-18 23:28:16 +0000 UTC
-4394,CLOSED,Triton Crashes with onnxruntime error,,2022-07-08 19:37:48 +0000 UTC
-4391,CLOSED,python backend example for BERT like architecture,,2022-07-08 19:38:52 +0000 UTC
-4390,CLOSED,Internal: An input of type 'str' was detected in the model. Only a single input of type Dict(str, Tensor) or input(s) of type Tensor are supported.,,2023-07-06 13:51:28 +0000 UTC
-4389,CLOSED,[Question] perf_analyzer, "Server Compute Input" meaning,question,2022-05-17 18:33:28 +0000 UTC
-4388,CLOSED,What was the reason for designing Implicit State Management,question,2022-05-20 07:24:09 +0000 UTC
-4387,CLOSED,Tritonbackend multiple instances of the same model run sequentially instead of in parallel on the same device with asyncrhonous requests.,,2023-01-03 01:44:37 +0000 UTC
-4386,CLOSED,Why is an extra 1 added to the first element of the result of model simple_squence?,question,2022-05-20 07:25:53 +0000 UTC
-4385,CLOSED,pytorch_backend build fail,question,2022-07-14 23:11:17 +0000 UTC
-4384,CLOSED,Request specifies invalid shape for input 'images' for yolov5x6_tensorrt. Error details: model expected the shape of dimension 0 to be between 1 and 1 but received 32,,2022-07-08 19:47:05 +0000 UTC
-4380,CLOSED,GPG Keys Expired For Apt Packages,,2022-05-19 12:52:09 +0000 UTC
-4379,CLOSED,Response cache is not working well,bug,2022-08-03 21:44:38 +0000 UTC
-4378,OPEN,perf_analyzer socket closed error,investigating,2022-11-01 00:18:09 +0000 UTC
-4376,CLOSED,docker run not working,,2022-07-08 19:52:59 +0000 UTC
-4375,CLOSED,[Question] Incorrect generation of model configuration for ONNX model,,2022-05-13 16:08:05 +0000 UTC
-4374,CLOSED,Using custom Python backend stub causes Operation not permitted error,,2022-11-22 03:21:25 +0000 UTC
-4373,CLOSED,[Question] Setting dynamic batching with warmup,,2022-06-02 15:33:14 +0000 UTC
-4372,CLOSED,Possible GPU memory leak in Triton. Not draining.,,2022-11-30 08:31:03 +0000 UTC
-4371,CLOSED,Triton Server build with Docker fails,,2022-05-17 08:48:47 +0000 UTC
-4367,CLOSED,Parallel model loading on multiple GPUs on startup,enhancement,2023-06-26 20:34:29 +0000 UTC
-4366,CLOSED,[Feature Request] Add load_model api to BLS triton_python_backend_utils,enhancement,2023-06-29 20:51:03 +0000 UTC
-4365,CLOSED,[Question] Plugins from MMDeploy,question,2023-01-30 18:58:37 +0000 UTC
-4363,OPEN,Triton Batch size feature requests,enhancement,2022-05-11 23:44:36 +0000 UTC
-4362,OPEN,"inference failed: response output count mismatch" from gRPC client after enabling response cache,bug,2022-09-11 01:54:10 +0000 UTC
-4361,CLOSED,Why triton serving shared memory failed with running multiple workers in uvicorn in order to send multiple request concurrently to the models?,,2022-09-30 22:45:17 +0000 UTC
-4353,CLOSED,A few questions on C++ API,,2022-05-10 09:49:20 +0000 UTC
-4352,CLOSED,Accept JSON files as configuration files,enhancement,2022-05-10 01:45:14 +0000 UTC
-4351,OPEN,Multiple configuration files for the same model,enhancement,2023-06-02 01:35:16 +0000 UTC
-4350,CLOSED,Poor Performance on Triton vs inferencing w/o triton,,2022-05-27 19:46:02 +0000 UTC
-4346,CLOSED,Loading ONNX model fails because of insufficient CUDA driver version,bug,2022-05-11 23:50:24 +0000 UTC
-4345,CLOSED,Loading ONNX model fails because of insufficient CUDA driver version,,2022-05-06 11:06:24 +0000 UTC
-4344,CLOSED,Is there any scheduling strategy that drops old requests?,question,2022-05-27 19:49:06 +0000 UTC
-4341,CLOSED,Can ensemble model support to handle multiple inference requests simultaneously?,,2022-10-10 16:20:15 +0000 UTC
-4340,CLOSED,Python backend stub compilation fails because of TRITONSERVER_TYPE_BF16,,2022-05-06 17:35:22 +0000 UTC
-4335,CLOSED,Docs link error,,2022-05-04 19:54:12 +0000 UTC
-4333,CLOSED,Build failure when building PyTorch CPU tritonserver container: /usr/bin/ld: cannot find -ltorch,,2022-05-12 20:36:47 +0000 UTC
-4332,CLOSED,Third party won't build with clang 13+,,2022-11-22 03:20:19 +0000 UTC
-4331,CLOSED,Python backend stuck at TRITONBACKEND_ModelInstanceInitialize,bug,2023-01-27 16:42:04 +0000 UTC
-4330,CLOSED,Model warmup fails, yet load is reported successful,bug,2022-05-19 18:19:21 +0000 UTC
-4329,CLOSED,fp16 onnx model does not load on triton server,,2022-05-23 21:42:08 +0000 UTC
-4328,CLOSED,Support for dynamic shapes with TFTRT model,,2022-11-22 03:21:08 +0000 UTC
-4321,CLOSED,[Question] Semantic Segmentation : Add an argmax layer,question,2022-05-12 08:37:12 +0000 UTC
-4319,CLOSED,how multiple instances on the same device can be concurrent,question,2023-06-21 21:38:02 +0000 UTC
-4310,CLOSED,image_client.py example is broken for multiple classification outputs,bug,2022-05-04 17:55:59 +0000 UTC
-4308,CLOSED,how can i use custom tensorrt backends,question,2022-05-27 19:46:30 +0000 UTC
-4306,CLOSED,Triton Server build failed because DCGM 2.2.9 public key is not available,,2022-05-27 19:47:37 +0000 UTC
-4303,OPEN,[Question] explicit model control mode - load models in parallel,enhancement, investigating,2023-07-08 00:28:43 +0000 UTC
-4302,CLOSED,"no module named tensorflow" when running Triton server w/ python backend,,2022-04-28 19:59:45 +0000 UTC
-4300,CLOSED,Torchscript model loading throws error UNAVAILABLE: INTERNAL: An Input of type 'Tensor?' was detected in the model,,2022-04-29 18:25:40 +0000 UTC
-4299,CLOSED,Run tritonserver on Windows10 but it exited without any error,,2022-06-23 02:16:01 +0000 UTC
-4298,CLOSED,Running inference with Pytorch backend on Jetson nano,,2022-04-28 10:45:11 +0000 UTC
-4297,CLOSED,Questions about performance test results,question,2022-06-08 21:53:08 +0000 UTC
-4293,CLOSED,ONNX with TensorRT Optimization (ORT-TRT) Warmup,,2022-09-06 23:45:03 +0000 UTC
-4288,CLOSED,Triton inference server container freezes on startup,,2022-05-27 19:49:46 +0000 UTC
-4287,CLOSED,Docker A2 GPU support,,2022-04-27 07:28:54 +0000 UTC
-4286,CLOSED,How to connect Triton client library to my C++ client project?,,2022-06-05 03:34:09 +0000 UTC
-4285,CLOSED,Use custom integer data for warmup,question,2022-06-08 21:54:09 +0000 UTC
-4284,CLOSED,Custom Metrics per server,question,2022-05-23 21:40:45 +0000 UTC
-4278,CLOSED,model inference of yolov5 torchscript format runs slow,,2022-05-12 00:57:16 +0000 UTC
-4277,CLOSED,I want to load from a serialized model_config message with the Triton Server API.,,2022-05-23 21:47:53 +0000 UTC
-4276,CLOSED,nvidia-triton flower demo performace,,2023-04-28 13:11:52 +0000 UTC
-4275,CLOSED,Abnormal gpu memory usage,,2022-05-23 21:40:06 +0000 UTC
-4274,CLOSED,Proper Densenet_onnx Classification Input File Format Error,,2022-05-24 14:15:57 +0000 UTC
-4273,CLOSED,【question】cross compile for riscv64,,2022-05-18 08:07:56 +0000 UTC
-4272,CLOSED,tritonserver: error while loading shared libraries: /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1: file too short,,2022-11-22 03:20:32 +0000 UTC
-4270,CLOSED,Optimal Jetpack 4.x build,,2022-11-22 03:19:03 +0000 UTC
-4269,CLOSED,PyTorch inference returns incorrect values,,2022-04-25 14:24:52 +0000 UTC
-4266,CLOSED,Example in Python to create a pipeline with Gst-nvinferserver,,2022-04-24 12:22:08 +0000 UTC
-4265,CLOSED,tritonclient.utils.InferenceServerException: [StatusCode.UNAVAILABLE] unavailable,,2022-11-22 03:18:44 +0000 UTC
-4264,CLOSED,Load model parallelly,enhancement,2023-07-11 21:21:41 +0000 UTC
-4263,CLOSED,compose.py failure: module 'build' has no attribute 'get_container_versions',bug,2022-04-22 21:28:04 +0000 UTC
-4262,CLOSED,openvino 2022.01 support,enhancement,2022-06-09 00:35:59 +0000 UTC
-4261,CLOSED,Cannot import tritonclient.grpc after using conda pack,,2022-05-13 23:01:12 +0000 UTC
-4260,CLOSED,inference failed: ensemble unexpected deadlock,,2022-11-22 03:15:43 +0000 UTC
-4255,CLOSED,How to run inference for T5 tensorrt model deployed on nvidia triton?,,2023-04-07 06:55:03 +0000 UTC
-4254,CLOSED,How to test triton-inference-server with jmeter,,2022-05-13 22:58:51 +0000 UTC
-4250,CLOSED,Can you tell me the meaning of overhead in perf_analyzer report,question,2022-04-26 01:01:38 +0000 UTC
-4249,CLOSED,Can Triton support TensorFlow1.10.0?,,2022-04-19 16:20:04 +0000 UTC
-4248,CLOSED,How to maintain confidentiality of models in local deployments,,2023-01-14 09:28:17 +0000 UTC
-4247,CLOSED,torchscript model don't use Dynamic Batching,,2022-04-22 16:07:44 +0000 UTC
-4245,CLOSED,Clean and Concise documentation,,2022-11-11 08:56:23 +0000 UTC
-4244,CLOSED,400 Error during Inference,,2022-11-22 03:18:50 +0000 UTC
-4243,CLOSED,Test L0_memory_growth failed to load all models for tritonserver image including only onnx and python backends,,2022-05-16 21:36:53 +0000 UTC
-4242,CLOSED,Test L0_custom_ops failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 15:42:20 +0000 UTC
-4241,CLOSED,Test L0_https failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 15:45:06 +0000 UTC
-4240,CLOSED,Test L0_parallel_copy failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 15:46:21 +0000 UTC
-4239,CLOSED,Test L0_large_payload failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 15:46:41 +0000 UTC
-4238,CLOSED,Test L0_model_config failed to load all models for tritonserver image including only onnx and python backends,,2022-07-12 21:37:49 +0000 UTC
-4237,CLOSED,Test L0_output_name failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 15:49:11 +0000 UTC
-4236,CLOSED,Test L0_nullchar_string failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 15:49:29 +0000 UTC
-4235,CLOSED,Test L0_multi_server failed for tritonserver image including only onnx and python backends,,2022-04-18 15:49:52 +0000 UTC
-4234,CLOSED,triton python backend load time of pytorch model is 4x slower than an ONNX model load time.,,2022-11-22 03:21:34 +0000 UTC
-4233,CLOSED,Test L0_grpc failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 16:06:04 +0000 UTC
-4232,CLOSED,Test L0_http failed to load all models for tritonserver image including only onnx and python backends,,2022-04-18 16:05:52 +0000 UTC
-4231,CLOSED,Test L0_backend_python is coupled with AWS S3,,2022-04-18 16:09:06 +0000 UTC
-4225,CLOSED,[Question]The performance of triton-server itself,question,2022-05-31 16:47:27 +0000 UTC
-4223,OPEN,Python InferenceServerClient (http) should not call close() from __del__,investigating,2023-03-29 13:54:47 +0000 UTC
-4221,CLOSED,Serverside Postprocessing,question,2022-04-15 00:19:25 +0000 UTC
-4220,CLOSED,[FEATURE REQUEST] Support types supported by TensorRT,,2022-04-15 12:51:16 +0000 UTC
-4219,CLOSED,Async requests not increasing throughput with high network latency,,2022-07-12 17:52:52 +0000 UTC
-4215,CLOSED,Python Model with BLS: failed to get cuda pointer device attribute,bug,2022-06-24 14:50:11 +0000 UTC
-4214,CLOSED,[mlflow-triton-plugin] wrong "backend" value when deploying in onnx flavor,,2022-04-13 11:49:47 +0000 UTC
-4213,CLOSED,Tritonserver Log: model-level grading,enhancement,2022-08-18 18:05:29 +0000 UTC
-4212,CLOSED,Custom ops with LD_PRELOAD trick failed,,2022-05-13 21:42:22 +0000 UTC
-4206,CLOSED,Can Triton improve QPS by increasing CPU utilization?,,2022-05-06 01:48:01 +0000 UTC
-4205,CLOSED,Launch the TIS encountering the issue of "Unable to get power limit for GPU 0: Not Supported",,2022-04-15 05:35:38 +0000 UTC
-4203,CLOSED,Batching by any axis,enhancement,2022-04-20 09:00:26 +0000 UTC
-4202,CLOSED,feat: multiple triton server can bind on the same http/grpc port,enhancement,2022-08-09 15:41:29 +0000 UTC
-4201,CLOSED,python backend: BLS triton_python_backend_utils support url arguments,question,2022-04-13 02:23:12 +0000 UTC
-4200,CLOSED,Failed to load tensorflow savedmodel,,2022-04-12 07:38:30 +0000 UTC
-4199,CLOSED,tritonserver ensemble SEGV in nvidia::inferenceserver::RateLimiter::EnqueuePayload,bug,2022-05-02 16:18:37 +0000 UTC
-4198,CLOSED,how to ensemble models with detection and classsification,question,2023-02-23 17:06:56 +0000 UTC
-4197,CLOSED,Triton dosen't start in kubernetes,,2022-04-10 17:11:02 +0000 UTC
-4196,CLOSED,Failed to update context stat: Timer not set correctly. Send time from 1649545273035059093 to 0.,,2022-05-13 21:40:52 +0000 UTC
-4194,CLOSED,UNAVAILABLE: Invalid argument: model 'keypoints_pose_0', tensor 'input.1': the model expects 4 dimensions (shape [1,3,224,224]) but the model configuration specifies 4 dimensions (shape [1,3,224,244]),,2022-04-09 02:36:41 +0000 UTC
-4193,CLOSED,[request] Able to dynamically load ensembles without polling for model dependencies,,2022-04-10 04:01:40 +0000 UTC
-4188,CLOSED,[request] automatic on-the-fly model load/unload based on init & process resources,,2022-05-23 22:12:42 +0000 UTC
-4187,CLOSED,model config is unavaible to client if model is not loaded by the server,,2022-04-08 18:06:36 +0000 UTC
-4184,CLOSED,Will you submit the min-image for history triton?,,2022-04-13 03:22:43 +0000 UTC
-4183,CLOSED,Need minimum CI test set to validate customized tritonsever gpu image built using optional features,,2022-05-13 20:55:01 +0000 UTC
-4179,CLOSED,Triton server container lockup on stop when pinned memory is too big,investigating,2022-06-06 12:50:07 +0000 UTC
-4178,CLOSED,[Question] The way of working of sequence_end control signal in sequece batcher,question,2022-04-14 09:09:06 +0000 UTC
-4172,CLOSED,Jetson PyTorch wheel broken link,bug,2022-04-07 17:13:14 +0000 UTC
-4170,CLOSED,sometimes slower response time with smaller batch size,question,2022-04-10 05:09:24 +0000 UTC
-4168,CLOSED,HTTP ERROR 400 when load or unload model,,2022-04-06 06:06:35 +0000 UTC
-4165,CLOSED,Not able to use S3 bucket as a model storage,question,2022-04-19 10:16:42 +0000 UTC
-4162,OPEN,Support for tfio in tensorflow backend,enhancement,2022-04-07 00:44:10 +0000 UTC
-4160,CLOSED,Specific item names cause Triton placeholders to not work in a GUI,,2022-04-05 14:46:34 +0000 UTC
-4154,OPEN,pinned_memory_manager Killed,bug,2022-04-15 17:43:26 +0000 UTC
-4153,CLOSED,Previous success build of full tritonserver failed for most recent release branches (r21.12...r22.03),bug, investigating,2022-05-02 22:25:33 +0000 UTC
-4152,CLOSED,ValueError: assignment destination is read-only,question,2022-04-18 16:13:38 +0000 UTC
-4151,CLOSED,Cannot start python-backend with TYPE_GPU instance type,question,2022-04-12 07:39:17 +0000 UTC
-4150,CLOSED,Optional input for python backend,bug,2022-09-29 22:47:55 +0000 UTC
-4149,CLOSED,Can't find tritonclient.utils.shared_memory on WIN10,,2022-04-04 18:40:50 +0000 UTC
-4145,CLOSED,python backend stuck in Starting Python backend stub when launching multiple servers simultaneously,,2022-04-04 23:08:52 +0000 UTC
-4144,CLOSED,How should I request a model whose input is a dictionary？,,2022-10-27 03:45:25 +0000 UTC
-4142,CLOSED,Triton inference time extremely slow at scale,,2022-06-27 14:54:22 +0000 UTC
-4139,CLOSED,Request custom model metadata,,2022-04-22 15:58:06 +0000 UTC
-4138,CLOSED,[Question] memory consumption of model loading for different instance_group count,,2022-03-31 18:00:51 +0000 UTC
-4137,CLOSED,Q. Is BLS and model excution run with pipeline parallelism,,2022-09-10 02:40:52 +0000 UTC
-4134,CLOSED,Triton hangs on tensorflow1 backend cpu-only build,,2022-03-31 17:53:04 +0000 UTC
-4133,CLOSED,Dynamic Batching not creating batches correctly and incorrect inference results,bug,2022-06-24 21:44:43 +0000 UTC
-4132,CLOSED,python backend always loading!,,2022-03-31 13:49:41 +0000 UTC
-4131,CLOSED,Grpc compression increases the latency quite a lot?,,2022-04-04 18:48:12 +0000 UTC
-4130,CLOSED,error creating a triton deployment mlflow plugin,investigating,2022-04-08 20:31:35 +0000 UTC
-4129,CLOSED,Dynamic Batching is not creating batches during inference,,2022-03-30 16:04:52 +0000 UTC
-4127,CLOSED,perf_analyzer failed with --shared-memory=cuda,,2022-03-30 00:58:11 +0000 UTC
-4126,CLOSED,pytorch backend：backend_memory.cc:177] failed to free CUDA memory: an illegal memory access was encountered,,2022-03-31 22:56:58 +0000 UTC
-4118,CLOSED,[Feature Request] Allow ensemble model's sub-models to be inside the model dir,,2022-05-13 19:22:43 +0000 UTC
-4113,CLOSED,Endless wait when loading Python backend model on Jetson - v2.19.0,bug,2022-04-06 15:17:25 +0000 UTC
-4112,CLOSED,inference failed: PyTorch execute failure: Global alloc not supported yet,,2022-04-04 13:50:21 +0000 UTC
-4111,CLOSED,how should i run the fastertransformer(FT) custom op with TIS?,,2022-03-28 04:42:56 +0000 UTC
-4105,CLOSED,ONNX TensorRT gives widely different result for fp16 quantized CLIP text embedding,question,2022-05-18 23:52:20 +0000 UTC
-4104,CLOSED,Post Processing with Triton Ensemble,,2022-03-25 18:14:33 +0000 UTC
-4103,CLOSED,No response after a long period,,2022-04-09 01:37:52 +0000 UTC
-4102,CLOSED,Deploy Triton server with MinIO as Model Store,,2022-03-25 05:42:30 +0000 UTC
-4101,CLOSED,Failed to load model - Unknown Builtin Op Torch Sparse,,2022-03-25 18:06:50 +0000 UTC
-4096,CLOSED,Client Hung,,2022-03-28 20:48:37 +0000 UTC
-4095,OPEN,Is it possible to make gRPC to use a unix socket instead of TCP in Triton Server?,enhancement,2022-09-01 17:05:35 +0000 UTC
-4094,CLOSED,Some confusions about MessageQueue in python backend.,,2022-03-24 01:47:29 +0000 UTC
-4089,CLOSED,Input to the script for publishing models to mlflow is overly particular with inputs,bug,2022-04-22 15:58:55 +0000 UTC
-4088,CLOSED,Request to cherry-pick fixes in tensorrt-backend for 22.03,,2022-03-22 21:09:04 +0000 UTC
-4087,CLOSED,Device memory is insufficient for Jetson example,,2022-03-23 09:17:40 +0000 UTC
-4085,CLOSED,YoloV4 Inference with Triton produces different output than with TensorRT,,2022-04-07 06:51:17 +0000 UTC
-4082,OPEN,feat: Add `TYPE_STRING` support to PyTorch backend,enhancement,2022-05-30 22:47:01 +0000 UTC
-4081,CLOSED,object class 'GstNvInferServer' has no property named 'input-tensor-meta',question,2022-03-21 19:10:54 +0000 UTC
-4079,CLOSED,Triton Inference Server taking adding 3 seconds to get YOLOv4 Inference,,2022-03-31 05:52:32 +0000 UTC
-4078,CLOSED,POSTing Base64 image to Triton running YOLOV4/TensorRT model,,2022-08-09 02:21:56 +0000 UTC
-4072,OPEN,Support connection strings for Azure-backed modelrepo,enhancement,2022-05-24 17:36:47 +0000 UTC
-4068,CLOSED,Perf_Analyzer always throwing 'std::length_error',bug,2022-03-31 17:18:26 +0000 UTC
-4067,CLOSED,build the server in a container,,2022-04-07 01:25:46 +0000 UTC
-4066,CLOSED,Perf_Analyzer requires libcudart,,2022-03-29 23:31:34 +0000 UTC
-4065,CLOSED,Model Configuration is wrongly restrictive,,2023-05-23 04:57:46 +0000 UTC
-4064,CLOSED,MOT model deploy error,,2022-08-09 07:15:41 +0000 UTC
-4062,CLOSED,Please add option to specify `grpc.default_authority` when creating python InferenceServerClient,enhancement,2022-05-10 04:01:55 +0000 UTC
-4061,CLOSED,Error and server shutdown on http request.,,2023-01-31 09:05:36 +0000 UTC
-4059,OPEN,Understanding Trace File,bug,2022-06-27 03:27:43 +0000 UTC
-4083,CLOSED,RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`,,2022-04-07 01:35:20 +0000 UTC
-4058,CLOSED,Server unloads models automatically after starting,,2022-03-15 05:08:13 +0000 UTC
-4057,CLOSED,Build openvino backend fail in container,question,2022-03-15 16:40:26 +0000 UTC
-4053,CLOSED,【Question】how to set the ensemble_scheduling when do inference of multiple models at the same time,,2022-03-29 23:50:49 +0000 UTC
-4052,CLOSED,TorchScript model inference grid sampler error,question,2022-04-18 16:18:15 +0000 UTC
-4051,CLOSED,[Question]Can TRITON POLL MODE hot-update the ensemble model?,bug, investigating,2022-05-31 17:54:09 +0000 UTC
-4050,CLOSED,Failed with Jetson NX using tensorrt model and docker from nvcr.io/nvidia/tritonserver:22.02-py3,,2022-03-17 07:50:26 +0000 UTC
-4049,OPEN,[feature request] classification by axis support in classification extension,enhancement,2022-10-02 07:17:23 +0000 UTC
-4048,CLOSED,Onnx batchsize greater than 1,question,2022-03-15 16:40:49 +0000 UTC
-4046,CLOSED,Triton server crashed unexpectedly during loading TensorRT models,,2022-03-15 08:33:41 +0000 UTC
-4045,CLOSED,Auto-Setting upstream container version in build.py for no container build,bug,2022-04-13 15:33:29 +0000 UTC
-4044,CLOSED,Question on Concurrent Execution on the same GPU device,question,2022-03-25 16:48:53 +0000 UTC
-4043,CLOSED,Python Backend can not be loaded!!,,2022-03-17 09:08:55 +0000 UTC
-4042,CLOSED,tensorflow_text support for Triton,,2022-08-20 00:05:28 +0000 UTC
-4038,CLOSED,How to set dynamic batching for tensorrt model?,question,2022-03-11 01:24:26 +0000 UTC
-4037,CLOSED,when i increase the number of instance_group,the latency is not decrease. i use tensorrt platform,,2022-05-17 18:06:11 +0000 UTC
-4036,CLOSED,Expose gRPC channel options,enhancement,2022-05-20 19:11:30 +0000 UTC
-4035,CLOSED,Possible to write dirty data in deprecated share memory in python backend?,,2022-03-10 15:12:43 +0000 UTC
-4030,CLOSED,Permission denied message,question,2022-03-10 22:55:10 +0000 UTC
-4028,CLOSED,Can we load TensorRT model within the self.initialize method of the TritonPythonModel?,question,2022-03-11 00:49:17 +0000 UTC
-4027,CLOSED,How to build server source code in a container?,question,2022-03-11 16:32:56 +0000 UTC
-4026,CLOSED,Ensemble model using BLS: stub unhealthy,bug,2023-03-17 15:40:30 +0000 UTC
-4023,CLOSED,Graceful handing of oom errors,enhancement,2022-03-09 18:47:15 +0000 UTC
-4022,CLOSED,Triton Python backend not able to use conda env built on different OS,wontfix,2022-03-09 02:33:49 +0000 UTC
-4021,CLOSED,ONNX CPU slower performance with a series of classification requests vs single one,question,2022-05-17 18:06:51 +0000 UTC
-4020,CLOSED,Dynamically load multiple instances for the same model,question,2022-03-10 08:05:53 +0000 UTC
-4019,CLOSED,Inference time with triton server is more than the inference time without triton,performance,2022-03-25 16:45:31 +0000 UTC
-4018,CLOSED,ONNX configuration example,,2022-03-08 09:11:48 +0000 UTC
-4017,CLOSED,tritonserver exited with coredump when using cuda graph optimization,bug,2022-04-09 07:02:20 +0000 UTC
-4016,CLOSED,Segmentation fault (core dumped),,2022-03-07 21:35:30 +0000 UTC
-4015,CLOSED,How to use Triton Inference Server with Docker Swarm,question,2022-03-10 17:02:05 +0000 UTC
-4014,CLOSED,[Question] - Is it possible to cache inferences for TTS,,2022-03-25 16:44:34 +0000 UTC
-4013,CLOSED,run c++ example code on win10,,2022-03-08 00:59:59 +0000 UTC
-4011,CLOSED,YOLOv5x ONNXRuntime with OpenVINO EP failed - need to upgrade OpenVINO EP,,2022-03-10 04:53:39 +0000 UTC
-4010,CLOSED,triton server failed exited with coredump,,2022-03-25 16:50:27 +0000 UTC
-4008,CLOSED,Signal 11 received and server down when inferring bert model,,2022-04-22 16:00:13 +0000 UTC
-4007,CLOSED,quantized model inference slow with Triton server than inference directly in python code,,2022-06-08 23:19:16 +0000 UTC
-4003,CLOSED,Revise build environment and release Debian package to improve integration with existing systems,,2022-04-11 16:41:45 +0000 UTC
-4002,CLOSED,(triton-third-party) Azure-storage-cpplite dependency will not build with tag 0.3.0,bug, investigating,2022-03-15 14:51:29 +0000 UTC
-4001,CLOSED,(Question) Is there existing a way to reschedule requests in concurrent model execution?,,2022-03-02 16:53:19 +0000 UTC
-3998,CLOSED,Question: Is it possible to access request body 'parameters' in the python backend?,enhancement, question,2023-03-15 22:47:23 +0000 UTC
-3997,CLOSED,Memory not released,bug, investigating,2023-02-09 04:03:56 +0000 UTC
-3996,CLOSED,Tensorflow2 backend UNAVAILABLE: Not found: unable to load shared library: libnccl.so.2,,2022-03-02 18:23:55 +0000 UTC
-3994,CLOSED,How to get the time each inference cost?,,2022-03-02 01:50:38 +0000 UTC
-3992,CLOSED,Python backend BLS unable to handle response from GPU model,,2022-04-11 19:00:36 +0000 UTC
-3990,CLOSED,Ask about the meaning of output.,,2022-03-01 01:37:22 +0000 UTC
-3989,CLOSED,Deploying my own crnn model using triton, the output has the wrong shape,,2022-03-07 02:30:23 +0000 UTC
-3987,CLOSED,Python client shm functions set_shared_memory_region and get_contents_as_numpy should support offset !=0,,2022-03-15 23:59:07 +0000 UTC
-3986,CLOSED,Set_shared_memory of the class InferInput in python client doesn't support offset !=0,,2022-03-15 23:58:39 +0000 UTC
-3985,CLOSED,How to download dependency in advance?,question,2022-03-02 06:40:39 +0000 UTC
-3984,OPEN,Batching support by stacking input arrays in python backend,enhancement,2023-04-14 07:32:27 +0000 UTC
-3980,CLOSED,CPU-only mode unable to load Models got CUDA error,,2022-11-16 02:41:02 +0000 UTC
-3979,CLOSED,Support for OpenPPL Backend,question,2022-05-17 18:32:40 +0000 UTC
-3978,CLOSED,Triton cannot inference `tf.math.l2_normalize` correctly from ngc 21.06 ~ ngc 22.03 ( triton 2.20.0),bug, investigating,2022-08-04 17:05:11 +0000 UTC
-3976,CLOSED,Edge compute using Transform4rec models with ONNX runtime,,2022-04-12 01:03:54 +0000 UTC
-3973,CLOSED,Update python preprocessor example to showcase batch processing,,2022-05-13 21:38:27 +0000 UTC
-3972,OPEN,Way to capture headers in model.py when using python backend,enhancement,2023-03-07 23:32:45 +0000 UTC
-3971,CLOSED,Auto generated model configuration for custom backend,question,2022-02-28 01:49:57 +0000 UTC
-3970,CLOSED,Cannot get CUDA device count, GPU metrics will not be available on multi-gpus,bug,2022-05-19 00:51:01 +0000 UTC
-3968,CLOSED,error when trying to allocate a region of cuda shared memory,,2022-02-22 19:11:35 +0000 UTC
-3967,CLOSED,Error about layer datatype when load onnx model,,2022-02-24 22:58:17 +0000 UTC
-3965,CLOSED,ONNX Backend Installation Error,,2023-04-27 05:23:18 +0000 UTC
-3962,CLOSED,Build trtion server image with container failed,bug,2022-02-23 19:19:56 +0000 UTC
-3961,OPEN,python backend: how does the conda environment support multiple versions,enhancement,2022-02-19 22:06:11 +0000 UTC
-3960,OPEN,Request for non-gpu version docker image (to decrease the image size),enhancement,2023-02-03 06:31:38 +0000 UTC
-3955,CLOSED,Triton server - required NVIDIA driver version vs CUDA minor version compatibility,question,2022-02-25 05:40:17 +0000 UTC
-3953,CLOSED,inference queue time high,,2022-09-09 23:42:35 +0000 UTC
-3952,CLOSED,Bug for building triton server with onnx backend with docker,,2022-03-10 04:57:02 +0000 UTC
-3951,CLOSED,Questions about ragged batching of higher dimensional tensors,,2022-02-28 06:19:17 +0000 UTC
-3948,CLOSED,[client c++] build script uses system libcurl instead of own third-party/curl,bug,2022-03-03 11:20:19 +0000 UTC
-3944,CLOSED,python backend error: c_python_backend_utils.TritonModelException: Tensor is stored in GPU and cannot be converted to NumPy,,2023-03-22 14:47:41 +0000 UTC
-3942,CLOSED,[Question] Details examples for how to use the rate limiter?,question,2022-03-01 13:54:58 +0000 UTC
-3941,CLOSED,[Question] About perf_analyzer request & execution count,,2022-06-07 02:02:13 +0000 UTC
-3940,CLOSED,Docs for building custom TF backend are obsolete,bug,2022-03-10 23:28:23 +0000 UTC
-3937,CLOSED,torchscripted model fails to load on triton server,,2022-02-27 14:46:55 +0000 UTC
-3936,CLOSED,triton for windows(r22.01) build error,,2022-05-13 21:38:59 +0000 UTC
-3935,CLOSED,Best practices for loading a new model version across Triton instances,question,2022-03-10 23:20:54 +0000 UTC
-3930,OPEN,VPU support for OpenVINO backend,enhancement,2022-02-14 21:22:15 +0000 UTC
-3929,CLOSED,Same output for every batches when using shared memory,bug, investigating,2022-03-04 10:14:59 +0000 UTC
-3928,CLOSED,The error means that the trt plan does not support the shape [16,128].,,2022-04-11 09:50:52 +0000 UTC
-3927,CLOSED,Test “L0_trt_dla” failed in branch r21.12 due to missing model “resnet50_plan”,,2022-02-18 21:48:08 +0000 UTC
-3926,OPEN,Does triton-inference-server run on Drive AGX?,enhancement,2022-02-18 21:52:36 +0000 UTC
-3924,CLOSED,Build image "tritonserver_qa" failed due to “Dockerfile.QA” references to a non-existing folder,,2022-02-22 18:08:54 +0000 UTC
-3923,CLOSED,Build tritonserver image failed due to hardcoded the”/tmp” folder in build files,enhancement,2022-03-14 20:11:06 +0000 UTC
-3922,CLOSED,[client c++] Could not find a package configuration file provided by "RapidJSON",,2022-11-22 03:31:12 +0000 UTC
-3920,CLOSED,NVIDIA Tesla T4 is not being used during inference,,2022-02-11 14:32:12 +0000 UTC
-3919,CLOSED,UNAVAILABLE: Internal: trt failed to set binding dimension to [8,10,128] for input 'input_id' for paraRecognition,,2022-02-11 17:13:18 +0000 UTC
-3918,CLOSED,Where to modify the apt sources in building such container?,,2022-02-17 19:47:53 +0000 UTC
-3917,CLOSED,grpc node.js client unable to send uint32 inputs,,2022-11-22 03:18:17 +0000 UTC
-3916,CLOSED,Triton Server Crash,,2022-03-01 19:13:14 +0000 UTC
-3915,CLOSED,Use ensemble to start Python Backend and PyTorch Backend, prompting not supported for Pytorch Backend.,,2022-02-11 01:18:01 +0000 UTC
-3914,CLOSED,Error: Failed to process the request(s). error: unpack_from requires a buffer of at least ...,,2022-02-13 13:23:35 +0000 UTC
-3913,CLOSED,CPU-Only Image: Dockerfile to build them, or release to nvcr,,2022-02-28 16:34:59 +0000 UTC
-3912,CLOSED,Model has kind KIND_GPU but no GPUs are available,question,2022-02-11 08:49:56 +0000 UTC
-3909,CLOSED,TorchScript GELU error,pytorch ngc,2022-02-11 19:05:29 +0000 UTC
-3908,CLOSED,Failed to build perf_analyzer on macOS,bug, enhancement, investigating,2022-04-05 19:44:29 +0000 UTC
-3907,CLOSED,Question: support for older CPUs (no AVX),question,2022-02-09 16:52:16 +0000 UTC
-3906,CLOSED,Triton tries to use the `tensorflow1` backend for `22.01-tf2-python-py3` image,question,2022-02-10 08:29:49 +0000 UTC
-3905,CLOSED,Experiencing Bottlenecking at Scale - is it related to having a single gRPC connection?,question,2022-02-09 20:29:48 +0000 UTC
-3904,CLOSED,Unclear TensorRT version match,question,2022-02-09 01:32:00 +0000 UTC
-3903,CLOSED,Directing requests to correct triton deployment on kubernetes,question,2022-02-10 07:58:01 +0000 UTC
-3902,CLOSED,Is there a way to load LD_PRELOAD plugins dynamically?,,2023-07-06 18:46:10 +0000 UTC
-3901,OPEN,How to add reshape[] to states with implicit state management,enhancement,2022-02-08 16:26:34 +0000 UTC
-3898,CLOSED,Client side ratio perf_ Analyzer requests are much slower,,2022-02-15 19:48:51 +0000 UTC
-3897,CLOSED,DCGM_FI_DEV_GPU_UTIL for HPA is showing error "no metrics returned from custom metrics API",,2022-03-10 23:23:59 +0000 UTC
-3896,CLOSED,Conditional model inference,,2022-02-07 14:20:06 +0000 UTC
-3895,CLOSED,unexpected input format FORMAT_NONE, expecting FORMAT_NCHW or FORMAT_NHWC,,2022-03-10 23:24:15 +0000 UTC
-3891,CLOSED,Pass outputs of one model to inputs of another in BLS,,2022-02-07 19:23:59 +0000 UTC
-3885,CLOSED,Triton server not combined requests to batch in python backend,,2022-02-03 20:29:39 +0000 UTC
-3884,CLOSED,triton server for jetpack not provided in release 2.18,,2022-02-28 16:37:09 +0000 UTC
-3883,OPEN,tensorrt slower than onnx,bug,2022-08-18 11:30:47 +0000 UTC
-3880,CLOSED,Batch Inference,,2022-02-02 23:19:43 +0000 UTC
-3877,CLOSED,UNAVAILABLE: Internal: unable to create stream: the provided PTX was compiled with an unsupported toolchain,,2023-01-20 13:01:12 +0000 UTC
-3869,CLOSED,Dynamic_batching not working correctly with tensorflow models,,2022-02-01 17:37:37 +0000 UTC
-3866,CLOSED,Why the third party of grpc-new not appear to contain CMakeLists.txt,bug,2022-03-25 18:19:00 +0000 UTC
-3865,CLOSED,[Question] how to update config.prototxt to support different model input shape,,2022-01-29 00:13:54 +0000 UTC
-3864,CLOSED,Windows Dockerfile reference new cuDNN folder structure,,2022-01-31 21:26:48 +0000 UTC
-3859,CLOSED,Incorrect order of outputs,,2022-01-27 18:55:07 +0000 UTC
-3857,CLOSED,BLS script + FORCE_CPU_ONLY_INPUT_TENSORS -> output tensor from ORT is NEVER on GPU memory,,2022-02-01 21:53:11 +0000 UTC
-3856,CLOSED,Single model scaling,,2022-07-06 16:10:14 +0000 UTC
-3855,CLOSED,tensorrt slower than others,,2022-02-15 15:58:30 +0000 UTC
-3854,OPEN,rate limiting based on number of requests,enhancement,2023-06-01 20:01:46 +0000 UTC
-3852,CLOSED,modification of the dimension check in EvaluateTensorRTContext,bug, investigating,2022-03-16 00:53:07 +0000 UTC
-3851,CLOSED,Why Triton's allow_ragged_batch feature doesn't works?,,2022-02-02 23:20:54 +0000 UTC
-3847,CLOSED,Understanding Backends,,2022-03-15 10:45:17 +0000 UTC
-3846,CLOSED,how to enable Dynamic batching for Ensembling models?,,2022-01-24 14:07:20 +0000 UTC
-3845,CLOSED,E0124 07:16:50.138736 59 logging.cc:43] 1: [stdArchiveReader.cpp::StdArchiveReader::34] Error Code 1: Serialization (Serialization assertion safeVersionRead == safeSerializationVersion failed.Version tag does not match. Note: Current Version: 43, Serialized Engine Version: 0),,2022-02-01 17:39:30 +0000 UTC
-3844,CLOSED,How do I create properly formatted input_data_file for warmup,investigating,2022-01-26 04:05:44 +0000 UTC
-3842,CLOSED,Prometheus always shows one GPU in Triton on Kubernetes,,2022-01-21 17:39:03 +0000 UTC
-3841,CLOSED,Pytorch backend forJetson Nano,,2022-01-21 18:16:07 +0000 UTC
-3840,CLOSED,python model get stuck on instance initialization step,,2022-03-30 14:00:10 +0000 UTC
-3839,CLOSED,allow_ragged_batch,,2022-01-28 18:05:48 +0000 UTC
-3838,CLOSED,The problem of low cpu usage,,2022-02-09 20:42:11 +0000 UTC
-3837,CLOSED,[Question] Is there data populating when request batch_size is less than tensorrt offline model batch_size,,2022-01-21 17:58:05 +0000 UTC
-3834,CLOSED,[Question] Error when loading models with python backend,,2022-01-25 01:42:41 +0000 UTC
-3833,CLOSED,Differing batch_size between input and output,,2022-01-21 15:56:24 +0000 UTC
-3832,CLOSED,[question] model loading,,2022-01-20 23:34:12 +0000 UTC
-3831,OPEN,Missing logs incase of incorrect model name,enhancement,2022-01-20 14:46:01 +0000 UTC
-3830,CLOSED,[Question]: Fairseq model to Triton server,,2022-01-20 23:30:35 +0000 UTC
-3825,CLOSED,Tensorflow Backend had an unexpected memory increase while updating models,question, investigating,2022-06-21 09:30:55 +0000 UTC
-3824,CLOSED,server's NGC container has no source code and build tools like cmake.,question,2022-01-20 23:41:42 +0000 UTC
-3822,CLOSED,[Question] Is ensemble model sequential in all cases?,,2022-01-20 23:31:11 +0000 UTC
-3820,CLOSED,its easy to compilation failure in China,,2022-01-20 23:49:38 +0000 UTC
-3818,CLOSED,How to understand queue_batch_size payload_batch_size pending_batch_size batch_size next_preferred_batch_size_ in dynamic_batch_scheduler.cc,,2022-01-21 00:07:25 +0000 UTC
-3817,CLOSED,Possible Network Performance Bottleneck,,2022-02-28 16:35:41 +0000 UTC
-3816,CLOSED,How to run tests after building tritonserver,question,2022-01-20 23:40:48 +0000 UTC
-3813,CLOSED,Driver incompatibility for triton image on sagemaker instance,,2022-01-21 17:08:56 +0000 UTC
-3808,CLOSED,Triton HTTP python client library call `get_result()` on `InferAsyncRequest` object in different thread results in `greenlet .error`,bug, question,2022-01-21 01:39:19 +0000 UTC
-3807,CLOSED,Memory efficient BLS with input-dependent number of inference requests,,2022-05-02 16:27:11 +0000 UTC
-3805,CLOSED,Failed to fetch anonymous token when trying to pull Triton Docker image in CI,bug,2023-05-02 19:32:22 +0000 UTC
-3804,CLOSED,Question: Return label map along with predictions,question, investigating,2022-05-24 17:43:35 +0000 UTC
-3802,CLOSED,Model load call is throwing error only on the first call `POST /v2/repository/models/{MODEL}/load`,bug, investigating,2022-03-10 18:33:47 +0000 UTC
-3801,CLOSED,struct.error: unpack_from requires a buffer of at least 1150092984 bytes for unpacking 1150092980 bytes at offset 4 (actual buffer size is 97920),,2022-01-25 06:10:43 +0000 UTC
-3800,CLOSED,`is_server_live()` python GRPC client got no response intermittently, while c++ is OK,bug, investigating,2023-03-13 22:39:17 +0000 UTC
-3796,CLOSED,Nonlinear increase of throughput as the number of CPU instances increases,,2022-01-31 14:09:59 +0000 UTC
-3795,CLOSED,Specifying model signatures in config.pbtxt serving TensorFlow Models,enhancement,2022-02-03 18:28:32 +0000 UTC
-3794,CLOSED,Python backend ensemble model extra input params,question,2022-02-03 18:27:12 +0000 UTC
-3787,CLOSED,Semantic segmentation model serving,bug, question, investigating,2022-01-21 18:07:23 +0000 UTC
-3786,CLOSED,Allow tritonserver to stay up on model load failure?,question,2023-05-11 04:31:18 +0000 UTC
-3784,CLOSED,Efficient Way to Send And Retrieve Image Inference Response,question,2022-10-08 07:52:35 +0000 UTC
-3781,OPEN,Server goes down trying to predict on certain BERT based, TensorRT optimized model in Tenosrflow Savedmodel format,bug, investigating,2022-03-22 21:18:29 +0000 UTC
-3779,CLOSED,python_backend consuming too much CPU without any incoming request,bug, investigating,2022-01-11 16:24:39 +0000 UTC
-3777,CLOSED,free() invalid pointer,bug, investigating,2022-02-23 04:14:15 +0000 UTC
-3774,CLOSED,why tensorrt is slow than onnx,,2022-01-21 23:36:21 +0000 UTC
-3773,CLOSED,No CUDA-capable device is detected (CUDA_ERROR_NO_DEVICE) cuInit()=100,,2022-01-10 05:50:00 +0000 UTC
-3770,CLOSED,Could not invoke stateful service in triton bls,bug,2022-01-24 03:51:54 +0000 UTC
-3769,CLOSED,How to protect python model in triton server?,,2022-01-21 18:28:21 +0000 UTC
-3765,OPEN,Standard Log format like JSON or XML,enhancement,2023-02-08 11:15:11 +0000 UTC
-3764,CLOSED,when we can use `compose.py` to build the CPU-only containers?,,2022-02-09 15:49:44 +0000 UTC
-3763,OPEN,21.12-py3 Server launching error when hosting a TRT model with custom plugin,enhancement,2022-01-31 21:22:50 +0000 UTC
-3762,CLOSED,The set_data_from_numpy function of class InferInput returns a list of **Nonetypes**,,2022-01-21 01:30:40 +0000 UTC
-3761,CLOSED,GPU memory never release.,,2022-12-06 17:38:26 +0000 UTC
-3759,CLOSED,custom tritonserver build, but the protobuf version is too low, causing an error,,2022-01-05 01:30:50 +0000 UTC
-3758,CLOSED,Memory not being released after triton inference - Python,bug, investigating,2022-01-11 00:03:28 +0000 UTC
-3757,CLOSED,[Question] - Artifacts with Python backend,,2022-01-04 15:03:23 +0000 UTC
-3756,CLOSED,Class labels are not returned when label_filename is provided,,2022-01-04 07:55:22 +0000 UTC
-3755,CLOSED,[question][performance] Triton ensemble scheduling in parallel,question,2022-01-05 10:36:42 +0000 UTC
-3754,CLOSED,/bin/bash: error while loading shared libraries: libnvinfer.so.8: cannot open shared object file: No such file or directory,,2022-01-03 16:44:11 +0000 UTC
-3753,CLOSED,use docker pull triton on jetson,,2022-01-05 19:23:05 +0000 UTC
-3752,CLOSED,[Question] How to serve sklearn preprocessing pipeline with triton,,2021-12-31 05:21:24 +0000 UTC
-3750,CLOSED,pinned buffer: failed to perform CUDA copy: invalid argument,,2022-01-27 18:25:51 +0000 UTC
-3749,CLOSED,Benchmark/Measure switching time between models,,2021-12-30 17:42:03 +0000 UTC
-3748,CLOSED,What is the generation method of the first four bytes of bytestring?,,2021-12-30 23:21:29 +0000 UTC
-3747,CLOSED,Sporadic streaming gRPC error "2 UNKNOWN: TRTIS response timeout",investigating,2022-01-05 13:32:14 +0000 UTC
-3746,CLOSED,ERROR: infer_trtis_server.cpp:261 Triton: TritonServer response error received., triton_err_str:Internal, err_msg:PyTorch execute failure: Expected Tensor but got GenericDict,question,2022-01-03 21:28:32 +0000 UTC
-3745,CLOSED,got error when output is zero rank in BLS,bug,2022-01-06 20:40:24 +0000 UTC
-3744,CLOSED,Read-only file system error,,2021-12-30 10:47:55 +0000 UTC
-3743,CLOSED,change the request input shape will make triton server hang up？,,2022-01-03 16:36:04 +0000 UTC
-3742,CLOSED,Onnx with TensorRT in Windows 10 platform,,2021-12-31 10:19:59 +0000 UTC
-3741,CLOSED,Question: ragged batch with ONNX backend,,2021-12-30 06:39:42 +0000 UTC
-3740,CLOSED,'c++: fatal error: Killed signal terminated program cc1plus' while building the container,,2021-12-28 06:20:56 +0000 UTC
-3739,CLOSED,tritonserver died after sometime,,2022-01-13 07:47:08 +0000 UTC
-3738,CLOSED,Messages of `BUG: soft lockup` and freezing when stopping tritonserver container,bug,2022-03-10 23:27:28 +0000 UTC
-3737,CLOSED,pytorch model error,,2021-12-30 06:50:47 +0000 UTC
-3736,CLOSED,Why does TRITONSERVER_Server used like this?,,2021-12-29 17:48:24 +0000 UTC
-3734,CLOSED,tensorrt plugin error？,question,2022-01-21 18:34:24 +0000 UTC
-3733,CLOSED,S3 sync unloading all the models in triton,,2021-12-26 14:48:05 +0000 UTC
-3732,CLOSED,how to send paramters between different models when use ensemble,,2021-12-23 07:42:07 +0000 UTC
-3731,CLOSED,CPU only mode cannot load models, got CUDA error,,2022-02-24 09:59:09 +0000 UTC
-3730,CLOSED,Failed to allocate memory for requested buffer of size 13565952,,2021-12-22 17:01:02 +0000 UTC
-3729,CLOSED,How to implement custom pre/post processing in Triton inference server?,,2021-12-22 14:53:25 +0000 UTC
-3728,CLOSED,[developing custom triton backend] Segmentation fault when return error in TRITONBACKEND_ModelInstanceInitialize,,2022-01-21 18:19:48 +0000 UTC
-3727,CLOSED,Make grpc_client thread-safe,,2022-01-10 19:38:59 +0000 UTC
-3725,CLOSED,Error configuring triton with s3 | Could not get MetaData for object at s3://,,2022-01-21 18:17:48 +0000 UTC
-3719,CLOSED,[question] What is the server actions when got new requests immediately after the exiting,,2021-12-20 23:08:05 +0000 UTC
-3718,CLOSED,Unable to load Openvino shufflenet model (Input 'axes' should be Constant.),,2021-12-23 07:51:20 +0000 UTC
-3717,CLOSED,Segmentation fault when delete server through C API,bug,2022-03-17 06:09:44 +0000 UTC
-3716,CLOSED,Starting a Triton Inference Server using Terraform on AWS throwing stub error,investigating,2021-12-23 07:07:45 +0000 UTC
-3715,CLOSED,Windows Tensorflow and Pytorch support,,2021-12-29 17:32:24 +0000 UTC
-3711,CLOSED,Server returns broken json requests when using TensorRT model config,bug,2022-02-01 19:38:00 +0000 UTC
-3710,CLOSED,My tensorrt model can not be loaded by triton server,,2022-04-21 17:54:35 +0000 UTC
-3705,CLOSED,How to understand the priority in rate_limiter?,,2021-12-16 03:46:38 +0000 UTC
-3701,CLOSED,timeouts when using gpu_execution_accelerator 'tensorrt',,2021-12-15 22:29:05 +0000 UTC
-3700,CLOSED,Segmentation fault,bug,2022-01-21 16:58:00 +0000 UTC
-3697,CLOSED,Triton Server python backend doesn't provide permission to create directories [Errno 13] Permission denied,,2021-12-13 22:56:09 +0000 UTC
-3696,CLOSED,question about warmup,question,2022-07-25 14:15:47 +0000 UTC
-3695,CLOSED,question about tensorflow backend,,2021-12-13 19:22:01 +0000 UTC
-3694,OPEN,[BUG] Triton Server with Kaldi Backend does not return final response to client.,bug,2021-12-20 21:49:32 +0000 UTC
-3693,CLOSED,question about batch inference,question,2021-12-21 01:23:54 +0000 UTC
-3688,CLOSED,Cannot start Triton servers following default instructions,,2021-12-10 19:13:12 +0000 UTC
-3687,CLOSED,Fails to load the models from model_repository,,2021-12-10 12:04:46 +0000 UTC
-3686,CLOSED,Onnx runtime error. grid_sampler is not a registered function/op,,2021-12-10 19:54:44 +0000 UTC
-3685,CLOSED,docker: Command not found in build server with backend,,2022-01-21 18:49:21 +0000 UTC
-3683,CLOSED,JetPack 4.6 do not support pytorch?,,2021-12-10 20:13:17 +0000 UTC
-3681,CLOSED,About the Model Warmup part of the document,question,2021-12-10 20:31:05 +0000 UTC
-3679,CLOSED,Triton hosted with kubernetes on Jetson Nano,question,2022-01-10 19:43:12 +0000 UTC
-3678,CLOSED,Stub process is unhealthy and it will be restarted,bug,2023-03-17 17:07:51 +0000 UTC
-3677,CLOSED,error: ‘TRITONSERVER_ResponseAllocatorQueryFn_t’ has not been declared,,2022-01-05 18:57:56 +0000 UTC
-3671,CLOSED,Asynchronous web client sending request to triton server,question,2022-12-26 16:37:56 +0000 UTC
-3669,CLOSED,yolov5 inference time increases in triton?how to get infer time details in triton?,question,2021-12-09 21:40:50 +0000 UTC
-3668,CLOSED,[Question] Client code halted at ModelRepositoryIndex via gRPC?,question,2022-01-21 18:51:44 +0000 UTC
-3664,OPEN,Triton's TF backend does not support ScaNN operations needed for tf recommenders models,enhancement, investigating,2023-03-21 22:53:39 +0000 UTC
-3658,CLOSED,How to get requests headers in the backend,question,2021-12-07 02:03:34 +0000 UTC
-3657,CLOSED,when use perf_analyzer, what's the compute input meaning?,question,2021-12-06 22:01:45 +0000 UTC
-3656,CLOSED,Can't define a constant value in input_map within config.pbtxt,,2021-12-06 22:08:57 +0000 UTC
-3655,CLOSED,Class InferResult not released causing memory leak,,2021-12-06 22:26:30 +0000 UTC
-3652,CLOSED,[QUESTION] grpc_server.cpp ModelInferHandler::Process the same Request instance for different correlation_id,question,2022-03-25 22:49:21 +0000 UTC
-3650,CLOSED,unexpected platform type python for <model_name>,,2021-12-03 17:20:57 +0000 UTC
-3649,CLOSED,pytorch models not being loaded,,2021-12-05 22:44:32 +0000 UTC
-3648,CLOSED,Pose estimation models,,2021-12-05 22:43:29 +0000 UTC
-3647,CLOSED,error: failed to register input shared memory region: failed to register CUDA shared memory region 'input_data1',bug,2021-12-22 00:26:42 +0000 UTC
-3646,CLOSED,system crash problem,,2022-01-19 09:30:53 +0000 UTC
-3645,CLOSED,start failed with s3 model repo,,2021-12-02 20:40:46 +0000 UTC
-3644,OPEN,[Feature Request] Support label look up for tensors of higher rank in classification protocol,enhancement, investigating,2021-12-06 23:44:51 +0000 UTC
-3643,CLOSED,can I specify the log file with auto rolling with file size option when run triton?,enhancement,2023-07-10 22:52:31 +0000 UTC
-3640,CLOSED,faster_rcnn_r50 pretrained converted to ONNX hosted in Triton model server,question,2023-01-30 07:45:59 +0000 UTC
-3637,CLOSED,[Feature Request] Proper documentation on usage of "label_filename" and code example for server side label look up,,2021-12-22 18:46:14 +0000 UTC
-3634,CLOSED,[QUESTION] TensorRT model with variable-sized input / output dimensions returns null,bug, investigating,2023-07-10 22:54:59 +0000 UTC
-3633,CLOSED,About PyTorch execute failure: forward() is missing value for argument 'input'. error,,2021-12-01 18:48:46 +0000 UTC
-3631,CLOSED,triton with onnx model error when load,,2021-12-13 22:21:13 +0000 UTC
-3630,CLOSED,When I use Onnx In Triton, The CPU Only 70% utilization can be measured, When I add the concurrent, The time delay increase, but CPU usages can't increase any more.,,2021-12-10 22:11:13 +0000 UTC
-3629,CLOSED,Will the triton server schedule requests in the queue to GPUs with low memory usage?,,2021-12-01 03:36:26 +0000 UTC
-3628,CLOSED,Update model,,2022-10-31 11:40:25 +0000 UTC
-3627,OPEN,health check should not say it's ready when cuda device-side assertion error is triggered,enhancement,2022-08-08 21:44:18 +0000 UTC
-3626,CLOSED,When I run tritonserver.exe on windows, I encountered the following problems,,2023-01-11 08:32:15 +0000 UTC
-3624,CLOSED,dynamic batching not working properly while requests waiting in queue,bug, investigating,2022-01-24 16:08:41 +0000 UTC
-3623,CLOSED,failed to load model :at least one version must be available under the version policy of model,,2021-11-29 19:51:58 +0000 UTC
-3622,CLOSED,backend development: cannot convert string to datatype,,2021-11-26 13:56:19 +0000 UTC
-3621,CLOSED,release memory,enhancement,2021-11-30 09:50:13 +0000 UTC
-3620,CLOSED,Failed to build triton server with docker at branch r20.12,,2021-12-01 08:36:34 +0000 UTC
-3619,CLOSED,How to Get Model's FLOPS In TIS?,enhancement,2023-07-10 22:55:31 +0000 UTC
-3617,CLOSED,Failed to build r21.05 in Docker container,,2021-11-29 10:04:36 +0000 UTC
-3616,CLOSED,Getting (unsorted) value of all classes as output,,2021-11-26 13:46:37 +0000 UTC
-3615,CLOSED,Error in using S3-Compatible Storage [Oracle Cloud Infrastructure (OCI) Object Storage],,2021-11-30 15:36:36 +0000 UTC
-3614,CLOSED,floating point exception using self built python backend and triton, centos,,2021-12-01 01:46:21 +0000 UTC
-3613,CLOSED,Segmentation fault in libtriton_pytorch.so with invalid inputs,,2022-11-16 09:05:43 +0000 UTC
-3612,CLOSED,Allow different input types for different inputs in perf_analyzer,,2021-12-22 18:30:07 +0000 UTC
-3610,OPEN,Triton Inference Server binary distribution for Ubuntu x64,enhancement,2022-02-01 03:44:49 +0000 UTC
-3609,CLOSED,The model deployed with TensorRT could not be load,,2023-05-27 16:35:12 +0000 UTC
-3608,CLOSED,python_backend always tries to chmod the triton_python_backend_stub,,2022-06-16 20:26:08 +0000 UTC
-3607,CLOSED,undefined symbol: TRITONBACKEND_StateBuffer,,2021-11-29 05:52:33 +0000 UTC
-3604,CLOSED,Op type not registered 'SentencepieceOp' for Universal Sentence Encoder,question,2021-12-02 04:24:28 +0000 UTC
-3603,CLOSED,Pip install tritonclient[all] not working in ubuntu 20.04,,2023-03-29 03:41:52 +0000 UTC
-3602,CLOSED,Ensemble models with multiple inputs or outputs,,2021-11-29 18:07:48 +0000 UTC
-3601,CLOSED,protobuf version issue,,2023-04-10 16:19:00 +0000 UTC
-3599,CLOSED,Considerably higher "compute infer" time when client geographically further from server,investigating,2022-01-21 18:54:28 +0000 UTC
-3597,OPEN,Allow triton to read from multiple cloud model repositories,enhancement,2021-11-23 03:27:30 +0000 UTC
-3596,CLOSED,Hi, I have some question about triton/onnxruntime/tensorrt,,2021-12-07 23:32:57 +0000 UTC
-3595,CLOSED,no model loaded by triton,,2021-12-07 23:34:13 +0000 UTC
-3594,OPEN,Support external shared memory stored in a Redis server,enhancement,2021-11-27 13:39:36 +0000 UTC
-3593,CLOSED,Error when converting the automatic config json to config.pbtxt,bug,2022-03-29 16:38:38 +0000 UTC
-3584,CLOSED,Generate config.pbtxt,,2021-11-24 10:18:04 +0000 UTC
-3583,CLOSED,automatic model load / unload or a lockable store extension,enhancement,2022-01-20 23:51:47 +0000 UTC
-3581,CLOSED,Is there any way to check if the client is disconnected? Or a way to force close the model?,question,2021-11-29 18:02:04 +0000 UTC
-3580,CLOSED,Stateful model example for the python backend,,2021-11-21 09:53:54 +0000 UTC
-3579,CLOSED,triton server supports multi-version TensorRT backend,,2021-11-24 02:55:48 +0000 UTC
-3578,CLOSED,Unknown type name 'NoneType',question,2021-12-10 22:18:52 +0000 UTC
-3577,CLOSED,Triton Server return sequence flags errors in each 6 batch processing,,2021-12-07 00:12:21 +0000 UTC
-3576,CLOSED,Unable to find the implementation of the pure virtual function Run() function of the BackendContext structure,,2021-11-17 18:55:57 +0000 UTC
-3575,OPEN,[Question] Poll mode with --load-model,enhancement,2022-10-31 14:49:59 +0000 UTC
-3574,CLOSED,Slower ONNX inference on tritonserver than on jetson-voice,,2021-11-30 22:39:12 +0000 UTC
-3573,CLOSED,Unexpected inference output 'detections' for model 'yolov4',,2021-11-17 06:58:24 +0000 UTC
-3572,CLOSED,Triton Inference Server is 10X Slower than TensorFlow Serving!!!?,performance, investigating,2022-01-31 21:55:42 +0000 UTC
-3571,CLOSED,How does the client call HTTP request to send an image file (such as a.jpg) to the server?,,2021-11-15 23:00:29 +0000 UTC
-3570,CLOSED,CMake Error In Centos,,2022-08-23 06:23:02 +0000 UTC
-3566,CLOSED,Can Give a Centos version Model_analyzer build.py. Now this build script just for window or ubantu, I have to make my own Dockerfile, But there many bug report.,,2021-11-13 00:14:30 +0000 UTC
-3565,CLOSED,Is it possible to release the additional gpu memory occupied for inference on a batch?,,2021-11-17 05:18:35 +0000 UTC
-3564,CLOSED,How to send batch with different sizes in inference client request,,2021-11-17 00:21:50 +0000 UTC
-3563,CLOSED,triton server is down after inferencing with one request,bug,2022-03-25 16:47:22 +0000 UTC
-3561,CLOSED,Failed to build triton server on an aarch64 device with error "error: ‘size_t’ does not name a type",,2021-11-11 14:11:48 +0000 UTC
-3560,CLOSED,python backend model instances as threads instead of separate processes?,,2021-11-30 22:37:11 +0000 UTC
-3559,CLOSED,Signal 11 received and server down when inferring an Onnx model,,2021-11-11 18:25:58 +0000 UTC
-3554,CLOSED,RPC Client report Unimplemented desc,,2021-11-10 16:38:21 +0000 UTC
-3553,CLOSED,How to measure performance (throughput, latency) when running two different models run concurrently?,,2021-11-10 17:01:50 +0000 UTC
-3552,CLOSED,Unable to compile identity_backend,,2021-11-11 14:45:54 +0000 UTC
-3551,CLOSED,Documentation for jetson should be updated for client installation,,2021-11-10 16:33:44 +0000 UTC
-3548,CLOSED,Not able to load the BertForSequenceClassification model from huggingface,pytorch ngc,2022-07-27 13:08:26 +0000 UTC
-3547,CLOSED,[Python Backend] Send PbTensor to cpu for calling as_numpy() or add a function as_cupy(),enhancement, investigating,2023-01-24 22:42:06 +0000 UTC
-3546,CLOSED,CondaPackError: Cannot pack an environment with editable packages,,2021-11-15 10:49:19 +0000 UTC
-3544,CLOSED,triton-client no matches found from pip install ?,,2022-07-13 10:52:39 +0000 UTC
-3543,CLOSED,How to transfer GPU memory data to CPU memory in C++ custom backend,,2021-11-08 11:09:56 +0000 UTC
-3542,CLOSED,how to set CMAKE_EXPORT_COMPILE_COMMANDS,,2021-11-09 01:00:44 +0000 UTC
-3541,CLOSED,Need help authoring Model configuration for Pytorch MNIST,,2021-12-06 13:51:42 +0000 UTC
-3537,CLOSED,output tensor of [1, 1000, 1, 1] 'fc6_1' output layer in densenet onnx,,2021-11-05 08:19:14 +0000 UTC
-3534,CLOSED,String Outputs- Setting Shape and Output Config,,2021-11-30 22:33:52 +0000 UTC
-3532,CLOSED,How can I check nvidia driver version built in images?,,2021-11-04 04:11:30 +0000 UTC
-3529,CLOSED,TensorRT Backend Installation Error,question,2021-11-30 22:33:28 +0000 UTC
-3528,CLOSED,Tensorflow Object Detection Prediction,bug, question,2021-11-30 22:39:20 +0000 UTC
-3527,CLOSED,max_batch_size in config.pbtxt refer to model batch size or request batch size ?,question,2023-02-16 18:14:02 +0000 UTC
-3525,CLOSED,Prometheus not working with Triton,bug,2022-05-23 16:38:43 +0000 UTC
-3524,CLOSED,struct.error: unpack_from requires a buffer of at least 274435 bytes,bug,2021-11-29 19:53:44 +0000 UTC
-3523,CLOSED,Custom backend (C++ or python) Whats the difference ?,question,2021-11-03 03:44:46 +0000 UTC
-3522,CLOSED,How to debug a custom backend,question,2021-11-05 01:59:35 +0000 UTC
-3519,CLOSED,Load buit-in OpenVINO failed,bug,2021-11-22 23:38:31 +0000 UTC
-3518,CLOSED,`cudaCheck` fails when provided `_CUDA_COMPAT_REALLIB`,question,2021-11-02 09:01:50 +0000 UTC
-3515,CLOSED,Standard Output in Python Backend,,2021-10-29 20:21:03 +0000 UTC
-3512,CLOSED,perf_analyzer report indices element out of data bounds, idx=1936311911 must be within the inclusive range [-100000,99999],investigating,2022-03-01 01:52:59 +0000 UTC
-3511,CLOSED,"Failed to allocate CUDA memory with byte size" WARNNING,question,2022-07-19 11:35:48 +0000 UTC
-3509,CLOSED,unable to create shared memory region,,2021-11-30 22:29:18 +0000 UTC
-3507,CLOSED,How to request batch inputs to server, I want recall 200 creative ideas, but the input in config.pbtxt only for one,,2021-11-10 07:25:08 +0000 UTC
-3506,CLOSED,MIG deployment of triton gives error on GKE,,2021-10-31 09:06:21 +0000 UTC
-3505,CLOSED,unexpected inference output 'OUTPUT__0' for model 'fil',,2021-10-27 22:11:44 +0000 UTC
-3504,CLOSED,Error while deploying nvidia triton inference on AWS EKS,,2021-11-12 22:44:20 +0000 UTC
-3503,CLOSED,к21.04 broken link in release notes,question,2021-10-28 19:35:03 +0000 UTC
-3502,CLOSED,Does triton support TTS audio streaming synthesis？,,2021-11-12 22:40:38 +0000 UTC
-3499,CLOSED,Save files in server,,2021-11-09 18:02:16 +0000 UTC
-3498,CLOSED,Unable to build CPU only image,bug,2021-11-30 22:28:58 +0000 UTC
-3496,CLOSED,Dynamic Batching in client script,,2021-11-12 22:41:15 +0000 UTC
-3495,CLOSED,Creating custom python backend environment,bug,2021-12-17 14:42:31 +0000 UTC
-3494,CLOSED,Unable to use shm using simple_http_shm_client,,2021-10-25 18:25:45 +0000 UTC
-3493,CLOSED,pytorch backend has lowergpu utilization,question,2021-11-12 22:42:02 +0000 UTC
-3492,CLOSED,custom backend handling of unexpectly closed sequence under sequence batcher oldest Strategy,enhancement, question,2023-07-06 18:15:30 +0000 UTC
-3491,CLOSED,Witch backend should I use for gettting best performance,,2021-10-23 18:19:39 +0000 UTC
-3490,CLOSED,Is it possible to redirect a gRPC request from the Triton client?,question,2021-11-30 22:27:33 +0000 UTC
-3489,CLOSED,Clear documentation about all parameters in config.pbtxt meaning?,,2021-12-15 01:23:48 +0000 UTC
-3488,CLOSED,core dumps error,investigating,2021-11-30 22:27:18 +0000 UTC
-3485,CLOSED,Version policy `specific` doesn’t work as expected. It doesn’t respect the model_version parameter during inference.,bug,2022-01-21 18:13:54 +0000 UTC
-3484,OPEN,Label_filename content request,enhancement,2023-02-07 23:12:42 +0000 UTC
-3483,CLOSED,Can the ensembled model implement branching logic?,,2021-12-01 16:28:25 +0000 UTC
-3482,OPEN,Is python backend going to support asyncio?,enhancement,2023-02-03 01:27:42 +0000 UTC
-3480,OPEN,unable to load custom python environment with python backend,bug,2021-12-07 16:01:37 +0000 UTC
-3479,CLOSED,Deploy Triton using Kubernetes,,2021-11-12 22:43:16 +0000 UTC
-3478,CLOSED,looser throw specifier for ‘virtual const char* google::protobuf::FatalException::what() const’,,2021-11-17 00:43:01 +0000 UTC
-3475,CLOSED,Intermittent issue with loading a python backend models (21.08),bug,2021-11-30 15:27:40 +0000 UTC
-3474,CLOSED,Device Auto Reallocaton not working as expected,bug, investigating,2021-11-03 07:11:23 +0000 UTC
-3473,CLOSED,ONNX with TensorRT Optimization Model Warmup not work,,2022-03-25 04:27:34 +0000 UTC
-3471,CLOSED,Triton Explicit Model control mode cloud serving advice,enhancement,2022-09-22 20:52:26 +0000 UTC
-3470,CLOSED,Using String outputs for Custom C++ Backend Models,,2021-10-27 17:48:16 +0000 UTC
-3469,CLOSED,jimeter test, tps is not stable,,2021-11-01 16:20:23 +0000 UTC
-3468,CLOSED,Unable to deploy Resnet.,,2021-11-12 22:54:35 +0000 UTC
-3467,CLOSED,label_filename usage,,2021-10-22 22:05:25 +0000 UTC
-3465,CLOSED,Unable to grpcurl to inference.GRPCInferenceService/ServerLive,,2021-11-12 22:55:06 +0000 UTC
-3464,CLOSED,I saved a tensorflow model as the saved_model format of the serialized input of tf.Example, how to deploy the triton service and infer,bug,2023-05-19 22:37:38 +0000 UTC
-3460,CLOSED,GPU metrics are not matching with nvidia-smi output,investigating,2021-11-30 22:52:33 +0000 UTC
-3459,CLOSED,Condition checking in ensemble,,2021-12-22 17:49:52 +0000 UTC
-3456,CLOSED,Not able to provide <perf-analyzer-flags> 'shape' for perf_analyzer in config.yaml, results in termination,,2021-10-12 17:34:15 +0000 UTC
-3455,CLOSED,Can we provide pure cmake builds without mixing python scripts and cmake files and download steps?,,2021-11-01 16:19:18 +0000 UTC
-3452,OPEN,Same model but different results between triton and native tensorrt engine,bug,2023-06-23 19:36:33 +0000 UTC
-3451,CLOSED,Core dump caused by overwhelming requests,,2021-11-01 16:18:50 +0000 UTC
-3450,CLOSED,Triton backend API version does not support this backend,,2021-10-12 18:08:51 +0000 UTC
-3448,CLOSED,Python backend model not loading in recent release,,2021-12-22 17:30:21 +0000 UTC
-3446,CLOSED,No throughput gain on increasing number of GPUs,,2021-10-20 12:29:44 +0000 UTC
-3445,CLOSED,Unable to find lib triton_python.so,,2021-10-12 03:23:54 +0000 UTC
-3444,CLOSED,Possibilities for shared memory and CUDA shared memory,,2021-10-14 21:48:18 +0000 UTC
-3443,CLOSED,Change Documentation of Instance Groups to prevent Misunderstandings,,2021-11-30 22:42:50 +0000 UTC
-3442,CLOSED,Add installation instructions for rhel,,2021-10-18 22:02:47 +0000 UTC
-3441,CLOSED,will ensemble model pass the GPU pointer to next model?,question,2022-07-27 00:46:40 +0000 UTC
-3429,CLOSED,Docker fails to register cuda shared memory,bug,2023-05-16 18:27:04 +0000 UTC
-3428,CLOSED,Triton Server Shared Memory - Free Unused Memory from Multiple Processes in C++,,2021-10-06 03:21:31 +0000 UTC
-3426,CLOSED,How to use a YOLO detection + DeepSORT tracking models in Triton?,,2023-03-08 11:21:01 +0000 UTC
-3425,CLOSED,unable to build custom image: unexpected keyword argument,,2021-10-01 18:45:33 +0000 UTC
-3420,CLOSED,how to profile GPU input/output tensors on python backend,,2021-09-29 19:21:59 +0000 UTC
-3419,CLOSED,Ability to have optional inputs,enhancement,2021-12-14 00:49:59 +0000 UTC
-3418,CLOSED,Onnxruntime execute failure,bug,2022-06-07 16:59:38 +0000 UTC
-3417,CLOSED,TensorRT Backend Repo build failed.,,2021-09-30 05:29:21 +0000 UTC
-3416,CLOSED,[Questions] Why is the queue time high?,,2021-09-29 22:22:51 +0000 UTC
-3414,CLOSED,python-backend r21.09-py3 manifest unknown,,2021-09-28 19:21:18 +0000 UTC
-3413,CLOSED,Improve ensemble concurrent performance,,2021-10-18 22:27:41 +0000 UTC
-3410,CLOSED,Clear documentation on GRPC health endpoint,,2021-11-30 22:48:33 +0000 UTC
-3409,OPEN,Packages for triton server and its components.,enhancement,2021-09-27 21:43:37 +0000 UTC
-3407,CLOSED,Problem with sending float tensors to the server,,2022-12-07 08:56:55 +0000 UTC
-3406,CLOSED,Out of order on processing sequence,,2021-10-07 01:19:07 +0000 UTC
-3405,CLOSED,how to change a torchscript model to stateful model to use the sequence_batching config,,2021-10-26 19:04:13 +0000 UTC
-3401,CLOSED,error: unable to run model: failed to parse the request JSON buffer: The document is empty. at 0,investigating,2021-12-22 00:43:52 +0000 UTC
-3400,CLOSED,Why client expriment v2 api in r20.03 can not request 20.03 server,investigating,2021-09-27 03:50:31 +0000 UTC
-3399,CLOSED,Misleading gRPC status codes returned for some error cases,enhancement, investigating,2022-07-13 21:42:23 +0000 UTC
-3395,CLOSED,Dynamic Batching for Variable Shaped Inputs,,2021-10-07 19:15:08 +0000 UTC
-3394,CLOSED,Asynchronous Operation for Request/Response for Python Backed,enhancement,2022-07-18 23:23:54 +0000 UTC
-3393,CLOSED,TFLite Backend,,2022-11-07 22:01:28 +0000 UTC
-3391,CLOSED,`perf_analyzer` error while loading shared libraries: `libcudart.so.11.0`,,2021-10-01 18:29:17 +0000 UTC
-3390,CLOSED,multithread failed,,2021-09-24 19:43:26 +0000 UTC
-3389,CLOSED,multithread failed,,2021-09-24 19:43:35 +0000 UTC
-3386,CLOSED,Python backend on CPU is slower when serving a pytorch model,bug, investigating,2022-01-27 01:45:38 +0000 UTC
-3385,CLOSED,TorchScript gelu signature difference,,2021-09-23 07:58:09 +0000 UTC
-3384,CLOSED,Can't load model by API,,2021-11-30 22:50:03 +0000 UTC
-3383,CLOSED,AWS S3 bucket model repository failure,,2021-11-01 12:51:16 +0000 UTC
-3382,CLOSED,Inconsistency in CUDA compatibility check,,2022-01-21 23:16:20 +0000 UTC
-3380,CLOSED,Release triton server image 21.09+,,2021-09-21 06:29:10 +0000 UTC
-3379,CLOSED,GPU memory consumption increases after model is loaded,,2021-11-30 22:55:10 +0000 UTC
-3378,CLOSED,Concurrent Model Execution In Same Client Script,,2021-10-26 13:22:37 +0000 UTC
-3377,CLOSED,Multiple models inference in just 1 client script,,2021-10-07 18:56:27 +0000 UTC
-3376,CLOSED,why a model might take longer per inference request after a different model is run?,investigating,2023-07-10 22:44:23 +0000 UTC
-3374,CLOSED,Why expriment v2 api in r20.03 can't request 20.03 server,,2021-09-24 02:58:42 +0000 UTC
-3373,OPEN,Ensemble of models are executed over different devices,enhancement,2021-09-17 15:54:37 +0000 UTC
-3372,CLOSED,tritonclient.utils.shared_memory not available on Jetson tritonclient[all]=2.8.0,enhancement,2021-09-19 15:17:38 +0000 UTC
-3371,CLOSED,Cannot load model using S3 / MinIO,,2021-09-15 10:44:12 +0000 UTC
-3370,CLOSED,Does tensorflow-backend support multi-stream on a single GPU?,,2021-09-16 22:20:59 +0000 UTC
-3369,CLOSED,2 or more models running in parallel,,2021-09-17 05:11:14 +0000 UTC
-3364,CLOSED,Enable querying for desired output memory type,enhancement,2021-12-02 01:11:28 +0000 UTC
-3362,CLOSED,Crashing when sending 1000 requests on 10 concurrent connections,,2021-09-15 13:11:53 +0000 UTC
-3361,CLOSED,Loading CPU/GPU dyncamically,,2021-09-14 17:13:18 +0000 UTC
-3360,CLOSED,Allow http server to bind to specific address/interface,enhancement, good first issue,2022-04-04 17:48:23 +0000 UTC
-3358,CLOSED,Triton on AWS ec2 instance: cuda_memory_manager.cc:115] CUDA memory pool disabled,,2021-09-11 07:24:04 +0000 UTC
-3353,CLOSED,Connection reset by peer,,2021-10-28 00:05:11 +0000 UTC
-3350,CLOSED,about config.pbtxt,,2021-09-10 16:54:40 +0000 UTC
-3349,CLOSED,Model Load Lifecycle,,2021-09-30 18:56:55 +0000 UTC
-3348,CLOSED,PyTorch execute failure: isTensor(); Expected Tensor but got GenericList,,2021-10-28 01:44:20 +0000 UTC
-3347,CLOSED,Is it possible to use tensorflow with GPU on custom python backend?,,2021-09-14 17:07:37 +0000 UTC
-3344,CLOSED,An option to automatically re-assign device for CPU only ops in tensorflow SavedModel,,2021-09-17 09:13:48 +0000 UTC
-3343,CLOSED,Pass different input size in a batch,,2021-10-18 22:45:08 +0000 UTC
-3340,CLOSED,Tensorflow backend support for parallelism,,2021-09-09 19:18:46 +0000 UTC
-3337,CLOSED,inference failed: in ensemble unexpected deadlock,,2021-09-09 08:40:50 +0000 UTC
-3336,CLOSED,How to provide dynamic input size in ensemble model,,2021-09-08 18:58:39 +0000 UTC
-3335,CLOSED,How can I improve the inference speed?,,2021-09-09 04:15:28 +0000 UTC
-3334,CLOSED,Triton server is slower than pytorch model,,2021-11-15 23:01:55 +0000 UTC
-3333,CLOSED,Defining multiple key-value pairs in ensemble model config.pbtxt,,2021-09-23 17:18:04 +0000 UTC
-3332,CLOSED,AttributeError: module 'triton_python_backend_utils' has no attribute 'InferenceRequest',,2021-09-08 18:37:04 +0000 UTC
-3328,CLOSED,How to pass different input sizes to different models in an ensemble?,,2021-09-08 07:21:09 +0000 UTC
-3327,CLOSED,is the dynamic batcher setting sucess?,,2021-09-08 17:03:50 +0000 UTC
-3326,CLOSED,Scaling multiple GPUs,,2021-11-30 22:55:20 +0000 UTC
-3325,CLOSED,TRITON server returning wrong inferences,,2021-09-16 22:21:18 +0000 UTC
-3322,CLOSED,make: *** No rule to make target 'triton-python-backend-stub'.,,2021-09-09 18:15:05 +0000 UTC
-3321,CLOSED,triton_server slower than tensorflow serving and python API,question,2021-12-22 00:33:08 +0000 UTC
-3320,CLOSED,Memory leak issue on using load/unload API to dynamic loading TensorRT model,bug,2021-12-16 06:25:25 +0000 UTC
-3318,CLOSED,Shape problem when using SavedModel,,2021-09-07 16:44:16 +0000 UTC
-3316,CLOSED,Allow selected models in the model repository not to be loaded,,2021-09-03 15:58:07 +0000 UTC
-3312,CLOSED,Why the Yolov5s speed is very slow neither using dynamic batch size or multiple instances?,,2021-11-17 07:58:58 +0000 UTC
-3308,CLOSED,Facing issue while using Yolov5s with Trition inference Server,,2022-11-22 03:18:08 +0000 UTC
-3307,CLOSED,How to rename input name of torchscript model?,,2021-09-09 18:57:42 +0000 UTC
-3306,CLOSED,Dynamically switch floating point precision for TensorRT engine,enhancement, question,2021-11-23 00:01:56 +0000 UTC
-3303,CLOSED,http client for a model with multiple outputs,question,2023-07-05 12:47:44 +0000 UTC
-3302,CLOSED,The current model becomes slow after running another model,,2021-09-18 06:10:18 +0000 UTC
-3301,CLOSED,max_batch_size not in op error reported for tensorflow tensorrt model,investigating,2021-09-27 22:45:15 +0000 UTC
-3300,CLOSED,Can NMT models can be deployed with triton inference server ?,question,2021-09-01 18:10:02 +0000 UTC
-3299,CLOSED,Periodic dead of server while using python backend,bug,2023-02-21 17:51:26 +0000 UTC
-3292,CLOSED,Using Triton as a custom service in GitLab CI,question,2021-09-01 22:35:28 +0000 UTC
-3290,CLOSED,Half Precision TF inference error (DT_HALF and Einsum),investigating,2021-10-08 18:19:21 +0000 UTC
-3288,CLOSED,Build error withou docker,,2021-08-30 16:51:45 +0000 UTC
-3287,CLOSED,Triton21.05 has problem when load model from Minio,investigating,2022-02-24 16:38:43 +0000 UTC
-3286,OPEN,Batched Prediction for Python backend,enhancement,2022-02-25 14:21:04 +0000 UTC
-3284,CLOSED,Question regarding string request/response with HTTP frontend and python backend,question,2021-09-03 22:56:10 +0000 UTC
-3283,CLOSED,Build ONNX Runtime Backend image failed,,2021-09-17 20:37:05 +0000 UTC
-3282,CLOSED,Triton repository agent checksum - readme broken link,,2021-08-31 17:18:21 +0000 UTC
-3281,CLOSED,【20210826】 multi model-repository is invalid,,2022-11-22 03:17:54 +0000 UTC
-3275,CLOSED,Running nvidia inference server without an Nvidia GPU,,2021-08-25 13:37:53 +0000 UTC
-3274,CLOSED,Performance on triton with python backend,performance,2021-08-30 14:38:15 +0000 UTC
-3271,CLOSED,Does TIS supports other devices such as google's TPU?,,2021-09-03 21:50:59 +0000 UTC
-3265,CLOSED,[PyTorch Backend] Triton server consumes too much GPU memory,,2021-09-03 22:57:05 +0000 UTC
-3264,CLOSED,Sagemaker and KFServing port clash,,2021-08-23 13:57:39 +0000 UTC
-3263,CLOSED,How to Add Custom Models with different framework in Triton inference server and have to serve the model and utilize the api,,2021-09-06 12:18:33 +0000 UTC
-3262,CLOSED,Build modified triton from source in Jetson,,2021-09-28 12:15:32 +0000 UTC
-3261,CLOSED,Image Segmentation Output,,2021-09-03 21:51:20 +0000 UTC
-3260,CLOSED,Client Script Error,,2021-09-20 14:47:08 +0000 UTC
-3259,CLOSED,python backend pb_utils.InferenceResponse can adding message argument for the responses output ?,,2021-08-23 14:17:09 +0000 UTC
-3258,CLOSED,the result of ensemble model is wrong and the result of single model is right.,,2021-08-22 01:20:14 +0000 UTC
-3256,CLOSED,Missing package configuration: prometheus-cpp,question,2021-08-23 15:26:43 +0000 UTC
-3255,CLOSED,Kubernetes unable to kill container,,2021-08-26 08:03:36 +0000 UTC
-3254,CLOSED,Tensorflow Segmentation Model Deployment On Triton Inference Server,,2021-09-03 21:52:23 +0000 UTC
-3253,CLOSED,Build python backend error using docker,,2021-08-24 03:19:54 +0000 UTC
-3252,CLOSED,I use my yolov5l tensorrt model to deploy on triton, it will instantly fill up the GPU memory,,2021-08-20 08:47:28 +0000 UTC
-3251,CLOSED,Avg request latency to Avg HTTP time up 5000 usec,,2021-09-30 02:41:03 +0000 UTC
-3248,CLOSED,Inconsistent implementation results,question,2021-09-03 21:51:49 +0000 UTC
-3246,CLOSED,Triton Ensemble model: Unable to get multiple output,,2021-08-19 16:43:56 +0000 UTC
-3245,CLOSED,Accumulate inference time with an ensemble model is way slower than the slowest individual,,2021-09-03 22:58:11 +0000 UTC
-3244,CLOSED,Dockerfile in building triton,,2021-11-18 18:40:11 +0000 UTC
-3239,CLOSED,Unable to set input with 0 dimension shape for non-batching model,,2021-08-17 17:04:38 +0000 UTC
-3237,CLOSED,FileNotFoundError: [Errno 2] No such file or directory: '/tmp/folder31uWBi/1/model.py',bug,2021-08-18 23:04:45 +0000 UTC
-3234,CLOSED,tritonserver:<version>py3-min Dockerfile,,2021-08-18 20:37:50 +0000 UTC
-3228,CLOSED,Release Python client on PyPI,enhancement,2021-12-02 00:58:48 +0000 UTC
-3227,CLOSED,Load triton_python_backend_stub from S3 model repository,bug,2021-09-07 02:09:52 +0000 UTC
-3225,CLOSED,Can we fine tune a model with triton server?,,2021-08-12 15:42:43 +0000 UTC
-3224,CLOSED,Install Triton Server From Source Code On Jetson Nano,,2021-08-11 17:27:00 +0000 UTC
-3223,CLOSED,CPU triton server,,2021-08-30 21:02:00 +0000 UTC
-3222,CLOSED,Triton with python backend: not Using Python execution env *.tar.gz file,,2021-08-13 02:44:23 +0000 UTC
-3219,CLOSED,sending an output with wrong datatype (e.g. integers > 256 as datatype uint8) should fail,bug,2021-09-08 19:33:37 +0000 UTC
-3218,CLOSED,Is dynamic batch trt model file required when using dynamic_batching,,2021-08-30 21:00:12 +0000 UTC
-3217,CLOSED,[StatusCode.INVALID_ARGUMENT] unable to find data for input tensor 'input_tensor',,2021-08-10 07:23:01 +0000 UTC
-3215,CLOSED,Triton Ensemble Model - seems to be executing each model multiple times.,question,2021-08-23 14:53:40 +0000 UTC
-3213,CLOSED,Error in Build of Triton server using --no-container-build,,2021-08-30 20:59:57 +0000 UTC
-3212,OPEN,Can I load costum layer or plugin from Minio or S3?,enhancement,2022-02-08 09:39:12 +0000 UTC
-3211,CLOSED,Concurrent requests for the same model lead to inconsistent output results,,2022-06-30 03:59:47 +0000 UTC
-3210,CLOSED,error: creating server: INTERNAL - Could not get MetaData for bucket with name 192.168.x.xxx:9000,,2021-08-09 05:24:48 +0000 UTC
-3207,CLOSED,Unregistering system shared memory region before async requests complete causes server crash,question,2021-08-07 09:20:03 +0000 UTC
-3206,CLOSED,How to send binary data in `perf_analyzer`?,question,2021-08-30 21:00:37 +0000 UTC
-3204,CLOSED,Unable to run TensortRT execution accelerator with an ONNX model.,,2021-08-05 20:03:40 +0000 UTC
-3203,CLOSED,How to build the PyTorch Backend on jetson?,question,2021-11-06 12:12:36 +0000 UTC
-3198,CLOSED,Clarification of error message: Poll failed for model directory 'x': output 'output.0' for ensemble 'x' is not written,question,2021-08-04 21:23:27 +0000 UTC
-3197,CLOSED,Include triton backend license files in tritonserver container image.,enhancement, investigating,2021-08-05 21:40:40 +0000 UTC
-3196,CLOSED,segment fault when running tritonserver which is built from source,investigating,2021-08-05 03:30:13 +0000 UTC
-3195,CLOSED,how can I control the cuda memory for models?,question,2021-08-11 21:25:17 +0000 UTC
-3194,CLOSED,openvino_backend has much lower performance than tensorFlow_backend,performance, investigating,2022-01-07 18:13:56 +0000 UTC
-3193,CLOSED,how can I checkout to r20.12?,,2021-08-04 06:16:35 +0000 UTC
-3191,CLOSED,Certain FP16 traced pytorch models with batchnorm no longer work with r21.07,bug, investigating,2021-08-24 06:30:50 +0000 UTC
-3189,CLOSED,python backend complains about modulenotfound,,2021-08-12 03:49:28 +0000 UTC
-3188,CLOSED,Triton on KFServing errors on every request - "Infer failed: input 'features' already exists in request",,2021-09-03 22:58:32 +0000 UTC
-3187,CLOSED,Triton on KFServing errors on every request - " Infer failed: input 'features' already exists in request",,2021-08-03 14:31:23 +0000 UTC
-3183,CLOSED,Python backend server stuck after initializing,,2021-08-10 16:40:45 +0000 UTC
-3179,CLOSED,No performance gain in models from Tensorflow model zoo,,2021-09-17 20:35:39 +0000 UTC
-3178,CLOSED,SageMaker HTTPService automatically starts in r21.07/r21.06,,2021-08-04 18:42:19 +0000 UTC
-3177,CLOSED,[INTERNAL] Attempting to access response which is not yet allocated,,2021-08-09 22:18:54 +0000 UTC
-3176,CLOSED,[Docs] Clarify when backends need to release requests,,2021-08-03 00:19:34 +0000 UTC
-3175,CLOSED,CUDA illegal memory error when calling bool() in torch script,,2021-08-02 15:46:14 +0000 UTC
-3174,CLOSED,Trition Inference server docker container get exited during 2nd time inference on CPU,investigating,2021-12-22 00:22:39 +0000 UTC
-3173,OPEN,`Magic tag does not match` even when `model.plan` is generated inside the same container of Triton,bug, investigating,2021-08-04 08:19:44 +0000 UTC
-3172,OPEN,Parallel model initialization for python backend,enhancement,2021-08-02 17:23:02 +0000 UTC
-3171,OPEN,Model Interpretability,enhancement,2021-08-02 17:23:19 +0000 UTC
-3170,CLOSED,Python custom backend failed to start with error "no version information available (required by /bin/bash)",,2021-08-03 13:30:35 +0000 UTC
-3169,CLOSED,Build and run Triton direclty in my machine instead of using docker,question,2021-08-30 21:02:58 +0000 UTC
-3163,CLOSED,Missing tritonserver2.12.0-jetpack4.6.tgz,,2021-07-28 12:20:25 +0000 UTC
-3162,CLOSED,/opt/tensorrtserver/nvidia_entrypoint.sh: line 92: exec: tritonserver: not found,,2021-07-28 02:41:17 +0000 UTC
-3161,CLOSED,/opt/tensorrtserver/nvidia_entrypoint.sh: line 92: exec: tritonserver: not found,,2021-07-28 18:30:46 +0000 UTC
-3160,CLOSED,Streaming connections interrupted for extremely long time series,bug,2021-10-23 19:20:19 +0000 UTC
-3157,CLOSED,Framework support matrix has wrong version of onnxruntime for 21.06,,2021-07-28 18:31:58 +0000 UTC
-3156,CLOSED,Triton python backend build failed (main branch),bug,2021-08-12 20:24:05 +0000 UTC
-3155,CLOSED,Trition local build (branch r21.06) with docker failed,,2021-07-27 17:45:57 +0000 UTC
-3154,CLOSED,py3-min Dockerfile,,2021-08-02 17:48:32 +0000 UTC
-3152,CLOSED,got stucked when import torch,,2021-07-27 14:35:42 +0000 UTC
-3151,CLOSED,Can Triton support MIG?,,2021-08-03 00:57:41 +0000 UTC
-3150,CLOSED,Doesn't co-work with MPS?,bug,2022-04-12 17:05:48 +0000 UTC
-3148,CLOSED,Can we build with Caffe2 without MKL?,,2021-07-23 18:36:18 +0000 UTC
-3147,CLOSED,"failed to connect to all addresses" occurs by chance,,2021-09-01 09:28:58 +0000 UTC
-3146,CLOSED,cmake bug in subproject core,,2021-07-26 18:50:43 +0000 UTC
-3142,CLOSED,openvino error in loading network (with custom op),bug, duplicate,2021-08-14 02:42:37 +0000 UTC
-3141,CLOSED,CMake problems for client library in v2.11.0,enhancement,2022-03-25 16:46:51 +0000 UTC
-3138,CLOSED,Secure https endpoint,enhancement,2022-01-31 21:44:43 +0000 UTC
-3135,CLOSED,Triton requiring config.pbtxt when loading models from s3 (MinIO)?,bug, investigating,2023-01-31 19:29:55 +0000 UTC
-3134,CLOSED,dlSym cannot locate method 'CreateExtension',bug,2021-08-04 22:15:40 +0000 UTC
-3131,OPEN,Support for Kafka endpoints,enhancement,2021-07-19 19:00:06 +0000 UTC
-3130,CLOSED,Support for HTTP endpoint on windows,enhancement,2021-09-20 22:54:06 +0000 UTC
-3129,CLOSED,openvino custom op,question,2021-07-21 02:09:35 +0000 UTC
-3126,CLOSED,Triton spins on startup running in EGX,bug,2021-08-04 18:55:14 +0000 UTC
-3122,CLOSED,[PyTorch Backend] Missing libtorch_python.so and libshm.so for custom ops,,2021-07-15 17:58:44 +0000 UTC
-3121,CLOSED,GRPC: unable to provide 'output_3' in GPU, will use CPU,,2021-07-15 18:41:10 +0000 UTC
-3120,CLOSED,Can Triton server be installed on Drive PX2?,,2021-07-15 18:46:44 +0000 UTC
-3119,CLOSED,Triton server crashes silently with invalid input data,bug,2021-07-29 18:10:58 +0000 UTC
-3117,CLOSED,failed to load torch script model, unexpected no host policy,,2021-07-15 04:06:15 +0000 UTC
-3116,CLOSED,Wrong preferred_batch_size selected,,2021-07-21 08:26:01 +0000 UTC
-3115,CLOSED,How to read image file from HTTP requests for python backend?,question,2021-07-14 15:35:14 +0000 UTC
-3114,CLOSED,rapidjson.JSONDecodeError: Parse error at offset 0: Invalid value.,question,2021-07-16 07:42:51 +0000 UTC
-3113,OPEN,perf_analyzer latency vs throughput mismatch: reduce the overhead of perf_analyzer when using synchronous infer API,bug,2021-07-29 19:18:33 +0000 UTC
-3112,CLOSED,ONNX runtime failed to inferenced and shows onnx runtime error 2?,,2021-07-14 02:50:46 +0000 UTC
-3111,CLOSED,Using Python backend for jetson platform,,2021-07-14 03:39:28 +0000 UTC
-3110,CLOSED,Not able to connect to Dell EMC ECS,,2021-07-21 09:18:45 +0000 UTC
-3107,CLOSED,Question about max_batch_size, dynamic_batching in python_backend,,2021-07-13 03:44:08 +0000 UTC
-3106,CLOSED,How to set the cpu affinity of the model instance?,question,2021-08-02 22:02:58 +0000 UTC
-3103,CLOSED,Multiple copies of perf_analyzer cannot be run in parallel with sequence batching due to sequence id collisions,bug, enhancement,2021-11-11 19:51:03 +0000 UTC
-3100,CLOSED,No ModelWarmup examples,investigating,2021-07-13 17:02:59 +0000 UTC
-3099,CLOSED,Support for Sequence IDs as strings,enhancement,2021-12-22 00:08:16 +0000 UTC
-3098,CLOSED,Failed to initialize server,,2021-08-27 03:03:10 +0000 UTC
-3097,CLOSED,Usage of packaged conda environments stored in s3 with python backend,enhancement,2021-09-07 02:08:55 +0000 UTC
-3096,CLOSED,Multiple instances of same model-additional memory for weights?,,2021-07-13 05:54:53 +0000 UTC
-3094,OPEN,python backend with custom packages reports error "Internal: Failed to initialize stub, stub process exited unexpectedly",bug,2021-07-13 14:12:43 +0000 UTC
-3093,CLOSED,Failed to load tensorflow model: Op type not registered NormalizeUTF8,,2021-07-08 05:11:33 +0000 UTC
-3090,CLOSED,When sending multiple pictures at the same time, the prediction result of the same picture will be different every time,,2021-08-03 18:47:27 +0000 UTC
-3086,CLOSED,is_server_live and is_server_ready results in connection reset when tritonserver crash,,2021-08-02 22:29:41 +0000 UTC
-3085,CLOSED,custom build tensorflow backend,,2021-07-06 21:14:23 +0000 UTC
-3083,CLOSED,Versioning of other repositories,enhancement,2021-07-07 21:30:03 +0000 UTC
-3081,CLOSED,2.11.0 tensorflow backend support TF_ENABLE_ONEDNN_OPTS=1,question, investigating,2022-02-16 00:47:37 +0000 UTC
-3080,CLOSED,Question about tensorflow backend (savedmodel),question,2021-08-03 18:49:20 +0000 UTC
-3076,CLOSED,Triton into Gitlab CI, Need exposed port 9000,,2021-07-07 21:50:06 +0000 UTC
-3074,CLOSED,Python backend segfault with detectron2,bug,2021-09-22 14:05:31 +0000 UTC
-3073,CLOSED,segfault at ... error 6 in libc-2.27.so,,2021-08-03 18:23:59 +0000 UTC
-3070,CLOSED,backend build fails,,2021-07-16 16:53:35 +0000 UTC
-3069,CLOSED,How to add resnet50 model for qa tests,question,2021-07-07 21:55:17 +0000 UTC
-3066,CLOSED,Increasing number of instances of the model does not increase performance,question,2021-09-01 16:38:58 +0000 UTC
-3061,CLOSED,Client library / triton server / config.pbtxt data_type naming convention missmatch,,2021-07-07 19:56:03 +0000 UTC
-3060,CLOSED,Change some reported metrics from Prometheus' `Counter` to `Histogram`,enhancement,2023-03-07 00:25:01 +0000 UTC
-3058,CLOSED,pytorch_backend build fails with `cannot find -ltorch`,,2021-07-13 17:04:06 +0000 UTC
-3056,CLOSED,file not found: archive/constants.pkl error while loading model from TorchScript,,2021-06-29 01:19:11 +0000 UTC
-3045,CLOSED,Concurrent requests to multiple models cause NaN values in output,bug, investigating,2021-10-07 23:18:20 +0000 UTC
-3044,CLOSED,use var_length bert plan, tritonserver core dumped,,2021-07-13 17:04:47 +0000 UTC
-3043,CLOSED,.proto files deleted in src/core/ directory,,2021-06-24 17:07:37 +0000 UTC
-3040,CLOSED,Ensemble within ensemble,,2021-07-07 22:55:55 +0000 UTC
-3038,OPEN,How to find out the number of unprocessed requests for an inference?,enhancement,2023-01-27 22:29:15 +0000 UTC
-3037,CLOSED,No performance improvement when optimizing models,,2021-07-13 17:05:41 +0000 UTC
-3036,CLOSED,Python Backend does not load due to shared memory issue,,2021-06-29 15:24:21 +0000 UTC
-3034,CLOSED,CMake Error: The source directory "/tmp/citritonbuild/tritonserver/build/build" does not exist.,,2021-06-23 07:36:52 +0000 UTC
-3033,CLOSED,cpu memory increase constantly when serving model with triton-inference-server,bug, investigating,2022-04-01 00:44:51 +0000 UTC
-3032,CLOSED,Error details: model expected the shape of dimension 0 to be between 1 and 1 but received 5,,2021-08-02 22:53:33 +0000 UTC
-3031,CLOSED,Constant URL for RESTAPI,,2021-06-25 01:19:56 +0000 UTC
-3030,CLOSED,Error message while installing triton for jetpack,,2021-06-28 22:13:35 +0000 UTC
-3029,CLOSED,Huge inference speed difference when loading a model from S3,bug, investigating,2022-02-24 16:37:01 +0000 UTC
-3027,CLOSED,docker load invalid diffid,investigating,2021-07-13 17:07:00 +0000 UTC
-3026,CLOSED,unexpected platform type tensorflow_savedmodel for rul,bug,2021-07-15 18:14:03 +0000 UTC
-3025,CLOSED,from tritonclient.grpc import model_config_pb2,,2021-06-25 02:35:18 +0000 UTC
-3023,CLOSED,Example to Test TensorFlow2 Backend in TritonServer Built Without Docker,,2021-06-24 18:20:58 +0000 UTC
-3022,CLOSED,need a doc for the optimization of cuda graph,,2021-07-13 17:09:05 +0000 UTC
-3021,CLOSED,When 21.06 Will be available to launch triton in unprivileged (non-root user) ?,,2021-06-30 08:03:57 +0000 UTC
-3018,CLOSED,How to debug an ensemble model?,,2021-06-16 09:43:24 +0000 UTC
-3014,CLOSED,tritonclient.utils.InferenceServerException: [StatusCode.INTERNAL] in ensemble \'ensemble_dali_face_detect\', request specifies invalid shape for input \'input\' for facedetect_trt_0_gpu0. Error details: model expected the shape of dimension 0 to be between 1 and 1 but received 2,,2021-07-13 17:09:40 +0000 UTC
-3013,CLOSED,faild build requires deletion of `/tmp/citritonbuild/<backend>` git repo folders,enhancement, good first issue,2023-07-10 22:43:10 +0000 UTC
-3012,CLOSED,warmup does not seem to work in libtorch backend,,2022-09-02 16:17:13 +0000 UTC
-3011,CLOSED,Trouble understanding REST outputs from Python TritonClient,,2021-06-16 20:59:35 +0000 UTC
-3010,CLOSED,Error cannot find -ltorch and ltorchvision in building PyTorch Backend along with Triton Server in Oracle Linux 7.9,enhancement,2021-07-13 17:11:17 +0000 UTC
-3009,CLOSED,Error In Loading Zipped Model in Python Backend,,2021-06-21 21:53:13 +0000 UTC
-3008,CLOSED,Deployment Strategies,enhancement,2021-07-13 17:13:18 +0000 UTC
-3007,CLOSED,A suggestion: allowing the different batch size of inputs for "ensemble mode".,,2021-06-14 02:53:15 +0000 UTC
-3004,CLOSED,Validation tool for concurrent model execution and dynamic batching,,2021-06-21 21:54:56 +0000 UTC
-3001,CLOSED,Model Load request to a kubernetes cluster reaches only one pod (when replicationCount > 1),,2021-06-11 16:52:32 +0000 UTC
-2999,CLOSED,How to minimize the docker image for deployment?,duplicate, question,2021-07-19 02:16:13 +0000 UTC
-2998,CLOSED,Auto-Generated Model Configuration for tensorflow_savedmodel issue,enhancement, investigating,2021-07-13 17:13:56 +0000 UTC
-2994,CLOSED,grpc::UNAVAILABLE, error_message_ = "Connect Failed" when max_batch_size is greater than 0,,2021-07-13 17:16:25 +0000 UTC
-2993,CLOSED,Do we need to explicitly load/unload sub-models in an ensemble model.,question,2021-06-10 14:25:27 +0000 UTC
-2989,CLOSED,How to load python backend model without restart server,question,2021-06-09 13:32:55 +0000 UTC
-2988,CLOSED,Is there a way that the server do not exit when some model load failed,,2021-06-09 08:34:06 +0000 UTC
-2987,CLOSED,how to set cuda version when build?,question,2021-06-09 10:42:08 +0000 UTC
-2985,CLOSED,Triton (+ model.pt) silently exits,bug, investigating,2021-06-29 16:55:24 +0000 UTC
-2979,CLOSED,Triton Server Onnx - CUDA failure 700,,2021-06-09 18:40:55 +0000 UTC
-2978,CLOSED,Use real image data with perf_analyzer - Triton Inference Server,,2021-06-08 15:23:00 +0000 UTC
-2977,CLOSED,yolov5 to onnx model image client throws 'expecting model output to be a vector' error,question,2021-06-08 17:39:57 +0000 UTC
-2976,CLOSED,Failed to,invalid,2021-06-08 14:51:55 +0000 UTC
-2975,CLOSED,Can't set ‘backend’ to run my tensorRT model,,2021-06-10 11:27:29 +0000 UTC
-2974,CLOSED,QA：XGBoost/RAPIDS support?,question,2021-07-13 17:16:44 +0000 UTC
-2971,CLOSED,Python backend returning wrong results converting from NHWC to NCHW,bug, investigating,2021-06-16 17:41:42 +0000 UTC
-2970,CLOSED,input segment_ids[0] expected type int32 != int64,,2021-06-08 17:55:25 +0000 UTC
-2969,CLOSED,tritonserver running in pytorch backend cannot do batch but change it to python backend it is ok,,2021-06-08 10:48:08 +0000 UTC
-2968,CLOSED,rapidjson.JSONDecodeError Error in Running Python Model,bug, investigating,2021-06-10 14:08:39 +0000 UTC
-2967,CLOSED,build with --no-container-build flag fails while building onnxruntime inside a container,duplicate,2021-06-08 12:42:39 +0000 UTC
-2966,CLOSED,Failed to determine modification time of model on Azure Storage when starting up triton server after update to release 21.04,bug, investigating,2021-09-01 01:23:50 +0000 UTC
-2960,CLOSED,Failed to load python model inside Self-built Triton Server in Centos 7,,2021-06-07 05:10:06 +0000 UTC
-2959,CLOSED,when a large amount of data needs to be transferred between ensemble models, it is very time consuming,,2021-06-05 07:20:27 +0000 UTC
-2958,CLOSED,python backend is much slower than the same code run in python environment,performance, investigating,2021-11-25 00:18:51 +0000 UTC
-2953,CLOSED,Service Shutdown while multiple python backend to be loaded.,,2021-06-03 14:46:19 +0000 UTC
-2952,CLOSED,The question of interrupting streams,question,2021-07-13 17:19:10 +0000 UTC
-2951,CLOSED,Cannot find -ltensorflow_triton when building tensorflow_backend,,2021-06-04 16:00:01 +0000 UTC
-2949,CLOSED,AWS Helm chart error,,2021-06-03 17:51:32 +0000 UTC
-2948,CLOSED,triton client memory leak in python sdk,,2021-06-08 03:51:24 +0000 UTC
-2944,CLOSED,triton C API How to build example,,2022-05-31 13:48:40 +0000 UTC
-2943,CLOSED,Triton server hangs while trying to deploy models,,2021-06-11 22:11:09 +0000 UTC
-2941,OPEN,HTTP/REST client supporting for Java/Scala,enhancement,2021-06-01 18:47:23 +0000 UTC
-2940,CLOSED,tritonserver2.10.0-jetpack4.5.tgz - deepstream 5.1- jetson xavier NX,,2021-06-02 17:24:30 +0000 UTC
-2939,CLOSED,Model loading is slow， and when it comes to loading multiple models, it gets stuck!,,2021-07-13 17:29:51 +0000 UTC
-2938,CLOSED,Client build on Windows fails: cl : command line error D8021: invalid numeric argument '/Wno-implicit-fallthrough',,2021-06-06 13:25:07 +0000 UTC
-2937,CLOSED,GRPC server always return `RawOutputContents` even if the `InferInputTensor::contents` is specified.,question,2021-06-07 11:43:38 +0000 UTC
-2932,CLOSED,Triton (docker 21.05-py3) fails to load a model repository with two python backends,,2021-05-28 17:44:02 +0000 UTC
-2931,CLOSED,server can not load mybackend shared library,,2021-05-28 07:17:41 +0000 UTC
-2929,CLOSED,how can get tensorflow version in triton tensorflow backend,,2021-05-27 16:34:10 +0000 UTC
-2928,CLOSED,How to serve a PyTorch model in Triton python_backend on multiple GPUs?,,2021-05-31 13:59:26 +0000 UTC
-2927,CLOSED,Extending Triton to support more open source libraries,question,2021-06-02 16:29:37 +0000 UTC
-2926,CLOSED,Get label with REST http client,,2021-06-08 15:58:30 +0000 UTC
-2925,CLOSED,Cuda version for Jetpack 2.10.0,,2021-05-31 10:50:25 +0000 UTC
-2922,CLOSED,ensemble_image_client client example giving error,,2021-05-26 16:58:38 +0000 UTC
-2921,CLOSED,Onnx runtime on Jetson Xavier,,2021-05-26 16:07:58 +0000 UTC
-2920,CLOSED,Triton server docker images for jetson xavier,,2021-05-26 16:08:13 +0000 UTC
-2919,CLOSED,Use share memory sometime have a error in defferent process,,2021-05-27 11:42:20 +0000 UTC
-2918,CLOSED,How to test performance benchmark with input type raw_image ?,,2021-05-27 16:22:49 +0000 UTC
-2917,CLOSED,ensemble model load failed,,2021-05-27 07:20:28 +0000 UTC
-2911,CLOSED,error: failed to get model metadata: Request for unknown model: 'inception_graphdef' is not found,,2021-06-01 19:18:44 +0000 UTC
-2910,CLOSED,Docker minimum Triton example seems incomplete,investigating,2021-08-03 01:32:29 +0000 UTC
-2907,CLOSED,My model is working fine when I use gpu:0 but it is giving error when I use gpu:1.,,2022-02-23 12:04:47 +0000 UTC
-2906,CLOSED,unable to create TensorRT context,,2022-07-18 08:13:08 +0000 UTC
-2902,CLOSED,GRPC Execute Failed, message: failed to connect to all addresses,,2021-05-21 20:42:13 +0000 UTC
-2901,CLOSED,unexpected platform type caffe2_netdef for Triton 2.7.0 ?,,2021-05-21 20:47:47 +0000 UTC
-2900,CLOSED,Can't generating linux_stamp.whl successfully on Intel i5,question,2021-05-24 03:46:06 +0000 UTC
-2899,CLOSED,Running image_client.py causes core dump on Jetson NX,,2021-05-25 07:20:12 +0000 UTC
-2895,CLOSED,Unable to compile Triton/client on CentOS 7.9,,2021-11-11 07:20:09 +0000 UTC
-2893,CLOSED,Triton json cause assert Abort or Segmentfault,bug,2021-07-13 17:20:13 +0000 UTC
-2892,CLOSED,Can not load densenet model,,2021-05-21 01:41:33 +0000 UTC
-2888,CLOSED,What onnx version support this Add_1003 operator?,,2021-05-19 20:27:20 +0000 UTC
-2887,CLOSED,[Question] [Model storage] Is there a way to download a model from cloud repo and store it to disk?,question,2021-06-08 17:39:03 +0000 UTC
-2886,CLOSED,Backend version mismatch when building from sources,,2021-05-19 11:41:21 +0000 UTC
-2885,CLOSED,ensemble model, inference input data-type is 'BYTES', model expects 'FP32',,2021-05-21 03:12:38 +0000 UTC
-2883,CLOSED,Waiting for in-flight requests to complete,,2021-05-19 07:34:16 +0000 UTC
-2882,CLOSED,tritonserver: error while loading shared libraries: libnvidia-ml.so.1 on Jetson NX,,2021-05-21 19:54:51 +0000 UTC
-2877,CLOSED,Triton does not run without gpu (cpu-only),bug, question,2023-04-03 20:50:43 +0000 UTC
-2876,CLOSED,CUDA copy error in python backend on Jetson,,2021-05-22 11:17:10 +0000 UTC
-2875,CLOSED,Cmake error,,2021-05-18 23:47:49 +0000 UTC
-2874,CLOSED,Python backend is not using python3 available in PATH,,2021-05-18 23:48:25 +0000 UTC
-2873,CLOSED,triton load yensorrt error,,2021-06-08 15:40:28 +0000 UTC
-2872,CLOSED,Questions related to shared memory,question,2021-06-08 17:21:40 +0000 UTC
-2871,CLOSED,Triton does not refresh aws credentials when using IAM roles,enhancement, investigating,2022-05-25 17:37:22 +0000 UTC
-2870,CLOSED,Reshape did not work for python backend,question, investigating,2021-05-21 15:36:13 +0000 UTC
-2868,CLOSED,Yolov5 - preprocess in infer config file,,2021-05-18 00:17:15 +0000 UTC
-2865,CLOSED,How to stop ensemble pipeline,,2021-05-17 23:57:42 +0000 UTC
-2860,CLOSED,Troubleshooting execute function of Python Backend,,2021-06-02 16:25:00 +0000 UTC
-2858,CLOSED,failed to load all models,,2021-06-08 15:27:05 +0000 UTC
-2857,CLOSED,how to make dynamic_axes espcn(super resolution model) onnx config.pbtxt file?,,2021-05-14 14:05:20 +0000 UTC
-2856,CLOSED,smaller size docker image with specific functions,,2021-05-14 15:21:27 +0000 UTC
-2855,CLOSED,python backend streaming pybind11::error_already_set for perf_analyzer with concurrency > 1,bug, investigating,2021-08-30 14:56:44 +0000 UTC
-2853,CLOSED,Triton does not load the latest available model in explicit mode.,bug, investigating,2021-06-08 10:49:15 +0000 UTC
-2852,CLOSED,How to accelerate the Triton when loading TensorRT plan.,question,2021-07-13 17:30:54 +0000 UTC
-2851,CLOSED,How to reduce the time consumption of tensor transfer among models in a ensemble model?,,2021-05-12 19:27:27 +0000 UTC
-2850,CLOSED,TRITONBACKEND_InputProperties,,2021-05-14 16:04:08 +0000 UTC
-2849,CLOSED,Python client support for variable size dimensions,,2021-05-14 21:41:27 +0000 UTC
-2848,CLOSED,the preprocess VGG mean-substraction in image_client.cc,bug,2021-08-11 19:41:45 +0000 UTC
-2846,CLOSED,InferenceServerException: PyTorch execute failure: Expected Tensor but got Tuple [ BART Summarization ],bug, investigating,2022-10-20 07:23:25 +0000 UTC
-2845,CLOSED,identity_backend of input_buffer_count,,2021-05-11 13:13:35 +0000 UTC
-2844,CLOSED,Update Release notes for clients - libopencv_imgcodecs.so.4.2: cannot open shared object file: No such file or directory,,2021-05-11 13:36:05 +0000 UTC
-2843,CLOSED,so many server's handler caused server not work,bug, investigating,2021-06-08 00:01:55 +0000 UTC
-2842,OPEN,Azure Kubernetes Service Deployment Sample,enhancement,2021-05-13 14:02:37 +0000 UTC
-2838,CLOSED,Running Triton Without GPU,,2021-05-11 23:49:39 +0000 UTC
-2836,CLOSED,infrence of torch script model much slower with triton than python environment,bug, performance, pytorch ngc,2022-04-11 13:44:14 +0000 UTC
-2835,CLOSED,Unable to connect to S3 protocol of Dell EMC ECS EX300,bug,2021-05-19 14:44:20 +0000 UTC
-2834,CLOSED,Windows build errors with grpc: missing pthread dependency, protobuf version inconsistency,,2021-05-12 03:36:29 +0000 UTC
-2833,CLOSED,Windows docker build error, Could not create SSL/TLS secure channel.,,2021-06-07 14:11:49 +0000 UTC
-2831,CLOSED,How to inference with tuple input in pytorch_backend?,,2021-05-17 02:47:54 +0000 UTC
-2830,CLOSED,Triton Model Repository Format,enhancement, investigating,2021-05-10 23:17:26 +0000 UTC
-2828,CLOSED,how to change the package name in java client,question, investigating,2021-05-10 23:08:35 +0000 UTC
-2827,CLOSED,Question: How to call another model in a model.(like emsemble, speech recognition application),question,2021-05-17 22:07:46 +0000 UTC
-2821,CLOSED,Triton Client build for windows,,2021-05-07 08:18:54 +0000 UTC
-2818,CLOSED,Triton crashes while running TensorFlow model with reshape commands for outputs in config.pbtxt,,2021-05-19 21:47:07 +0000 UTC
-2817,CLOSED,[BUG] cuDF context is initialized to gpu0 multiple times with Tritons python backend on a multi-gpu machine,,2021-05-05 23:05:56 +0000 UTC
-2815,CLOSED,Dynamic Batching does not seem to work.,,2021-07-13 17:32:29 +0000 UTC
-2811,CLOSED,Verifying dynamic batching is working,,2021-05-06 18:35:34 +0000 UTC
-2810,CLOSED,Out of memory error on second inference,,2021-05-06 03:03:13 +0000 UTC
-2807,CLOSED,Question: Performance differences between v1 and v2,,2021-05-05 13:54:49 +0000 UTC
-2803,OPEN,Windows support for C API sample,enhancement,2021-05-06 16:27:35 +0000 UTC
-2802,CLOSED,List dependent packages for Windows build in build.py,,2021-05-07 21:02:52 +0000 UTC
-2801,CLOSED,Exclude non-needed GRPC/HTTP dependencies,,2021-05-07 21:02:52 +0000 UTC
-2800,CLOSED,Exclude disabled cloud dependencies from CMAKE,,2021-05-11 02:35:04 +0000 UTC
-2798,CLOSED,Incorrect NVTX annotations emitted by the inference server,,2021-05-04 00:52:08 +0000 UTC
-2794,CLOSED,memory leak in s3 filesystem,,2021-05-03 16:08:32 +0000 UTC
-2793,CLOSED,can not load onnx model,,2021-05-19 00:01:59 +0000 UTC
-2792,CLOSED,How to deploy yolov5 model,,2021-07-30 04:14:30 +0000 UTC
-2791,OPEN,Kubernetes Operator for install and lifecycle management,enhancement,2021-10-19 10:51:38 +0000 UTC
-2790,CLOSED,When use command line to launch triton, it raise: attempt to access JSON non-number as double,,2021-05-06 06:24:57 +0000 UTC
-2786,CLOSED,How to add ConfigProto when running a model,,2021-04-29 12:12:02 +0000 UTC
-2785,CLOSED,Building triton server from min with Pytorch backend,,2021-05-05 09:12:32 +0000 UTC
-2784,CLOSED,No models being mounted when running container,,2021-04-29 17:33:54 +0000 UTC
-2783,CLOSED,Unusual HTTP send/recv latency with perf_analyzer,,2021-06-07 14:04:00 +0000 UTC
-2781,CLOSED,Cannot load ensemble model,,2022-11-01 17:29:13 +0000 UTC
-2780,CLOSED,Questions about Model Loading on GPU memory,,2021-05-18 00:39:22 +0000 UTC
-2779,CLOSED,[Jetson] triton client health calls via grpc on jetson very slow/hangs,,2021-06-07 15:57:55 +0000 UTC
-2777,CLOSED,How to use Triton server “ensemble model” with 1:N input/output to create patches from large image?,,2022-02-08 09:38:23 +0000 UTC
-2773,CLOSED,Custom Operations for TensorFlow 2 error,,2021-12-21 23:51:59 +0000 UTC
-2771,CLOSED,hello, everyone , here is a my demo of using ensemble model.,,2021-04-27 08:40:29 +0000 UTC
-2770,CLOSED,tensorflow saved_model performance,,2021-05-05 18:10:04 +0000 UTC
-2769,CLOSED,big inference latency difference between k8s (aws) and local laptop,,2021-05-04 23:53:03 +0000 UTC
-2768,CLOSED,tritonclient.utils.InferenceServerException: request specifies invalid shape for input 'input' for face_det_tensorrt_0_3_gpu0. Error details: model expected the shape of dimension 0 to be between 1 and 1 but received 2,,2021-06-15 08:13:45 +0000 UTC
-2767,CLOSED,Documentation for Windows alpha release,,2021-06-11 13:34:25 +0000 UTC
-2766,CLOSED,503 Service Unavailable,,2021-07-13 17:33:13 +0000 UTC
-2765,CLOSED,When I user triton server, How should I use it，batch concatenate and more GPU,,2021-07-13 17:33:27 +0000 UTC
-2763,CLOSED,Unable to build r21.04 version,,2021-04-23 21:09:43 +0000 UTC
-2760,CLOSED,Ensemble model throughput lower than member models,,2022-07-21 01:54:04 +0000 UTC
-2759,CLOSED,Common access to DataType-manipulating functions,enhancement,2022-05-20 18:57:02 +0000 UTC
-2758,CLOSED,containerd: OCI runtime create failed "stat /run/containerd/io.containerd.runtime.v1.linux/k8s.io/../nvidia: no such file or directory",,2021-04-23 19:09:31 +0000 UTC
-2756,CLOSED,how to get open-source packages manually?,,2021-04-23 21:14:13 +0000 UTC
-2755,CLOSED,metrics about Count and Latency,,2021-12-15 09:38:44 +0000 UTC
-2752,CLOSED,Launching Triton worked first time now it doesn't (nothing changed),,2021-05-19 21:35:21 +0000 UTC
-2750,CLOSED,Cpu memory keeps increasing while inferencing,,2021-06-21 11:49:38 +0000 UTC
-2748,CLOSED,No error log for failed request.,,2021-04-21 14:07:20 +0000 UTC
-2747,CLOSED,use python backend to add Huggingface Transformers example?,enhancement,2021-05-15 13:12:44 +0000 UTC
-2744,CLOSED,[Question] Flask integration with Python HTTP Client,,2021-04-26 15:13:21 +0000 UTC
-2743,CLOSED,error: creating server: Internal - failed to load all models,,2021-04-21 01:04:35 +0000 UTC
-2740,CLOSED,Is there a way to shut down GPU/CPU memory fall back policy?,,2021-04-23 21:40:35 +0000 UTC
-2739,CLOSED,grpc java client,,2021-05-06 02:11:45 +0000 UTC
-2738,CLOSED,In the latest project of triton service ,How to create the imagepreprocess.so ? Thanks,,2021-04-19 16:49:43 +0000 UTC
-2737,CLOSED,Questions about gPRC threads in Triton Inference Server,,2021-04-23 21:52:04 +0000 UTC
-2735,CLOSED,Add trtorch backend,enhancement,2021-11-30 23:01:47 +0000 UTC
-2732,CLOSED,Starting triton got stucked,,2021-04-19 07:49:01 +0000 UTC
-2731,CLOSED,Triton did not update the model after users added a new model into model_repository | There is nothing on localhost:8000/api/status,,2021-04-20 00:36:38 +0000 UTC
-2727,CLOSED,Question about the relationship between multiple instances and GPU usage,,2021-04-19 06:11:29 +0000 UTC
-2726,CLOSED,nvbufsurftransform:cuInit failed : 3,,2021-04-14 11:44:34 +0000 UTC
-2725,CLOSED,is_server_live results in connection reset,,2022-10-21 23:24:47 +0000 UTC
-2720,CLOSED,Question about repeat backend,,2021-04-12 16:03:39 +0000 UTC
-2719,CLOSED,new C++ client file,,2021-04-12 12:17:43 +0000 UTC
-2716,CLOSED,c++ client compile fail to #include "triton/common/triton_json.h",,2021-08-04 11:01:28 +0000 UTC
-2714,CLOSED,Latest nvidia-pyindex release breaks pip install of client libraries,,2021-04-09 18:37:31 +0000 UTC
-2711,CLOSED,Question about custom backends under the 21.x API,,2021-04-23 22:07:44 +0000 UTC
-2706,CLOSED,Golang GRPC Request caused Triton server with a fatal error,,2021-09-30 19:03:17 +0000 UTC
-2702,CLOSED,set_data_from_numpy hangs with image data,,2021-04-14 03:24:00 +0000 UTC
-2701,CLOSED,Getting error in a multi gpu machine,,2021-05-13 02:28:30 +0000 UTC
-2699,CLOSED,c# .net5,,2021-04-23 22:10:32 +0000 UTC
-2698,CLOSED,[Question] Infer from multiple models simultaneously on a single GPU without lower fps,,2022-01-19 13:20:10 +0000 UTC
-2691,CLOSED,Rebuilding c++ example image_client within docker sdk image fails,,2021-04-02 17:24:12 +0000 UTC
-2690,CLOSED,Can't find shared_memory module in tritonclient library,,2021-04-08 16:40:56 +0000 UTC
-2689,CLOSED,Call other backends within python backend,,2021-04-07 03:56:19 +0000 UTC
-2688,CLOSED,Trinton Errror during the initialization of the modell,,2021-04-01 16:27:58 +0000 UTC
-2687,CLOSED,About Ensemble Models,,2021-04-08 17:28:45 +0000 UTC
-2686,OPEN,perf_analyzer --async miscommunicates with server and runs out of memory,bug,2021-09-09 18:29:04 +0000 UTC
-2685,CLOSED,How can I get percentile latency metrics (say p99) for online inferencing?,,2021-04-23 22:20:43 +0000 UTC
-2681,CLOSED,How can I run identity_backend on trition,,2021-05-18 19:48:49 +0000 UTC
-2678,CLOSED,Tesla T4 cards unusually hot while Triton is idling,,2022-06-28 09:34:38 +0000 UTC
-2675,OPEN,Programmatic interface for Nvidia Framework Containers Support matrix,enhancement,2021-08-11 19:42:48 +0000 UTC
-2674,CLOSED,QuickStart example without the --gpus flag fails for tritonserver:v20.12-py3 and later.,,2021-03-29 20:45:33 +0000 UTC
-2673,CLOSED,using cuda shared memory with libtorch,,2021-03-31 17:19:29 +0000 UTC
-2672,CLOSED,fail to load pytorch model,,2021-04-20 13:11:39 +0000 UTC
-2671,CLOSED,The examples of the customised python backend are not working with the inference server. Cannot run interpreter host. Errno = 2,,2021-03-28 14:30:42 +0000 UTC
-2668,CLOSED,Python gRPC and http clients throw warning when using Python 3.8,,2021-04-02 15:44:19 +0000 UTC
-2667,CLOSED,Sometimes receive Socket Closed UNAVAILABLE from triton grpc server.,,2021-03-29 20:32:51 +0000 UTC
-2661,CLOSED,Option (flag) to disable `optimized_execution` in pytorch backend,,2021-06-09 07:59:17 +0000 UTC
-2659,CLOSED,How triton determine when to remove an idle instance?,,2021-03-31 16:22:52 +0000 UTC
-2658,CLOSED,What do you support for Multi Tenancy?,,2021-03-31 16:27:47 +0000 UTC
-2657,OPEN,Add support for STSAssumeRoleWebIdentityCredentialsProvider for S3 repositories,enhancement,2021-05-19 21:45:36 +0000 UTC
-2653,CLOSED,config.pbtxt for openvino model,,2021-03-23 04:22:43 +0000 UTC
-2652,CLOSED,Unable to execute model simultaneously on a multi gpu instance.,,2021-03-23 22:32:24 +0000 UTC
-2650,CLOSED,Simplify example python client code.,,2021-04-06 16:41:32 +0000 UTC
-2649,CLOSED,Error on loading onnx model,,2021-03-31 16:30:06 +0000 UTC
-2644,CLOSED,Wrong error for loading ONNX models,,2021-03-31 16:30:31 +0000 UTC
-2643,CLOSED,Tritonserver crashes with segmentation fault,bug,2021-04-08 16:34:12 +0000 UTC
-2641,CLOSED,Unable to run pytorch model, CUDA copy error,,2022-05-03 10:25:35 +0000 UTC
-2640,CLOSED,Failed to get device count,,2021-03-18 17:40:27 +0000 UTC
-2637,CLOSED,Not able to run custom PyTorch model using Triton Inference Server & Seldon Core,,2021-03-31 16:35:57 +0000 UTC
-2636,CLOSED,Label mismatch: value and index in reverse order,,2021-03-31 16:36:28 +0000 UTC
-2635,CLOSED,Can't set max batch size when using strict-model-config = false #1466,,2021-03-25 15:57:49 +0000 UTC
-2634,CLOSED,Question of starting serving inside docker,,2021-03-17 15:16:04 +0000 UTC
-2633,CLOSED,Error when load model with http api,,2021-08-16 02:59:35 +0000 UTC
-2629,CLOSED,DGX A100,,2021-03-18 10:50:38 +0000 UTC
-2627,OPEN,TF-TRT Model can't be loaded,bug,2021-12-28 14:45:38 +0000 UTC
-2626,CLOSED,PyTorch backend sometimes allocates input tensors on wrong GPU on multi-GPU systems,,2021-03-31 16:37:39 +0000 UTC
-2625,OPEN,Triton server crashes with a model converted using tf-trt,bug,2023-07-09 11:01:39 +0000 UTC
-2624,CLOSED,service not responding: Too many open files,,2023-06-03 05:39:26 +0000 UTC
-2618,CLOSED,./rtSafe/safeContext.cpp (133) - Cudnn Error in configure: 7 (CUDNN_STATUS_MAPPING_ERROR),,2021-04-23 22:27:49 +0000 UTC
-2617,CLOSED,[Ask for help & discussion] Understanding shared memory.,,2021-03-22 20:31:48 +0000 UTC
-2616,CLOSED,Thread safety question about python grpcclient and server,,2021-03-31 16:38:14 +0000 UTC
-2613,CLOSED,ONNX TensorRT optimization parameters,,2021-03-11 16:39:11 +0000 UTC
-2612,CLOSED,Shared memory allocation,,2021-03-14 05:21:51 +0000 UTC
-2607,CLOSED,reload ensemble model may cause server crash,bug,2022-05-31 17:54:18 +0000 UTC
-2606,CLOSED,Unable to autofill for 'yolov4_nvidia', either all model tensor configuration should specify their dims or none,,2022-06-08 04:45:25 +0000 UTC
-2604,CLOSED,Building Pytorch backend,,2021-03-10 17:11:12 +0000 UTC
-2603,CLOSED,triton client C++,,2022-10-24 08:54:08 +0000 UTC
-2600,CLOSED,Docker image does not run under arbitrary non-root user,,2022-01-10 03:16:13 +0000 UTC
-2599,CLOSED,[Question] how to improve gpu-utilization with multi models,,2021-04-23 22:35:14 +0000 UTC
-2598,CLOSED,AttributeError: 'NoneType' object has no attribute 'cancelled',,2021-03-10 03:41:01 +0000 UTC
-2596,CLOSED,Error at Triton init on GKE while loading Pytorch custom ops,,2021-03-11 08:40:43 +0000 UTC
-2594,CLOSED,Running torchscript exported model in Triton throws InferenceServerException,,2021-03-05 02:02:02 +0000 UTC
-2593,CLOSED,In model configuration support defining list of tensors as input,,2021-10-25 04:09:45 +0000 UTC
-2592,CLOSED,Error response from daemon: received unexpected HTTP status: 502 Bad Gateway,,2021-04-23 22:35:57 +0000 UTC
-2591,CLOSED,TensorRT model uses Perf_Client to test the performance and finds that compute input takes too long and infer takes too short.,,2021-03-16 02:09:50 +0000 UTC
-2584,CLOSED,What is the possible reason of "instance group 124M2_1 of model 124M2 specifies invalid or unsupported gpu id 1",,2021-03-11 09:19:18 +0000 UTC
-2583,CLOSED,How to fork a request to multiple model instances?,,2021-03-05 02:02:17 +0000 UTC
-2582,CLOSED,How to get outputs.content instead of raw_output_contents?,,2021-04-23 22:36:07 +0000 UTC
-2578,CLOSED,model reload should be performed in backgroud but it does not,,2021-03-26 15:31:52 +0000 UTC
-2576,CLOSED,No rule to make target "client",,2021-03-04 10:28:34 +0000 UTC
-2575,CLOSED,CMake error in building from source (no container),,2021-03-02 19:03:18 +0000 UTC
-2572,CLOSED,Deploy model trained with TLT,,2021-08-03 19:15:11 +0000 UTC
-2571,CLOSED,InferenceServerException: input 'max_seqlen' batch size does not match other inputs for 'varsbert,,2021-06-25 05:47:14 +0000 UTC
-2570,CLOSED,Model saved with tensorflow 2.4's tf.compat.v1.saved_model.simple_save inference ran errors.,,2021-12-21 23:37:01 +0000 UTC
-2568,CLOSED,Can Tensorflow Backend be compiled a no-GPU version ?,,2021-04-23 22:38:23 +0000 UTC
-2564,CLOSED,Triton not starting HTTP server on GKE,,2021-04-23 22:38:40 +0000 UTC
-2563,CLOSED,How to find correct rest api params for POST /v2/models/:model/infer inference request?,,2021-03-01 17:40:21 +0000 UTC
-2562,CLOSED,ERROR: No supported GPU(s) detected to run this container,,2021-02-26 06:30:42 +0000 UTC
-2559,CLOSED,Error - "Internal - failed to load all models",,2022-10-10 05:43:25 +0000 UTC
-2558,CLOSED,Build the TensorFlow Backend With Custom TensorFlow Failed.,,2021-02-26 18:44:33 +0000 UTC
-2555,CLOSED,Start tensorrtserver on boot,,2021-02-24 17:14:33 +0000 UTC
-2554,CLOSED,yolov3.onnx problem,,2021-02-25 02:45:47 +0000 UTC
-2553,CLOSED,Problem with commiting changes to the prebuilt docker container,,2021-03-07 12:15:51 +0000 UTC
-2552,CLOSED,QA：Framework installation is needed or not after the construction of the backend?,,2021-03-04 10:43:31 +0000 UTC
-2549,CLOSED,Configuration documentation typo,,2021-02-24 20:08:35 +0000 UTC
-2545,CLOSED,Trailing zeros in bytes truncated in binary data using python http client,,2021-03-16 15:28:00 +0000 UTC
-2543,CLOSED,client latency is too high when model returns a large tensor,,2021-04-23 22:38:50 +0000 UTC
-2542,CLOSED,build docker with build.py bug,,2021-05-19 21:36:58 +0000 UTC
-2540,CLOSED,MyelinGraphError,,2021-03-04 16:10:18 +0000 UTC
-2539,CLOSED,Decoding binary data,,2021-02-22 17:11:55 +0000 UTC
-2537,CLOSED,a mistake in the description of custom docker build,,2021-02-22 18:41:08 +0000 UTC
-2536,CLOSED,Is there any difference between tensorflow model and TensorRT engine?,,2023-02-27 06:10:35 +0000 UTC
-2535,CLOSED,Memory allocation when using multiple platforms/backends,,2021-02-24 18:58:46 +0000 UTC
-2532,CLOSED,Cannot import torch when using Python Backend,,2021-07-26 12:14:14 +0000 UTC
-2531,CLOSED,Tensorflow 1/2 backend source code,,2021-02-22 18:26:19 +0000 UTC
-2530,CLOSED,how to show model repository from the client?,,2021-02-18 13:40:28 +0000 UTC
-2526,CLOSED,[LibTorch] Expected Tensor but got None with inception v3,,2022-04-28 10:44:33 +0000 UTC
-2525,CLOSED,Pip install nvidia-pyindex not working,,2021-04-11 03:35:28 +0000 UTC
-2521,CLOSED,Support List of Tensors in LibTorch backend,,2021-02-16 17:09:18 +0000 UTC
-2520,CLOSED,Call HTTP/RestAPI with byte data in payload,,2021-02-18 17:39:24 +0000 UTC
-2519,CLOSED,backend directory in jetson nano deploy,,2021-02-27 03:05:05 +0000 UTC
-2518,CLOSED,Backend cmake build error,,2021-02-19 08:06:09 +0000 UTC
-2517,CLOSED,Triton GRPC client creation has memory leak,,2021-02-23 02:28:45 +0000 UTC
-2514,CLOSED,Decoding grpc output,,2021-08-11 19:43:24 +0000 UTC
-2513,CLOSED,ArmNN Backend,,2021-11-12 22:58:46 +0000 UTC
-2512,CLOSED,Reading raw content produces wrong values,,2021-02-12 17:56:19 +0000 UTC
-2502,CLOSED,Wrong formatted config file from Auto-Generated Model Configuration,,2021-02-16 18:27:50 +0000 UTC
-2501,CLOSED,Is it possible to compile a version for Centos7?,,2021-02-25 06:55:37 +0000 UTC
-2500,CLOSED,Python backend doesn't support boolean outputs,,2021-02-11 16:54:45 +0000 UTC
-2496,CLOSED,Unable to set dims for output with only batch dimension,,2021-09-30 19:02:17 +0000 UTC
-2495,CLOSED,Failed to parse error (Quick Start),,2021-02-09 01:05:54 +0000 UTC
-2493,CLOSED,Error in using S3-Compatible Storage [Oracle Cloud Infrastructure (OCI) Object Storage],,2021-07-13 09:13:42 +0000 UTC
-2491,CLOSED,Is the pytorch backend correctly disabling gradient calculation?,,2021-02-08 19:08:29 +0000 UTC
-2490,CLOSED,How to specify the memory type of an output and collect the output to a response?,,2021-02-10 19:05:18 +0000 UTC
-2488,CLOSED,Using TensorRT acceleration with model running on both CPU and GPU,,2021-02-12 20:02:08 +0000 UTC
-2486,OPEN,get query_params with python backend,enhancement,2022-11-22 19:37:18 +0000 UTC
-2482,CLOSED,Simplify the source building process (i.e. doc and dependencies),,2021-02-09 08:14:59 +0000 UTC
-2478,CLOSED,Trouble sending image data to Triton Server,,2021-11-08 08:41:04 +0000 UTC
-2477,CLOSED,Nvidia T4 - perf_client low performance,,2021-02-08 22:03:14 +0000 UTC
-2476,CLOSED,UnicodeEncodeError when deploying Vietnamese n-gram language model on the Triton Inference Server.,,2021-02-03 15:24:08 +0000 UTC
-2475,CLOSED,Segmentation fault (core dumped) on ensemble model from Triton (GPU) to Python Backend (CPU),,2021-03-16 15:28:40 +0000 UTC
-2472,CLOSED,tritonclient.utils.cuda_shared_memory.CudaSharedMemoryException: unable to set device successfully,,2021-02-01 05:57:05 +0000 UTC
-2467,CLOSED,Getting the server inference time using the client library,,2021-02-02 18:17:28 +0000 UTC
-2466,CLOSED,wrong word in README.md document,,2021-01-28 22:36:37 +0000 UTC
-2462,CLOSED,perf client using json file to load real image data for ensemble DALI+model,,2022-12-14 21:52:32 +0000 UTC
-2461,CLOSED,Triton server failed to load Tensorflow SavedModel,bug,2022-03-14 20:31:20 +0000 UTC
-2460,CLOSED,20.11-py3 vs 20.11-py3-min?,,2021-01-26 17:54:31 +0000 UTC
-2458,CLOSED,Build failing in python build script,,2021-01-26 16:38:03 +0000 UTC
-2457,CLOSED,python_backend should not set name_ property,,2021-01-27 23:29:11 +0000 UTC
-2456,CLOSED,memory leak before doing inference (windows http client),,2021-01-26 03:28:10 +0000 UTC
-2453,CLOSED,Spend long time in python backend module Tensor function,,2021-02-07 03:48:02 +0000 UTC
-2446,CLOSED,CPU memory usage constantly increases while doing inference (windows http client),,2021-01-26 02:47:45 +0000 UTC
-2443,CLOSED,Build TF-Text by default with TensorFlow2 backend,,2021-10-18 22:00:17 +0000 UTC
-2442,CLOSED,Streaming inference for heavily overlapping data,,2021-07-21 19:02:05 +0000 UTC
-2439,CLOSED,Python backend fails with PyTorch > 1.6.0,,2022-06-17 08:27:43 +0000 UTC
-2435,CLOSED,python_backend cmake error,,2021-02-03 01:02:33 +0000 UTC
-2434,CLOSED,segment fault within http_server.cc about rapidjson,,2021-01-26 00:11:28 +0000 UTC
-2433,CLOSED,Streaming generation,,2022-05-30 03:35:56 +0000 UTC
-2432,CLOSED,python_backend cmake error,,2021-01-28 23:54:18 +0000 UTC
-2431,CLOSED,Performance Analyzer sends input data with wrong order for Hugectr Model,,2021-01-22 08:07:38 +0000 UTC
-2430,CLOSED,Which model should be fed to `simple` server?,,2021-01-19 02:23:30 +0000 UTC
-2429,CLOSED,[question] Total avg queue time = 0 usec for ensemble model,,2021-01-26 17:33:48 +0000 UTC
-2428,CLOSED,Unable to get "compute start" and "compute end" with ensemble model for trace command,,2021-03-31 16:38:09 +0000 UTC
-2427,CLOSED,python_backend cmake,,2021-01-17 20:20:24 +0000 UTC
-2421,CLOSED,Declaring Triton plugins, with kfserving,,2021-01-19 17:59:49 +0000 UTC
-2419,CLOSED,tritonclient.utils.InferenceServerException / StatusCode.UNAVAILABLE] / Request for unknown model / model is not found,,2021-01-15 11:14:35 +0000 UTC
-2418,CLOSED,How to use custom backend e.g. identity_backend,,2021-01-15 17:47:09 +0000 UTC
-2417,CLOSED,Internal: failed to connect to all addresses,,2021-01-20 15:13:53 +0000 UTC
-2416,CLOSED,build with docker failed,,2021-01-20 01:42:19 +0000 UTC
-2415,CLOSED,Encounter error when running official python backend,,2021-01-14 15:29:30 +0000 UTC
-2414,CLOSED,Linker error when torch installed in Python backend,,2021-01-14 17:54:59 +0000 UTC
-2411,CLOSED,"unexpected shape for input" Error for a Model with Dynamic Input,,2021-01-13 17:16:56 +0000 UTC
-2410,CLOSED,Loading Custom TRT Plugins while Serving the TRT Models on Jetson Nano,,2021-01-13 17:18:59 +0000 UTC
-2409,CLOSED,Question: Changing GRPC compression options,,2021-01-13 17:22:17 +0000 UTC
-2408,CLOSED,A make error here that complains that 'not finding grpc_service_pb2.py',,2021-01-13 07:16:45 +0000 UTC
-2407,CLOSED,A make error here that complains that 'not finding grpc_service_pb2.py',,2021-01-13 03:23:46 +0000 UTC
-2404,CLOSED,Missing typeinfo in shared libraries,,2021-01-12 21:38:35 +0000 UTC
-2403,CLOSED,Can't Load Models Using Distributed MinIO,enhancement,2021-05-18 20:27:42 +0000 UTC
-2402,CLOSED,Startup error while loading shared libraries,,2021-01-26 16:45:52 +0000 UTC
-2401,CLOSED,ONNX Backend Support for Jetpack,,2021-01-26 00:22:53 +0000 UTC
-2400,CLOSED,Triton Server Support for Jetson Nano,,2021-01-12 17:27:12 +0000 UTC
-2399,CLOSED,How to optimize config: instance_group vs. dynamic_batching,,2021-01-12 02:45:56 +0000 UTC
-2398,CLOSED,The maximum batch size of pytorch model hosted by triton, is much smaller than torch jit model.,,2021-01-26 03:19:20 +0000 UTC
-2397,CLOSED,Remove unnecessary null pointer checks,,2021-01-26 00:26:22 +0000 UTC
-2395,CLOSED,triton server core dumped when client infer (model: bert-tensorrt engine),,2021-01-26 00:28:01 +0000 UTC
-2393,CLOSED,randomly same output with last batch,,2021-01-14 01:49:17 +0000 UTC
-2392,CLOSED,PIL module not found,,2021-01-08 17:26:21 +0000 UTC
-2389,CLOSED,Optimal/Suggested way of handling image requests for optimizing throughput,,2021-01-08 07:50:01 +0000 UTC
-2388,CLOSED,Cannot run Pytorch and Tensorflow models consecutively on a single GPU,,2021-01-11 17:02:37 +0000 UTC
-2387,CLOSED,Onnx batchsize greater than 1,,2021-08-30 16:53:23 +0000 UTC
-2386,CLOSED,Not able to use GKE default driver with Triton Inference Server,,2021-01-07 00:22:27 +0000 UTC
-2384,CLOSED,Performance improvement for numpy decoding in Python client,performance,2021-05-19 21:39:19 +0000 UTC
-2381,CLOSED,Support for SwiftStack S3 API,enhancement,2021-05-18 20:27:53 +0000 UTC
-2379,CLOSED,Intel MKL FATAL ERROR in 20.11 and 20.12,,2021-01-26 00:36:48 +0000 UTC
-2377,CLOSED,Getting issue while load FP16 retinaface model,,2022-05-28 18:09:55 +0000 UTC
-2376,CLOSED,Question: Kubernetes Deployment with stateful models,,2021-01-26 00:40:26 +0000 UTC
-2375,CLOSED,NVTX error occurs when TRITON_ENABLE_CAFFE2 and TRITON_ENABLE_PYTORCH is OFF,,2021-01-26 00:41:20 +0000 UTC
-2374,CLOSED,Triton Inference server 20.12 start error in Tesla P4.,,2021-02-03 01:03:35 +0000 UTC
-2373,CLOSED,Yolov5s torchscript model shows pytorch backend bugs?,,2021-10-28 01:38:46 +0000 UTC
-2372,CLOSED,TritonClient install in windows 10?,,2021-01-04 18:28:33 +0000 UTC
-2371,CLOSED,bert optimized onnx model infer error when batch_size > 1,,2020-12-31 05:47:13 +0000 UTC
-2370,CLOSED,When I set the concurrency increase, the service returns a data exception,,2021-01-04 17:25:08 +0000 UTC
-2369,CLOSED,python backend not support TRITONSERVER_MEMORY_GPU,enhancement,2021-09-01 15:15:59 +0000 UTC
-2368,CLOSED,Check failed: size >= 0 (-1655719932 vs. 0),,2021-01-05 16:55:19 +0000 UTC
-2367,CLOSED,Triton Inference Server does not use GPU for Jetson Nano.,,2021-01-04 18:43:50 +0000 UTC
-2366,CLOSED,pip install tritonclinet on win10,,2021-01-05 02:19:02 +0000 UTC
-2365,CLOSED,fail to use perf_client,,2021-01-04 17:48:09 +0000 UTC
-2364,CLOSED,HTTP client failed: Send failed since rewinding of the data stream failed,,2021-01-26 00:46:41 +0000 UTC
-2363,CLOSED,FP32 inference - ctypes?,,2021-01-02 08:57:21 +0000 UTC
-2362,CLOSED,how to build the C++ client libraries on windows,,2021-01-04 17:54:36 +0000 UTC
-2361,CLOSED,L4T images for Triton,,2021-01-26 00:47:01 +0000 UTC
-2360,CLOSED,Not able to compile client libraries using docker,,2021-01-06 18:54:04 +0000 UTC
-2359,CLOSED,Call to GRPC Inference API fails using generated java code,,2021-01-26 00:49:31 +0000 UTC
-2357,CLOSED,TensorRT on Jetpack Triton Build,,2020-12-17 17:18:03 +0000 UTC
-2356,CLOSED,Tensorflow Saved Model Format tensorflow-gpu==2.0.0 vs. tensorflow==2.3.0,,2021-01-26 00:50:49 +0000 UTC
-2355,CLOSED,Questions on building triton server,,2020-12-17 17:14:27 +0000 UTC
-2354,CLOSED,docker pull failed from NGC,,2021-01-26 00:51:07 +0000 UTC
-2353,CLOSED,CPU memory usage details in metrics,,2021-07-07 21:59:39 +0000 UTC
-2347,CLOSED,grpc_simple_client.go "undefined: inference.GRPCInferenceServiceClient",,2020-12-16 16:47:30 +0000 UTC
-2345,CLOSED,Is it possible to run Triton without starting a server?,,2023-05-24 11:30:32 +0000 UTC
-2342,CLOSED,perf_analyzer failed,,2020-12-16 03:46:00 +0000 UTC
-2339,CLOSED,[Libtorch] Triton server produces inconsistent results when hosting multiple models in one GPU,,2022-01-11 01:42:00 +0000 UTC
-2337,CLOSED,[Kaldi] 8Khz Model " unexpected size for input tensor.",,2021-01-27 00:02:23 +0000 UTC
-2334,OPEN,Option to not provide all inputs specified in config.pbtxt,enhancement,2021-05-10 22:33:45 +0000 UTC
-2333,CLOSED,Support scalar input to Triton,,2023-07-02 22:15:22 +0000 UTC
-2332,CLOSED,Add ability to choose which graph and signature_def to load on model load,,2021-04-21 17:56:57 +0000 UTC
-2331,CLOSED,Supporting multiple signature_def's at runtime,,2021-05-10 22:36:54 +0000 UTC
-2330,CLOSED,Pull 20.11-py3-sdk, manifest unknown,,2020-12-10 16:54:28 +0000 UTC
-2329,CLOSED,[Typo] in the developer website of trition introduction,,2021-02-03 18:08:49 +0000 UTC
-2324,CLOSED,Triton for JetPack does NOT support ONNX backend,enhancement,2021-05-10 22:38:24 +0000 UTC
-2323,CLOSED,Unable to start server on CPU-only device,,2022-05-13 17:25:32 +0000 UTC
-2322,CLOSED,SegmentationFault error related PinnedMemoryManager,,2021-02-05 21:17:24 +0000 UTC
-2321,CLOSED,Implement a custom client using curl and jsoncpp,,2020-12-11 15:44:49 +0000 UTC
-2317,CLOSED,Loading TorchScript model fails for Triton in DeepStream,,2021-01-26 00:53:18 +0000 UTC
-2315,CLOSED,How to protect trtis (20.02-py3) python code,,2022-11-16 02:08:58 +0000 UTC
-2312,CLOSED,HTTPService is starting instead of started,,2020-12-04 16:51:08 +0000 UTC
-2311,CLOSED,Failed to finalize CUDA memory manager: CNMEM_STATUS_INVALID_ARGUMENT,,2020-12-03 06:42:22 +0000 UTC
-2308,CLOSED,'torch.dtype' object has no attribute 'type',,2020-12-08 18:00:49 +0000 UTC
-2307,CLOSED,Serialization Error in readExternam:0 (Type mismatch),,2020-12-17 17:17:44 +0000 UTC
-2306,CLOSED,Why is there such a big performance difference between using http and grpc?,,2021-01-26 00:54:51 +0000 UTC
-2303,CLOSED,multiple model instances running simultaneously maybe cause gpu memory exhaust. How to avoid it?,,2020-12-10 18:47:37 +0000 UTC
-2301,CLOSED,How to load more than one custom plugin, LD_PRELOAD,,2021-04-06 02:32:42 +0000 UTC
-2300,CLOSED,Memory leak,,2021-01-05 08:06:06 +0000 UTC
-2299,CLOSED,RTX 3000 series?,,2021-01-26 00:55:09 +0000 UTC
-2297,CLOSED,Build from sources r2.5 does not work correctly,,2020-12-05 15:48:57 +0000 UTC
-2296,CLOSED,error with shared memory with client in triton client sdk: failed to register input shared memory region,,2022-07-06 11:18:31 +0000 UTC
-2294,CLOSED,TorchScript concurrent inference net execution time is substantially higher than single inference execution time,,2021-01-26 00:56:06 +0000 UTC
-2293,CLOSED,TensorRT version mismatch for NGC containers 20.11 and 20.10,,2021-05-19 09:07:24 +0000 UTC
-2291,CLOSED,Fix S3 authentication for IAM roles by including Session token in env vars,,2022-12-16 19:09:53 +0000 UTC
-2289,CLOSED,Error Handling in ` model.py `,,2020-12-03 21:44:54 +0000 UTC
-2281,CLOSED,Does onnx backend utilize TensorRT while interfering?,,2020-11-22 16:37:54 +0000 UTC
-2278,CLOSED,E1120 12:54:31.030207 49 model_repository_manager.cc:1007] failed to load 'yolov3-spp' version 1: Invalid argument: model 'yolov3-spp_0_gpu0', tensor '000_net': the model expects 4 dimensions (shape [1,3,608,608]) but the model configuration specifies 3 dimensions (shape [3,608,608]),,2021-01-26 00:56:54 +0000 UTC
-2277,CLOSED,anyone build triton by cmake or buil.py success in ubuntu?,,2021-01-26 00:58:54 +0000 UTC
-2276,CLOSED,TFLite support with Google Edge TPU acceleration,enhancement,2022-11-09 15:51:31 +0000 UTC
-2275,CLOSED,Different result: ensemble_model vs. ensemble_client,,2020-12-07 17:43:16 +0000 UTC
-2274,CLOSED,Client side memory leak in python sdk using shared memory,,2020-11-23 18:19:05 +0000 UTC
-2269,CLOSED,[Crash] Triton Server 20.10 crashing with 'double free or corruption (out)' with TensorFlow XLA-GPU enabled,,2021-01-26 01:00:10 +0000 UTC
-2268,CLOSED,Why can't the renamed "triton_python_backend_utils.py" and other modules be imported in models directory?,,2021-02-19 14:43:18 +0000 UTC
-2267,CLOSED,Adding Custom layer to Triton,,2020-11-20 05:20:56 +0000 UTC
-2265,CLOSED,onnx model inference cpu usage,,2021-01-26 01:00:57 +0000 UTC
-2260,CLOSED,Issue while Serving the model using Triton Server,,2020-11-18 19:05:10 +0000 UTC
-2258,CLOSED,Is there any guide to deploy QuartzNet through triton inference server?,,2022-12-22 13:58:12 +0000 UTC
-2254,CLOSED,Request for unknown model: 'resnet50_netdef' is not found,,2020-11-17 05:59:04 +0000 UTC
-2253,CLOSED,GCS permissions to load models from bucket?,,2020-11-17 05:46:35 +0000 UTC
-2250,CLOSED,How to build libimagepreprocess.so?,,2020-11-16 17:16:33 +0000 UTC
-2245,CLOSED,How to downgrade the cuda version in the "nvcr.io/nvidia/tritonserver:20.10-py3",,2020-11-11 23:14:37 +0000 UTC
-2243,CLOSED,Protobuf version conflicts r19.10,,2020-11-12 13:36:05 +0000 UTC
-2240,CLOSED,Jetson support of pytorch and PTH-TRT,enhancement,2023-07-10 22:58:28 +0000 UTC
-2239,CLOSED,cmake build server error r20.10 or master,,2020-11-20 05:57:59 +0000 UTC
-2238,CLOSED,cmake build server error r20.10,,2022-04-13 03:33:16 +0000 UTC
-2237,CLOSED,Memory leaking with many TensorFlow models and warmup,,2020-11-11 23:31:25 +0000 UTC
-2236,CLOSED,Can we put image download to server side?,,2021-02-04 17:57:09 +0000 UTC
-2234,CLOSED,Use TRTIS optimized model instead of running optimization again on load,,2020-11-09 17:18:29 +0000 UTC
-2231,CLOSED,Model with dynamic shapes and TensorRT optimization outputs nonsense,,2021-12-17 10:05:00 +0000 UTC
-2229,CLOSED,sequence_batch_scheduler.cc:399 The previous sequence did not end before this sequence start,bug,2020-11-11 23:42:19 +0000 UTC
-2228,CLOSED,Config.pbtx for Efficientdet-D0,,2020-11-09 20:37:44 +0000 UTC
-2227,CLOSED,Segmentation Fault when launching the server with custom built TensorRT plugins,,2021-01-04 02:30:45 +0000 UTC
-2226,CLOSED,Mismatch between config.pbtxt and reported model config json,,2020-11-12 21:25:51 +0000 UTC
-2225,CLOSED,Error occurs when i run the triton server with docker(Quick start),,2020-11-05 17:22:25 +0000 UTC
-2224,CLOSED,The README file in this repo has a bad link - [404:NotFound],,2020-11-04 19:01:57 +0000 UTC
-2223,CLOSED,Failed to load Tensorflow models,,2020-11-05 12:54:57 +0000 UTC
-2222,CLOSED,Support for AWS Inferentia?,,2022-05-11 09:11:08 +0000 UTC
-2221,CLOSED,CUDA Shared Memory for pytorch cuda tensor?,,2021-04-08 01:02:55 +0000 UTC
-2210,CLOSED,Array object is not Json Serializable on triton_client.infer,,2020-11-11 23:49:39 +0000 UTC
-2209,CLOSED,Debian package, when?,,2020-11-11 23:50:16 +0000 UTC
-2205,CLOSED,Triton Inference Server on ppc64le,,2021-07-07 22:01:12 +0000 UTC
-2195,CLOSED,C++ Clients V2 Api - Inference results output and batch Id,,2020-10-30 19:50:17 +0000 UTC
-2194,CLOSED,Python API references disappeared from documentation !,,2020-10-29 15:52:45 +0000 UTC
-2193,CLOSED,provided PTX was compiled with an unsupported toolchain?,,2020-11-03 22:15:38 +0000 UTC
-2192,CLOSED,Can't get max-batch-size to work,,2020-11-11 23:51:36 +0000 UTC
-2187,CLOSED,Could you provide a tritonclient in Java?,,2020-11-11 23:51:49 +0000 UTC
-2186,CLOSED,Error when build TRTIS Docker Image,,2020-11-11 23:52:19 +0000 UTC
-2179,CLOSED,failed to load 'densenet_onnx' version 1: Invalid argument: unknown platform 'onnxruntime_onnx',,2023-01-20 09:17:47 +0000 UTC
-2169,CLOSED,Fail to build identity_backend,,2020-10-26 12:38:28 +0000 UTC
-2168,CLOSED,HTTP end point doesn't support models with decoupled transaction policy,,2020-10-26 17:29:39 +0000 UTC
-2167,CLOSED,Bypass arguments in Ensemble Models' input and output,,2020-10-27 16:57:16 +0000 UTC
-2166,CLOSED,IndexError: list index out of range,,2020-10-26 19:56:38 +0000 UTC
-2165,CLOSED,"Invalid argument - repository path is not a valid directory" when running local S3 on port 80,,2021-01-26 01:05:44 +0000 UTC
-2163,OPEN,Pull repository from different s3 accounts (multiple credentials),enhancement,2022-08-30 22:25:50 +0000 UTC
-2160,CLOSED,CMake problems for client library in v2.3.0,,2021-07-07 22:21:57 +0000 UTC
-2159,CLOSED,Support different network protocols,,2021-09-10 19:18:04 +0000 UTC
-2158,CLOSED,Ragged batching support for ML backends,,2021-07-07 22:24:35 +0000 UTC
-2157,CLOSED,Python backend cannot support KIND_GPU in model config,,2023-05-18 01:54:44 +0000 UTC
-2156,CLOSED,Can multiple instances can use the same GPU shared memory?,,2022-05-30 02:50:27 +0000 UTC
-2155,CLOSED,Do multiple instances of the same model share parameters?,,2020-10-22 15:50:26 +0000 UTC
-2154,CLOSED,What is the difference between triton-inference-server/server/src/backends/backend and triton-inference-server/backend?,,2020-10-22 17:05:14 +0000 UTC
-2153,CLOSED,Where is the definition of TRITONBACKEND_ModelSetState,,2020-10-22 16:01:35 +0000 UTC
-2148,CLOSED,Wrong value of byte_size returned by TRITONBACKEND_InputProperties,,2020-10-27 23:27:21 +0000 UTC
-2147,CLOSED,S3 Storage with POLL mode reloads models constantly,,2021-01-28 22:39:32 +0000 UTC
-2144,CLOSED,Non-numeric subdirectories are not ignored for version convertion,,2020-10-27 02:08:34 +0000 UTC
-2143,CLOSED,Windows support,,2020-10-20 15:47:48 +0000 UTC
-2140,CLOSED,Failed to connect all addresses,,2020-10-20 23:58:03 +0000 UTC
-2138,CLOSED,Failing to invoke triton methods via GRPC,,2020-10-20 07:13:06 +0000 UTC
-2137,CLOSED,The overhead cost so much time,,2020-10-29 02:32:30 +0000 UTC
-2136,CLOSED,Different batch-size requests causes error, even the loading model supports batching,,2021-08-30 02:53:14 +0000 UTC
-2135,CLOSED,Wrong results when using an onnx model with tensorrt gpu_execution_accelerator and dynamic axes,,2021-05-19 21:44:16 +0000 UTC
-2133,CLOSED,Deploying Bert TensorRT model with Triton,,2020-10-27 19:30:22 +0000 UTC
-2130,CLOSED,perf_client fails with "Received message larger than max",,2022-10-14 20:36:38 +0000 UTC
-2127,CLOSED,Triton server multiple initialization errors, under kubernetes,,2020-10-21 22:18:59 +0000 UTC
-2126,CLOSED,How to run Triton Inference Server docker container on a Jetson Nano?,,2020-10-15 15:22:21 +0000 UTC
-2123,CLOSED,Multi-inputs with dynamic axes in ONNX Graph not corrected reported on loading,,2020-10-14 21:03:55 +0000 UTC
-2122,CLOSED,Prometheus output differs from nvidia-smi,,2020-11-13 22:31:41 +0000 UTC
-2121,CLOSED,TRITON INFERENCE WITH ENSEMBLE MODEL,,2020-10-20 17:00:36 +0000 UTC
-2120,CLOSED,Indiscriminate use of ExternalProject_Add,,2020-10-20 17:00:52 +0000 UTC
-2118,CLOSED,Can request the Triton Inference Server by using the 'request package' in python rather than 'client library'?,,2021-05-12 09:09:33 +0000 UTC
-2112,CLOSED,Missing methods in Kotlin/Java code generated using gRPC,,2021-01-26 01:07:21 +0000 UTC
-2111,CLOSED,When batch_size is 1, return Stream removed.,,2020-11-11 23:55:02 +0000 UTC
-2110,CLOSED,20.10 build error ( linking error ),,2021-05-19 17:23:00 +0000 UTC
-2109,CLOSED,Memory leak in 20.09?,,2020-10-20 22:14:57 +0000 UTC
-2107,CLOSED,Cannot deploy on AWS EKS gpu nodes (p3, g4, etc.),,2020-10-12 19:12:10 +0000 UTC
-2104,CLOSED,Triton 20.08 hangs on inference when Custom Backend (Legacy, either v1 or v2) has no output tensors specified,,2020-10-20 16:06:39 +0000 UTC
-2103,CLOSED,How to debug failed inference request,,2023-02-22 14:01:49 +0000 UTC
-2101,CLOSED,Release 2.3.0- Missing custom backend sdk,,2020-10-08 21:34:55 +0000 UTC
-2100,CLOSED,fail to reload a model with the same name and different configurations,,2020-10-16 17:48:50 +0000 UTC
-2098,CLOSED,Jetson build for 20.09,,2020-10-08 07:40:20 +0000 UTC
-2097,CLOSED,Tensorflow models don't seem to batch properly,,2021-07-30 18:55:18 +0000 UTC
-2095,CLOSED,Triton unable to access GPU on Jetson Nano,,2020-10-07 18:55:42 +0000 UTC
-2092,CLOSED,V2 API Migration,,2021-04-12 17:10:18 +0000 UTC
-2091,CLOSED,Failing to comsume Triton's Prometheus in Grafana,,2021-11-03 17:04:14 +0000 UTC
-2090,CLOSED,Unclear torch model failure message,,2022-05-25 10:02:34 +0000 UTC
-2085,OPEN,MXNet support,enhancement,2020-10-09 18:21:27 +0000 UTC
-2082,CLOSED,Issues with CentOS client build,,2021-10-26 13:09:07 +0000 UTC
-2076,CLOSED,Model auto loading unuseful,,2022-02-25 19:39:42 +0000 UTC
-2068,CLOSED,Failed to build identity_backend, square_backend and repeat_backend,,2020-10-26 16:57:39 +0000 UTC
-2067,CLOSED,the results are in descending order, is it right?,,2020-10-09 01:23:01 +0000 UTC
-2060,CLOSED,how to create .so file for a model,,2020-09-28 16:03:34 +0000 UTC
-2059,CLOSED,Send a raw image file from a client to a Triton server,,2020-10-20 17:07:08 +0000 UTC
-2053,CLOSED,Different GPUs cause huge memory consumption differences,,2020-09-22 15:20:21 +0000 UTC
-2052,CLOSED,the model expects 0 dimensions (shape []) but 0 dimensionsid not allowed,,2020-09-22 15:21:59 +0000 UTC
-2049,CLOSED,How to use custom-backend with Tritonbackend.h and run it on GPU,,2020-10-06 13:34:44 +0000 UTC
-2048,CLOSED,Expected Tuple but got GenericDict,,2021-06-29 14:32:36 +0000 UTC
-2046,CLOSED,Same code working for one postprocess backend but not for another.,,2020-09-23 17:59:28 +0000 UTC
-2040,CLOSED,How to install tritonclient?,,2020-09-22 15:24:32 +0000 UTC
-2036,CLOSED,Separate weights from plan file for TensorRT backend,,2020-09-25 05:06:57 +0000 UTC
-2028,CLOSED,Use-after-free in ensemble scheduler when using legacy custom model,,2020-10-02 22:00:21 +0000 UTC
-2027,CLOSED,Error when using Ceph S3 storage as a model repository,enhancement,2021-05-18 20:28:10 +0000 UTC
-2025,CLOSED,Deploy Detectron2 Mask R-CNN inside Triton,,2022-12-16 11:41:44 +0000 UTC
-2024,CLOSED,How to deploy Detectron2 model using pytorch?,,2022-03-01 02:30:31 +0000 UTC
-2023,CLOSED,unload model not release mem,,2020-09-22 15:38:27 +0000 UTC
-2021,CLOSED,Support input/output compression,,2021-04-30 17:22:14 +0000 UTC
-2020,OPEN,Ability to disable or redirect cout/cerr,enhancement,2021-09-14 17:28:41 +0000 UTC
-2019,CLOSED,Stricter model versioning,,2021-11-12 22:57:54 +0000 UTC
-2018,CLOSED,Control number of threads used by CPU server,,2022-11-08 19:55:51 +0000 UTC
-2014,CLOSED,Ensemble model stuck perf test,,2020-09-17 07:35:41 +0000 UTC
-2008,CLOSED,can't parse path with --model-repository=s3://host:port/demobucket/path,,2020-09-16 02:34:16 +0000 UTC
-2004,CLOSED,Triton Inference Server on Azure,,2020-10-05 18:35:58 +0000 UTC
-2003,CLOSED,oom happens in GTX1080Ti(11GB) but not in RTX2080Ti(11GB),,2020-09-10 18:34:13 +0000 UTC
-2002,CLOSED,TF-TRT model's TRTEngineOP loaded on the first GPU only,,2021-11-12 22:57:31 +0000 UTC
-1996,CLOSED,Datatype difference in model config and HTTP request body,,2021-01-26 01:08:09 +0000 UTC
-1995,CLOSED,Can't get the shape right,,2020-09-11 06:14:37 +0000 UTC
-1994,CLOSED,Loading libimagepreprocess.so get undefined symbol: _ZNK6google8protobuf7Message25InitializationErrorStringEv,,2020-09-10 08:24:43 +0000 UTC
-1993,CLOSED,[enforce fail at operator.cc:76] blob != nullptr. op Cast: Encountered a non-existing input blob: data,,2020-11-16 18:27:16 +0000 UTC
-1990,CLOSED,Whats the equivalent of /api/status?format=json in v2?,,2020-09-09 09:21:59 +0000 UTC
-1989,CLOSED,Unreasonable handler overhead using python client 20.08,,2020-09-09 16:33:59 +0000 UTC
-1988,CLOSED,Invalid argument: unsupported datatype 'TYPE_BYTES' on 20.08,,2020-09-14 16:01:59 +0000 UTC
-1982,CLOSED,the metric with ensemble_scheduling.,,2020-09-04 16:13:30 +0000 UTC
-1972,CLOSED,is there pre-build docker images for jetson?,,2020-09-08 12:45:06 +0000 UTC
-1971,CLOSED,GRPC python client get_model_config,,2020-09-09 18:16:08 +0000 UTC
-1968,CLOSED,trion on kfserving performance,,2020-09-02 20:53:12 +0000 UTC
-1966,CLOSED,How to use dynamic_batching in ensemble for a pipeline,,2020-09-02 15:45:09 +0000 UTC
-1965,CLOSED,How do I use AsyncRun?,,2020-09-02 22:05:59 +0000 UTC
-1964,CLOSED,simplest way to use tensorflow 1.12.0 in triton server 20.07,,2020-09-04 17:18:19 +0000 UTC
-1963,CLOSED,REST API is too slow when using python request API,,2020-09-05 11:23:39 +0000 UTC
-1962,CLOSED,REST API is too slow when using python request API,,2020-09-02 03:00:47 +0000 UTC
-1961,CLOSED,docker install error,,2020-09-01 15:48:12 +0000 UTC
-1960,CLOSED,Missing models to run client examples,,2020-09-10 15:53:52 +0000 UTC
-1959,CLOSED,Custom Backend load custom model files,,2020-09-01 15:56:40 +0000 UTC
-1958,CLOSED,Cannot set Tensorflow backend option "allow-soft-placement" with docker,,2020-09-02 21:41:56 +0000 UTC
-1950,CLOSED,Please add max_queue_delay_microseconds and possibly preferred_batch_size to StrategyDirect,,2020-10-22 21:46:38 +0000 UTC
-1948,CLOSED,:8000/v2/health/live and :8000/v2/health/ready returns 400,,2021-04-10 11:32:00 +0000 UTC
-1947,CLOSED,Tritonbackend.h or Custom.h, unclear ensemble model,,2020-09-01 16:01:44 +0000 UTC
-1940,CLOSED,OOM error API,,2020-10-09 18:25:30 +0000 UTC
-1939,CLOSED,client request error,,2020-08-31 15:51:38 +0000 UTC
-1935,CLOSED,Failed to load 'resnet50_netdef',,2020-09-01 06:05:14 +0000 UTC
-1931,CLOSED,k8s triton cluster error: creating server: Internal - failed to stat file,,2020-08-31 17:47:10 +0000 UTC
-1927,CLOSED,How to safely restart/pullout a triton server in prod env?,,2020-08-25 16:43:27 +0000 UTC
-1925,CLOSED,jetson can not use gpu now?,,2020-08-28 05:39:33 +0000 UTC
-1919,CLOSED,Perf_Client TF Warm-Up Period,,2020-08-24 15:45:52 +0000 UTC
-1915,CLOSED,cudashm.get_contents_as_numpy always using gpu 0,,2020-09-11 20:12:56 +0000 UTC
-1908,CLOSED,can we use inference server for the codes which doesnt have any model ?,,2020-08-31 17:54:44 +0000 UTC
-1907,CLOSED,document empty error while running image client example,,2020-08-31 21:25:25 +0000 UTC
-1906,CLOSED,PyTorch 1.6 support?,,2021-04-19 08:49:26 +0000 UTC
-1905,CLOSED,[libprotobuf FATAL /workspace/build/grpc-repo/src/grpc/third_party/protobuf/src/google/protobuf/repeated_field.h:1193] CHECK failed: (index) < (current_size_): terminate called after throwing an instance of 'google::protobuf::FatalException' what(): CHECK failed: (index) < (current_size_): Aborted (core dumped),,2020-08-21 10:54:29 +0000 UTC
-1904,CLOSED,CPU memory usage constantly increases while doing inference,,2021-04-20 11:39:38 +0000 UTC
-1902,CLOSED,How is the concurrent model execution support feature performs? Is there any benchmark data?,,2020-08-13 18:06:42 +0000 UTC
-1900,CLOSED,No error or warnings when underlying models of ensemble model request doesn't succeed,,2020-08-17 17:35:01 +0000 UTC
-1899,CLOSED,Perf Client Failed while inferencing request on the loaded bert model !!,,2020-08-31 19:01:54 +0000 UTC
-1894,CLOSED,tensorrtserver.api.InferenceServerException: [ 0] expecting 1 invocations of SetRaw for input 'INPUT__0', one per batch entry,,2020-08-13 16:39:46 +0000 UTC
-1893,CLOSED,Example how to work with shared memory in multi-threaded application?,,2020-12-06 13:58:30 +0000 UTC
-1889,CLOSED,Config.pbtxt Issue,,2020-08-11 00:36:15 +0000 UTC
-1888,CLOSED,Docker build fails with ln: target 'libonnxruntime.so' is not a directory,,2020-08-11 01:51:00 +0000 UTC
-1878,CLOSED,Question on perf_client (both client and server version at 20.06),,2020-08-07 02:32:41 +0000 UTC
-1876,CLOSED,Question on dynamic batching and preferred batch size,,2020-08-06 03:18:01 +0000 UTC
-1872,CLOSED,Typo in the v2.0.0 release notes - Jetson Jetpack Support,,2020-08-05 18:44:35 +0000 UTC
-1871,CLOSED,String field 'nvidia.inferenceserver.ModelInferResponse.InferOutputTensor.ParametersEntry.key' contains invalid UTF-8 data when parsing a protocol buffer. Use the 'bytes' type if you intend to send raw bytes.,,2020-08-12 14:30:03 +0000 UTC
-1870,CLOSED,Torchvision ops not compiled with GPU support,,2021-08-04 02:08:38 +0000 UTC
-1869,CLOSED,Trition V2 incorrectly computing number of elements in the batch,,2020-08-07 02:27:55 +0000 UTC
-1861,CLOSED,20.07 pre-built release not working?,,2020-08-03 16:08:03 +0000 UTC
-1856,CLOSED,Are the Triton clients thread safe?,,2020-08-02 16:27:01 +0000 UTC
-1854,CLOSED,TLT Examples,,2020-09-12 15:25:30 +0000 UTC
-1853,CLOSED,Slow ONNX inference,,2020-08-05 16:46:33 +0000 UTC
-1845,CLOSED,Inference with 20.06 using curl,,2020-07-29 15:56:11 +0000 UTC
-1844,CLOSED,Understanding more about triton,,2020-08-04 15:48:18 +0000 UTC
-1839,CLOSED,The useage of GPU memory in TRT environment is pretty more than Python environment for a torch scripte model.,,2020-07-30 10:15:49 +0000 UTC
-1838,CLOSED,GPU support for triton container?,,2020-07-28 15:42:56 +0000 UTC
-1835,CLOSED,Installation of Triton Server with helm chart,,2021-02-08 16:35:25 +0000 UTC
-1834,CLOSED,Perf client output,,2020-07-31 19:19:16 +0000 UTC
-1832,CLOSED,Error in request.raw_input_contents.extend([input_bytes]),,2020-07-27 17:44:55 +0000 UTC
-1829,CLOSED,Docker container won't start due to entrypoint.sh error,,2020-07-24 22:36:39 +0000 UTC
-1827,CLOSED,new v2 api embeds nvidia.inferenceserver as package name in .proto,,2020-07-30 16:01:57 +0000 UTC
-1822,CLOSED,How to send binary tensor data to ensemble model by HTTP request?,,2020-07-24 15:37:31 +0000 UTC
-1821,CLOSED,gRPC communication extremely slow,,2020-08-03 19:01:40 +0000 UTC
-1820,CLOSED,Suggestion: Use distributed flags for commandline passing,,2021-05-10 22:29:57 +0000 UTC
-1814,CLOSED,Jetson Triton v1 to v2 Tensorflow,,2020-07-23 18:11:55 +0000 UTC
-1811,CLOSED,[question] Does it support distributed serving?,,2021-04-22 07:48:37 +0000 UTC
-1810,CLOSED,warm up issue,,2020-07-20 23:17:54 +0000 UTC
-1809,CLOSED,OpenVINO unsupported operation,,2020-07-20 23:21:58 +0000 UTC
-1808,CLOSED,E0720 03:43:07.419723 1 main.cc:1099] error: creating server: INTERNAL - failed to load all models,,2020-08-05 09:37:21 +0000 UTC
-1807,CLOSED,cmake build error,,2020-07-18 17:07:57 +0000 UTC
-1806,CLOSED,model_version not set in inference response unless it is set in request,,2020-07-20 18:50:30 +0000 UTC
-1804,CLOSED,TensorFlow 2 support,,2020-09-01 23:52:10 +0000 UTC
-1803,CLOSED,Python API async_infer http callback,,2021-12-06 17:30:34 +0000 UTC
-1800,CLOSED,Custom backend and generic computer vision algorithms,,2020-07-15 17:22:27 +0000 UTC
-1798,CLOSED,how to stream multisource camera in real-time with trtis,,2020-07-22 03:01:54 +0000 UTC
-1796,CLOSED,Invalid argument: warmup setting expects n bytes,,2022-04-26 08:33:20 +0000 UTC
-1795,CLOSED,Unable to load model config with zero/random data warm up,,2020-07-15 18:26:54 +0000 UTC
-1794,CLOSED,fp16 issue in 20.03,,2020-08-15 07:19:32 +0000 UTC
-1793,CLOSED,Auto switching between GPU and CPU for production environment?,,2020-07-14 02:46:06 +0000 UTC
-1790,CLOSED,Clip to predictions in ensembles,,2020-07-16 16:56:08 +0000 UTC
-1787,CLOSED,CUDA out of memory during inference, not during model loading,,2020-07-14 02:41:11 +0000 UTC
-1786,CLOSED,Choose TensorRT version to use for Triton,,2020-07-14 02:36:33 +0000 UTC
-1778,CLOSED,Mask RCNN TensorRT in Triton,,2020-07-09 17:10:46 +0000 UTC
-1777,OPEN,About Model Encrypted,enhancement,2022-03-15 00:52:19 +0000 UTC
-1776,CLOSED,triton_client.infer(...) error "Received message larger than max (33816626 vs. 4194304)",,2020-07-16 22:42:24 +0000 UTC
-1766,CLOSED,libevhtp patch step fails on sed: extra characters at the end of g command,,2020-07-11 06:18:03 +0000 UTC
-1765,CLOSED,Failed to allocate CUDA memory with byte size 78643200 on GPU 0: CNMEM_STATUS_OUT_OF_MEMORY, falling back to pinned system memory,,2020-07-08 21:26:43 +0000 UTC
-1764,CLOSED,Wrong output for my triton segmentation model,,2020-07-09 04:03:48 +0000 UTC
-1762,CLOSED,Load onnx model error in trtis 19.09 container?,,2020-07-08 15:49:22 +0000 UTC
-1752,CLOSED,TRT run the torchscripte model failed!,,2020-08-04 15:49:06 +0000 UTC
-1749,CLOSED,tensorrtserver.api.InferenceServerException: [ 0] status request did not return status,,2020-08-04 15:49:43 +0000 UTC
-1746,CLOSED,CPU memory slowly increases when reusing an InferContext object for many times,,2021-04-20 11:39:11 +0000 UTC
-1745,CLOSED,tritonclientutils.utils.InferenceServerException: [StatusCode.UNIMPLEMENTED],,2020-11-28 00:33:47 +0000 UTC
-1741,CLOSED,Changing batch sizes when using cuda shared memory,,2020-07-08 07:38:04 +0000 UTC
-1733,CLOSED,Loading of models from GCS is prohibitively slow,,2020-08-04 15:51:06 +0000 UTC
-1724,CLOSED,Triton container not loading the latest version of model when S3 location is updated,,2020-07-08 19:30:09 +0000 UTC
-1723,CLOSED,Why does the client scale the image nparray with particular models?,,2020-07-02 20:47:06 +0000 UTC
-1722,CLOSED,Can't download image 20.03.1-py3 on Kind k8s 1.15.11,,2020-08-04 15:51:55 +0000 UTC
-1718,CLOSED,S3 custom model load is not working,,2020-07-24 16:12:07 +0000 UTC
-1714,CLOSED,Docker build error,,2020-06-25 23:48:14 +0000 UTC
-1708,CLOSED,Memory leak in get_async_run_results (when async call and error),,2020-07-01 23:46:37 +0000 UTC
-1707,CLOSED,ONNX->TensorRT model fails supporting multiple batch sizes,,2020-07-21 07:55:14 +0000 UTC
-1706,CLOSED,Broader input formatting supported,,2020-08-04 15:55:39 +0000 UTC
-1705,CLOSED,WSL2 + CUDA,,2021-12-29 18:36:40 +0000 UTC
-1693,CLOSED,async is too slow on http,,2020-08-04 15:57:23 +0000 UTC
-1688,CLOSED,difference inferencing result between http v1 and grpc v2,,2020-06-22 04:01:53 +0000 UTC
-1683,CLOSED,Client build fails when excluding GPU support,,2020-06-25 16:00:27 +0000 UTC
-1682,CLOSED,A shape problem,,2020-06-22 17:19:32 +0000 UTC
-1665,CLOSED,GRPC v2 low bandwidth,,2020-06-29 16:00:43 +0000 UTC
-1663,CLOSED,only one host thread launch CUDA kernel in 20.03,,2021-01-26 01:09:03 +0000 UTC
-1662,CLOSED,Fine grained Division,,2020-06-15 15:15:51 +0000 UTC
-1661,CLOSED,instance_group and --concurrency-range,,2020-06-22 17:19:55 +0000 UTC
-1660,CLOSED,Variable-length, row-based, TF Example and TF ExampleListWithContext support,,2021-09-08 20:23:34 +0000 UTC
-1653,CLOSED,Cuda shared memory support for Custom Backends,,2020-06-11 18:03:30 +0000 UTC
-1649,CLOSED,REST API V2 call is shutting down server,,2023-01-11 08:31:23 +0000 UTC
-1648,CLOSED,Error when updating context stat,,2020-06-22 17:20:15 +0000 UTC
-1647,CLOSED,using LOG_VERBOSE(L) in custom backend not work,,2020-06-10 15:58:56 +0000 UTC
-1644,CLOSED,Distribute model across multiple GPUs,,2020-08-14 15:45:38 +0000 UTC
-1637,CLOSED,Preformance seems to be poor in benchmark.,,2020-06-19 15:49:45 +0000 UTC
-1636,CLOSED,Memory leak when running simple_cuda_shm_client?,,2020-06-29 18:05:34 +0000 UTC
-1633,CLOSED,How to solve "no next action, trigger OnComplete()" plz?,,2020-06-09 00:39:42 +0000 UTC
-1632,CLOSED,/opt/tritonserver/nvidia_entrypoint.sh: line 93: exec: --: invalid option,,2020-06-08 15:50:48 +0000 UTC
-1625,CLOSED,Transfer Learning Toolkit,,2021-09-08 20:20:36 +0000 UTC
-1624,CLOSED,Tritonhttpclient should support https,enhancement,2020-06-19 03:33:46 +0000 UTC
-1621,CLOSED,Is triton inference server same with TRTIS?,,2020-06-05 16:23:56 +0000 UTC
-1609,CLOSED,which gpu will instance model exist if not set gpus:[0],,2020-06-05 16:29:58 +0000 UTC
-1598,CLOSED,Release dates 20.05 and 20.06,,2020-06-09 22:08:06 +0000 UTC
-1593,CLOSED,Problems using Pyrthon API to access the triton server,,2020-06-02 15:56:53 +0000 UTC
-1589,CLOSED,Troubleshooting "CUDA Driver unavailable",,2020-06-02 16:37:24 +0000 UTC
-1588,CLOSED,docker run triton-inference-server error when I replace the models with my own.,,2020-06-02 16:38:13 +0000 UTC
-1578,CLOSED,Differences between the result of Triton Inference Server and mere GPU,,2021-03-01 05:49:21 +0000 UTC
-1577,CLOSED,Unable to build tritonserver_client through Dockerfile.client,,2020-06-25 17:33:45 +0000 UTC
-1567,CLOSED,instance_group issue,,2020-05-28 20:14:01 +0000 UTC
-1566,CLOSED,Compatibility of Instance Groups Setting,,2020-05-28 20:46:14 +0000 UTC
-1557,CLOSED,Strange GPU usage on Jetson Nano,,2020-05-29 10:13:39 +0000 UTC
-1556,CLOSED,tensorflow 1.13.1 model and triton server 20.03,,2020-06-02 17:37:03 +0000 UTC
-1555,CLOSED,max_workspace_size_bytes can't set in config.pbtxt,,2020-05-28 20:07:54 +0000 UTC
-1554,CLOSED,Docker Image Not Found,,2020-05-27 16:15:55 +0000 UTC
-1546,CLOSED,Successfully loaded torchscript model failed with "CUDA error: CUBLAS_STATUS_NOT_INITIALIZED" when called for inference,,2020-06-22 19:25:53 +0000 UTC
-1545,CLOSED,tensorrtserver.api.InferenceServerException: [inference:0 49] unexpected shape for output 'features', model configuration shape is [1,128], inference shape is [1,128],,2020-05-27 07:35:09 +0000 UTC
-1544,CLOSED,perfclient shows weirdly low throughput compared to client application,,2020-06-12 03:22:08 +0000 UTC
-1543,CLOSED,GRPC client failed / HTTP client failed,,2020-06-11 19:00:36 +0000 UTC
-1534,CLOSED,regex incorrect defined for s3 path.,,2020-05-28 18:28:10 +0000 UTC
-1529,CLOSED,TritonRT server on EKS not being able to read from AWS S3,,2020-05-22 20:44:04 +0000 UTC
-1528,CLOSED,How to support models with both CPU and GPU?,,2020-06-25 17:32:52 +0000 UTC
-1524,OPEN,SSD_MobileNetv1_COCO label_filename incorrect classification,enhancement,2022-08-10 06:40:43 +0000 UTC
-1522,CLOSED,XLA Warmup for Multiple Instances and Batch Sizes,,2021-09-08 20:16:14 +0000 UTC
-1521,CLOSED,GRPC Configuration Documentation, Defaults, and Examples,,2020-05-26 21:08:43 +0000 UTC
-1517,CLOSED,Run on Jetson - continuation,,2020-05-21 08:33:27 +0000 UTC
-1516,CLOSED,parameters in output tensor,,2020-05-20 19:50:48 +0000 UTC
-1515,CLOSED,Provide steps for compiling PyTorch custom-ops library against the nightlies,,2020-05-26 21:56:40 +0000 UTC
-1514,CLOSED,Document steps for how to compile Pytorch custom-ops library for running with Triton,,2020-05-21 23:13:33 +0000 UTC
-1510,CLOSED,Docker Error for nvcr.io/nvidia/deepstream:5.0-dp-20.04-triton,,2020-05-19 21:50:34 +0000 UTC
-1507,CLOSED,General observation of Triton Server OOMs.,,2021-09-30 18:58:31 +0000 UTC
-1505,CLOSED,Compilation error using C++ client library on Win10 when doing inference with batch_size > 1,,2020-05-19 18:53:14 +0000 UTC
-1502,CLOSED,Multiple Inputs Model Failing: Failed to update context stat: [ 0] INTERNAL - Timer not set correctly.,,2020-05-22 23:29:50 +0000 UTC
-1501,CLOSED,How to use label_filename,,2021-06-27 13:23:00 +0000 UTC
-1500,CLOSED,Running inference with Trits is much slower than running model Pytorch directly,,2020-06-02 18:15:20 +0000 UTC
-1499,CLOSED,Facing OOM while running TF models,,2020-05-21 18:58:59 +0000 UTC
-1495,CLOSED,HTTP Response Occasionally Cut Off,,2020-06-25 17:23:21 +0000 UTC
-1485,CLOSED,Allow Triton to report all health metrics at startup,,2020-05-14 18:09:47 +0000 UTC
-1483,CLOSED,where is the TRTSERVER_Server definition?,,2020-05-14 07:48:57 +0000 UTC
-1477,CLOSED,Onnx GPU inference on GKE not possible,,2020-05-15 23:51:44 +0000 UTC
-1476,CLOSED,[Question]support ONNX opset 11,,2020-05-13 14:06:55 +0000 UTC
-1472,CLOSED,Enable AMP without any TensorRT optimization for Tensorflow in the inference server.,,2020-07-22 15:54:02 +0000 UTC
-1471,CLOSED,predict failed with tensorrtserver-1.12.0,,2020-05-14 01:45:51 +0000 UTC
-1468,CLOSED,Run on Jetson,,2022-09-26 23:09:14 +0000 UTC
-1458,CLOSED,AttributeError: function 'SharedMemoryControlContextCudaRegister' not found (Windows 10),,2020-05-14 15:09:36 +0000 UTC
-1454,CLOSED,saved model: Expected image, got empty file,,2020-05-12 10:01:24 +0000 UTC
-1453,CLOSED,failed to build triton-client,,2020-05-15 06:34:45 +0000 UTC
-1441,CLOSED,libpytorch backend occupies more gpu memory than pytorch,,2020-05-08 18:07:50 +0000 UTC
-1440,CLOSED,How to size how many models can be served simultaneously.,,2020-05-13 16:06:24 +0000 UTC
-1439,CLOSED,Sending some metrics from triton container,,2020-05-08 15:48:25 +0000 UTC
-1429,CLOSED,Improve RapidJSON support in build,enhancement,2021-11-29 21:02:36 +0000 UTC
-1428,CLOSED,Don't require curl CMake config,,2020-08-11 16:13:51 +0000 UTC
-1427,CLOSED,Thread safety in channel map,,2020-07-11 00:17:43 +0000 UTC
-1426,CLOSED,Support https connections,,2020-07-16 15:11:18 +0000 UTC
-1425,CLOSED,Support client-side deadlines/timeouts for AsyncRun callback,,2020-07-11 00:17:43 +0000 UTC
-1423,CLOSED,Windows 10 Triton Client Build Does Not Generate a .whl file,,2020-05-11 18:54:22 +0000 UTC
-1415,CLOSED,fasterrcnn_resnet50_fpn TorchScript model cannot be loaded,,2021-06-29 16:55:47 +0000 UTC
-1391,CLOSED,Reduce Docker Image Size,,2020-06-15 15:25:03 +0000 UTC
-1390,CLOSED,how to send input request to the loaded bert_pt model,,2020-05-18 20:34:22 +0000 UTC
-1385,CLOSED,Allow customization of cuda sync behavior,enhancement,2020-06-17 18:18:55 +0000 UTC
-1378,CLOSED,how to make TensorRT Optimization?,,2020-05-08 16:34:11 +0000 UTC
-1377,CLOSED,Server fails to load tensorflow_savedmodel when using s3 model repository,,2020-05-08 15:36:21 +0000 UTC
-1372,CLOSED,perf_client batchsize -b issue,,2020-05-08 16:35:15 +0000 UTC
-1371,OPEN,Is there any plan to support new pyTorch(==1.5) model archive format?,enhancement,2020-11-30 19:33:23 +0000 UTC
-1370,CLOSED,How to free GPU memory in perf_client,,2020-05-08 16:36:09 +0000 UTC
-1369,CLOSED,Error serving PyTorch image crowd counting model,,2020-05-02 17:12:14 +0000 UTC
-1368,CLOSED,How to use dynamic variable size on multiple dimension?,,2020-05-07 07:26:41 +0000 UTC
-1367,CLOSED,config.pbtxt should not contain any directory name,enhancement,2020-08-26 22:03:04 +0000 UTC
-1366,CLOSED,Version number 01 fails to load.,,2020-04-29 21:10:27 +0000 UTC
-1361,CLOSED,fp16 c++ support example,,2020-04-28 03:26:15 +0000 UTC
-1360,CLOSED,How to pass scalar value via input - MTCNN,,2020-04-30 17:20:24 +0000 UTC
-1358,OPEN,Is there way to log IP addresses of http or https requests ?,enhancement,2020-10-08 06:03:33 +0000 UTC
-1352,CLOSED,Allow the use of different SavedModel signature_def,,2021-05-19 21:43:12 +0000 UTC
-1351,CLOSED,OOMs on enabling multiple TF MaskRCNN models with FP16 optimization,,2020-05-19 16:33:58 +0000 UTC
-1350,CLOSED,is it possible to downgrade to cuda-10.1?,,2020-04-23 00:34:48 +0000 UTC
-1349,CLOSED,batch size >1 cann't speed in inference,,2020-04-30 17:18:32 +0000 UTC
-1340,CLOSED,Wrong CUDA buffer size used in addsub.cu AllocateCudaBuffers() when reading payloads,,2020-04-27 15:51:27 +0000 UTC
-1338,CLOSED,unable to load model 'bert', tensor 'input_ids': the model expects 1 dimensions but the model configuration specified 2 dimensions,,2020-04-28 11:36:37 +0000 UTC
-1337,CLOSED,Raw binary data order of multiple inputs batch request,,2020-05-08 16:36:42 +0000 UTC
-1331,CLOSED,Passing S3 Credentials as Env Vars,enhancement,2020-05-01 22:30:06 +0000 UTC
-1329,CLOSED,Is there a Rest API version of Triton Client ?,,2020-04-19 17:58:06 +0000 UTC
-1328,CLOSED,Got a problem in autofilling model config,,2020-04-23 17:38:06 +0000 UTC
-1327,CLOSED,Question: Metrics - inferences per minute,,2020-04-17 10:15:08 +0000 UTC
-1326,CLOSED,the return value of the function named 'parse_model_http' in v2_image_client.py,,2020-04-21 19:06:09 +0000 UTC
-1321,CLOSED,Core dump when all CUDA-capable devices are busy or unavailable,,2020-05-15 23:46:09 +0000 UTC
-1313,CLOSED,fasterrcnn_resnet50_fpn TorchScript model cannot be loaded,,2020-04-21 19:06:23 +0000 UTC
-1311,CLOSED,error: creating server: INTERNAL - failed to load all models,,2021-12-10 23:55:48 +0000 UTC
-1305,CLOSED,How to send HTTP request to resnet50_netdef model using curl?,,2020-04-15 12:08:39 +0000 UTC
-1301,CLOSED,Unable to use TensorRT Execution Accelerator for ONNX Model,,2020-04-13 15:38:05 +0000 UTC
-1299,CLOSED,Trtsever crashes !!,,2020-04-16 16:44:55 +0000 UTC
-1297,CLOSED,unable to load model 'face_graphdef', configuration expects 1 inputs, model provides at most 0,,2020-04-15 06:01:02 +0000 UTC
-1295,CLOSED,TRITON cannot access MINIO,,2020-04-10 18:21:26 +0000 UTC
-1291,CLOSED,Spammy dynamic batching logs on 20.03,,2020-04-15 00:58:38 +0000 UTC
-1285,CLOSED,Error when input define as identity,,2020-04-09 01:30:47 +0000 UTC
-1284,CLOSED,unexpected shape for output when output is 4 dims,bug,2020-04-13 07:42:13 +0000 UTC
-1276,CLOSED,Model loaded but no input,,2020-04-15 01:02:02 +0000 UTC
-1274,CLOSED,Question: Will Stream endpoint increase GPU utilization?,,2020-04-06 09:57:03 +0000 UTC
-1273,CLOSED,GRPC version update,,2020-04-06 16:21:36 +0000 UTC
-1272,CLOSED,C# .NET,,2021-09-08 20:12:49 +0000 UTC
-1271,CLOSED,Encountered a non-existing input blob,,2020-04-10 09:33:16 +0000 UTC
-1270,CLOSED,Question: Performance difference with Tensorflow Serving,,2020-04-15 19:46:14 +0000 UTC
-1269,CLOSED,Incorrect configuration on DockerFile,,2020-04-07 07:35:17 +0000 UTC
-1264,CLOSED,How to to use model converted by torch2trt,,2020-04-07 23:37:47 +0000 UTC
-1254,CLOSED,provide a way to get trtis's version info,,2020-05-28 20:36:21 +0000 UTC
-1247,CLOSED,simple_perf_client inference size usage,,2020-03-31 15:44:37 +0000 UTC
-1246,CLOSED,Does NOT support M40(maxwell) GPU any more?,,2020-04-01 01:09:13 +0000 UTC
-1241,CLOSED,ONNX-exported torchvision FasterRCNN fails on inference request,,2020-04-16 18:28:30 +0000 UTC
-1240,CLOSED,TF 2.1 SavedModel Format : unexpected input format FORMAT_NONE, expecting FORMAT_NHWC or FORMAT_NCHW,,2022-08-29 23:15:53 +0000 UTC
-1239,CLOSED,How to transform caffe2 .pb model files to caffe2 netdef model files ?,,2020-04-09 01:58:35 +0000 UTC
-1236,CLOSED,TorchScript model -Failed to update context stat: [ 0] INVALID_ARG - Timer not set correctly.,,2020-03-27 20:31:47 +0000 UTC
-1231,CLOSED,Batched model works only with single instance, when sending parallel inference requests using non-cuda shared memory (works on 19.12),,2020-03-29 12:11:30 +0000 UTC
-1222,CLOSED,Incorrect last_inference_timestamp_milliseconds values in api/status output in 1.10.0,,2020-03-31 21:33:15 +0000 UTC
-1214,CLOSED,docker build error,,2020-03-23 15:28:52 +0000 UTC
-1213,CLOSED,unexpected size for output,,2020-04-15 01:00:21 +0000 UTC
-1209,CLOSED,perf_client does not support binary (image) data as TYPE_STRING?,,2020-04-03 17:32:24 +0000 UTC
-1204,CLOSED,C++ Image client for object detection,,2020-03-20 22:54:40 +0000 UTC
-1203,CLOSED,TRT inference server does not start in non-GPU mode,,2020-03-23 07:52:15 +0000 UTC
-1201,OPEN,Pruning of requested outputs in ensemble models,enhancement,2020-04-16 16:01:29 +0000 UTC
-1194,CLOSED,ensemble model question and model priority,,2020-12-10 18:48:43 +0000 UTC
-1188,CLOSED,Access to shared memory & CUDA shared memory through HTTP REST API,,2020-03-16 16:31:05 +0000 UTC
-1184,CLOSED,Accuracy difference between implementations of example image client,,2020-03-24 16:14:33 +0000 UTC
-1178,CLOSED,Cannot build TRTIS clients from on Ubuntu 18.04 docker container.,,2020-05-05 14:30:55 +0000 UTC
-1177,CLOSED,Can't build with glibc 2.30,,2020-03-12 15:12:11 +0000 UTC
-1174,CLOSED,How to make dynamic requests?,,2020-03-09 16:28:02 +0000 UTC
-1173,CLOSED,S3 file system should support remote minio server as well,,2020-03-12 04:56:02 +0000 UTC
-1172,CLOSED,Any examples on stateful models?,,2020-12-29 11:19:12 +0000 UTC
-1171,CLOSED,Error "NotImplementedError: memoryview: unsupported format <b",,2021-09-22 19:37:23 +0000 UTC
-1158,CLOSED,not getting same performance as in perf_client,,2020-03-24 20:21:24 +0000 UTC
-1157,CLOSED,20.02 model load err,,2020-03-08 18:01:58 +0000 UTC
-1154,CLOSED,Error when running TRTIS-20.01 built from source code--"Intel MKL FATAL ERROR: Cannot load libmkl_intel_thread.so",,2020-03-04 21:14:20 +0000 UTC
-1153,CLOSED,Multiple instance of the same model will load multiple copies of the model data in memory,,2020-03-06 15:23:31 +0000 UTC
-1151,CLOSED,How to use perf_client on ensemble models and sequence models,enhancement,2020-04-03 17:38:05 +0000 UTC
-1149,CLOSED,Is there any way to access the build script except pulling the image?,,2020-03-08 18:08:18 +0000 UTC
-1148,CLOSED,branch r20.02 ubuntu 1604 build error,,2020-03-06 18:35:07 +0000 UTC
-1147,CLOSED,HTTP API: Access-Control-Allow-Origin etc.,,2022-08-19 20:59:22 +0000 UTC
-1144,CLOSED,How to send multiple input tensor by rest api,,2020-04-16 12:37:59 +0000 UTC
-1140,CLOSED,Safeguard one model from failures in others,,2020-03-08 18:08:48 +0000 UTC
-1138,CLOSED,perf_client unable to allocate memory on gpu,bug,2020-03-02 17:16:57 +0000 UTC
-1129,CLOSED,Can V2 APIs client satisfied these requirement ?,,2020-05-09 03:20:36 +0000 UTC
-1126,CLOSED,Ensemble model only delivers the first of its configured outputs,,2020-02-24 17:52:50 +0000 UTC
-1125,CLOSED,"Error details: OK" error message from autofill.cc,,2020-02-21 13:11:48 +0000 UTC
-1123,CLOSED,Problem when running sequence models,,2020-09-25 12:57:29 +0000 UTC
-1119,CLOSED,Shared memory pre-allocation,,2020-02-21 22:23:30 +0000 UTC
-1115,CLOSED,what is the motivation for v2 refactory?,,2020-02-19 08:43:32 +0000 UTC
-1114,CLOSED,How to get the versioned model path when implementing a custom backend (Context),,2020-02-24 18:08:57 +0000 UTC
-1113,CLOSED,clientsdk: gnutls "non-properly terminated" error in git clone interrupts build (v1.10.0),,2020-02-18 21:59:22 +0000 UTC
-1112,CLOSED,Error while loading shared libraries: libtensorflow_framework.so.1,,2020-02-27 11:11:59 +0000 UTC
-1106,CLOSED,Concurrency problem due to allocation of output buffers on device,bug,2020-03-18 18:05:38 +0000 UTC
-1105,CLOSED,TRTIS failed to load 'trt model: unexpected configuration maximum batch size 64 for 'resnet50_trt_0_gpu0', model maximum is 1 as model does not contain an implicit batch dimension nor the explicit batch-dimension of 'gpu_0/data_0' is a wildcard,,2020-03-24 20:21:45 +0000 UTC
-1102,CLOSED,Memory Leak on 20.02 ver,,2020-03-02 21:57:54 +0000 UTC
-1100,CLOSED,golang client input of TYPE_STRING,,2020-02-18 03:18:06 +0000 UTC
-1099,CLOSED,torch/script.h: No such file or directory,,2020-02-11 04:05:46 +0000 UTC
-1098,CLOSED,problem about using pytorch_libtorch platform,,2020-02-11 08:35:14 +0000 UTC
-1097,CLOSED,http,,2020-02-10 16:54:25 +0000 UTC
-1096,CLOSED,custom backend with TYPE_STRING output?,,2020-02-10 17:07:29 +0000 UTC
-1082,CLOSED,nv_gpu_memory_used_bytes metric does not decrease on model unload,,2020-02-04 16:24:21 +0000 UTC
-1081,CLOSED,Build TRTIS by enabling trace, the inference time descreased 20%,,2020-02-03 02:54:32 +0000 UTC
-1080,CLOSED,trtserver uses more than 20 CPUs,,2020-03-08 02:53:27 +0000 UTC
-1063,CLOSED,Add support for unidirectional streaming inference,,2020-02-03 18:17:15 +0000 UTC
-1062,CLOSED,Thread-safety of CustomGetNextInputFn and CustomGetOutputFn,,2020-02-03 18:18:11 +0000 UTC
-1061,CLOSED,Multi-Process Server (MPS),,2022-08-22 22:22:27 +0000 UTC
-1055,CLOSED,ensemble scheduler reshape not working as expected,,2020-01-24 18:53:34 +0000 UTC
-1054,CLOSED,No postprocessing in case of aync call in image_client example,,2020-01-24 18:15:10 +0000 UTC
-1053,CLOSED,What does userp mean?,,2020-01-24 03:51:26 +0000 UTC
-1044,CLOSED,exceeds maximum batch size,,2020-01-22 22:39:05 +0000 UTC
-1043,CLOSED,Where to find the model def for image_preprocess_nchw_3x224x224_inception?,,2020-11-12 02:14:22 +0000 UTC
-1042,CLOSED,Is it possible to compile a version for MACOS?,,2021-02-16 18:31:35 +0000 UTC
-1041,CLOSED,Reshape 's requested shape is incorrect,,2020-02-03 09:03:59 +0000 UTC
-1040,CLOSED,ONNX multi dynamic_axes cause error,,2020-03-31 16:49:46 +0000 UTC
-1038,CLOSED,What format is the status in?,,2020-01-19 22:55:29 +0000 UTC
-1032,CLOSED,How to transfer request to sequence model on trtis by http?,,2020-01-19 01:51:58 +0000 UTC
-1029,CLOSED,Tensorflow 2.0 models with TRTIS,,2020-03-20 19:46:40 +0000 UTC
-1026,CLOSED,How can we share variables between distinct custom backends in the ensemble-model ?,,2020-01-19 13:54:06 +0000 UTC
-1025,CLOSED,No error when asking for invalid TensorRT output tensor name,,2020-01-24 18:49:42 +0000 UTC
-1024,CLOSED,Error in dlopen or dlsym: libthnvrtc.so,,2020-01-17 02:45:12 +0000 UTC
-1023,CLOSED,local s3 storage,,2020-02-03 21:33:28 +0000 UTC
-1021,CLOSED,How to optimize config file for MaskRCNN trt model on V100?,,2020-02-03 18:25:33 +0000 UTC
-1020,CLOSED,Failed to deserialize trt model.,,2020-01-16 12:32:42 +0000 UTC
-1019,CLOSED,deploying model using trtis is much slower than using frozen model directly,,2020-02-03 18:24:52 +0000 UTC
-1018,CLOSED,same code. run ok on version 19.04 but cannot run in version 19.11 or 19.12,,2020-03-16 16:29:19 +0000 UTC
-1015,CLOSED,maskrcnn-benchmark pytorch model error,,2020-01-16 21:18:55 +0000 UTC
-1010,CLOSED,How to parse NV-InferResponse header fastly in HTTP response?,,2020-01-08 23:57:55 +0000 UTC
-1009,CLOSED,TRTIS support TensorRT7,,2020-01-09 16:18:56 +0000 UTC
-1001,CLOSED,CPU memory grows up while using CUDA shared memory,,2020-01-13 04:24:34 +0000 UTC
-999,CLOSED,How to send integer string to inference server by http?,,2021-12-30 04:11:16 +0000 UTC
-998,CLOSED,SavedModel load sees different input tensor shape than exists in the model,,2020-01-08 20:09:32 +0000 UTC
-992,CLOSED,GPU memory didn't clean up as expected,,2020-01-15 16:47:58 +0000 UTC
-991,CLOSED,Is tensorrt-inference-server support savedmodel model that include sparse tensor input?,,2020-01-03 22:25:11 +0000 UTC
-990,CLOSED,Is InferContext in python/c++ api related to cuda context?,,2019-12-31 01:50:50 +0000 UTC
-989,CLOSED,How to use predict interface for savedmodel that inputs have 'coo_sparse' what is SparseTensor?,,2019-12-31 01:42:21 +0000 UTC
-988,CLOSED,Ensembling : Python custom operation,,2019-12-30 18:10:13 +0000 UTC
-987,CLOSED,What is the reason not to use the tensorflow serving?,,2020-02-27 17:48:56 +0000 UTC
-982,CLOSED,Questions Regarding Failures,,2020-01-03 15:07:09 +0000 UTC
-981,CLOSED,The size of the input dimensions that correspand for each batch must be equal?,,2019-12-27 01:38:08 +0000 UTC
-980,CLOSED,Requests support priority and timeout settings,enhancement,2020-03-03 19:00:55 +0000 UTC
-979,CLOSED,/api/status call before initialization causes segfault?,,2020-02-10 17:17:20 +0000 UTC
-976,CLOSED,Does the model support two variable size input dimensions(exclude batch dimension)?,,2019-12-20 02:06:28 +0000 UTC
-975,CLOSED,Curl request error in tensorrt-inference-server (Infer failed: unexpected size for input 'im_info', expecting 12 bytes for model 'ssd_model2),,2020-01-02 07:31:43 +0000 UTC
-971,CLOSED,I got a problem when I use trtis client,,2020-02-03 18:26:15 +0000 UTC
-968,CLOSED,Allow batch size 0 in ensembles,,2022-06-01 23:24:18 +0000 UTC
-967,CLOSED,Unexpected size for input,,2019-12-19 10:33:14 +0000 UTC
-964,CLOSED,libopencv_imgcodecs.so.3.2 was not found when I tried Example Custom Backend,,2019-12-19 17:54:02 +0000 UTC
-963,CLOSED,No Active Model,,2019-12-30 19:56:28 +0000 UTC
-958,CLOSED,Error from tensorflow_savedmodel,,2019-12-17 12:50:15 +0000 UTC
-944,CLOSED,What are the hardware requirements for trtis19_09，and tritis19_05?,,2019-12-10 11:08:58 +0000 UTC
-939,CLOSED,tensorrtserver:19.09-py3+tensorflow1.14.0 encounter error,,2019-12-10 17:50:43 +0000 UTC
-924,CLOSED,How to set batch size in onnx model,,2019-12-02 08:44:24 +0000 UTC
-923,CLOSED,TensorFlow 2 support,,2020-03-08 18:05:56 +0000 UTC
-922,CLOSED,Low GPU util without docker,,2020-01-06 16:33:46 +0000 UTC
-915,CLOSED,Error with Sequence Batch,,2019-12-30 19:56:47 +0000 UTC
-914,CLOSED,Server Queue,enhancement,2023-06-22 06:42:40 +0000 UTC
-911,CLOSED,Cannot load onnx model,,2019-12-02 21:26:59 +0000 UTC
-906,CLOSED,Dynamic batch scheduling in ensemble models,,2022-05-23 13:10:49 +0000 UTC
-903,CLOSED,Request batch size greater than max_batch_size,,2019-11-21 20:35:01 +0000 UTC
-900,CLOSED,pytorch bert model error,,2020-02-20 00:08:42 +0000 UTC
-899,CLOSED,Docs provided to create local cmake build trt inference server with Tensorflow not working properly or don't have enough information to build the required builds,,2019-12-16 23:58:03 +0000 UTC
-892,CLOSED,Allocations for TensorRT models with dynamic batch size are much too large,,2019-11-20 21:11:16 +0000 UTC
-891,OPEN,Separate static / shared client lib dependencies,enhancement,2020-03-08 18:15:37 +0000 UTC
-889,CLOSED,aws s3 model_repository error,,2020-11-24 19:53:06 +0000 UTC
-880,CLOSED,dynamic batch size parameter not recognized by python api,,2019-12-30 19:58:08 +0000 UTC
-877,CLOSED,network with output dims [ 1 ] , how to define config.pbtxt output field,,2019-11-18 23:58:14 +0000 UTC
-874,CLOSED,Can't get hidden layer as output?,,2019-12-09 12:34:00 +0000 UTC
-870,CLOSED,Parameters support for ensemble model during execution time,,2021-04-19 02:51:06 +0000 UTC
-868,CLOSED,Do you have a web_UI to manage models ?,,2019-11-13 16:28:41 +0000 UTC
-867,CLOSED,any plan for client sdk in other languages?,,2020-01-11 13:01:43 +0000 UTC
-866,CLOSED,[Question] Best way to upscale video using TRTIS?,,2019-11-18 23:52:31 +0000 UTC
-863,CLOSED,How to select the OpenVino execution provider for ONNX runtime backend in configuration file? Could you please provide an example?,,2019-11-13 03:25:23 +0000 UTC
-862,CLOSED,How to select the OpenVino execution provider for ONNX runtime backend in configuration file? Could you please provide a example?,,2019-11-13 03:10:33 +0000 UTC
-859,CLOSED,Warning: Explicit batch network detected and batch size specified, use enqueue without batch size instead.,,2021-07-19 06:35:19 +0000 UTC
-857,CLOSED,python not found,,2019-11-12 16:15:34 +0000 UTC
-855,CLOSED,Op type not registered 'BatchMatMulV2' in binary running,,2019-11-18 23:51:48 +0000 UTC
-848,CLOSED,There are some bugs in ICaffeParser,,2019-11-11 21:39:53 +0000 UTC
-843,CLOSED,HTTP allocation failed,,2019-11-10 15:50:42 +0000 UTC
-842,CLOSED,Confusing error message on loading and checking SequenceControl information.,,2019-12-18 17:58:40 +0000 UTC
-838,CLOSED,Tensorflow backend for Jetson,,2019-11-07 21:37:06 +0000 UTC
-837,CLOSED,Improve error messages: backend not installed -> Segfault,bug,2019-11-15 20:24:42 +0000 UTC
-819,CLOSED,Unable to run optimized bert from TensorRT python,,2021-03-21 23:32:49 +0000 UTC
-808,CLOSED,RNN in ONNX model does not give correct output with batch_size > 1,,2019-11-05 14:56:49 +0000 UTC
-796,CLOSED,unknown output name 'OUTPUT__0' for pytorch model and ctx.get_server_status() get protobuf error,,2019-11-07 01:20:51 +0000 UTC
-794,CLOSED,Build Error with CMAKE,,2019-10-28 21:44:57 +0000 UTC
-786,CLOSED,Unable to run optimized BERT: model shape expected by framework [-1,-1] doesn't match model configuration shape [-1,-1],,2019-10-28 23:56:20 +0000 UTC
-776,CLOSED,Could I get request access log instead of starting server with --verbose-log=1?,enhancement,2023-07-11 21:14:55 +0000 UTC
-775,CLOSED,How to config input has shape: <unknown> ?,,2019-10-24 16:28:29 +0000 UTC
-772,CLOSED,Provide test harness for custom backends,,2019-10-25 15:22:38 +0000 UTC
-768,CLOSED,Multiple instance limited with 50% utilization,,2019-10-31 18:48:44 +0000 UTC
-767,CLOSED,About Custom Plugin,,2020-03-30 16:00:57 +0000 UTC
-760,CLOSED,build error with cmake on version r19.09,,2019-10-25 23:12:33 +0000 UTC
-754,CLOSED,Dynamic batching - don't discard built models,,2019-10-14 18:21:28 +0000 UTC
-750,CLOSED,Can trt-server support TVM exported optimized libcode to run on datacenter?,,2019-10-11 16:52:38 +0000 UTC
-749,CLOSED,savemodel error: creating server: INTERNAL - failed to open text file for read /models/pb_version18_batch_FP16_TRT_docker_32/config.pbtxt: No such file or directory,,2019-10-14 16:51:01 +0000 UTC
-748,CLOSED,savemodel error: creating server: INTERNAL - failed to open text file for read /models/pb_version18_batch_FP16_TRT_docker_32/config.pbtxt: No such file or directory,,2019-10-10 15:38:13 +0000 UTC
-746,CLOSED,[Documentation] Build tensorflow library from source lacks needed scripts,,2019-10-14 17:20:02 +0000 UTC
-745,CLOSED,CUDNN_STATUS_MAPPING_ERROR,,2019-10-30 23:06:42 +0000 UTC
-744,CLOSED,librequest.so undefined reference to `nvidia::inferenceserver::InferResponse::~InferResponse()' in C++ implementing,,2019-10-14 17:23:26 +0000 UTC
-736,CLOSED,Integration with DALI-based custom backend for preprocessing?,,2020-09-30 17:51:50 +0000 UTC
-735,CLOSED,trtis loads model but does not detect input/output nodes,,2019-10-14 16:46:36 +0000 UTC
-727,CLOSED,failed to load Tensor shape expected by framework [] doesn't match model configuration shape [-1],,2020-04-28 16:28:00 +0000 UTC
-714,CLOSED,Metrics on Jetson TX2 do not contain nv_gpu information,,2019-10-14 17:24:09 +0000 UTC
-708,CLOSED,Feature request: model warmup,enhancement,2021-09-22 16:14:27 +0000 UTC
-706,CLOSED,Loading tftrt optimized models,,2019-10-04 21:07:11 +0000 UTC
-705,CLOSED,Update Protobuf dependencies,,2019-10-02 13:45:08 +0000 UTC
-704,CLOSED,Can you help me how to freeze_graph right pb,,2019-09-30 15:59:09 +0000 UTC
-703,CLOSED,Building clients fails on Raspberry Pi,,2019-10-14 17:25:55 +0000 UTC
-698,CLOSED,Documentation missing for the C++ API with latest release,,2019-09-27 21:13:54 +0000 UTC
-684,CLOSED,How to retrieve output tensor values (C++ Api),,2019-09-26 10:35:42 +0000 UTC
-679,CLOSED,GDB cannot access memory,,2019-09-20 23:28:07 +0000 UTC
-678,CLOSED,Docker Build Failing,,2019-09-24 23:01:42 +0000 UTC
-675,CLOSED,Doesn't ModelControlAPI manage unloaded model's GPU memory?,,2020-04-29 21:18:05 +0000 UTC
-669,CLOSED,HTTP keep-alive,,2019-09-23 07:27:27 +0000 UTC
-666,CLOSED,GetRawAtCursor returns unexpected results for C++ gRPC client,,2019-10-05 00:29:16 +0000 UTC
-663,CLOSED,The result is inf, inf, nan, nan,,2019-09-18 02:18:58 +0000 UTC
-651,CLOSED,Basic examples with the gRPC and/or REST API Plzzzzzzzzzzzzzzzzz!,,2019-09-15 21:09:34 +0000 UTC
-644,CLOSED,Skip "/api/status/model"?,,2019-09-11 22:47:04 +0000 UTC
-643,CLOSED,Support specifying host header in example clients grpc?,,2022-06-16 00:10:52 +0000 UTC
-641,CLOSED,How to preserve request / response order with the callback version of AsyncRun?,,2019-09-12 00:29:46 +0000 UTC
-640,CLOSED,batch_size has no efffect on the infence time,,2019-09-26 21:40:05 +0000 UTC
-635,CLOSED,cudaStreamCaptureModeGlobal not declared,,2021-11-16 02:30:43 +0000 UTC
-634,CLOSED,[Pytorch] Multiple instance of the model on same GPU provide no speedup,,2020-03-08 18:17:34 +0000 UTC
-633,CLOSED,Output data got from response body is in binary format when doing inference through http post.,,2019-09-26 21:41:12 +0000 UTC
-617,CLOSED,incremental build error,,2019-09-09 19:36:30 +0000 UTC
-612,CLOSED,taking to too much time for "HTTP client failed: Couldn't connect to server",,2021-01-15 22:01:58 +0000 UTC
-610,CLOSED,Excessive memory usage for Tensorflow SavedModel,,2019-09-04 20:13:13 +0000 UTC
-609,CLOSED,Publish client docker image in NGC,,2019-09-09 19:37:41 +0000 UTC
-608,CLOSED,NGC Inference Server Container Website Broken,,2019-08-31 23:19:20 +0000 UTC
-603,CLOSED,Changing batch size between requests with shared memory fails,,2019-09-12 22:19:30 +0000 UTC
-597,CLOSED,Does TF-TRT used in TRTIS tensorflow runtime engine?,,2021-05-12 18:58:24 +0000 UTC
-590,CLOSED,Ensemble configuration setup,,2019-09-09 19:39:09 +0000 UTC
-584,CLOSED,error input size deply ssd on trtis,,2019-08-25 09:13:08 +0000 UTC
-572,CLOSED,python3 in client docker complains about missing numpy,,2019-08-27 17:55:06 +0000 UTC
-565,CLOSED,CustomGetNextInput_fn,,2019-08-20 19:55:55 +0000 UTC
-564,CLOSED,model repository API?,,2019-08-27 00:53:35 +0000 UTC
-563,CLOSED,TRTIS gRPU DRAM Out-Of-Memory,,2019-08-20 19:35:14 +0000 UTC
-562,CLOSED,Feature request: add ByteSize() to Output class,enhancement,2019-09-19 00:33:32 +0000 UTC
-561,CLOSED,facenet input shape problem,,2019-09-26 21:41:53 +0000 UTC
-557,CLOSED,trtis cannot serve mask rcnn onnx model,,2019-08-27 18:33:27 +0000 UTC
-556,CLOSED,Header path issues in librequest.so,bug,2019-08-29 15:43:47 +0000 UTC
-550,CLOSED,trtserver: regionFormat.cpp:65: size_t nvinfer1::RegionFormatB::memorySize(int, const nvinfer1::Dims&) const: Assertion `batchSize > 0' failed.,,2022-06-30 16:22:28 +0000 UTC
-547,CLOSED,upsample,,2019-08-27 00:57:36 +0000 UTC
-546,CLOSED,image_cilent,,2019-08-27 00:57:17 +0000 UTC
-545,CLOSED,Custom Operations with Docker,,2021-07-26 18:29:26 +0000 UTC
-544,CLOSED,Shared Memory client fails for batch size != 1,,2019-08-16 00:20:18 +0000 UTC
-543,CLOSED,Output shape with Pytorch model,,2019-09-12 03:41:59 +0000 UTC
-539,CLOSED,UNIMPLEMENTED endpoints using java grpc,,2020-01-13 16:55:54 +0000 UTC
-531,CLOSED,Cannot compile trtis-clients,,2019-08-12 21:07:12 +0000 UTC
-521,CLOSED,TRT support for MaskRCNN,,2019-09-09 21:54:14 +0000 UTC
-519,CLOSED,Java client from .proto files,,2020-01-11 13:04:09 +0000 UTC
-517,CLOSED,Cannot run tensorrtserver with python3,,2019-08-12 21:12:22 +0000 UTC
-514,CLOSED,Label map query,,2019-08-05 17:13:42 +0000 UTC
-513,CLOSED,perf_client will silently ignore -f flag and not output a CSV in static concurrency mode,bug,2019-08-09 23:52:15 +0000 UTC
-509,CLOSED,Allow manual batching for TensorRT plans with max_batch_size=0,enhancement,2019-08-07 22:27:11 +0000 UTC
-501,CLOSED,docker installation in c++,,2019-07-29 22:07:31 +0000 UTC
-498,CLOSED,client installation in c++,,2019-08-02 21:12:45 +0000 UTC
-497,CLOSED,ensemble model MODEL_UNAVAILABLE,,2019-07-26 02:34:37 +0000 UTC
-490,CLOSED,CMake build error r19.07,,2019-07-23 23:15:08 +0000 UTC
-487,CLOSED,Computation performance difference between TRTIS and TRT with Yolov3 ONNX model,,2019-07-23 11:28:44 +0000 UTC
-483,CLOSED,Specify which port to expose services at,,2019-07-19 21:31:00 +0000 UTC
-481,CLOSED,Missing layout specification of serialized tensors in the inference API,,2019-07-23 00:52:39 +0000 UTC
-477,CLOSED,TRT+CELERY unexpected: InferenceServerException('c_void_p(122485 792)',),,2021-06-08 16:40:44 +0000 UTC
-471,CLOSED,How to migrate the preprocess and postprocess to the trt server?,,2019-07-24 16:18:12 +0000 UTC
-470,CLOSED,how to accelerate docker pull?,,2019-07-22 07:05:01 +0000 UTC
-469,CLOSED,Is there a matrix of backend version ?,,2019-07-17 16:34:09 +0000 UTC
-459,CLOSED,Can't use GCS bucket name as model repository root,bug,2019-07-17 22:39:22 +0000 UTC
-458,CLOSED,Explicitly configured max_batch_size is ignored by Autofill (ONNX, TensorFlow),,2019-07-16 20:56:44 +0000 UTC
-457,CLOSED,Deploying TF-serving SavedModel format to TRT-5.1.5 without conversion,,2019-07-15 16:04:54 +0000 UTC
-456,CLOSED,TensorRT server stucked when I run perf_client command,,2019-07-31 00:30:38 +0000 UTC
-454,CLOSED,ModelRepositoryManager doesn't fail on zero-prefixed version directories,,2019-07-17 18:25:15 +0000 UTC
-453,CLOSED,1.3.0 documentation doesn't mention removal of GCS support,,2019-07-16 19:23:03 +0000 UTC
-452,CLOSED,run tensorrtserver_client on centos,,2019-11-11 15:27:21 +0000 UTC
-448,CLOSED,Unable to load model repository on Google Cloud Storage with 1.3.0,,2019-07-13 17:58:44 +0000 UTC
-447,CLOSED,Dynamic input and output shape definition for object detection,,2019-07-17 07:16:55 +0000 UTC
-446,CLOSED,must map ensemble input INPUT for ensemble ensemble_model,,2019-07-11 16:27:05 +0000 UTC
-444,CLOSED,perf_client with user-defined data and variable size,,2019-07-10 21:17:16 +0000 UTC
-443,CLOSED,Ensembling and varying batch sizes,,2019-07-11 17:57:10 +0000 UTC
-439,CLOSED,Arbitrary metadata with models,,2023-03-01 21:52:51 +0000 UTC
-438,CLOSED,InferenceServerException: [ 0] invalid size 22110 bytes for input 'gpu_0/data', expects 4 bytes,,2019-07-09 17:00:59 +0000 UTC
-435,CLOSED,Caffe2 model crashes on loading,bug,2019-07-15 16:08:55 +0000 UTC
-432,CLOSED,Tensorflow shape detection fails with multiple versions.,bug,2019-07-09 23:07:32 +0000 UTC
-418,CLOSED,How to deploy maskRCNN pytorch model?,,2021-05-05 17:00:13 +0000 UTC
-409,CLOSED,Config proto formatting,,2019-06-27 17:35:26 +0000 UTC
-386,CLOSED,Support specifying host header in example clients,enhancement,2019-06-27 19:04:40 +0000 UTC
-385,CLOSED,curl example for simple-server,,2019-06-18 17:19:45 +0000 UTC
-384,CLOSED,[feature request] Publish Python Client Library to PyPi,,2019-06-19 01:25:45 +0000 UTC
-381,CLOSED,NMT output different from normal inference,,2019-07-27 23:05:50 +0000 UTC
-378,CLOSED,Unable to run TensorRTIS on AKS,,2019-06-16 03:59:01 +0000 UTC
-376,CLOSED,Support for SSD, NMT is missing from TensorRT Inference Server,,2021-08-30 08:57:05 +0000 UTC
-373,CLOSED,What is the meaning of metrics: nv_inference_load_ratio?,,2019-06-13 22:09:08 +0000 UTC
-366,CLOSED,Impact of "instance-group" option,,2019-06-14 22:58:37 +0000 UTC
-363,CLOSED,TRTIS stops serving all models when uploading a new model,bug,2019-08-08 15:55:43 +0000 UTC
-358,CLOSED,Support for XGBoost,enhancement,2021-04-30 02:59:24 +0000 UTC
-356,CLOSED,On demand Polling,,2019-08-10 00:58:56 +0000 UTC
-352,CLOSED,how do i calculate the overlap of two class?,,2019-06-11 17:39:12 +0000 UTC
-351,CLOSED,tensorrtserver.api.InferenceServerException: [ 0] invalid size 4153344 bytes for input 'input/input_data', expects 2076672 bytes,,2019-06-17 23:43:51 +0000 UTC
-343,CLOSED,Ensemble communication improvements: avoid tensor copies,enhancement,2019-12-30 18:20:16 +0000 UTC
-331,CLOSED,ppc64le, TensorRT Inference Server Release 19.05,,2019-06-04 17:16:53 +0000 UTC
-326,CLOSED,is_training/keep_prob tensor with 19.05,,2019-06-25 05:35:32 +0000 UTC
-325,CLOSED,dims [-1] don't match configuration dims [-1],,2019-06-06 16:27:26 +0000 UTC
-324,CLOSED,Support model repositories on s3 or azure blob,enhancement,2019-08-12 21:11:30 +0000 UTC
-323,CLOSED,can the server docker built from source on jetson baord with aarch64 cpu ?,,2019-06-06 16:37:03 +0000 UTC
-317,CLOSED,tensorrt version mismatch while serving plan files.,,2019-05-31 18:39:42 +0000 UTC
-312,CLOSED,How to pass placeholders other than input - MTCNN,,2020-04-22 07:03:35 +0000 UTC
-311,CLOSED,No PYVER in Dockerfile.client,,2019-06-19 17:48:17 +0000 UTC
-310,CLOSED,Looser coupling against TensorRT versions,,2019-05-28 20:55:25 +0000 UTC
-309,CLOSED,Directly crash after log "Adding visible gpu devices: 0",,2019-05-27 11:00:54 +0000 UTC
-308,CLOSED,why the uff parser is so slow ?,,2019-05-28 16:00:27 +0000 UTC
-307,CLOSED,memlock can not be setted in k8s,,2019-05-30 16:31:52 +0000 UTC
-302,CLOSED,Time to get_async_run_results is very slow,,2019-05-30 16:44:33 +0000 UTC
-300,CLOSED,About save the serialized model of tensor rt model...,,2019-05-22 16:06:04 +0000 UTC
-298,CLOSED,caffe2_netdef InferenceServerException,bug,2019-06-04 19:31:56 +0000 UTC
-297,CLOSED,grpc client for a model with multiple outputs and input dims is -1,,2019-06-02 11:05:15 +0000 UTC
-296,CLOSED,grpc_image_client.py example with raw output reading?,,2019-05-17 17:04:32 +0000 UTC
-295,CLOSED,nv_inference_load_ratio_bucket,,2019-09-09 21:48:43 +0000 UTC
-294,CLOSED,Invalid argument: model input must specify 'dims',,2019-06-03 09:04:34 +0000 UTC
-293,CLOSED,ONNX support,,2019-06-03 16:48:38 +0000 UTC
-289,CLOSED,Execution of models in ensemble does not depend on requested outputs,enhancement,2019-09-09 18:18:27 +0000 UTC
-287,CLOSED,AttributeError: module 'common' has no attribute 'allocate_buffers',,2019-05-11 11:49:08 +0000 UTC
-285,CLOSED,Custom backends with errors are flagged as READY,bug,2019-06-27 17:32:42 +0000 UTC
-282,CLOSED,YOLOv3 model configuration issue,,2020-02-01 15:37:57 +0000 UTC
-281,CLOSED,Transform ONNX to TensorRT Fail,,2019-05-09 20:04:32 +0000 UTC
-280,CLOSED,inference by CURL ?,,2019-05-10 02:40:18 +0000 UTC
-279,CLOSED,InferContext.ResultFormat.CLASS for RNN,,2019-05-10 04:01:24 +0000 UTC
-278,CLOSED,Windows Python Client Possibility,,2020-04-21 16:37:23 +0000 UTC
-276,CLOSED,TRTIS in pod segfaulting,,2019-09-09 21:49:26 +0000 UTC
-274,CLOSED,Wrong output order retrieval using ensemble model,,2019-05-14 16:27:55 +0000 UTC
-273,CLOSED,Handling input with shape [seq_len, batch_size],,2019-05-08 06:28:13 +0000 UTC
-272,CLOSED,TRTIS crashes in GCP after the first request without any error messages when more than 10 models are loaded.,,2019-05-07 17:05:08 +0000 UTC
-270,CLOSED,Dynamic batch size for input with shape -1,,2021-11-03 22:49:23 +0000 UTC
-269,CLOSED,Inference performance hit a limit with one trtserver instance via grpc,bug,2019-07-03 19:29:47 +0000 UTC
-264,CLOSED,Config.pbtxt and max_batch_size setting for SavedModel,,2019-10-11 08:08:22 +0000 UTC
-261,CLOSED,Model status API should report per-model-instance memory usage,enhancement,2023-07-08 00:16:15 +0000 UTC
-260,CLOSED,Custom backend and label file,,2019-05-09 19:43:28 +0000 UTC
-258,CLOSED,Jmeter failed to get response from trtis,,2019-05-07 07:01:37 +0000 UTC
-257,CLOSED,Error ensemble_scheduling,,2019-04-29 16:13:12 +0000 UTC
-252,CLOSED,about the ensemble,,2019-04-25 07:40:27 +0000 UTC
-251,CLOSED,Error perf_client with dynamic batch Caffe2 model,,2019-04-29 15:47:11 +0000 UTC
-246,CLOSED,TRTIS1.0.0: Timer not set correctly / No valid requests recorded within time interval / Must specify at least one target,,2019-04-24 16:34:02 +0000 UTC
-239,CLOSED,Unexpected TensorRT5.1.2 Results vs TRTIS1.0.0 Results,,2019-06-06 16:33:28 +0000 UTC
-238,CLOSED,Tool to convert an uff file to a plan file,,2019-04-22 15:46:22 +0000 UTC
-237,CLOSED,Support dynamic Model Outputs,,2019-04-22 16:32:04 +0000 UTC
-235,CLOSED,Incremental Builds failed,,2019-06-06 16:31:42 +0000 UTC
-228,CLOSED,Does TRTIS support MXNet and Pytorch models?,,2019-04-15 21:10:41 +0000 UTC
-223,CLOSED,ready_state: MODEL_UNAVAILABLE: TensorFlow SavedModel configuration,,2019-04-16 18:00:05 +0000 UTC
-220,CLOSED,Client Examples for Several Applications,,2019-05-09 19:42:45 +0000 UTC
-217,CLOSED,Object Detection Tensorflow example request,,2020-06-02 11:15:10 +0000 UTC
-214,CLOSED,Specifying Optimization Policy while using TensorRT Inference Server,,2019-04-09 16:50:53 +0000 UTC
-213,CLOSED,difference results between infer using .pb and infer using trtis,,2019-05-28 13:31:14 +0000 UTC
-207,CLOSED,trtis uses too much ram,,2019-08-18 18:00:06 +0000 UTC
-202,CLOSED,Run tensorrtserver failed!,,2019-04-04 17:07:34 +0000 UTC
-200,CLOSED,i update TRT 5.0 to TRT 5.1 and it show a warning,,2019-04-09 16:57:14 +0000 UTC
-194,CLOSED,Why TRTIS Accelerated limited?,,2019-04-09 16:56:02 +0000 UTC
-192,CLOSED,TYPE_STRING: failed to set result for entire batch,,2019-04-04 15:30:51 +0000 UTC
-187,CLOSED,Image compression,,2020-02-03 17:26:33 +0000 UTC
-185,CLOSED,Can tensorrtservere-19.03-py3 support cuda-driver-396.26 and cuda9.0?,,2019-04-02 23:45:34 +0000 UTC
-184,CLOSED,Caffe2 Backend: Support models with flexible input/output shapes in batch mode,,2019-04-16 17:51:17 +0000 UTC
-175,CLOSED,Feature Request: Add Go Client,enhancement,2019-09-14 21:57:38 +0000 UTC
-173,CLOSED,Account for xxx.yy.zz driver version numbers,,2019-03-22 23:52:46 +0000 UTC
-171,CLOSED,CUDAStreams only supported for TensorRT backend?,,2019-03-21 16:08:02 +0000 UTC
-170,CLOSED,Relationship between src/custom/ and src/servables/custom?,,2019-03-20 23:36:25 +0000 UTC
-169,CLOSED,Building the Server Failed,,2020-11-23 13:31:16 +0000 UTC
-168,CLOSED,How to use different max_batch_size configurations for version?,,2019-06-12 07:31:23 +0000 UTC
-162,CLOSED,image_client demos don't support dynamic input/output shapes,,2019-03-20 15:42:38 +0000 UTC
-156,CLOSED,perf_client not included in docker image,,2019-03-14 15:17:30 +0000 UTC
-155,CLOSED,how can i get a TRTIS which support tensorrt 4.0?,,2019-03-14 11:22:49 +0000 UTC
-147,CLOSED,Question: TRTIS Caffe2 does not support models with both CPU and GPU ops?,bug, enhancement,2020-06-25 17:24:57 +0000 UTC
-145,CLOSED,TensorRT Plan blocking GPU Memory,,2020-04-29 21:14:17 +0000 UTC
-142,CLOSED,Visualization tool that shows the TensorRT Inference Server Metrics,,2019-04-02 23:49:45 +0000 UTC
-141,CLOSED,TRTIS and Kubeflow docker image,,2019-03-08 20:23:24 +0000 UTC
-140,CLOSED,HTTP 200 response on invalid output names,bug,2019-04-26 17:20:31 +0000 UTC
-139,CLOSED,can not run tensorrt-inference-servere-19.02 with cuda-driver-396.26,,2019-03-08 18:47:16 +0000 UTC
-136,CLOSED,Performance Example Application: [ 0] INTERNAL - No valid requests recorded within time interval. Please use a larger time window.,,2019-03-08 15:47:16 +0000 UTC
-135,CLOSED,tensorrtserver_clients docker image: ERROR: No supported GPU(s) detected to run this container,,2019-03-07 20:11:10 +0000 UTC
-134,CLOSED,Unable to collect inference metrics for nullptr servable,bug,2019-03-21 15:48:06 +0000 UTC
-133,CLOSED,How to deploy serialized models?,,2019-03-15 19:35:27 +0000 UTC
-131,CLOSED,Does TRTIS support model parallelism?,,2023-01-30 02:19:49 +0000 UTC
-122,CLOSED,Potential GPU memory leak for TensorFlow models?,bug,2022-02-17 10:01:14 +0000 UTC
-118,CLOSED,image client build error,,2019-03-01 16:53:15 +0000 UTC
-110,CLOSED,Linking TRTIS as a library,enhancement,2019-10-21 16:37:12 +0000 UTC
-104,CLOSED,unexpected shape for input 'input' for model,,2019-02-21 17:05:00 +0000 UTC
-103,CLOSED,Error With Running the samples,,2019-02-20 22:43:03 +0000 UTC
-101,CLOSED,Spammy log: failed to get energy consumption,,2019-02-20 19:04:44 +0000 UTC
-99,CLOSED,kInvalidBinNum error,,2019-02-19 14:46:49 +0000 UTC
-95,CLOSED,Warning about Dynamic Batching and thread count,enhancement,2019-02-20 00:00:39 +0000 UTC
-94,CLOSED,load error when signature changed,,2019-02-20 00:17:31 +0000 UTC
-91,CLOSED,image_client.py due to message type error,,2019-02-15 16:27:52 +0000 UTC
-76,CLOSED,TRTIS does not load model,,2019-02-11 15:58:31 +0000 UTC
-72,CLOSED,image_client libopencv_highgui.so error,,2019-02-06 16:46:58 +0000 UTC
-71,CLOSED,Issue with simple_string model in the examples,,2019-02-07 05:45:52 +0000 UTC
-70,CLOSED,"authentication required" trying to build the server,,2019-02-03 01:25:10 +0000 UTC
-65,CLOSED,Best practice for custom backend with additional resources,enhancement,2019-02-25 16:42:54 +0000 UTC
-55,CLOSED,Multiple GPU scheduling,bug,2019-01-29 19:55:31 +0000 UTC
-53,CLOSED,resnet50_netdef does not run on CPU,,2019-02-02 18:39:53 +0000 UTC
-50,CLOSED,simple_client.py: unexpected additional input data for model 'simple',,2019-01-23 11:07:20 +0000 UTC
-49,CLOSED,NVIDIA Quadro M1000M GPU not supported by tensorrtserver:18.12-py3,,2019-01-23 16:26:39 +0000 UTC
-48,CLOSED,the TRTIS could not load tf-trt frozen model,,2019-01-31 03:11:21 +0000 UTC
-47,CLOSED,Encountered an error while loading tensorflow savedmodel,,2019-01-30 16:44:13 +0000 UTC
-44,CLOSED,the TRTIS can not load the trt model,,2019-01-23 16:20:51 +0000 UTC
-40,CLOSED,Cannot deserialize plugin RPROI_TRT,,2019-01-14 17:04:33 +0000 UTC
-37,CLOSED,How the tensorflow savedmodel worked in TRTIS?,,2019-06-26 12:22:23 +0000 UTC
-36,CLOSED,New labels file is not detected,bug,2019-01-24 00:50:02 +0000 UTC
-33,CLOSED,How the Input tensor definition for tensorflow GraphDef model in TRTIS graph.pbtxt file?,,2019-01-17 17:54:28 +0000 UTC
-32,CLOSED,Why TRTIS not support dims: [ -1 ] for input and output tensor?,duplicate,2019-01-04 22:37:12 +0000 UTC
-29,CLOSED,INTERNAL - unable to enqueue for inference,,2019-01-03 16:31:32 +0000 UTC
-28,CLOSED,Clients build failed when set --build-arg "PYVER=3.6",help wanted,2019-04-22 22:23:43 +0000 UTC
-27,CLOSED,cp: cannot stat 'bazel-bin/src/custom/addsub/libaddsub.so': No such file or directory,bug,2018-12-26 06:18:48 +0000 UTC
-25,CLOSED,Implement Pre-process "add-on" to reduce TRTIS communication bottleneck,,2019-05-09 19:54:17 +0000 UTC
-19,CLOSED,New model deployment is not detected,,2018-12-20 06:25:21 +0000 UTC
-18,CLOSED,Does TRTIS support large models that place on multiple devices？,,2019-01-17 17:54:55 +0000 UTC
-16,CLOSED,TRTIS should support TensorRT models that require custom plugins,enhancement,2020-01-30 13:03:26 +0000 UTC
-14,CLOSED,perf_client has limits on concurrency,bug,2019-01-29 19:55:30 +0000 UTC
-13,CLOSED,Encountered out of memoryError when using perf_client test with various batch sizes,bug,2020-03-08 18:20:21 +0000 UTC
-12,CLOSED,Problem with ssd,,2018-12-17 01:05:13 +0000 UTC
-8,CLOSED,TRTIS should support variable-sized input and output tensor dimensions,enhancement,2019-04-22 15:45:18 +0000 UTC
-7,CLOSED,got problem while serving with TensorRT plan,,2018-12-03 02:05:00 +0000 UTC
-5,CLOSED,How to deploy models where the shape of output tensor is not known,,2018-12-25 09:30:09 +0000 UTC
-4,CLOSED,how to infer by http restful API,,2018-11-29 08:31:32 +0000 UTC
-3,CLOSED,image_client error,,2020-07-24 15:43:54 +0000 UTC
-1,CLOSED,Submitting raw data via IPC,enhancement,2019-10-21 16:39:10 +0000 UTC

From 42012b33ee5f2ffceb646930c2fdcd323fa32589 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 13 Jul 2023 10:58:09 -0700
Subject: [PATCH 28/39] Update copyrights

---
 deploy/fleetcommand/README.md         | 2 +-
 docs/protocol/extension_sequence.md   | 2 +-
 docs/protocol/extension_statistics.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/deploy/fleetcommand/README.md b/deploy/fleetcommand/README.md
index 996b7598cc..217162279c 100644
--- a/deploy/fleetcommand/README.md
+++ b/deploy/fleetcommand/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/protocol/extension_sequence.md b/docs/protocol/extension_sequence.md
index 51c99fc3cf..3836d06fce 100644
--- a/docs/protocol/extension_sequence.md
+++ b/docs/protocol/extension_sequence.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/protocol/extension_statistics.md b/docs/protocol/extension_statistics.md
index 46e1a92322..040f165dde 100644
--- a/docs/protocol/extension_statistics.md
+++ b/docs/protocol/extension_statistics.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions

From 856531e820a1ea32ee8f5fdc1ce32b9f13cb062f Mon Sep 17 00:00:00 2001
From: dyastremsky <58150256+dyastremsky@users.noreply.github.com>
Date: Thu, 13 Jul 2023 14:08:22 -0700
Subject: [PATCH 29/39] Fix README.md typo (decoupled)

Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8d5f96c0a2..526f39695f 100644
--- a/README.md
+++ b/README.md
@@ -206,7 +206,7 @@ designed for modularity and flexibility
 - [Create custom backends](https://github.com/triton-inference-server/backend)
   in either [C/C++](https://github.com/triton-inference-server/backend/blob/main/README.md#triton-backend-api)
   or [Python](https://github.com/triton-inference-server/python_backend)
-- Create [decouple backends and models](docs/user_guide/decoupled_models.md) that can send
+- Create [decoupled backends and models](docs/user_guide/decoupled_models.md) that can send
   multiple responses for a request or not send any responses for a request
 - Use a [Triton repository agent](docs/customization_guide/repository_agents.md) to add functionality
   that operates when a model is loaded and unloaded, such as authentication,

From e119d37c75a7b47de0bfb26b8b29e623c0bf338d Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 13 Jul 2023 14:12:43 -0700
Subject: [PATCH 30/39] Run pre-commit hooks

---
 build.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build.py b/build.py
index 3515698a45..1bf9d8ac10 100755
--- a/build.py
+++ b/build.py
@@ -74,8 +74,8 @@
         "2023.0.0",  # ORT OpenVINO
         "2023.0.0",  # Standalone OpenVINO
         "2.4.7",  # DCGM version
-        "py310_23.1.0-1",
-    )  # Conda version.
+        "py310_23.1.0-1",  # Conda version
+    )
 }
 
 CORE_BACKENDS = ["ensemble"]

From 91263a7a8df722c7ef6d5b4a77a204b89bdbfb92 Mon Sep 17 00:00:00 2001
From: dyastremsky <58150256+dyastremsky@users.noreply.github.com>
Date: Mon, 17 Jul 2023 02:39:33 -0700
Subject: [PATCH 31/39] Grammar fix

Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
---
 src/grpc/stream_infer_handler.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 0c70389d48..8877694284 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -218,7 +218,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     state->context_->IncrementRequestCounter();
 
     // If the request is not for a model with decoupled transaction policy
-    // then put it in the context queue so that's it's response is sent in
+    // then put it in the context queue so that its response is sent in
     // the same order as the request was received.
     if (!state->is_decoupled_) {
       state->context_->EnqueueForResponse(state);

From 1ff1a2fe6630d46a02f699b85489bdd1fae7f816 Mon Sep 17 00:00:00 2001
From: dyastremsky <58150256+dyastremsky@users.noreply.github.com>
Date: Mon, 17 Jul 2023 02:39:50 -0700
Subject: [PATCH 32/39] Redundant word

Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
---
 docs/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/README.md b/docs/README.md
index f6117c8168..9a293fa95b 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -189,7 +189,7 @@ The following resources are recommended to explore the full suite of Triton Infe
   - [Model Navigator](https://github.com/triton-inference-server/model_navigator):
   The Triton Model Navigator is a tool that provides the ability to automate the process of moving model from source to optimal format and configuration for deployment on Triton Inference Server. The tool supports export model from source to all possible formats and applies the Triton Inference Server backend optimizations.
 
-- **Backends**: Triton has supports a wide variety of frameworks used to run models. Users can extend this functionality by creating custom backends.
+- **Backends**: Triton supports a wide variety of frameworks used to run models. Users can extend this functionality by creating custom backends.
   - [PyTorch](https://github.com/triton-inference-server/pytorch_backend): Widely used Open Source DL Framework
   - [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend): Widely used Open Source DL Framework
   - [TensorRT](https://github.com/triton-inference-server/tensorrt_backend): NVIDIA [TensorRT](https://developer.nvidia.com/tensorrt) is an inference acceleration SDK that provide a with range of graph optimizations, kernel optimization, use of lower precision, and more.

From 46b931c436b69b2050da4b75c4826b0924c721d4 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Mon, 17 Jul 2023 02:48:11 -0700
Subject: [PATCH 33/39] Revert docker file changes

---
 docker/cpu_only/entrypoint.d/12-banner.sh              | 1 +
 docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh   | 1 +
 docker/entrypoint.d/56-network-driver-version-check.sh | 1 +
 docker/entrypoint.d/99-check-run-aip-mode.sh           | 1 +
 4 files changed, 4 insertions(+)
 mode change 100644 => 100755 docker/cpu_only/entrypoint.d/12-banner.sh
 mode change 100644 => 100755 docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh
 mode change 100644 => 100755 docker/entrypoint.d/99-check-run-aip-mode.sh

diff --git a/docker/cpu_only/entrypoint.d/12-banner.sh b/docker/cpu_only/entrypoint.d/12-banner.sh
old mode 100644
new mode 100755
index f6c44bede5..0b4adda84b
--- a/docker/cpu_only/entrypoint.d/12-banner.sh
+++ b/docker/cpu_only/entrypoint.d/12-banner.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 prodname_uc=$(echo "${NVIDIA_PRODUCT_NAME}" | tr [:lower:] [:upper:] | sed 's/ /_/g' | sed 's/^NVIDIA_//')  # Product name
diff --git a/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh b/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh
old mode 100644
new mode 100755
index c5ab38f435..4caa8eeff7
--- a/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh
+++ b/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 export TRITON_SERVER_CPU_ONLY=1
diff --git a/docker/entrypoint.d/56-network-driver-version-check.sh b/docker/entrypoint.d/56-network-driver-version-check.sh
index e69de29bb2..8b13789179 100644
--- a/docker/entrypoint.d/56-network-driver-version-check.sh
+++ b/docker/entrypoint.d/56-network-driver-version-check.sh
@@ -0,0 +1 @@
+
diff --git a/docker/entrypoint.d/99-check-run-aip-mode.sh b/docker/entrypoint.d/99-check-run-aip-mode.sh
old mode 100644
new mode 100755
index 32a93fbbb2..ec9249e944
--- a/docker/entrypoint.d/99-check-run-aip-mode.sh
+++ b/docker/entrypoint.d/99-check-run-aip-mode.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 # If detect Vertex AI environment, launch tritonserver with supplied arguments

From ba9218b010f2a0bfe8dc3a8503acb671e0f6232b Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Mon, 17 Jul 2023 02:50:43 -0700
Subject: [PATCH 34/39] Executable shebang revert

---
 docker/entrypoint.d/50-gpu-driver-check2.sh | 1 +
 1 file changed, 1 insertion(+)
 mode change 100644 => 100755 docker/entrypoint.d/50-gpu-driver-check2.sh

diff --git a/docker/entrypoint.d/50-gpu-driver-check2.sh b/docker/entrypoint.d/50-gpu-driver-check2.sh
old mode 100644
new mode 100755
index f831d43801..bc22dd55ad
--- a/docker/entrypoint.d/50-gpu-driver-check2.sh
+++ b/docker/entrypoint.d/50-gpu-driver-check2.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 if [[ "${NVIDIA_CPU_ONLY:-0}" == "1" ]]; then

From edcd3ef094c8855dceb9514c382c809da064af93 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Mon, 17 Jul 2023 10:50:58 -0700
Subject: [PATCH 35/39] Make model.py files non-executable

---
 .../argument_validation/models/argument_validation/1/model.py   | 2 --
 qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py  | 2 --
 .../decoupled/models/decoupled_bls_stream/1/model.py            | 2 --
 .../decoupled/models/decoupled_execute_error/1/model.py         | 2 --
 .../decoupled/models/decoupled_return_response_error/1/model.py | 2 --
 .../models/decoupled_send_after_close_error/1/model.py          | 2 --
 qa/L0_backend_python/restart/models/restart/1/model.py          | 2 --
 qa/L0_buffer_attributes/models/bls/1/model.py                   | 2 --
 qa/L0_buffer_attributes/models/identity/1/model.py              | 2 --
 qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py   | 2 --
 .../python/conflicting_max_batch_size/model.py                  | 2 --
 .../python/conflicting_scheduler_sequence/model.py              | 2 --
 .../autofill_noplatform/python/input_missing_datatype/model.py  | 2 --
 .../autofill_noplatform/python/input_missing_dims/model.py      | 2 --
 .../autofill_noplatform/python/input_missing_name/model.py      | 2 --
 .../autofill_noplatform/python/input_wrong_property/model.py    | 2 --
 .../autofill_noplatform/python/no_return/model.py               | 2 --
 .../autofill_noplatform/python/output_missing_datatype/model.py | 2 --
 .../autofill_noplatform/python/output_missing_dims/model.py     | 2 --
 .../autofill_noplatform/python/output_missing_name/model.py     | 2 --
 .../autofill_noplatform/python/output_wrong_property/model.py   | 2 --
 .../conflicting_scheduler_ensemble/model.py                     | 2 --
 .../conflicting_scheduler_ensemble/ensemble_first_step/model.py | 2 --
 .../ensemble_second_step/model.py                               | 2 --
 .../python/dynamic_batching/model.py                            | 2 --
 .../python/dynamic_batching_no_op/model.py                      | 2 --
 .../python/incomplete_input/model.py                            | 2 --
 .../test_duplication/addsub_repo/composing_model/1/model.py     | 2 --
 .../test_duplication/subadd_repo/composing_model/1/model.py     | 2 --
 .../addsub_repo/composing_model/1/model.py                      | 2 --
 .../subadd_repo/composing_model/1/model.py                      | 2 --
 .../addsub_repo/composing_addsub/1/model.py                     | 2 --
 .../subadd_repo/composing_subadd/1/model.py                     | 2 --
 .../test_no_duplication/addsub_repo/composing_addsub/1/model.py | 2 --
 .../test_no_duplication/subadd_repo/composing_subadd/1/model.py | 2 --
 qa/L0_nan_inf/models/nan_inf_output/1/model.py                  | 2 --
 qa/L0_parameters/model_repository/parameter/1/model.py          | 2 --
 qa/L0_warmup/decoupled/1/model.py                               | 2 --
 qa/L0_warmup/failing_infer/1/model.py                           | 2 --
 qa/python_models/add_sub/model.py                               | 2 --
 qa/python_models/auto_complete/model.py                         | 2 --
 qa/python_models/auto_complete_error/model.py                   | 2 --
 qa/python_models/bls/model.py                                   | 2 --
 qa/python_models/bls_async/model.py                             | 2 --
 qa/python_models/bls_finalize_error/model.py                    | 2 --
 qa/python_models/bls_init_error/model.py                        | 2 --
 qa/python_models/bls_memory/model.py                            | 2 --
 qa/python_models/bls_memory_async/model.py                      | 2 --
 qa/python_models/bls_model_loading/model.py                     | 2 --
 qa/python_models/bls_undefined/model.py                         | 2 --
 qa/python_models/cuda_memory_consumer/1/model.py                | 2 --
 qa/python_models/custom_metrics/model.py                        | 2 --
 qa/python_models/delayed_model/model.py                         | 2 --
 qa/python_models/dlpack_add_sub/model.py                        | 2 --
 qa/python_models/dlpack_empty_output/model.py                   | 2 --
 qa/python_models/dlpack_identity/model.py                       | 2 --
 qa/python_models/dlpack_io_identity/model.py                    | 2 --
 qa/python_models/dlpack_io_identity_decoupled/model.py          | 2 --
 qa/python_models/dlpack_square/model.py                         | 2 --
 qa/python_models/dlpack_sub_add/model.py                        | 2 --
 qa/python_models/dlpack_test/model.py                           | 2 --
 qa/python_models/execute_error/model.py                         | 2 --
 qa/python_models/execute_return_error/model.py                  | 2 --
 qa/python_models/fini_error/model.py                            | 2 --
 qa/python_models/ground_truth/model.py                          | 2 --
 qa/python_models/identity_fp32/model.py                         | 2 --
 qa/python_models/identity_fp32_logging/model.py                 | 2 --
 qa/python_models/identity_fp32_timeout/model.py                 | 2 --
 qa/python_models/init_args/model.py                             | 0
 qa/python_models/init_error/model.py                            | 2 --
 qa/python_models/init_exit/model.py                             | 2 --
 qa/python_models/model_env/model.py                             | 2 --
 qa/python_models/model_init_del/model.py                        | 2 --
 qa/python_models/multi_file/model.py                            | 2 --
 qa/python_models/non_contiguous/model.py                        | 2 --
 qa/python_models/optional/model.py                              | 2 --
 qa/python_models/python_version/model.py                        | 2 --
 qa/python_models/pytorch_fp32_fp32/model.py                     | 2 --
 qa/python_models/response_sender_error/model.py                 | 2 --
 qa/python_models/sequence_int32/model.py                        | 2 --
 qa/python_models/string/model.py                                | 2 --
 qa/python_models/string_fixed/model.py                          | 2 --
 qa/python_models/string_identity/model.py                       | 2 --
 qa/python_models/sub_add/model.py                               | 2 --
 qa/python_models/torchvision/resnet50/model.py                  | 2 --
 qa/python_models/variable_gpu_output/model.py                   | 2 --
 qa/python_models/wrong_model/model.py                           | 2 --
 87 files changed, 172 deletions(-)
 mode change 100755 => 100644 qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py
 mode change 100755 => 100644 qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py
 mode change 100755 => 100644 qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py
 mode change 100755 => 100644 qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py
 mode change 100755 => 100644 qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py
 mode change 100755 => 100644 qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py
 mode change 100755 => 100644 qa/L0_backend_python/restart/models/restart/1/model.py
 mode change 100755 => 100644 qa/L0_buffer_attributes/models/bls/1/model.py
 mode change 100755 => 100644 qa/L0_buffer_attributes/models/identity/1/model.py
 mode change 100755 => 100644 qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform/python/no_return/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py
 mode change 100755 => 100644 qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py
 mode change 100755 => 100644 qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py
 mode change 100755 => 100644 qa/L0_nan_inf/models/nan_inf_output/1/model.py
 mode change 100755 => 100644 qa/L0_parameters/model_repository/parameter/1/model.py
 mode change 100755 => 100644 qa/L0_warmup/decoupled/1/model.py
 mode change 100755 => 100644 qa/L0_warmup/failing_infer/1/model.py
 mode change 100755 => 100644 qa/python_models/add_sub/model.py
 mode change 100755 => 100644 qa/python_models/auto_complete/model.py
 mode change 100755 => 100644 qa/python_models/auto_complete_error/model.py
 mode change 100755 => 100644 qa/python_models/bls/model.py
 mode change 100755 => 100644 qa/python_models/bls_async/model.py
 mode change 100755 => 100644 qa/python_models/bls_finalize_error/model.py
 mode change 100755 => 100644 qa/python_models/bls_init_error/model.py
 mode change 100755 => 100644 qa/python_models/bls_memory/model.py
 mode change 100755 => 100644 qa/python_models/bls_memory_async/model.py
 mode change 100755 => 100644 qa/python_models/bls_model_loading/model.py
 mode change 100755 => 100644 qa/python_models/bls_undefined/model.py
 mode change 100755 => 100644 qa/python_models/cuda_memory_consumer/1/model.py
 mode change 100755 => 100644 qa/python_models/custom_metrics/model.py
 mode change 100755 => 100644 qa/python_models/delayed_model/model.py
 mode change 100755 => 100644 qa/python_models/dlpack_add_sub/model.py
 mode change 100755 => 100644 qa/python_models/dlpack_empty_output/model.py
 mode change 100755 => 100644 qa/python_models/dlpack_identity/model.py
 mode change 100755 => 100644 qa/python_models/dlpack_io_identity/model.py
 mode change 100755 => 100644 qa/python_models/dlpack_io_identity_decoupled/model.py
 mode change 100755 => 100644 qa/python_models/dlpack_square/model.py
 mode change 100755 => 100644 qa/python_models/dlpack_sub_add/model.py
 mode change 100755 => 100644 qa/python_models/dlpack_test/model.py
 mode change 100755 => 100644 qa/python_models/execute_error/model.py
 mode change 100755 => 100644 qa/python_models/execute_return_error/model.py
 mode change 100755 => 100644 qa/python_models/fini_error/model.py
 mode change 100755 => 100644 qa/python_models/ground_truth/model.py
 mode change 100755 => 100644 qa/python_models/identity_fp32/model.py
 mode change 100755 => 100644 qa/python_models/identity_fp32_logging/model.py
 mode change 100755 => 100644 qa/python_models/identity_fp32_timeout/model.py
 mode change 100755 => 100644 qa/python_models/init_args/model.py
 mode change 100755 => 100644 qa/python_models/init_error/model.py
 mode change 100755 => 100644 qa/python_models/init_exit/model.py
 mode change 100755 => 100644 qa/python_models/model_env/model.py
 mode change 100755 => 100644 qa/python_models/model_init_del/model.py
 mode change 100755 => 100644 qa/python_models/multi_file/model.py
 mode change 100755 => 100644 qa/python_models/non_contiguous/model.py
 mode change 100755 => 100644 qa/python_models/optional/model.py
 mode change 100755 => 100644 qa/python_models/python_version/model.py
 mode change 100755 => 100644 qa/python_models/pytorch_fp32_fp32/model.py
 mode change 100755 => 100644 qa/python_models/response_sender_error/model.py
 mode change 100755 => 100644 qa/python_models/sequence_int32/model.py
 mode change 100755 => 100644 qa/python_models/string/model.py
 mode change 100755 => 100644 qa/python_models/string_fixed/model.py
 mode change 100755 => 100644 qa/python_models/string_identity/model.py
 mode change 100755 => 100644 qa/python_models/sub_add/model.py
 mode change 100755 => 100644 qa/python_models/torchvision/resnet50/model.py
 mode change 100755 => 100644 qa/python_models/variable_gpu_output/model.py
 mode change 100755 => 100644 qa/python_models/wrong_model/model.py

diff --git a/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py b/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py
old mode 100755
new mode 100644
index bd5fae1afe..df1b298a35
--- a/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py
+++ b/qa/L0_backend_python/argument_validation/models/argument_validation/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py
old mode 100755
new mode 100644
index 901e4c46b7..782e7ec86e
--- a/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py
+++ b/qa/L0_backend_python/decoupled/models/decoupled_bls/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py
old mode 100755
new mode 100644
index e6334d34dc..8643482912
--- a/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py
+++ b/qa/L0_backend_python/decoupled/models/decoupled_bls_stream/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py
old mode 100755
new mode 100644
index ecdb7df322..3882f0da9c
--- a/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py
+++ b/qa/L0_backend_python/decoupled/models/decoupled_execute_error/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py
old mode 100755
new mode 100644
index 10b9ef12fe..ecde9c7168
--- a/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py
+++ b/qa/L0_backend_python/decoupled/models/decoupled_return_response_error/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py b/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py
old mode 100755
new mode 100644
index aeab19851c..52aa17ac0d
--- a/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py
+++ b/qa/L0_backend_python/decoupled/models/decoupled_send_after_close_error/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_backend_python/restart/models/restart/1/model.py b/qa/L0_backend_python/restart/models/restart/1/model.py
old mode 100755
new mode 100644
index d7cb765ec9..1f7491498e
--- a/qa/L0_backend_python/restart/models/restart/1/model.py
+++ b/qa/L0_backend_python/restart/models/restart/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_buffer_attributes/models/bls/1/model.py b/qa/L0_buffer_attributes/models/bls/1/model.py
old mode 100755
new mode 100644
index 6c035bb6a4..2d3e78e936
--- a/qa/L0_buffer_attributes/models/bls/1/model.py
+++ b/qa/L0_buffer_attributes/models/bls/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_buffer_attributes/models/identity/1/model.py b/qa/L0_buffer_attributes/models/identity/1/model.py
old mode 100755
new mode 100644
index 933ed6d9c5..2d4b592ae3
--- a/qa/L0_buffer_attributes/models/identity/1/model.py
+++ b/qa/L0_buffer_attributes/models/identity/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py b/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py
old mode 100755
new mode 100644
index e03876f981..17c406b18e
--- a/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py
+++ b/qa/L0_grpc/client_plugin_models/client_plugin_test/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py b/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py
old mode 100755
new mode 100644
index 9c5e99e49e..17da02915b
--- a/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/conflicting_max_batch_size/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py b/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py
old mode 100755
new mode 100644
index f617ac6faf..b1399382c4
--- a/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/conflicting_scheduler_sequence/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py b/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py
old mode 100755
new mode 100644
index ef915705e6..cfd6aab9d6
--- a/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_datatype/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py b/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py
old mode 100755
new mode 100644
index b5f3a0c9fc..8c02b4ce40
--- a/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_dims/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py b/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py
old mode 100755
new mode 100644
index 78ba70742c..33a76b6b30
--- a/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/input_missing_name/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py b/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py
old mode 100755
new mode 100644
index 6a83d9fcbd..f3e883db06
--- a/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/input_wrong_property/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform/python/no_return/model.py b/qa/L0_model_config/autofill_noplatform/python/no_return/model.py
old mode 100755
new mode 100644
index 6bb52bc152..65fae1dcc2
--- a/qa/L0_model_config/autofill_noplatform/python/no_return/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/no_return/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py b/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py
old mode 100755
new mode 100644
index 64a08ca859..26ef3e5c7e
--- a/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_datatype/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py b/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py
old mode 100755
new mode 100644
index 0ee2d01f1a..6e43928239
--- a/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_dims/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py b/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py
old mode 100755
new mode 100644
index 12c777c613..cde57b7827
--- a/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/output_missing_name/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py b/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py
old mode 100755
new mode 100644
index 40874ab404..4dd17ea4e3
--- a/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py
+++ b/qa/L0_model_config/autofill_noplatform/python/output_wrong_property/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py
old mode 100755
new mode 100644
index 14ca01ee47..57589bacdf
--- a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py
+++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/conflicting_scheduler_ensemble/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py
old mode 100755
new mode 100644
index 14ca01ee47..57589bacdf
--- a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py
+++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_first_step/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py
old mode 100755
new mode 100644
index 14ca01ee47..57589bacdf
--- a/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py
+++ b/qa/L0_model_config/autofill_noplatform_success/python/conflicting_scheduler_ensemble/ensemble_second_step/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py
old mode 100755
new mode 100644
index f617ac6faf..b1399382c4
--- a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py
+++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py
old mode 100755
new mode 100644
index f617ac6faf..b1399382c4
--- a/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py
+++ b/qa/L0_model_config/autofill_noplatform_success/python/dynamic_batching_no_op/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py
old mode 100755
new mode 100644
index e951a2ef35..75000a0ba4
--- a/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py
+++ b/qa/L0_model_config/autofill_noplatform_success/python/incomplete_input/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py
old mode 100755
new mode 100644
index 71f89a1659..13a611e7a3
--- a/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py
+++ b/qa/L0_model_namespacing/test_duplication/addsub_repo/composing_model/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 import os
 import sys
 
diff --git a/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py
old mode 100755
new mode 100644
index 4eed1f9a40..664c20b58f
--- a/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py
+++ b/qa/L0_model_namespacing/test_duplication/subadd_repo/composing_model/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 import os
 import sys
 
diff --git a/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py
old mode 100755
new mode 100644
index 71f89a1659..13a611e7a3
--- a/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py
+++ b/qa/L0_model_namespacing/test_dynamic_resolution/addsub_repo/composing_model/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 import os
 import sys
 
diff --git a/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py b/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py
old mode 100755
new mode 100644
index 4eed1f9a40..664c20b58f
--- a/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py
+++ b/qa/L0_model_namespacing/test_dynamic_resolution/subadd_repo/composing_model/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 import os
 import sys
 
diff --git a/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py b/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py
old mode 100755
new mode 100644
index 71f89a1659..13a611e7a3
--- a/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py
+++ b/qa/L0_model_namespacing/test_ensemble_duplication/addsub_repo/composing_addsub/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 import os
 import sys
 
diff --git a/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py b/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py
old mode 100755
new mode 100644
index 4eed1f9a40..664c20b58f
--- a/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py
+++ b/qa/L0_model_namespacing/test_ensemble_duplication/subadd_repo/composing_subadd/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 import os
 import sys
 
diff --git a/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py b/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py
old mode 100755
new mode 100644
index 71f89a1659..13a611e7a3
--- a/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py
+++ b/qa/L0_model_namespacing/test_no_duplication/addsub_repo/composing_addsub/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 import os
 import sys
 
diff --git a/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py b/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py
old mode 100755
new mode 100644
index 4eed1f9a40..664c20b58f
--- a/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py
+++ b/qa/L0_model_namespacing/test_no_duplication/subadd_repo/composing_subadd/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 import os
 import sys
 
diff --git a/qa/L0_nan_inf/models/nan_inf_output/1/model.py b/qa/L0_nan_inf/models/nan_inf_output/1/model.py
old mode 100755
new mode 100644
index d85c3b4702..17cfb04fa0
--- a/qa/L0_nan_inf/models/nan_inf_output/1/model.py
+++ b/qa/L0_nan_inf/models/nan_inf_output/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_parameters/model_repository/parameter/1/model.py b/qa/L0_parameters/model_repository/parameter/1/model.py
old mode 100755
new mode 100644
index 458d5467c8..c175860962
--- a/qa/L0_parameters/model_repository/parameter/1/model.py
+++ b/qa/L0_parameters/model_repository/parameter/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_warmup/decoupled/1/model.py b/qa/L0_warmup/decoupled/1/model.py
old mode 100755
new mode 100644
index 52481ae83f..9827a87f09
--- a/qa/L0_warmup/decoupled/1/model.py
+++ b/qa/L0_warmup/decoupled/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/L0_warmup/failing_infer/1/model.py b/qa/L0_warmup/failing_infer/1/model.py
old mode 100755
new mode 100644
index 65814c77d4..632477c903
--- a/qa/L0_warmup/failing_infer/1/model.py
+++ b/qa/L0_warmup/failing_infer/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/add_sub/model.py b/qa/python_models/add_sub/model.py
old mode 100755
new mode 100644
index 6a5710d869..0868014804
--- a/qa/python_models/add_sub/model.py
+++ b/qa/python_models/add_sub/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/auto_complete/model.py b/qa/python_models/auto_complete/model.py
old mode 100755
new mode 100644
index f02e532b24..7f67182387
--- a/qa/python_models/auto_complete/model.py
+++ b/qa/python_models/auto_complete/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/auto_complete_error/model.py b/qa/python_models/auto_complete_error/model.py
old mode 100755
new mode 100644
index 3562824c46..1d611c36d5
--- a/qa/python_models/auto_complete_error/model.py
+++ b/qa/python_models/auto_complete_error/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/bls/model.py b/qa/python_models/bls/model.py
old mode 100755
new mode 100644
index 41eacfb933..bb2c48ddb4
--- a/qa/python_models/bls/model.py
+++ b/qa/python_models/bls/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/bls_async/model.py b/qa/python_models/bls_async/model.py
old mode 100755
new mode 100644
index a4bc98e85f..8d75259b7b
--- a/qa/python_models/bls_async/model.py
+++ b/qa/python_models/bls_async/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/bls_finalize_error/model.py b/qa/python_models/bls_finalize_error/model.py
old mode 100755
new mode 100644
index a0b900c75e..a38b1080ad
--- a/qa/python_models/bls_finalize_error/model.py
+++ b/qa/python_models/bls_finalize_error/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/bls_init_error/model.py b/qa/python_models/bls_init_error/model.py
old mode 100755
new mode 100644
index 1d890d1a8f..b2518e0334
--- a/qa/python_models/bls_init_error/model.py
+++ b/qa/python_models/bls_init_error/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/bls_memory/model.py b/qa/python_models/bls_memory/model.py
old mode 100755
new mode 100644
index d5df420d23..69da4f440f
--- a/qa/python_models/bls_memory/model.py
+++ b/qa/python_models/bls_memory/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/bls_memory_async/model.py b/qa/python_models/bls_memory_async/model.py
old mode 100755
new mode 100644
index 2ff6044148..d9e676b42e
--- a/qa/python_models/bls_memory_async/model.py
+++ b/qa/python_models/bls_memory_async/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/bls_model_loading/model.py b/qa/python_models/bls_model_loading/model.py
old mode 100755
new mode 100644
index 7557f4066b..ea9d3c1c94
--- a/qa/python_models/bls_model_loading/model.py
+++ b/qa/python_models/bls_model_loading/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/bls_undefined/model.py b/qa/python_models/bls_undefined/model.py
old mode 100755
new mode 100644
index a78dec61d4..30e5f4106a
--- a/qa/python_models/bls_undefined/model.py
+++ b/qa/python_models/bls_undefined/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/cuda_memory_consumer/1/model.py b/qa/python_models/cuda_memory_consumer/1/model.py
old mode 100755
new mode 100644
index 5e451e6d82..e3526920ea
--- a/qa/python_models/cuda_memory_consumer/1/model.py
+++ b/qa/python_models/cuda_memory_consumer/1/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/custom_metrics/model.py b/qa/python_models/custom_metrics/model.py
old mode 100755
new mode 100644
index 9b25382d08..31f105a1dd
--- a/qa/python_models/custom_metrics/model.py
+++ b/qa/python_models/custom_metrics/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/delayed_model/model.py b/qa/python_models/delayed_model/model.py
old mode 100755
new mode 100644
index 0587c0ff0e..e7538148f1
--- a/qa/python_models/delayed_model/model.py
+++ b/qa/python_models/delayed_model/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/dlpack_add_sub/model.py b/qa/python_models/dlpack_add_sub/model.py
old mode 100755
new mode 100644
index ead2db6017..7f70e05d5c
--- a/qa/python_models/dlpack_add_sub/model.py
+++ b/qa/python_models/dlpack_add_sub/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/dlpack_empty_output/model.py b/qa/python_models/dlpack_empty_output/model.py
old mode 100755
new mode 100644
index fba96cca1a..7784e28b4d
--- a/qa/python_models/dlpack_empty_output/model.py
+++ b/qa/python_models/dlpack_empty_output/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/dlpack_identity/model.py b/qa/python_models/dlpack_identity/model.py
old mode 100755
new mode 100644
index 39fffcbfdb..1bd0748df9
--- a/qa/python_models/dlpack_identity/model.py
+++ b/qa/python_models/dlpack_identity/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/dlpack_io_identity/model.py b/qa/python_models/dlpack_io_identity/model.py
old mode 100755
new mode 100644
index 7b72d1f02f..225d026992
--- a/qa/python_models/dlpack_io_identity/model.py
+++ b/qa/python_models/dlpack_io_identity/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/dlpack_io_identity_decoupled/model.py b/qa/python_models/dlpack_io_identity_decoupled/model.py
old mode 100755
new mode 100644
index 9395c756ed..5f4e597df8
--- a/qa/python_models/dlpack_io_identity_decoupled/model.py
+++ b/qa/python_models/dlpack_io_identity_decoupled/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/dlpack_square/model.py b/qa/python_models/dlpack_square/model.py
old mode 100755
new mode 100644
index 7e3a592426..b31531461e
--- a/qa/python_models/dlpack_square/model.py
+++ b/qa/python_models/dlpack_square/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/dlpack_sub_add/model.py b/qa/python_models/dlpack_sub_add/model.py
old mode 100755
new mode 100644
index bb5d7531b5..16caafcea2
--- a/qa/python_models/dlpack_sub_add/model.py
+++ b/qa/python_models/dlpack_sub_add/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/dlpack_test/model.py b/qa/python_models/dlpack_test/model.py
old mode 100755
new mode 100644
index 8a461c74db..64bc7d6692
--- a/qa/python_models/dlpack_test/model.py
+++ b/qa/python_models/dlpack_test/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/execute_error/model.py b/qa/python_models/execute_error/model.py
old mode 100755
new mode 100644
index 72a8bd482d..9ecdbff816
--- a/qa/python_models/execute_error/model.py
+++ b/qa/python_models/execute_error/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/execute_return_error/model.py b/qa/python_models/execute_return_error/model.py
old mode 100755
new mode 100644
index 3c38ca615a..e304441f04
--- a/qa/python_models/execute_return_error/model.py
+++ b/qa/python_models/execute_return_error/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/fini_error/model.py b/qa/python_models/fini_error/model.py
old mode 100755
new mode 100644
index bd95355091..7a9f409aee
--- a/qa/python_models/fini_error/model.py
+++ b/qa/python_models/fini_error/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/ground_truth/model.py b/qa/python_models/ground_truth/model.py
old mode 100755
new mode 100644
index d79b97ec2c..24a286e300
--- a/qa/python_models/ground_truth/model.py
+++ b/qa/python_models/ground_truth/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/identity_fp32/model.py b/qa/python_models/identity_fp32/model.py
old mode 100755
new mode 100644
index ab9d76a97f..2161a1e732
--- a/qa/python_models/identity_fp32/model.py
+++ b/qa/python_models/identity_fp32/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/identity_fp32_logging/model.py b/qa/python_models/identity_fp32_logging/model.py
old mode 100755
new mode 100644
index 411a00794e..91ace61fd5
--- a/qa/python_models/identity_fp32_logging/model.py
+++ b/qa/python_models/identity_fp32_logging/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/identity_fp32_timeout/model.py b/qa/python_models/identity_fp32_timeout/model.py
old mode 100755
new mode 100644
index a6c6703d16..ed2a2f4283
--- a/qa/python_models/identity_fp32_timeout/model.py
+++ b/qa/python_models/identity_fp32_timeout/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/init_args/model.py b/qa/python_models/init_args/model.py
old mode 100755
new mode 100644
diff --git a/qa/python_models/init_error/model.py b/qa/python_models/init_error/model.py
old mode 100755
new mode 100644
index fff43a8f2d..654dc8ef2c
--- a/qa/python_models/init_error/model.py
+++ b/qa/python_models/init_error/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/init_exit/model.py b/qa/python_models/init_exit/model.py
old mode 100755
new mode 100644
index 10966b86d9..e0fc8b55a4
--- a/qa/python_models/init_exit/model.py
+++ b/qa/python_models/init_exit/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/model_env/model.py b/qa/python_models/model_env/model.py
old mode 100755
new mode 100644
index 0785d6bc2a..8cc9db8d81
--- a/qa/python_models/model_env/model.py
+++ b/qa/python_models/model_env/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/model_init_del/model.py b/qa/python_models/model_init_del/model.py
old mode 100755
new mode 100644
index d0a9cc37a5..924132ecb1
--- a/qa/python_models/model_init_del/model.py
+++ b/qa/python_models/model_init_del/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/multi_file/model.py b/qa/python_models/multi_file/model.py
old mode 100755
new mode 100644
index d48f71456f..b94d6f336f
--- a/qa/python_models/multi_file/model.py
+++ b/qa/python_models/multi_file/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/non_contiguous/model.py b/qa/python_models/non_contiguous/model.py
old mode 100755
new mode 100644
index 68fcec97b5..de7417303b
--- a/qa/python_models/non_contiguous/model.py
+++ b/qa/python_models/non_contiguous/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/optional/model.py b/qa/python_models/optional/model.py
old mode 100755
new mode 100644
index 91d0c2117f..f0a790b43a
--- a/qa/python_models/optional/model.py
+++ b/qa/python_models/optional/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/python_version/model.py b/qa/python_models/python_version/model.py
old mode 100755
new mode 100644
index 720fa29a5f..5d77906fa9
--- a/qa/python_models/python_version/model.py
+++ b/qa/python_models/python_version/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/pytorch_fp32_fp32/model.py b/qa/python_models/pytorch_fp32_fp32/model.py
old mode 100755
new mode 100644
index 232cb8754e..98269213b2
--- a/qa/python_models/pytorch_fp32_fp32/model.py
+++ b/qa/python_models/pytorch_fp32_fp32/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/response_sender_error/model.py b/qa/python_models/response_sender_error/model.py
old mode 100755
new mode 100644
index cbdd75c2a4..4f1e0e5e85
--- a/qa/python_models/response_sender_error/model.py
+++ b/qa/python_models/response_sender_error/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/sequence_int32/model.py b/qa/python_models/sequence_int32/model.py
old mode 100755
new mode 100644
index 46766c5ef7..445cb5b13e
--- a/qa/python_models/sequence_int32/model.py
+++ b/qa/python_models/sequence_int32/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/string/model.py b/qa/python_models/string/model.py
old mode 100755
new mode 100644
index f8d1c783c8..5e419d965a
--- a/qa/python_models/string/model.py
+++ b/qa/python_models/string/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/string_fixed/model.py b/qa/python_models/string_fixed/model.py
old mode 100755
new mode 100644
index 6fdee5084f..d6e23eccb8
--- a/qa/python_models/string_fixed/model.py
+++ b/qa/python_models/string_fixed/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/string_identity/model.py b/qa/python_models/string_identity/model.py
old mode 100755
new mode 100644
index 593fdb08b3..0288b129bc
--- a/qa/python_models/string_identity/model.py
+++ b/qa/python_models/string_identity/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/sub_add/model.py b/qa/python_models/sub_add/model.py
old mode 100755
new mode 100644
index f5d7df549f..8ac679c86f
--- a/qa/python_models/sub_add/model.py
+++ b/qa/python_models/sub_add/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/torchvision/resnet50/model.py b/qa/python_models/torchvision/resnet50/model.py
old mode 100755
new mode 100644
index 5bd720cd07..1e2dbbf7a1
--- a/qa/python_models/torchvision/resnet50/model.py
+++ b/qa/python_models/torchvision/resnet50/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/variable_gpu_output/model.py b/qa/python_models/variable_gpu_output/model.py
old mode 100755
new mode 100644
index 11050c4de3..2da2a3cbd2
--- a/qa/python_models/variable_gpu_output/model.py
+++ b/qa/python_models/variable_gpu_output/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/qa/python_models/wrong_model/model.py b/qa/python_models/wrong_model/model.py
old mode 100755
new mode 100644
index c6c1696a20..2cac72324f
--- a/qa/python_models/wrong_model/model.py
+++ b/qa/python_models/wrong_model/model.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without

From 30df40cfb05e8875ab2755c62c1448660db9fc9e Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Mon, 17 Jul 2023 21:34:05 -0700
Subject: [PATCH 36/39] Passin is proper flag

---
 pyproject.toml              |  2 ++
 qa/L0_https/test.sh         | 16 ++++++++--------
 qa/L0_perf_analyzer/test.sh | 16 ++++++++--------
 qa/L0_secure_grpc/test.sh   | 16 ++++++++--------
 4 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1a8da1f4d3..2843ad2d42 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,8 @@
 skip = "./.git,./.github"
 # ignore short words, and typename parameters like OffsetT
 ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
+# ignore allowed words
+ignore-words-list = "passin"
 # use the 'clear' dictionary for unambiguous spelling mistakes
 builtin = "clear"
 # disable warnings about binary files and wrong encoding
diff --git a/qa/L0_https/test.sh b/qa/L0_https/test.sh
index 7fe03b843e..2c030332e5 100755
--- a/qa/L0_https/test.sh
+++ b/qa/L0_https/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -57,23 +57,23 @@ rm -f *.key *.crt ${CLIENT_LOG}.* server.log
 
 # Generate valid CA
 openssl genrsa -passout pass:1234 -des3 -out ca.key 4096
-openssl req -passing pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA"
+openssl req -passin pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA"
 
 # Generate valid Server Key/Cert
 openssl genrsa -passout pass:1234 -des3 -out server.key 4096
-openssl req -passing pass:1234 -new -key server.key -out server.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost"
-openssl x509 -req -passing pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt
+openssl req -passin pass:1234 -new -key server.key -out server.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost"
+openssl x509 -req -passin pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt
 
 # Remove passphrase from the Server Key
-openssl rsa -passing pass:1234 -in server.key -out server.key
+openssl rsa -passin pass:1234 -in server.key -out server.key
 
 # Generate valid Client Key/Cert
 openssl genrsa -passout pass:1234 -des3 -out client.key 4096
-openssl req -passing pass:1234 -new -key client.key -out client.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost"
-openssl x509 -passing pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt
+openssl req -passin pass:1234 -new -key client.key -out client.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost"
+openssl x509 -passin pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt
 
 # Remove passphrase from Client Key
-openssl rsa -passing pass:1234 -in client.key -out client.key
+openssl rsa -passin pass:1234 -in client.key -out client.key
 
 # Create mutated client key (Make first char of each like capital)
 cp client.key client2.key && sed -i "s/\b\(.\)/\u\1/g" client2.key
diff --git a/qa/L0_perf_analyzer/test.sh b/qa/L0_perf_analyzer/test.sh
index 42b80f009f..40130af7e2 100755
--- a/qa/L0_perf_analyzer/test.sh
+++ b/qa/L0_perf_analyzer/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -984,23 +984,23 @@ wait $SERVER_PID
 
 # Generate valid CA
 openssl genrsa -passout pass:1234 -des3 -out ca.key 4096
-openssl req -passing pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA"
+openssl req -passin pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA"
 
 # Generate valid Server Key/Cert
 openssl genrsa -passout pass:1234 -des3 -out server.key 4096
-openssl req -passing pass:1234 -new -key server.key -out server.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost"
-openssl x509 -req -passing pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt
+openssl req -passin pass:1234 -new -key server.key -out server.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost"
+openssl x509 -req -passin pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt
 
 # Remove passphrase from the Server Key
-openssl rsa -passing pass:1234 -in server.key -out server.key
+openssl rsa -passin pass:1234 -in server.key -out server.key
 
 # Generate valid Client Key/Cert
 openssl genrsa -passout pass:1234 -des3 -out client.key 4096
-openssl req -passing pass:1234 -new -key client.key -out client.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost"
-openssl x509 -passing pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt
+openssl req -passin pass:1234 -new -key client.key -out client.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost"
+openssl x509 -passin pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt
 
 # Remove passphrase from Client Key
-openssl rsa -passing pass:1234 -in client.key -out client.key
+openssl rsa -passin pass:1234 -in client.key -out client.key
 
 # Create mutated client key (Make first char of each like capital)
 cp client.key client2.key && sed -i "s/\b\(.\)/\u\1/g" client2.key
diff --git a/qa/L0_secure_grpc/test.sh b/qa/L0_secure_grpc/test.sh
index 63c9b104a6..784613c6a2 100755
--- a/qa/L0_secure_grpc/test.sh
+++ b/qa/L0_secure_grpc/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -56,23 +56,23 @@ rm -fr *.log *.log.*
 
 # Generate valid CA
 openssl genrsa -passout pass:1234 -des3 -out ca.key 4096
-openssl req -passing pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA"
+openssl req -passin pass:1234 -new -x509 -days 365 -key ca.key -out ca.crt -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Test/CN=Root CA"
 
 # Generate valid Server Key/Cert
 openssl genrsa -passout pass:1234 -des3 -out server.key 4096
-openssl req -passing pass:1234 -new -key server.key -out server.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost"
-openssl x509 -req -passing pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt
+openssl req -passin pass:1234 -new -key server.key -out server.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Server/CN=localhost"
+openssl x509 -req -passin pass:1234 -days 365 -in server.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out server.crt
 
 # Remove passphrase from the Server Key
-openssl rsa -passing pass:1234 -in server.key -out server.key
+openssl rsa -passin pass:1234 -in server.key -out server.key
 
 # Generate valid Client Key/Cert
 openssl genrsa -passout pass:1234 -des3 -out client.key 4096
-openssl req -passing pass:1234 -new -key client.key -out client.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost"
-openssl x509 -passing pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt
+openssl req -passin pass:1234 -new -key client.key -out client.csr -subj  "/C=SP/ST=Spain/L=Valdepenias/O=Test/OU=Client/CN=localhost"
+openssl x509 -passin pass:1234 -req -days 365 -in client.csr -CA ca.crt -CAkey ca.key -set_serial 01 -out client.crt
 
 # Remove passphrase from Client Key
-openssl rsa -passing pass:1234 -in client.key -out client.key
+openssl rsa -passin pass:1234 -in client.key -out client.key
 
 # Create mutated client key (Make first char of each like capital)
 cp client.key client2.key && sed -i "s/\b\(.\)/\u\1/g" client2.key

From 8738c9b089c249f3af01dddae4e153642a491e4e Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 18 Jul 2023 21:46:07 -0700
Subject: [PATCH 37/39] Run pre-commit hooks on init_args/model.py

---
 qa/python_models/init_args/model.py | 45 +++++++++++++++--------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/qa/python_models/init_args/model.py b/qa/python_models/init_args/model.py
index f3a5e6dbc4..26ea2f1d87 100644
--- a/qa/python_models/init_args/model.py
+++ b/qa/python_models/init_args/model.py
@@ -25,36 +25,34 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+
 import numpy as np
 import triton_python_backend_utils as pb_utils
 
 
 def check_init_args(args):
     expected_args = {
-        'model_name':
-            'init_args',
-        'model_instance_name':
-            'init_args_0_0',
-        'model_instance_kind':
-            'CPU',
-        'model_instance_device_id':
-            '0',
-        'model_repository':
-            os.getenv("TRITON_DIR", "/opt/tironserver") +
-            '/qa/L0_backend_python/models/init_args',
-        'model_version':
-            '1'
+        "model_name": "init_args",
+        "model_instance_name": "init_args_0_0",
+        "model_instance_kind": "CPU",
+        "model_instance_device_id": "0",
+        "model_repository": os.getenv("TRITON_DIR", "/opt/tironserver")
+        + "/qa/L0_backend_python/models/init_args",
+        "model_version": "1",
     }
 
     for arg in expected_args:
         if args[arg] != expected_args[arg]:
             raise pb_utils.TritonModelException(
-                arg + ' does not contain correct value. Expected "' +
-                expected_args[arg] + ', got ' + args[arg])
+                arg
+                + ' does not contain correct value. Expected "'
+                + expected_args[arg]
+                + ", got "
+                + args[arg]
+            )
 
 
 class TritonPythonModel:
-
     def initialize(self, args):
         self.args = args
         check_init_args(self.args)
@@ -66,9 +64,13 @@ def execute(self, requests):
         correct.
         """
         keys = [
-            'model_config', 'model_instance_kind', 'model_instance_name',
-            'model_instance_device_id', 'model_repository', 'model_version',
-            'model_name'
+            "model_config",
+            "model_instance_kind",
+            "model_instance_name",
+            "model_instance_device_id",
+            "model_repository",
+            "model_version",
+            "model_name",
         ]
 
         correct_keys = 0
@@ -79,6 +81,7 @@ def execute(self, requests):
         responses = []
         for _ in requests:
             out_args = pb_utils.Tensor(
-                "OUT", np.array([correct_keys], dtype=np.float32))
+                "OUT", np.array([correct_keys], dtype=np.float32)
+            )
             responses.append(pb_utils.InferenceResponse([out_args]))
-        return responses
\ No newline at end of file
+        return responses

From bf67cbcca26eba25a4feaf485d350bb084ade104 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Tue, 18 Jul 2023 21:47:44 -0700
Subject: [PATCH 38/39] Fix typo in init_args/model.py

---
 qa/python_models/init_args/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qa/python_models/init_args/model.py b/qa/python_models/init_args/model.py
index 26ea2f1d87..12dd2212a1 100644
--- a/qa/python_models/init_args/model.py
+++ b/qa/python_models/init_args/model.py
@@ -36,7 +36,7 @@ def check_init_args(args):
         "model_instance_name": "init_args_0_0",
         "model_instance_kind": "CPU",
         "model_instance_device_id": "0",
-        "model_repository": os.getenv("TRITON_DIR", "/opt/tironserver")
+        "model_repository": os.getenv("TRITON_DIR", "/opt/tritonserver")
         + "/qa/L0_backend_python/models/init_args",
         "model_version": "1",
     }

From 51485260608a6202993e90a2a20ff6a269e96704 Mon Sep 17 00:00:00 2001
From: David Yastremsky <dyastremsky@nvidia.com>
Date: Thu, 20 Jul 2023 01:01:14 -0700
Subject: [PATCH 39/39] Make copyrights one line

---
 qa/L0_java_memory_growth/MemoryGrowthTest.java | 3 +--
 qa/L0_java_resnet/ResnetTest.java              | 3 +--
 qa/L0_java_sequence_batcher/SequenceTest.java  | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/qa/L0_java_memory_growth/MemoryGrowthTest.java b/qa/L0_java_memory_growth/MemoryGrowthTest.java
index b4acc9a246..28243459ec 100644
--- a/qa/L0_java_memory_growth/MemoryGrowthTest.java
+++ b/qa/L0_java_memory_growth/MemoryGrowthTest.java
@@ -1,5 +1,4 @@
-// Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights
-// reserved.
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
diff --git a/qa/L0_java_resnet/ResnetTest.java b/qa/L0_java_resnet/ResnetTest.java
index 78ff5c3e97..4827273926 100644
--- a/qa/L0_java_resnet/ResnetTest.java
+++ b/qa/L0_java_resnet/ResnetTest.java
@@ -1,5 +1,4 @@
-// Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights
-// reserved.
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
diff --git a/qa/L0_java_sequence_batcher/SequenceTest.java b/qa/L0_java_sequence_batcher/SequenceTest.java
index b25b2c383d..cfce3584de 100644
--- a/qa/L0_java_sequence_batcher/SequenceTest.java
+++ b/qa/L0_java_sequence_batcher/SequenceTest.java
@@ -1,5 +1,4 @@
-// Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights
-// reserved.
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions