From 4652f7aedea0ee4fdd0c235828fa42935f547960 Mon Sep 17 00:00:00 2001
From: pvijayakrish <pvijayakrish@nvidia.com>
Date: Tue, 23 Jul 2024 13:22:09 -0700
Subject: [PATCH 01/12] Update NGC versions post-24.07 release

---
 README.md                                      |  8 ++++----
 build.py                                       |  6 +++---
 deploy/aws/values.yaml                         |  4 ++--
 deploy/fleetcommand/Chart.yaml                 |  2 +-
 deploy/fleetcommand/values.yaml                |  6 +++---
 deploy/gcp/values.yaml                         |  4 ++--
 .../perf-analyzer-script/triton_client.yaml    |  2 +-
 .../server-deployer/build_and_push.sh          |  6 +++---
 .../server-deployer/chart/triton/Chart.yaml    |  4 ++--
 .../server-deployer/chart/triton/values.yaml   |  8 ++++----
 .../server-deployer/data-test/schema.yaml      |  2 +-
 .../server-deployer/schema.yaml                |  2 +-
 .../gke-marketplace-app/trt-engine/README.md   |  2 +-
 deploy/k8s-onprem/values.yaml                  |  4 ++--
 deploy/oci/values.yaml                         |  4 ++--
 docs/customization_guide/build.md              |  6 +++---
 docs/customization_guide/compose.md            | 18 +++++++++---------
 docs/customization_guide/test.md               |  2 +-
 docs/generate_docs.py                          |  4 ++--
 docs/user_guide/custom_operations.md           |  6 +++---
 docs/user_guide/performance_tuning.md          |  4 ++--
 qa/common/gen_jetson_trt_models                |  2 +-
 qa/common/gen_qa_custom_ops                    |  2 +-
 qa/common/gen_qa_model_repository              |  2 +-
 24 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/README.md b/README.md
index 38b4759c48..dcf0a3420f 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@
 > [!WARNING]
 > ##### LATEST RELEASE
 > You are currently on the `main` branch which tracks under-development progress towards the next release.
-> The current release is version [2.47.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.06 container release on NVIDIA GPU Cloud (NGC).
+> The current release is version [2.48.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.07 container release on NVIDIA GPU Cloud (NGC).
 
 Triton Inference Server is an open source inference serving software that
 streamlines AI inferencing. Triton enables teams to deploy any AI model from
@@ -91,16 +91,16 @@ Inference Server with the
 
 ```bash
 # Step 1: Create the example model repository
-git clone -b r24.06 https://github.com/triton-inference-server/server.git
+git clone -b r24.07 https://github.com/triton-inference-server/server.git
 cd server/docs/examples
 ./fetch_models.sh
 
 # Step 2: Launch triton from the NGC Triton container
-docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.06-py3 tritonserver --model-repository=/models
+docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.07-py3 tritonserver --model-repository=/models
 
 # Step 3: Sending an Inference Request
 # In a separate console, launch the image_client example from the NGC Triton SDK container
-docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.06-py3-sdk
+docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.07-py3-sdk
 /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
 
 # Inference should return the following
diff --git a/build.py b/build.py
index 24bde0f3a4..7b59680151 100755
--- a/build.py
+++ b/build.py
@@ -69,9 +69,9 @@
 # incorrectly load the other version of the openvino libraries.
 #
 TRITON_VERSION_MAP = {
-    "2.48.0dev": (
-        "24.06dev",  # triton container
-        "24.06",  # upstream container
+    "2.49.0dev": (
+        "24.07",  # triton container
+        "24.07",  # upstream container
         "1.18.1",  # ORT
         "2024.0.0",  # ORT OpenVINO
         "2024.0.0",  # Standalone OpenVINO
diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml
index 7fd88c5a04..85c70799af 100644
--- a/deploy/aws/values.yaml
+++ b/deploy/aws/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.06-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://triton-inference-server-repository/model_repository
   numGpus: 1
@@ -38,4 +38,4 @@ service:
 secret:
   region: AWS_REGION
   id: AWS_SECRET_KEY_ID
-  key: AWS_SECRET_ACCESS_KEY
\ No newline at end of file
+  key: AWS_SECRET_ACCESS_KEY
diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml
index cca541167c..a9668dcf4c 100644
--- a/deploy/fleetcommand/Chart.yaml
+++ b/deploy/fleetcommand/Chart.yaml
@@ -26,7 +26,7 @@
 
 apiVersion: v1
 # appVersion is the Triton version; update when changing release
-appVersion: "2.47.0"
+appVersion: "2.48.0"
 description: Triton Inference Server (Fleet Command)
 name: triton-inference-server
 # version is the Chart version; update when changing anything in the chart
diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml
index 3f7d95ea45..9b86a5a495 100644
--- a/deploy/fleetcommand/values.yaml
+++ b/deploy/fleetcommand/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.06-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
   pullPolicy: IfNotPresent
   numGpus: 1
   serverCommand: tritonserver
@@ -47,13 +47,13 @@ image:
     #
     # To set model control mode, uncomment and configure below
     # TODO: Fix the following url, it is invalid
-    # See https://github.com/triton-inference-server/server/blob/r24.06/docs/model_management.md
+    # See https://github.com/triton-inference-server/server/blob/r24.07/docs/model_management.md
     #  for more details
     #- --model-control-mode=explicit|poll|none
     #
     # Additional server args
     #
-    # see https://github.com/triton-inference-server/server/blob/r24.06/README.md
+    # see https://github.com/triton-inference-server/server/blob/r24.07/README.md
     #  for more details
 
 service:
diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml
index cd45058c9d..6e53efc103 100644
--- a/deploy/gcp/values.yaml
+++ b/deploy/gcp/values.yaml
@@ -27,10 +27,10 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.06-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: gs://triton-inference-server-repository/model_repository
   numGpus: 1
 
 service:
-  type: LoadBalancer
\ No newline at end of file
+  type: LoadBalancer
diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
index ddbfeeda1f..d101fa910b 100644
--- a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
+++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
@@ -33,7 +33,7 @@ metadata:
   namespace: default
 spec:
   containers:
-  - image: nvcr.io/nvidia/tritonserver:24.06-py3-sdk
+  - image: nvcr.io/nvidia/tritonserver:24.07-py3-sdk
     imagePullPolicy: Always
     name: nv-triton-client
     securityContext:
diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
index 04b7eb9b7f..1091d961b4 100755
--- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
+++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
@@ -27,9 +27,9 @@
 
 export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/')
 export APP_NAME=tritonserver
-export MAJOR_VERSION=2.45
-export MINOR_VERSION=2.45.0
-export NGC_VERSION=24.06-py3
+export MAJOR_VERSION=2.48
+export MINOR_VERSION=2.48.0
+export NGC_VERSION=24.07-py3
 
 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION
 
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
index 356f25efa3..295271466d 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
@@ -25,7 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 apiVersion: v1
-appVersion: "2.47"
+appVersion: "2.48"
 description: Triton Inference Server
 name: triton-inference-server
-version: 2.47.0
+version: 2.48.0
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
index ef88d0109b..3d460f8aa0 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -31,14 +31,14 @@ maxReplicaCount: 3
 tritonProtocol: HTTP
 # HPA GPU utilization autoscaling target
 HPATargetAverageValue: 85
-modelRepositoryPath: gs://triton_sample_models/24.06
-publishedVersion: '2.47.0'
+modelRepositoryPath: gs://triton_sample_models/24.07
+publishedVersion: '2.48.0'
 gcpMarketplace: true
 
 image:
   registry: gcr.io
   repository: nvidia-ngc-public/tritonserver
-  tag: 24.06-py3
+  tag: 24.07-py3
   pullPolicy: IfNotPresent
   # modify the model repository here to match your GCP storage bucket
   numGpus: 1
diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
index 979bfe15a9..8136ad5834 100644
--- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.47.0'
+  publishedVersion: '2.48.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.
diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml
index 43ad5c8535..1324f4cc3d 100644
--- a/deploy/gke-marketplace-app/server-deployer/schema.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.47.0'
+  publishedVersion: '2.48.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.
diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md
index 6fc22d1e9a..b23b405cb9 100644
--- a/deploy/gke-marketplace-app/trt-engine/README.md
+++ b/deploy/gke-marketplace-app/trt-engine/README.md
@@ -33,7 +33,7 @@
 ```
 docker run --gpus all -it --network host \
     --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
-    -v ~:/scripts nvcr.io/nvidia/tensorrt:24.06-py3
+    -v ~:/scripts nvcr.io/nvidia/tensorrt:24.07-py3
 
 pip install onnx six torch tf2onnx tensorflow
 
diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml
index f3e275c196..23d5b2581c 100644
--- a/deploy/k8s-onprem/values.yaml
+++ b/deploy/k8s-onprem/values.yaml
@@ -29,7 +29,7 @@ tags:
   loadBalancing: true
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.06-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
   pullPolicy: IfNotPresent
   modelRepositoryServer: < Replace with the IP Address of your file server >
   modelRepositoryPath: /srv/models
@@ -80,4 +80,4 @@ prometheus-adapter:
         name:
           matches: "nv_inference_queue_duration_us"
           as: "avg_time_queue_us"
-        metricsQuery: 'avg(delta(nv_inference_queue_duration_us{<<.LabelMatchers>>}[30s])/(1+delta(nv_inference_request_success{<<.LabelMatchers>>}[30s]))) by (<<.GroupBy>>)'
\ No newline at end of file
+        metricsQuery: 'avg(delta(nv_inference_queue_duration_us{<<.LabelMatchers>>}[30s])/(1+delta(nv_inference_request_success{<<.LabelMatchers>>}[30s]))) by (<<.GroupBy>>)'
diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml
index 466bb18a3b..045d1c6768 100644
--- a/deploy/oci/values.yaml
+++ b/deploy/oci/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.06-py3
+  imageName: nvcr.io/nvidia/tritonserver:24.07-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://https://<OCI_NAMESPACE>.compat.objectstorage.<OCI_REGION>.oraclecloud.com:443/triton-inference-server-repository
   numGpus: 1
@@ -38,4 +38,4 @@ service:
 secret:
   region: OCI_REGION
   id: OCI_SECRET_KEY_ID
-  key: OCI_SECRET_ACCESS_KEY
\ No newline at end of file
+  key: OCI_SECRET_ACCESS_KEY
diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md
index bdcdad09ec..b5fe0dbf43 100644
--- a/docs/customization_guide/build.md
+++ b/docs/customization_guide/build.md
@@ -173,7 +173,7 @@ $ ./build.py ... --repo-tag=common:<container tag> --repo-tag=core:<container ta
 
 If you are building on a release branch then `<container tag>` will
 default to the branch name. For example, if you are building on the
-r24.06 branch, `<container tag>` will default to r24.06. If you are
+r24.07 branch, `<container tag>` will default to r24.07. If you are
 building on any other branch (including the *main* branch) then
 `<container tag>` will default to "main". Therefore, you typically do
 not need to provide `<container tag>` at all (nor the preceding
@@ -334,8 +334,8 @@ python build.py --cmake-dir=<path/to/repo>/build --build-dir=/tmp/citritonbuild
 If you are building on *main* branch then '<container tag>' will
 default to "main". If you are building on a release branch then
 '<container tag>' will default to the branch name. For example, if you
-are building on the r24.06 branch, '<container tag>' will default to
-r24.06. Therefore, you typically do not need to provide '<container
+are building on the r24.07 branch, '<container tag>' will default to
+r24.07. Therefore, you typically do not need to provide '<container
 tag>' at all (nor the preceding colon). You can use a different
 '<container tag>' for a component to instead use the corresponding
 branch/tag in the build. For example, if you have a branch called
diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md
index 40562603bf..48ecea5ea7 100644
--- a/docs/customization_guide/compose.md
+++ b/docs/customization_guide/compose.md
@@ -46,8 +46,8 @@ The `compose.py` script can be found in the
 Simply clone the repository and run `compose.py` to create a custom container.
 Note: Created container version will depend on the branch that was cloned.
 For example branch
- [r24.06](https://github.com/triton-inference-server/server/tree/r24.06)
-should be used to create a image based on the NGC 24.06 Triton release.
+ [r24.07](https://github.com/triton-inference-server/server/tree/r24.07)
+should be used to create a image based on the NGC 24.07 Triton release.
 
 `compose.py` provides `--backend`, `--repoagent` options that allow you to
 specify which backends and repository agents to include in the custom image.
@@ -79,20 +79,20 @@ For example, running
 ```
 python3 compose.py --backend pytorch --repoagent checksum
 ```
-on branch [r24.06](https://github.com/triton-inference-server/server/tree/r24.06) pulls:
-- `min` container `nvcr.io/nvidia/tritonserver:24.06-py3-min`
-- `full` container `nvcr.io/nvidia/tritonserver:24.06-py3`
+on branch [r24.07](https://github.com/triton-inference-server/server/tree/r24.07) pulls:
+- `min` container `nvcr.io/nvidia/tritonserver:24.07-py3-min`
+- `full` container `nvcr.io/nvidia/tritonserver:24.07-py3`
 
 Alternatively, users can specify the version of Triton container to pull from
 any branch by either:
 1. Adding flag `--container-version <container version>` to branch
 ```
-python3 compose.py --backend pytorch --repoagent checksum --container-version 24.06
+python3 compose.py --backend pytorch --repoagent checksum --container-version 24.07
 ```
 2. Specifying `--image min,<min container image name> --image full,<full container image name>`.
    The user is responsible for specifying compatible `min` and `full` containers.
 ```
-python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.06-py3-min --image full,nvcr.io/nvidia/tritonserver:24.06-py3
+python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.07-py3-min --image full,nvcr.io/nvidia/tritonserver:24.07-py3
 ```
 Method 1 and 2 will result in the same composed container. Furthermore,
 `--image` flag overrides the `--container-version` flag when both are specified.
@@ -103,8 +103,8 @@ Note:
 2. vLLM and TensorRT-LLM backends are currently not supported backends for
 `compose.py`. If you want to build additional backends on top of these backends,
 it would be better to [build it yourself](#build-it-yourself) by using
-`nvcr.io/nvidia/tritonserver:24.06-vllm-python-py3` or
-`nvcr.io/nvidia/tritonserver:24.06-trtllm-python-py3` as a `min` container.
+`nvcr.io/nvidia/tritonserver:24.07-vllm-python-py3` or
+`nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` as a `min` container.
 
 
 ### CPU-only container composition
diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md
index a1b10dcf35..7ee68fd6b2 100644
--- a/docs/customization_guide/test.md
+++ b/docs/customization_guide/test.md
@@ -49,7 +49,7 @@ $ ./gen_qa_custom_ops
 ```
 
 This will create multiple model repositories in /tmp/<version>/qa_*
-(for example /tmp/24.06/qa_model_repository).  The TensorRT models
+(for example /tmp/24.07/qa_model_repository).  The TensorRT models
 will be created for the GPU on the system that CUDA considers device 0
 (zero). If you have multiple GPUs on your system see the documentation
 in the scripts for how to target a specific GPU.
diff --git a/docs/generate_docs.py b/docs/generate_docs.py
index 9c7dd5931e..1cc6644fde 100755
--- a/docs/generate_docs.py
+++ b/docs/generate_docs.py
@@ -43,11 +43,11 @@
 """
 TODO: Needs to handle cross-branch linkage.
 
-For example, server/docs/user_guide/architecture.md on branch 24.06 links to
+For example, server/docs/user_guide/architecture.md on branch 24.07 links to
 server/docs/user_guide/model_analyzer.md on main branch. In this case, the
 hyperlink of model_analyzer.md should be a URL instead of relative path.
 
-Another example can be server/docs/user_guide/model_analyzer.md on branch 24.06
+Another example can be server/docs/user_guide/model_analyzer.md on branch 24.07
 links to a file in server repo with relative path. Currently all URLs are
 hardcoded to main branch. We need to make sure that the URL actually points to the
 correct branch. We also need to handle cases like deprecated or removed files from
diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md
index 89e6216011..263d013ff2 100644
--- a/docs/user_guide/custom_operations.md
+++ b/docs/user_guide/custom_operations.md
@@ -64,7 +64,7 @@ simple way to ensure you are using the correct version of TensorRT is
 to use the [NGC TensorRT
 container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorrt)
 corresponding to the Triton container. For example, if you are using
-the 24.06 version of Triton, use the 24.06 version of the TensorRT
+the 24.07 version of Triton, use the 24.07 version of the TensorRT
 container.
 
 ## TensorFlow
@@ -123,7 +123,7 @@ simple way to ensure you are using the correct version of TensorFlow
 is to use the [NGC TensorFlow
 container](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow)
 corresponding to the Triton container. For example, if you are using
-the 24.06 version of Triton, use the 24.06 version of the TensorFlow
+the 24.07 version of Triton, use the 24.07 version of the TensorFlow
 container.
 
 ## PyTorch
@@ -167,7 +167,7 @@ simple way to ensure you are using the correct version of PyTorch is
 to use the [NGC PyTorch
 container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
 corresponding to the Triton container. For example, if you are using
-the 24.06 version of Triton, use the 24.06 version of the PyTorch
+the 24.07 version of Triton, use the 24.07 version of the PyTorch
 container.
 
 ## ONNX
diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md
index f67e238c6d..2a3ee09d2a 100644
--- a/docs/user_guide/performance_tuning.md
+++ b/docs/user_guide/performance_tuning.md
@@ -235,7 +235,7 @@ with a `tritonserver` binary.
 
 ```bash
 # Start server container
-docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.06-py3
+docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-server nvcr.io/nvidia/tritonserver:24.07-py3
 
 # Start serving your models
 tritonserver --model-repository=/mnt/models
@@ -284,7 +284,7 @@ by setting the `-u` flag, such as `perf_analyzer -m densenet_onnx -u
 
 ```bash
 # Start the SDK container interactively
-docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.06-py3-sdk
+docker run -ti --rm --gpus=all --network=host -v $PWD:/mnt --name triton-client nvcr.io/nvidia/tritonserver:24.07-py3-sdk
 
 # Benchmark model being served from step 3
 perf_analyzer -m densenet_onnx --concurrency-range 1:4
diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models
index 8c5a74a3ec..160282240c 100755
--- a/qa/common/gen_jetson_trt_models
+++ b/qa/common/gen_jetson_trt_models
@@ -34,7 +34,7 @@
 # Make all generated files accessible outside of container
 umask 0000
 # Set the version of the models
-TRITON_VERSION=${TRITON_VERSION:=24.06}
+TRITON_VERSION=${TRITON_VERSION:=24.07}
 # Set the CUDA device to use
 CUDA_DEVICE=${RUNNER_ID:=0}
 # Set TensorRT image
diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops
index d8ca748f8a..d8a41e9f55 100755
--- a/qa/common/gen_qa_custom_ops
+++ b/qa/common/gen_qa_custom_ops
@@ -37,7 +37,7 @@
 ##
 ############################################################################
 
-TRITON_VERSION=${TRITON_VERSION:=24.06}
+TRITON_VERSION=${TRITON_VERSION:=24.07}
 NVIDIA_UPSTREAM_VERSION=${NVIDIA_UPSTREAM_VERSION:=$TRITON_VERSION}
 TENSORFLOW_IMAGE=${TENSORFLOW_IMAGE:=nvcr.io/nvidia/tensorflow:$NVIDIA_UPSTREAM_VERSION-tf2-py3}
 PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-py3}
diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository
index 96131107d3..cab497aa86 100755
--- a/qa/common/gen_qa_model_repository
+++ b/qa/common/gen_qa_model_repository
@@ -48,7 +48,7 @@
 ##
 ############################################################################
 
-TRITON_VERSION=${TRITON_VERSION:=24.06}
+TRITON_VERSION=${TRITON_VERSION:=24.07}
 
 # ONNX. Use ONNX_OPSET 0 to use the default for ONNX version
 ONNX_VERSION=1.13.0

From b93cb6b921178bf927aca1c5a7ca59a4adc43702 Mon Sep 17 00:00:00 2001
From: Francesco Petrini <francescogpetrini@gmail.com>
Date: Mon, 8 Jul 2024 15:39:08 -0700
Subject: [PATCH 02/12] Update README and versions for 2.48.0 / 24.07 (#7425)

* Update README and versions for 2.48.0 / 24.07
---
 Dockerfile.sdk                                |  2 +-
 Dockerfile.win10.min                          | 20 +++++++++----------
 README.md                                     | 12 ++++++-----
 TRITON_VERSION                                |  2 +-
 build.py                                      |  2 +-
 deploy/aws/values.yaml                        |  2 +-
 deploy/fleetcommand/Chart.yaml                |  2 +-
 deploy/fleetcommand/values.yaml               |  2 +-
 deploy/gcp/values.yaml                        |  2 +-
 .../perf-analyzer-script/triton_client.yaml   |  2 +-
 .../server-deployer/build_and_push.sh         |  2 +-
 .../server-deployer/chart/triton/Chart.yaml   |  2 +-
 .../server-deployer/data-test/schema.yaml     |  2 +-
 .../server-deployer/schema.yaml               |  4 ++--
 .../gke-marketplace-app/trt-engine/README.md  |  6 +++---
 deploy/k8s-onprem/values.yaml                 |  2 +-
 deploy/oci/values.yaml                        |  2 +-
 docs/customization_guide/build.md             |  2 +-
 docs/customization_guide/compose.md           |  2 +-
 docs/customization_guide/test.md              |  2 +-
 docs/user_guide/custom_operations.md          |  2 +-
 docs/user_guide/performance_tuning.md         |  2 +-
 qa/common/gen_jetson_trt_models               |  2 +-
 qa/common/gen_qa_custom_ops                   |  2 +-
 24 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/Dockerfile.sdk b/Dockerfile.sdk
index 9e83ecca47..e92b4bcb89 100644
--- a/Dockerfile.sdk
+++ b/Dockerfile.sdk
@@ -29,7 +29,7 @@
 #
 
 # Base image on the minimum Triton container
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.06-py3-min
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
 ARG TRITON_COMMON_REPO_TAG=main
diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min
index 7d954d62de..0a554fbcf4 100644
--- a/Dockerfile.win10.min
+++ b/Dockerfile.win10.min
@@ -1,4 +1,4 @@
-# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -37,9 +37,9 @@ RUN choco install unzip -y
 #
 # Installing TensorRT
 #
-ARG TENSORRT_VERSION=10.0.1.6
-ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.4.zip"
-ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/zip/TensorRT-10.0.1.6.Windows10.win10.cuda-12.4.zip
+ARG TENSORRT_VERSION=10.2.0.19
+ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.5.zip"
+ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/zip/TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5.zip
 # COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP}
 ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP}
 RUN unzip /tmp/%TENSORRT_ZIP%
@@ -51,9 +51,9 @@ LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
 #
 # Installing cuDNN
 #
-ARG CUDNN_VERSION=9.1.0.70
+ARG CUDNN_VERSION=9.2.1.18
 ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip
-ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.1.0.70_cuda12-archive.zip
+ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.2.1.18_cuda12-archive.zip
 ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP}
 RUN unzip /tmp/%CUDNN_ZIP%
 RUN move cudnn-* cudnn
@@ -88,7 +88,7 @@ LABEL PYTHON_VERSION=${PYTHON_VERSION}
 #
 # Installing CMake
 #
-ARG CMAKE_VERSION=3.29.3
+ARG CMAKE_VERSION=3.30.0
 RUN pip install cmake==%CMAKE_VERSION%
 
 ENV CMAKE_TOOLCHAIN_FILE /vcpkg/scripts/buildsystems/vcpkg.cmake
@@ -150,7 +150,7 @@ WORKDIR /
 #
 ARG CUDA_MAJOR=12
 ARG CUDA_MINOR=5
-ARG CUDA_PATCH=0
+ARG CUDA_PATCH=1
 ARG CUDA_VERSION=${CUDA_MAJOR}.${CUDA_MINOR}.${CUDA_PATCH}
 ARG CUDA_PACKAGES="nvcc_${CUDA_MAJOR}.${CUDA_MINOR} \
                    cudart_${CUDA_MAJOR}.${CUDA_MINOR} \
@@ -175,7 +175,7 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi
 
 RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%"
 
-ARG CUDNN_VERSION=9.1.0.70
+ARG CUDNN_VERSION=9.2.1.18
 ENV CUDNN_VERSION ${CUDNN_VERSION}
 COPY --from=dependency_base /cudnn /cudnn
 RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\."
@@ -183,7 +183,7 @@ RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\."
 RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\."
 LABEL CUDNN_VERSION="${CUDNN_VERSION}"
 
-ARG TENSORRT_VERSION=10.0.1.6
+ARG TENSORRT_VERSION=10.2.0.19
 ENV TRT_VERSION ${TENSORRT_VERSION}
 COPY --from=dependency_base /TensorRT /TensorRT
 RUN setx PATH "c:\TensorRT\lib;%PATH%"
diff --git a/README.md b/README.md
index dcf0a3420f..8bb2302dea 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -30,10 +30,11 @@
 
 [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
-> [!WARNING]
-> ##### LATEST RELEASE
-> You are currently on the `main` branch which tracks under-development progress towards the next release.
-> The current release is version [2.48.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.07 container release on NVIDIA GPU Cloud (NGC).
+ [!WARNING]
+
+##### LATEST RELEASE
+ You are currently on the `main` branch which tracks under-development progress towards the next release.
+ The current release is version [2.48.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.07 container release on NVIDIA GPU Cloud (NGC).
 
 Triton Inference Server is an open source inference serving software that
 streamlines AI inferencing. Triton enables teams to deploy any AI model from
@@ -262,3 +263,4 @@ For questions, we recommend posting in our community
 
 Please refer to the [NVIDIA Developer Triton page](https://developer.nvidia.com/nvidia-triton-inference-server)
 for more information.
+=======
diff --git a/TRITON_VERSION b/TRITON_VERSION
index b5a8176d48..9a9feb0847 100644
--- a/TRITON_VERSION
+++ b/TRITON_VERSION
@@ -1 +1 @@
-2.48.0dev
+2.48.0
diff --git a/build.py b/build.py
index 7b59680151..ce69c40630 100755
--- a/build.py
+++ b/build.py
@@ -76,7 +76,7 @@
         "2024.0.0",  # ORT OpenVINO
         "2024.0.0",  # Standalone OpenVINO
         "3.2.6",  # DCGM version
-        "0.5.0.post1",  # vLLM version
+        "0.5.1",  # vLLM version
     )
 }
 
diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml
index 85c70799af..98151829c7 100644
--- a/deploy/aws/values.yaml
+++ b/deploy/aws/values.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml
index a9668dcf4c..340e19fb50 100644
--- a/deploy/fleetcommand/Chart.yaml
+++ b/deploy/fleetcommand/Chart.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml
index 9b86a5a495..7a556ef7df 100644
--- a/deploy/fleetcommand/values.yaml
+++ b/deploy/fleetcommand/values.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml
index 6e53efc103..937acc6b80 100644
--- a/deploy/gcp/values.yaml
+++ b/deploy/gcp/values.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
index d101fa910b..21e5a34077 100644
--- a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
+++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
@@ -1,4 +1,4 @@
-# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
index 1091d961b4..e4fe8fe04f 100755
--- a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
+++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
index 295271466d..e2b00ad12b 100644
--- a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
index 8136ad5834..0ecf429a44 100644
--- a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml
index 1324f4cc3d..c82f73e47f 100644
--- a/deploy/gke-marketplace-app/server-deployer/schema.yaml
+++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -89,7 +89,7 @@ properties:
   modelRepositoryPath:
     type: string
     title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc.
-    default: gs://triton_sample_models/24.06
+    default: gs://triton_sample_models/24.07
   image.ldPreloadPath:
     type: string
     title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable.
diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md
index b23b405cb9..22343d966d 100644
--- a/deploy/gke-marketplace-app/trt-engine/README.md
+++ b/deploy/gke-marketplace-app/trt-engine/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -57,7 +57,7 @@ mkdir -p engines
 
 python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh
 
-gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.06/bert/1/model.plan
+gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24.07/bert/1/model.plan
 ```
 
-For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.06/` should be updated accordingly with the correct version.
+For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24.07/` should be updated accordingly with the correct version.
diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml
index 23d5b2581c..9366a0710c 100644
--- a/deploy/k8s-onprem/values.yaml
+++ b/deploy/k8s-onprem/values.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml
index 045d1c6768..3a85e7901b 100644
--- a/deploy/oci/values.yaml
+++ b/deploy/oci/values.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md
index b5fe0dbf43..db16b65c6b 100644
--- a/docs/customization_guide/build.md
+++ b/docs/customization_guide/build.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md
index 48ecea5ea7..ca3aafdbd0 100644
--- a/docs/customization_guide/compose.md
+++ b/docs/customization_guide/compose.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md
index 7ee68fd6b2..d664a139d3 100644
--- a/docs/customization_guide/test.md
+++ b/docs/customization_guide/test.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/user_guide/custom_operations.md b/docs/user_guide/custom_operations.md
index 263d013ff2..136edd180f 100644
--- a/docs/user_guide/custom_operations.md
+++ b/docs/user_guide/custom_operations.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/docs/user_guide/performance_tuning.md b/docs/user_guide/performance_tuning.md
index 2a3ee09d2a..49cad9e637 100644
--- a/docs/user_guide/performance_tuning.md
+++ b/docs/user_guide/performance_tuning.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models
index 160282240c..99a6175a08 100755
--- a/qa/common/gen_jetson_trt_models
+++ b/qa/common/gen_jetson_trt_models
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops
index d8a41e9f55..4ae0f006b3 100755
--- a/qa/common/gen_qa_custom_ops
+++ b/qa/common/gen_qa_custom_ops
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions

From c712096b2018041db8ea4d6d97fe1e8a895426d6 Mon Sep 17 00:00:00 2001
From: Francesco Petrini <francescogpetrini@gmail.com>
Date: Wed, 10 Jul 2024 16:25:53 -0700
Subject: [PATCH 03/12] [24.07] Revert vllm to 0.5.0.post1 (#7434)

[24.07] Revert vllm to 0.5.0.post1
---
 build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.py b/build.py
index ce69c40630..7b59680151 100755
--- a/build.py
+++ b/build.py
@@ -76,7 +76,7 @@
         "2024.0.0",  # ORT OpenVINO
         "2024.0.0",  # Standalone OpenVINO
         "3.2.6",  # DCGM version
-        "0.5.1",  # vLLM version
+        "0.5.0.post1",  # vLLM version
     )
 }
 

From b28444040eb57909c7cc34427cce2f86cabebf43 Mon Sep 17 00:00:00 2001
From: Sai Kiran Polisetty <spolisetty@nvidia.com>
Date: Thu, 11 Jul 2024 21:37:16 +0530
Subject: [PATCH 04/12] ci: Enable TF-TRT tests (#7399)

---
 qa/L0_tftrt_optimization/test.sh | 2 --
 qa/L0_warmup/test.sh             | 4 ----
 2 files changed, 6 deletions(-)

diff --git a/qa/L0_tftrt_optimization/test.sh b/qa/L0_tftrt_optimization/test.sh
index da0b3d7e93..04dcdc2f65 100755
--- a/qa/L0_tftrt_optimization/test.sh
+++ b/qa/L0_tftrt_optimization/test.sh
@@ -53,8 +53,6 @@ source ../common/util.sh
 
 RET=0
 
-# FIXME [DLIS-6660] Update TF image models to support dynamic batching
-# Currently, models cannot load with TF-TRT optimization due to the removal of implicit batch support in TensorRT v10 onwards
 for MODEL in \
         graphdef_float32_float32_float32 \
         savedmodel_float32_float32_float32; do
diff --git a/qa/L0_warmup/test.sh b/qa/L0_warmup/test.sh
index 28c34e0b2f..aeed873b25 100755
--- a/qa/L0_warmup/test.sh
+++ b/qa/L0_warmup/test.sh
@@ -288,10 +288,6 @@ for BACKEND in ${BACKENDS}; do
     fi
 
     if [ "$BACKEND" == "graphdef" ]; then
-        # FIXME [DLIS-6660] Update TF image models to support dynamic batching
-        # Currently, models cannot load with TF-TRT optimization due to the removal of implicit batch support in TensorRT v10 onwards
-        continue
-
         # Show effect of warmup by using a TF model with TF-TRT optimization which is
         # known to be slow on first inference.
         # Note: model can be obatined via the fetching script in docs/example

From 81df91321832113df03f6b5e19782bdaacf63c8d Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Mon, 15 Jul 2024 09:33:26 -0700
Subject: [PATCH 05/12] =?UTF-8?q?test:=20Tests=20for=20Metrics=20API=20enh?=
 =?UTF-8?q?ancement=20to=20include=20error=20counters=20(#7=E2=80=A6=20(#7?=
 =?UTF-8?q?440)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Misha Chornyi <99709299+mc-nv@users.noreply.github.com>
Co-authored-by: Francesco Petrini <francescogpetrini@gmail.com>
---
 docs/user_guide/metrics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
index 8eb26d0bf5..0a7f3cf1a3 100644
--- a/docs/user_guide/metrics.md
+++ b/docs/user_guide/metrics.md
@@ -104,7 +104,7 @@ Count*. The count metrics are illustrated by the following examples:
 
 | Failed Request Reason |Description |
 |------------|------------|
-| REJECTED  | Number of inference failures due to request timeout in the schedular. |
+| REJECTED  | Number of inference failures due to request timeout in the scheduler. |
 | CANCELED  |  Number of inference failures due to request cancellation in the core. |
 | BACKEND |  Number of inference failures during execution of requests in the backend/model. |
 | OTHER  | Number of inference failures due to other uncategorized reasons in the core. |

From 510c7bc2063778300328881c61dc51a3b831b49d Mon Sep 17 00:00:00 2001
From: Kris Hung <krish@nvidia.com>
Date: Tue, 16 Jul 2024 10:52:22 -0700
Subject: [PATCH 06/12] fix: Downgrade ompi version to v4.1.5rc2 (#7441)

Downgrade ompi version to v4.1.5rc2
---
 build.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/build.py b/build.py
index 7b59680151..242294d1fd 100755
--- a/build.py
+++ b/build.py
@@ -1086,18 +1086,23 @@ def create_dockerfile_linux(
 # Remove contents that are not needed in runtime
 # Setuptools has breaking changes in version 70.0.0, so fix it to 69.5.1
 # The generated code in grpc_service_pb2_grpc.py depends on grpcio>=1.64.0, so fix it to 1.64.0
-RUN ldconfig && \
-    ARCH="$(uname -i)" && \
-    rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \
-    rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \
-    rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \
-    python3 -m pip install --upgrade pip && \
-    pip3 install --no-cache-dir transformers && \
-    find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \
-    find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \
+RUN ldconfig && \\
+    ARCH="$(uname -i)" && \\
+    rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \\
+    rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \\
+    rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \\
+    python3 -m pip install --upgrade pip && \\
+    pip3 install --no-cache-dir transformers && \\
+    find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \\
+    find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \\
     pip3 install --no-cache-dir setuptools==69.5.1 grpcio-tools==1.64.0
 
 ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH
+
+# There are some ucc issues when spawning mpi processes with ompi v4.1.7a1.
+# Downgrade to ompi v4.1.5rc2 to avoid the issue.
+RUN rm -fr /opt/hpcx/ompi
+COPY --from=nvcr.io/nvidia/tritonserver:24.02-py3-min /opt/hpcx/ompi /opt/hpcx/ompi
 """
     with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
         dfile.write(df)

From 64768a795efd6f971701d1f160fb62fdd6f614b7 Mon Sep 17 00:00:00 2001
From: Pavithra Vijayakrishnan
 <160681768+pvijayakrish@users.noreply.github.com>
Date: Fri, 19 Jul 2024 11:43:29 -0700
Subject: [PATCH 07/12] Tprd 287 update openssh client for triton trtllm
 containers (#7455)

---
 build.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/build.py b/build.py
index 242294d1fd..b6d34a4bf4 100755
--- a/build.py
+++ b/build.py
@@ -1234,6 +1234,14 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
             virtualenv \\
       && rm -rf /var/lib/apt/lists/*
 """
+    if "tensorrtllm" in backends:
+        df += """
+# Updating the openssh-client to fix for the CVE-2024-6387. This can be removed when trtllm uses a later CUDA container(12.5 or later)
+RUN apt-get update \\
+    && apt-get install -y --no-install-recommends \\
+        openssh-client \\
+    && rm -rf /var/lib/apt/lists/*
+    """
 
     if "vllm" in backends:
         df += """

From 14f077e91929fbbc1f1870e544cac1e4f234e92e Mon Sep 17 00:00:00 2001
From: Pavithra Vijayakrishnan
 <160681768+pvijayakrish@users.noreply.github.com>
Date: Mon, 22 Jul 2024 17:22:03 -0700
Subject: [PATCH 08/12] Update README for 24.07 (#7464)

---
 README.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 8bb2302dea..17628b4f03 100644
--- a/README.md
+++ b/README.md
@@ -30,11 +30,11 @@
 
 [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
- [!WARNING]
+[!WARNING]
 
 ##### LATEST RELEASE
- You are currently on the `main` branch which tracks under-development progress towards the next release.
- The current release is version [2.48.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.07 container release on NVIDIA GPU Cloud (NGC).
+You are currently on the `main` branch which tracks under-development progress towards the next release.
+The current release is version [2.48.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.07 container release on NVIDIA GPU Cloud (NGC).
 
 Triton Inference Server is an open source inference serving software that
 streamlines AI inferencing. Triton enables teams to deploy any AI model from
@@ -263,4 +263,3 @@ For questions, we recommend posting in our community
 
 Please refer to the [NVIDIA Developer Triton page](https://developer.nvidia.com/nvidia-triton-inference-server)
 for more information.
-=======

From 08742882f2ac44e07164e77f02905b254d73ecd9 Mon Sep 17 00:00:00 2001
From: Pavithra Vijayakrishnan
 <160681768+pvijayakrish@users.noreply.github.com>
Date: Mon, 22 Jul 2024 17:22:36 -0700
Subject: [PATCH 09/12] Uninstall setuptools after TRTLLM build (#7465)

---
 build.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)
 mode change 100755 => 100644 build.py

diff --git a/build.py b/build.py
old mode 100755
new mode 100644
index b6d34a4bf4..179f784ed0
--- a/build.py
+++ b/build.py
@@ -1086,17 +1086,18 @@ def create_dockerfile_linux(
 # Remove contents that are not needed in runtime
 # Setuptools has breaking changes in version 70.0.0, so fix it to 69.5.1
 # The generated code in grpc_service_pb2_grpc.py depends on grpcio>=1.64.0, so fix it to 1.64.0
-RUN ldconfig && \\
-    ARCH="$(uname -i)" && \\
-    rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \\
-    rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \\
-    rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \\
-    python3 -m pip install --upgrade pip && \\
-    pip3 install --no-cache-dir transformers && \\
-    find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \\
-    find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \\
-    pip3 install --no-cache-dir setuptools==69.5.1 grpcio-tools==1.64.0
-
+    
+RUN ldconfig && \
+    ARCH="$(uname -i)" && \
+    rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \
+    rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \
+    rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \
+    python3 -m pip install --upgrade pip && \
+    pip3 install --no-cache-dir transformers && \
+    find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \
+    find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \
+    pip3 install --no-cache-dir  grpcio-tools==1.64.0 && \
+    pip3 uninstall -y setuptools
 ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH
 
 # There are some ucc issues when spawning mpi processes with ompi v4.1.7a1.

From 25ba4dfa18cbcd7156935c740183b238f235168e Mon Sep 17 00:00:00 2001
From: Pavithra Vijayakrishnan
 <160681768+pvijayakrish@users.noreply.github.com>
Date: Mon, 22 Jul 2024 17:35:44 -0700
Subject: [PATCH 10/12] update packages trtllm (#7466)

---
 build.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/build.py b/build.py
index 179f784ed0..2bcfdd2533 100644
--- a/build.py
+++ b/build.py
@@ -1086,17 +1086,16 @@ def create_dockerfile_linux(
 # Remove contents that are not needed in runtime
 # Setuptools has breaking changes in version 70.0.0, so fix it to 69.5.1
 # The generated code in grpc_service_pb2_grpc.py depends on grpcio>=1.64.0, so fix it to 1.64.0
-    
-RUN ldconfig && \
-    ARCH="$(uname -i)" && \
-    rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \
-    rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \
-    rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \
-    python3 -m pip install --upgrade pip && \
-    pip3 install --no-cache-dir transformers && \
-    find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \
-    find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \
-    pip3 install --no-cache-dir  grpcio-tools==1.64.0 && \
+RUN ldconfig && \\
+    ARCH="$(uname -i)" && \\
+    rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \\
+    rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \\
+    rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \\
+    python3 -m pip install --upgrade pip && \\
+    pip3 install --no-cache-dir transformers && \\
+    find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \\
+    find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \\
+    pip3 install --no-cache-dir  grpcio-tools==1.64.0 && \\
     pip3 uninstall -y setuptools
 ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH
 

From c6a533eb5edd66c9ef0f19510b21bd88e393fbdc Mon Sep 17 00:00:00 2001
From: pvijayakrish <pvijayakrish@nvidia.com>
Date: Tue, 23 Jul 2024 14:01:14 -0700
Subject: [PATCH 11/12] Changing build.py permission to 755

---
 build.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 build.py

diff --git a/build.py b/build.py
old mode 100644
new mode 100755

From de01ed9e22874be1ac390546a56ddcf9d176afb9 Mon Sep 17 00:00:00 2001
From: pvijayakrish <pvijayakrish@nvidia.com>
Date: Tue, 23 Jul 2024 23:36:09 -0700
Subject: [PATCH 12/12] Updating main for tracking development for next release

---
 TRITON_VERSION | 2 +-
 build.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/TRITON_VERSION b/TRITON_VERSION
index 9a9feb0847..37433781ef 100644
--- a/TRITON_VERSION
+++ b/TRITON_VERSION
@@ -1 +1 @@
-2.48.0
+2.49.0dev
\ No newline at end of file
diff --git a/build.py b/build.py
index 2bcfdd2533..4a658bbdc1 100755
--- a/build.py
+++ b/build.py
@@ -70,7 +70,7 @@
 #
 TRITON_VERSION_MAP = {
     "2.49.0dev": (
-        "24.07",  # triton container
+        "24.08dev",  # triton container
         "24.07",  # upstream container
         "1.18.1",  # ORT
         "2024.0.0",  # ORT OpenVINO