triton-inference-server
diff --git a/‎Dockerfile.sdk
+38-10 b/‎Dockerfile.sdk
+38-10
diff --git a/‎docs/user_guide/model_configuration.md
+34 b/‎docs/user_guide/model_configuration.md
+34
diff --git a/‎qa/L0_input_validation/input_validation_test.py
+72 b/‎qa/L0_input_validation/input_validation_test.py
+72
diff --git a/‎qa/L0_input_validation/test.sh
+2 b/‎qa/L0_input_validation/test.sh
+2
@@ -32,8 +32,10 @@
 ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
+ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
 ARG TRITON_COMMON_REPO_TAG=main
 ARG TRITON_CORE_REPO_TAG=main
+ARG TRITON_CLIENT_REPO_TAG=main
 ARG TRITON_THIRD_PARTY_REPO_TAG=main
 ARG TRITON_MODEL_ANALYZER_REPO_TAG=main
 ARG TRITON_ENABLE_GPU=ON
@@ -103,8 +105,10 @@ RUN rm -f /usr/bin/python && \
 # Build the client library and examples
 ARG TRITON_REPO_ORGANIZATION
 ARG TRITON_CLIENT_REPO_SUBDIR
+ARG TRITON_PA_REPO_SUBDIR
 ARG TRITON_COMMON_REPO_TAG
 ARG TRITON_CORE_REPO_TAG
+ARG TRITON_CLIENT_REPO_TAG
 ARG TRITON_THIRD_PARTY_REPO_TAG
 ARG TRITON_ENABLE_GPU
 ARG JAVA_BINDINGS_MAVEN_VERSION
@@ -114,26 +118,53 @@ ARG TARGETPLATFORM
 WORKDIR /workspace
 COPY TRITON_VERSION .
 COPY ${TRITON_CLIENT_REPO_SUBDIR} client
+COPY ${TRITON_PA_REPO_SUBDIR} perf_analyzer
 
-WORKDIR /workspace/build
+WORKDIR /workspace/client_build
 RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
           -DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \
           -DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \
           -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
           -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
           -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
+          -DTRITON_ENABLE_PERF_ANALYZER=OFF \
           -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON \
-          -DTRITON_ENABLE_PYTHON_HTTP=ON -DTRITON_ENABLE_PYTHON_GRPC=ON \
+          -DTRITON_ENABLE_PYTHON_HTTP=OFF -DTRITON_ENABLE_PYTHON_GRPC=OFF \
           -DTRITON_ENABLE_JAVA_HTTP=ON \
-          -DTRITON_ENABLE_PERF_ANALYZER=ON \
+          -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON \
+          -DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} /workspace/client
+RUN make -j16 cc-clients java-clients && \
+    rm -fr ~/.m2
+
+# TODO: PA will rebuild the CC clients since it depends on it.
+# This should be optimized so that we do not have to build
+# the CC clients twice. Similarly, because the SDK expectation is
+# that PA is packaged with the python client, we hold off on building
+# the python client until now. Post-migration we should focus
+# effort on de-tangling these flows.
+WORKDIR /workspace/pa_build
+RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
+          -DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \
+          -DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \
+          -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
+          -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
+          -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
           -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
           -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \
           -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \
           -DTRITON_ENABLE_PERF_ANALYZER_OPENAI=ON \
-          -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON \
-          -DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} /workspace/client
-RUN make -j16 cc-clients python-clients java-clients && \
-    rm -fr ~/.m2
+          -DTRITON_ENABLE_CC_HTTP=ON \
+          -DTRITON_ENABLE_CC_GRPC=ON \
+          -DTRITON_ENABLE_PYTHON_HTTP=ON \
+          -DTRITON_ENABLE_PYTHON_GRPC=ON \
+          -DTRITON_PACKAGE_PERF_ANALYZER=ON \
+          -DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} \
+          /workspace/perf_analyzer
+RUN make -j16 perf-analyzer python-clients
+
+RUN pip3 install build \
+    && cd /workspace/perf_analyzer/genai-perf \
+    && python3 -m build --wheel --outdir /workspace/install/python
 
 # Install Java API Bindings
 RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
@@ -144,9 +175,6 @@ RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
         --jar-install-path /workspace/install/java-api-bindings; \
     fi
 
-RUN pip3 install build \
-    && cd /workspace/client/src/c++/perf_analyzer/genai-perf \
-    && python3 -m build --wheel --outdir /workspace/install/python
 ############################################################################
 ## Create sdk container
 ############################################################################
 
@@ -598,6 +598,40 @@ input1: [4, 4, 6] <== shape of this tensor [3]
 Currently, only TensorRT supports shape tensors. Read [Shape Tensor I/O](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#shape_tensor_io)
 to learn more about shape tensors.
 
+## Non-Linear I/O Formats
+
+For models that process input or output data in non-linear formats, the _is_non_linear_format_io_ property
+must be set. The following example model configuration shows how to specify that INPUT0 and INPUT1 use non-linear I/O data formats.
+
+```
+  name: "mytensorrtmodel"
+  platform: "tensorrt_plan"
+  max_batch_size: 8
+  input [
+    {
+      name: "INPUT0"
+      data_type: TYPE_FP16
+      dims: [ 3,224,224 ]
+      is_non_linear_format_io: true
+    },
+    {
+      name: "INPUT1"
+      data_type: TYPE_FP16
+      dims: [ 3,224,224 ]
+      is_non_linear_format_io: true
+    }
+  ]
+  output [
+    {
+      name: "OUTPUT0"
+      data_type: TYPE_FP16
+      dims: [ 1,3 ]
+     }
+  ]
+```
+
+Currently, only TensorRT supports this property. To learn more about I/O formats, refer to the [I/O Formats documentation](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#reformat-free-network-tensors).
+
 ## Version Policy
 
 Each model can have one or more
 
@@ -34,6 +34,7 @@
 import infer_util as iu
 import numpy as np
 import tritonclient.grpc as tritongrpcclient
+import tritonclient.utils.shared_memory as shm
 from tritonclient.utils import InferenceServerException, np_to_triton_dtype
 
 
@@ -211,6 +212,77 @@ def get_input_array(input_size, np_dtype):
             err_str,
         )
 
+    def test_wrong_input_shape_tensor_size(self):
+        def inference_helper(model_name, batch_size=1):
+            triton_client = tritongrpcclient.InferenceServerClient("localhost:8001")
+            if batch_size > 1:
+                dummy_input_data = np.random.rand(batch_size, 32, 32).astype(np.float32)
+            else:
+                dummy_input_data = np.random.rand(32, 32).astype(np.float32)
+            shape_tensor_data = np.asarray([4, 4], dtype=np.int32)
+
+            # Pass incorrect input byte size date for shape tensor
+            # Use shared memory to bypass the shape check in client library
+            input_byte_size = (shape_tensor_data.size - 1) * np.dtype(np.int32).itemsize
+
+            input_shm_handle = shm.create_shared_memory_region(
+                "INPUT0_SHM",
+                "/INPUT0_SHM",
+                input_byte_size,
+            )
+            shm.set_shared_memory_region(
+                input_shm_handle,
+                [
+                    shape_tensor_data,
+                ],
+            )
+            triton_client.register_system_shared_memory(
+                "INPUT0_SHM",
+                "/INPUT0_SHM",
+                input_byte_size,
+            )
+
+            inputs = [
+                tritongrpcclient.InferInput(
+                    "DUMMY_INPUT0",
+                    dummy_input_data.shape,
+                    np_to_triton_dtype(np.float32),
+                ),
+                tritongrpcclient.InferInput(
+                    "INPUT0",
+                    shape_tensor_data.shape,
+                    np_to_triton_dtype(np.int32),
+                ),
+            ]
+            inputs[0].set_data_from_numpy(dummy_input_data)
+            inputs[1].set_shared_memory("INPUT0_SHM", input_byte_size)
+
+            outputs = [
+                tritongrpcclient.InferRequestedOutput("DUMMY_OUTPUT0"),
+                tritongrpcclient.InferRequestedOutput("OUTPUT0"),
+            ]
+
+            try:
+                # Perform inference
+                with self.assertRaises(InferenceServerException) as e:
+                    triton_client.infer(
+                        model_name=model_name, inputs=inputs, outputs=outputs
+                    )
+                err_str = str(e.exception)
+                correct_input_byte_size = (
+                    shape_tensor_data.size * np.dtype(np.int32).itemsize
+                )
+                self.assertIn(
+                    f"input byte size mismatch for input 'INPUT0' for model '{model_name}'. Expected {correct_input_byte_size}, got {input_byte_size}",
+                    err_str,
+                )
+            finally:
+                shm.destroy_shared_memory_region(input_shm_handle)
+                triton_client.unregister_system_shared_memory("INPUT0_SHM")
+
+        inference_helper(model_name="plan_nobatch_zero_1_float32_int32")
+        inference_helper(model_name="plan_zero_1_float32_int32", batch_size=8)
+
 
 if __name__ == "__main__":
     unittest.main()
@@ -123,6 +123,8 @@ dynamic_batching {
 EOL
 
 cp -r $DATADIR/qa_model_repository/graphdef_object_int32_int32 models/.
+cp -r $DATADIR/qa_shapetensor_model_repository/plan_nobatch_zero_1_float32_int32 models/.
+cp -r $DATADIR/qa_shapetensor_model_repository/plan_zero_1_float32_int32 models/.
 
 SERVER_ARGS="--model-repository=`pwd`/models"
 run_server