Skip to content

Commit fd00801

Browse files
committed
Merge branch 'main' of github.com:triton-inference-server/server into rmccormick-pb-bf16
2 parents c41a83d + fb056b1 commit fd00801

File tree

17 files changed

+611
-103
lines changed

17 files changed

+611
-103
lines changed

Dockerfile.sdk

+38-10
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,10 @@
3232
ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-py3-min
3333

3434
ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
35+
ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
3536
ARG TRITON_COMMON_REPO_TAG=main
3637
ARG TRITON_CORE_REPO_TAG=main
38+
ARG TRITON_CLIENT_REPO_TAG=main
3739
ARG TRITON_THIRD_PARTY_REPO_TAG=main
3840
ARG TRITON_MODEL_ANALYZER_REPO_TAG=main
3941
ARG TRITON_ENABLE_GPU=ON
@@ -103,8 +105,10 @@ RUN rm -f /usr/bin/python && \
103105
# Build the client library and examples
104106
ARG TRITON_REPO_ORGANIZATION
105107
ARG TRITON_CLIENT_REPO_SUBDIR
108+
ARG TRITON_PA_REPO_SUBDIR
106109
ARG TRITON_COMMON_REPO_TAG
107110
ARG TRITON_CORE_REPO_TAG
111+
ARG TRITON_CLIENT_REPO_TAG
108112
ARG TRITON_THIRD_PARTY_REPO_TAG
109113
ARG TRITON_ENABLE_GPU
110114
ARG JAVA_BINDINGS_MAVEN_VERSION
@@ -114,26 +118,53 @@ ARG TARGETPLATFORM
114118
WORKDIR /workspace
115119
COPY TRITON_VERSION .
116120
COPY ${TRITON_CLIENT_REPO_SUBDIR} client
121+
COPY ${TRITON_PA_REPO_SUBDIR} perf_analyzer
117122

118-
WORKDIR /workspace/build
123+
WORKDIR /workspace/client_build
119124
RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
120125
-DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \
121126
-DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \
122127
-DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
123128
-DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
124129
-DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
130+
-DTRITON_ENABLE_PERF_ANALYZER=OFF \
125131
-DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON \
126-
-DTRITON_ENABLE_PYTHON_HTTP=ON -DTRITON_ENABLE_PYTHON_GRPC=ON \
132+
-DTRITON_ENABLE_PYTHON_HTTP=OFF -DTRITON_ENABLE_PYTHON_GRPC=OFF \
127133
-DTRITON_ENABLE_JAVA_HTTP=ON \
128-
-DTRITON_ENABLE_PERF_ANALYZER=ON \
134+
-DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON \
135+
-DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} /workspace/client
136+
RUN make -j16 cc-clients java-clients && \
137+
rm -fr ~/.m2
138+
139+
# TODO: PA will rebuild the CC clients since it depends on it.
140+
# This should be optimized so that we do not have to build
141+
# the CC clients twice. Similarly, because the SDK expectation is
142+
# that PA is packaged with the python client, we hold off on building
143+
# the python client until now. Post-migration we should focus
144+
# effort on de-tangling these flows.
145+
WORKDIR /workspace/pa_build
146+
RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
147+
-DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \
148+
-DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \
149+
-DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
150+
-DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
151+
-DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
129152
-DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
130153
-DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \
131154
-DTRITON_ENABLE_PERF_ANALYZER_TS=ON \
132155
-DTRITON_ENABLE_PERF_ANALYZER_OPENAI=ON \
133-
-DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON \
134-
-DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} /workspace/client
135-
RUN make -j16 cc-clients python-clients java-clients && \
136-
rm -fr ~/.m2
156+
-DTRITON_ENABLE_CC_HTTP=ON \
157+
-DTRITON_ENABLE_CC_GRPC=ON \
158+
-DTRITON_ENABLE_PYTHON_HTTP=ON \
159+
-DTRITON_ENABLE_PYTHON_GRPC=ON \
160+
-DTRITON_PACKAGE_PERF_ANALYZER=ON \
161+
-DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} \
162+
/workspace/perf_analyzer
163+
RUN make -j16 perf-analyzer python-clients
164+
165+
RUN pip3 install build \
166+
&& cd /workspace/perf_analyzer/genai-perf \
167+
&& python3 -m build --wheel --outdir /workspace/install/python
137168

138169
# Install Java API Bindings
139170
RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
@@ -144,9 +175,6 @@ RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
144175
--jar-install-path /workspace/install/java-api-bindings; \
145176
fi
146177

147-
RUN pip3 install build \
148-
&& cd /workspace/client/src/c++/perf_analyzer/genai-perf \
149-
&& python3 -m build --wheel --outdir /workspace/install/python
150178
############################################################################
151179
## Create sdk container
152180
############################################################################

docs/user_guide/model_configuration.md

+34
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,40 @@ input1: [4, 4, 6] <== shape of this tensor [3]
598598
Currently, only TensorRT supports shape tensors. Read [Shape Tensor I/O](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#shape_tensor_io)
599599
to learn more about shape tensors.
600600

601+
## Non-Linear I/O Formats
602+
603+
For models that process input or output data in non-linear formats, the _is_non_linear_format_io_ property
604+
must be set. The following example model configuration shows how to specify that INPUT0 and INPUT1 use non-linear I/O data formats.
605+
606+
```
607+
name: "mytensorrtmodel"
608+
platform: "tensorrt_plan"
609+
max_batch_size: 8
610+
input [
611+
{
612+
name: "INPUT0"
613+
data_type: TYPE_FP16
614+
dims: [ 3,224,224 ]
615+
is_non_linear_format_io: true
616+
},
617+
{
618+
name: "INPUT1"
619+
data_type: TYPE_FP16
620+
dims: [ 3,224,224 ]
621+
is_non_linear_format_io: true
622+
}
623+
]
624+
output [
625+
{
626+
name: "OUTPUT0"
627+
data_type: TYPE_FP16
628+
dims: [ 1,3 ]
629+
}
630+
]
631+
```
632+
633+
Currently, only TensorRT supports this property. To learn more about I/O formats, refer to the [I/O Formats documentation](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#reformat-free-network-tensors).
634+
601635
## Version Policy
602636

603637
Each model can have one or more

qa/L0_input_validation/input_validation_test.py

+72
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import infer_util as iu
3535
import numpy as np
3636
import tritonclient.grpc as tritongrpcclient
37+
import tritonclient.utils.shared_memory as shm
3738
from tritonclient.utils import InferenceServerException, np_to_triton_dtype
3839

3940

@@ -211,6 +212,77 @@ def get_input_array(input_size, np_dtype):
211212
err_str,
212213
)
213214

215+
def test_wrong_input_shape_tensor_size(self):
216+
def inference_helper(model_name, batch_size=1):
217+
triton_client = tritongrpcclient.InferenceServerClient("localhost:8001")
218+
if batch_size > 1:
219+
dummy_input_data = np.random.rand(batch_size, 32, 32).astype(np.float32)
220+
else:
221+
dummy_input_data = np.random.rand(32, 32).astype(np.float32)
222+
shape_tensor_data = np.asarray([4, 4], dtype=np.int32)
223+
224+
# Pass incorrect input byte size date for shape tensor
225+
# Use shared memory to bypass the shape check in client library
226+
input_byte_size = (shape_tensor_data.size - 1) * np.dtype(np.int32).itemsize
227+
228+
input_shm_handle = shm.create_shared_memory_region(
229+
"INPUT0_SHM",
230+
"/INPUT0_SHM",
231+
input_byte_size,
232+
)
233+
shm.set_shared_memory_region(
234+
input_shm_handle,
235+
[
236+
shape_tensor_data,
237+
],
238+
)
239+
triton_client.register_system_shared_memory(
240+
"INPUT0_SHM",
241+
"/INPUT0_SHM",
242+
input_byte_size,
243+
)
244+
245+
inputs = [
246+
tritongrpcclient.InferInput(
247+
"DUMMY_INPUT0",
248+
dummy_input_data.shape,
249+
np_to_triton_dtype(np.float32),
250+
),
251+
tritongrpcclient.InferInput(
252+
"INPUT0",
253+
shape_tensor_data.shape,
254+
np_to_triton_dtype(np.int32),
255+
),
256+
]
257+
inputs[0].set_data_from_numpy(dummy_input_data)
258+
inputs[1].set_shared_memory("INPUT0_SHM", input_byte_size)
259+
260+
outputs = [
261+
tritongrpcclient.InferRequestedOutput("DUMMY_OUTPUT0"),
262+
tritongrpcclient.InferRequestedOutput("OUTPUT0"),
263+
]
264+
265+
try:
266+
# Perform inference
267+
with self.assertRaises(InferenceServerException) as e:
268+
triton_client.infer(
269+
model_name=model_name, inputs=inputs, outputs=outputs
270+
)
271+
err_str = str(e.exception)
272+
correct_input_byte_size = (
273+
shape_tensor_data.size * np.dtype(np.int32).itemsize
274+
)
275+
self.assertIn(
276+
f"input byte size mismatch for input 'INPUT0' for model '{model_name}'. Expected {correct_input_byte_size}, got {input_byte_size}",
277+
err_str,
278+
)
279+
finally:
280+
shm.destroy_shared_memory_region(input_shm_handle)
281+
triton_client.unregister_system_shared_memory("INPUT0_SHM")
282+
283+
inference_helper(model_name="plan_nobatch_zero_1_float32_int32")
284+
inference_helper(model_name="plan_zero_1_float32_int32", batch_size=8)
285+
214286

215287
if __name__ == "__main__":
216288
unittest.main()

qa/L0_input_validation/test.sh

+2
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ dynamic_batching {
123123
EOL
124124

125125
cp -r $DATADIR/qa_model_repository/graphdef_object_int32_int32 models/.
126+
cp -r $DATADIR/qa_shapetensor_model_repository/plan_nobatch_zero_1_float32_int32 models/.
127+
cp -r $DATADIR/qa_shapetensor_model_repository/plan_zero_1_float32_int32 models/.
126128

127129
SERVER_ARGS="--model-repository=`pwd`/models"
128130
run_server

0 commit comments

Comments
 (0)