1
1
#!/usr/bin/python
2
2
3
- # Copyright 2019-2022 , NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
+ # Copyright 2019-2023 , NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4
4
#
5
5
# Redistribution and use in source and binary forms, with or without
6
6
# modification, are permitted provided that the following conditions
38
38
import os
39
39
40
40
from tritonclient .utils import *
41
+ import tritonclient .utils .cuda_shared_memory as cuda_shared_memory
41
42
import tritonclient .http as httpclient
42
43
43
44
TEST_JETSON = bool (int (os .environ .get ('TEST_JETSON' , 0 )))
@@ -61,6 +62,13 @@ def _infer_help(self, model_name, shape, data_type):
61
62
output0 = result .as_numpy ('OUTPUT0' )
62
63
self .assertTrue (np .all (input_data_0 == output0 ))
63
64
65
+ def _create_cuda_region (self , client , size , name ):
66
+ shm0_handle = cuda_shared_memory .create_shared_memory_region (
67
+ name , byte_size = size , device_id = 0 )
68
+ client .register_cuda_shared_memory (
69
+ name , cuda_shared_memory .get_raw_handle (shm0_handle ), 0 , size )
70
+ return shm0_handle
71
+
64
72
def _optional_input_infer (self , model_name , has_input0 , has_input1 ):
65
73
with httpclient .InferenceServerClient ("localhost:8000" ) as client :
66
74
shape = (1 ,)
@@ -151,6 +159,64 @@ def test_growth_error(self):
151
159
with self ._shm_leak_detector .Probe () as shm_probe :
152
160
self ._infer_help (model_name , shape , dtype )
153
161
162
+ # CUDA Shared memory is not supported on jetson
163
+ def test_gpu_tensor_error (self ):
164
+ model_name = 'identity_bool'
165
+ with httpclient .InferenceServerClient ("localhost:8000" ) as client :
166
+ input_data = np .array ([[True ] * 1000 ], dtype = bool )
167
+ inputs = [
168
+ httpclient .InferInput ("INPUT0" , input_data .shape ,
169
+ np_to_triton_dtype (input_data .dtype ))
170
+ ]
171
+ inputs [0 ].set_data_from_numpy (input_data )
172
+
173
+ requested_outputs = [httpclient .InferRequestedOutput ('OUTPUT0' )]
174
+
175
+ # intentionally create a shared memory region with not enough size.
176
+ client .unregister_cuda_shared_memory ()
177
+ shm0_handle = self ._create_cuda_region (client , 1 ,
178
+ 'output0_data' )
179
+
180
+ requested_outputs [0 ].set_shared_memory ('output0_data' , 1 )
181
+ with self .assertRaises (InferenceServerException ) as ex :
182
+ client .infer (model_name , inputs , outputs = requested_outputs )
183
+ self .assertIn (
184
+ "should be at least 1000 bytes to hold the results" ,
185
+ str (ex .exception ))
186
+ client .unregister_cuda_shared_memory ()
187
+ cuda_shared_memory .destroy_shared_memory_region (shm0_handle )
188
+
189
+ def test_dlpack_tensor_error (self ):
190
+ model_name = 'dlpack_identity'
191
+ with httpclient .InferenceServerClient ("localhost:8000" ) as client :
192
+ input_data = np .array ([[1 ] * 1000 ], dtype = np .float32 )
193
+ inputs = [
194
+ httpclient .InferInput ("INPUT0" , input_data .shape ,
195
+ np_to_triton_dtype (input_data .dtype ))
196
+ ]
197
+
198
+ requested_outputs = [httpclient .InferRequestedOutput ('OUTPUT0' )]
199
+ input_data_size = input_data .itemsize * input_data .size
200
+ client .unregister_cuda_shared_memory ()
201
+ input_region = self ._create_cuda_region (client , input_data_size ,
202
+ 'input0_data' )
203
+ inputs [0 ].set_shared_memory ('input0_data' , input_data_size )
204
+ cuda_shared_memory .set_shared_memory_region (
205
+ input_region , [input_data ])
206
+
207
+ # Intentionally create a small region to trigger an error
208
+ shm0_handle = self ._create_cuda_region (client , 1 ,
209
+ 'output0_data' )
210
+ requested_outputs [0 ].set_shared_memory ('output0_data' , 1 )
211
+
212
+ with self .assertRaises (InferenceServerException ) as ex :
213
+ client .infer (model_name , inputs , outputs = requested_outputs )
214
+ self .assertIn (
215
+ "should be at least 4000 bytes to hold the results" ,
216
+ str (ex .exception ))
217
+ client .unregister_cuda_shared_memory ()
218
+ cuda_shared_memory .destroy_shared_memory_region (shm0_handle )
219
+
154
220
def test_async_infer (self ):
155
221
model_name = "identity_uint8"
156
222
request_parallelism = 4
0 commit comments