40
40
from tritonclient .utils import *
41
41
import tritonclient .http as httpclient
42
42
43
+ TEST_JETSON = bool (int (os .environ .get ('TEST_JETSON' , 0 )))
44
+
43
45
44
46
class PythonTest (tu .TestResultCollector ):
45
47
@@ -59,6 +61,14 @@ def _infer_help(self, model_name, shape, data_type):
59
61
output0 = result .as_numpy ('OUTPUT0' )
60
62
self .assertTrue (np .all (input_data_0 == output0 ))
61
63
64
+ def _create_cuda_region (self , client , size , name ):
65
+ import tritonclient .utils .cuda_shared_memory as cuda_shared_memory
66
+ shm0_handle = cuda_shared_memory .create_shared_memory_region (
67
+ name , byte_size = size , device_id = 0 )
68
+ client .register_cuda_shared_memory (
69
+ name , cuda_shared_memory .get_raw_handle (shm0_handle ), 0 , size )
70
+ return shm0_handle
71
+
62
72
def _optional_input_infer (self , model_name , has_input0 , has_input1 ):
63
73
with httpclient .InferenceServerClient ("localhost:8000" ) as client :
64
74
shape = (1 ,)
@@ -144,6 +154,69 @@ def test_growth_error(self):
144
154
with self ._shm_leak_detector .Probe () as shm_probe :
145
155
self ._infer_help (model_name , shape , dtype )
146
156
157
+ # GPU tensors are not supported on jetson
158
+ # CUDA Shared memory is not supported on jetson
159
+ if not TEST_JETSON :
160
+
161
+ def test_gpu_tensor_error (self ):
162
+ import tritonclient .utils .cuda_shared_memory as cuda_shared_memory
163
+ model_name = 'identity_bool'
164
+ with httpclient .InferenceServerClient ("localhost:8000" ) as client :
165
+ input_data = np .array ([[True ] * 1000 ], dtype = bool )
166
+ inputs = [
167
+ httpclient .InferInput ("INPUT0" , input_data .shape ,
168
+ np_to_triton_dtype (input_data .dtype ))
169
+ ]
170
+ inputs [0 ].set_data_from_numpy (input_data )
171
+
172
+ requested_outputs = [httpclient .InferRequestedOutput ('OUTPUT0' )]
173
+
174
+ # intentionally create a shared memory region with not enough size.
175
+ client .unregister_cuda_shared_memory ()
176
+ shm0_handle = self ._create_cuda_region (client , 1 ,
177
+ 'output0_data' )
178
+
179
+ requested_outputs [0 ].set_shared_memory ('output0_data' , 1 )
180
+ with self .assertRaises (InferenceServerException ) as ex :
181
+ client .infer (model_name , inputs , outputs = requested_outputs )
182
+ self .assertIn (
183
+ "should be at least 1000 bytes to hold the results" ,
184
+ str (ex .exception ))
185
+ client .unregister_cuda_shared_memory ()
186
+ cuda_shared_memory .destroy_shared_memory_region (shm0_handle )
187
+
188
+ def test_dlpack_tensor_error (self ):
189
+ import tritonclient .utils .cuda_shared_memory as cuda_shared_memory
190
+ model_name = 'dlpack_identity'
191
+ with httpclient .InferenceServerClient ("localhost:8000" ) as client :
192
+ input_data = np .array ([[1 ] * 1000 ], dtype = np .float32 )
193
+ inputs = [
194
+ httpclient .InferInput ("INPUT0" , input_data .shape ,
195
+ np_to_triton_dtype (input_data .dtype ))
196
+ ]
197
+
198
+ requested_outputs = [httpclient .InferRequestedOutput ('OUTPUT0' )]
199
+ input_data_size = input_data .itemsize * input_data .size
200
+ client .unregister_cuda_shared_memory ()
201
+ input_region = self ._create_cuda_region (client , input_data_size ,
202
+ 'input0_data' )
203
+ inputs [0 ].set_shared_memory ('input0_data' , input_data_size )
204
+ cuda_shared_memory .set_shared_memory_region (
205
+ input_region , [input_data ])
206
+
207
+ # Intentionally create a small region to trigger an error
208
+ shm0_handle = self ._create_cuda_region (client , 1 ,
209
+ 'output0_data' )
210
+ requested_outputs [0 ].set_shared_memory ('output0_data' , 1 )
211
+
212
+ with self .assertRaises (InferenceServerException ) as ex :
213
+ client .infer (model_name , inputs , outputs = requested_outputs )
214
+ self .assertIn (
215
+ "should be at least 4000 bytes to hold the results" ,
216
+ str (ex .exception ))
217
+ client .unregister_cuda_shared_memory ()
218
+ cuda_shared_memory .destroy_shared_memory_region (shm0_handle )
219
+
147
220
def test_async_infer (self ):
148
221
model_name = "identity_uint8"
149
222
request_parallelism = 4
0 commit comments