Add testing for GPU tensor error handling

Tabrizian · Tabrizian · commit de034848da68 · 2023-05-29T18:39:42.000-04:00
diff --git a/qa/L0_backend_python/python_test.py b/qa/L0_backend_python/python_test.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 
-# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -38,6 +38,7 @@
 import os
 
 from tritonclient.utils import *
+import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
 import tritonclient.http as httpclient
 
 TEST_JETSON = bool(int(os.environ.get('TEST_JETSON', 0)))
@@ -61,6 +62,13 @@ def _infer_help(self, model_name, shape, data_type):
             output0 = result.as_numpy('OUTPUT0')
             self.assertTrue(np.all(input_data_0 == output0))
 
+    def _create_cuda_region(self, client, size, name):
+        shm0_handle = cuda_shared_memory.create_shared_memory_region(
+            name, byte_size=size, device_id=0)
+        client.register_cuda_shared_memory(
+            name, cuda_shared_memory.get_raw_handle(shm0_handle), 0, size)
+        return shm0_handle
+
     def _optional_input_infer(self, model_name, has_input0, has_input1):
         with httpclient.InferenceServerClient("localhost:8000") as client:
             shape = (1,)
@@ -151,6 +159,64 @@ def test_growth_error(self):
             with self._shm_leak_detector.Probe() as shm_probe:
                 self._infer_help(model_name, shape, dtype)
 
+        # CUDA Shared memory is not supported on jetson
+        def test_gpu_tensor_error(self):
+            model_name = 'identity_bool'
+            with httpclient.InferenceServerClient("localhost:8000") as client:
+                input_data = np.array([[True] * 1000], dtype=bool)
+                inputs = [
+                    httpclient.InferInput("INPUT0", input_data.shape,
+                                          np_to_triton_dtype(input_data.dtype))
+                ]
+                inputs[0].set_data_from_numpy(input_data)
+
+                requested_outputs = [httpclient.InferRequestedOutput('OUTPUT0')]
+
+                # intentionally create a shared memory region with not enough size.
+                client.unregister_cuda_shared_memory()
+                shm0_handle = self._create_cuda_region(client, 1,
+                                                       'output0_data')
+
+                requested_outputs[0].set_shared_memory('output0_data', 1)
+                with self.assertRaises(InferenceServerException) as ex:
+                    client.infer(model_name, inputs, outputs=requested_outputs)
+                self.assertIn(
+                    "should be at least 1000 bytes to hold the results",
+                    str(ex.exception))
+                client.unregister_cuda_shared_memory()
+                cuda_shared_memory.destroy_shared_memory_region(shm0_handle)
+
+        def test_dlpack_tensor_error(self):
+            model_name = 'dlpack_identity'
+            with httpclient.InferenceServerClient("localhost:8000") as client:
+                input_data = np.array([[1] * 1000], dtype=np.float32)
+                inputs = [
+                    httpclient.InferInput("INPUT0", input_data.shape,
+                                          np_to_triton_dtype(input_data.dtype))
+                ]
+
+                requested_outputs = [httpclient.InferRequestedOutput('OUTPUT0')]
+                input_data_size = input_data.itemsize * input_data.size
+                client.unregister_cuda_shared_memory()
+                input_region = self._create_cuda_region(client, input_data_size,
+                                                        'input0_data')
+                inputs[0].set_shared_memory('input0_data', input_data_size)
+                cuda_shared_memory.set_shared_memory_region(
+                    input_region, [input_data])
+
+                # Intentionally create a small region to trigger an error
+                shm0_handle = self._create_cuda_region(client, 1,
+                                                       'output0_data')
+                requested_outputs[0].set_shared_memory('output0_data', 1)
+
+                with self.assertRaises(InferenceServerException) as ex:
+                    client.infer(model_name, inputs, outputs=requested_outputs)
+                self.assertIn(
+                    "should be at least 4000 bytes to hold the results",
+                    str(ex.exception))
+                client.unregister_cuda_shared_memory()
+                cuda_shared_memory.destroy_shared_memory_region(shm0_handle)
+
     def test_async_infer(self):
         model_name = "identity_uint8"
         request_parallelism = 4
diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh
@@ -128,6 +128,10 @@ mkdir -p models/string_fixed/1/
 cp ../python_models/string_fixed/model.py ./models/string_fixed/1/
 cp ../python_models/string_fixed/config.pbtxt ./models/string_fixed
 
+mkdir -p models/dlpack_identity/1/
+cp ../python_models/dlpack_identity/model.py ./models/dlpack_identity/1/
+cp ../python_models/dlpack_identity/config.pbtxt ./models/dlpack_identity
+
 # Skip torch install on Jetson since it is already installed.
 if [ "$TEST_JETSON" == "0" ]; then
   pip3 install torch==1.13.0+cpu -f https://download.pytorch.org/whl/torch_stable.html