Add testing for GPU tensor error handling (#5871)

Tabrizian · oandreeva-nv · commit 23e7d78f1671 · 2023-07-05T17:52:59.000-07:00
* Add testing for GPU tensor error handling

* Fix up

* Remove exit 0

* Fix jetson

* Fix up
diff --git a/qa/L0_backend_python/python_test.py b/qa/L0_backend_python/python_test.py
@@ -40,6 +40,8 @@
 from tritonclient.utils import *
 import tritonclient.http as httpclient
 
+TEST_JETSON = bool(int(os.environ.get('TEST_JETSON', 0)))
+
 
 class PythonTest(tu.TestResultCollector):
 
@@ -59,6 +61,14 @@ def _infer_help(self, model_name, shape, data_type):
             output0 = result.as_numpy('OUTPUT0')
             self.assertTrue(np.all(input_data_0 == output0))
 
+    def _create_cuda_region(self, client, size, name):
+        import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
+        shm0_handle = cuda_shared_memory.create_shared_memory_region(
+            name, byte_size=size, device_id=0)
+        client.register_cuda_shared_memory(
+            name, cuda_shared_memory.get_raw_handle(shm0_handle), 0, size)
+        return shm0_handle
+
     def _optional_input_infer(self, model_name, has_input0, has_input1):
         with httpclient.InferenceServerClient("localhost:8000") as client:
             shape = (1,)
@@ -144,6 +154,69 @@ def test_growth_error(self):
         with self._shm_leak_detector.Probe() as shm_probe:
             self._infer_help(model_name, shape, dtype)
 
+    # GPU tensors are not supported on jetson
+    # CUDA Shared memory is not supported on jetson
+    if not TEST_JETSON:
+
+        def test_gpu_tensor_error(self):
+            import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
+            model_name = 'identity_bool'
+            with httpclient.InferenceServerClient("localhost:8000") as client:
+                input_data = np.array([[True] * 1000], dtype=bool)
+                inputs = [
+                    httpclient.InferInput("INPUT0", input_data.shape,
+                                          np_to_triton_dtype(input_data.dtype))
+                ]
+                inputs[0].set_data_from_numpy(input_data)
+
+                requested_outputs = [httpclient.InferRequestedOutput('OUTPUT0')]
+
+                # intentionally create a shared memory region with not enough size.
+                client.unregister_cuda_shared_memory()
+                shm0_handle = self._create_cuda_region(client, 1,
+                                                       'output0_data')
+
+                requested_outputs[0].set_shared_memory('output0_data', 1)
+                with self.assertRaises(InferenceServerException) as ex:
+                    client.infer(model_name, inputs, outputs=requested_outputs)
+                self.assertIn(
+                    "should be at least 1000 bytes to hold the results",
+                    str(ex.exception))
+                client.unregister_cuda_shared_memory()
+                cuda_shared_memory.destroy_shared_memory_region(shm0_handle)
+
+        def test_dlpack_tensor_error(self):
+            import tritonclient.utils.cuda_shared_memory as cuda_shared_memory
+            model_name = 'dlpack_identity'
+            with httpclient.InferenceServerClient("localhost:8000") as client:
+                input_data = np.array([[1] * 1000], dtype=np.float32)
+                inputs = [
+                    httpclient.InferInput("INPUT0", input_data.shape,
+                                          np_to_triton_dtype(input_data.dtype))
+                ]
+
+                requested_outputs = [httpclient.InferRequestedOutput('OUTPUT0')]
+                input_data_size = input_data.itemsize * input_data.size
+                client.unregister_cuda_shared_memory()
+                input_region = self._create_cuda_region(client, input_data_size,
+                                                        'input0_data')
+                inputs[0].set_shared_memory('input0_data', input_data_size)
+                cuda_shared_memory.set_shared_memory_region(
+                    input_region, [input_data])
+
+                # Intentionally create a small region to trigger an error
+                shm0_handle = self._create_cuda_region(client, 1,
+                                                       'output0_data')
+                requested_outputs[0].set_shared_memory('output0_data', 1)
+
+                with self.assertRaises(InferenceServerException) as ex:
+                    client.infer(model_name, inputs, outputs=requested_outputs)
+                self.assertIn(
+                    "should be at least 4000 bytes to hold the results",
+                    str(ex.exception))
+                client.unregister_cuda_shared_memory()
+                cuda_shared_memory.destroy_shared_memory_region(shm0_handle)
+
     def test_async_infer(self):
         model_name = "identity_uint8"
         request_parallelism = 4
diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh
@@ -53,7 +53,7 @@ SERVER_ARGS="$BASE_SERVER_ARGS --backend-config=python,shm-default-byte-size=524
 PYTHON_BACKEND_BRANCH=$PYTHON_BACKEND_REPO_TAG
 CLIENT_PY=./python_test.py
 CLIENT_LOG="./client.log"
-EXPECTED_NUM_TESTS="9"
+EXPECTED_NUM_TESTS="11"
 TEST_RESULT_FILE='test_results.txt'
 SERVER_LOG="./inference_server.log"
 source ../common/util.sh
@@ -128,9 +128,16 @@ mkdir -p models/string_fixed/1/
 cp ../python_models/string_fixed/model.py ./models/string_fixed/1/
 cp ../python_models/string_fixed/config.pbtxt ./models/string_fixed
 
+mkdir -p models/dlpack_identity/1/
+cp ../python_models/dlpack_identity/model.py ./models/dlpack_identity/1/
+cp ../python_models/dlpack_identity/config.pbtxt ./models/dlpack_identity
+
 # Skip torch install on Jetson since it is already installed.
 if [ "$TEST_JETSON" == "0" ]; then
   pip3 install torch==1.13.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+else
+  # GPU tensor tests are disabled on jetson
+  EXPECTED_NUM_TESTS=9
 fi
 
 prev_num_pages=`get_shm_pages`