From 6815fb2f4073b896b665334fd3b3a6e6ff4d0370 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Fri, 27 Oct 2023 14:04:13 -0700
Subject: [PATCH 1/2] Improve L0_backend_python ensemble subtest stability

---
 .../ensemble/ensemble_test.py                 | 41 +++++--------------
 qa/L0_backend_python/ensemble/test.sh         |  7 +---
 qa/python_models/add_sub/config.pbtxt         |  1 -
 3 files changed, 12 insertions(+), 37 deletions(-)

diff --git a/qa/L0_backend_python/ensemble/ensemble_test.py b/qa/L0_backend_python/ensemble/ensemble_test.py
index 64ddc3816f..9fb60e5a4e 100755
--- a/qa/L0_backend_python/ensemble/ensemble_test.py
+++ b/qa/L0_backend_python/ensemble/ensemble_test.py
@@ -43,8 +43,7 @@ class EnsembleTest(tu.TestResultCollector):
     def setUp(self):
         self._shm_leak_detector = shm_util.ShmLeakDetector()
 
-    def test_ensemble(self):
-        model_name = "ensemble"
+    def infer(self, model_name):
         shape = [16]
         with self._shm_leak_detector.Probe() as shm_probe:
             with httpclient.InferenceServerClient("localhost:8000") as client:
@@ -70,36 +69,18 @@ def test_ensemble(self):
                 self.assertIsNotNone(output0)
                 self.assertIsNotNone(output1)
 
-                self.assertTrue(np.allclose(output0, 2 * input_data_0))
-                self.assertTrue(np.allclose(output1, 2 * input_data_1))
+                # Set a big enough tolerance to reduce intermittence. May be
+                # better to test integer outputs in the future for consistency.
+                self.assertTrue(np.allclose(output0, 2 * input_data_0, atol=1e-06))
+                self.assertTrue(np.allclose(output1, 2 * input_data_1, atol=1e-06))
 
-        model_name = "ensemble_gpu"
-        with self._shm_leak_detector.Probe() as shm_probe:
-            with httpclient.InferenceServerClient("localhost:8000") as client:
-                input_data_0 = np.random.random(shape).astype(np.float32)
-                input_data_1 = np.random.random(shape).astype(np.float32)
-                inputs = [
-                    httpclient.InferInput(
-                        "INPUT0",
-                        input_data_0.shape,
-                        np_to_triton_dtype(input_data_0.dtype),
-                    ),
-                    httpclient.InferInput(
-                        "INPUT1",
-                        input_data_1.shape,
-                        np_to_triton_dtype(input_data_1.dtype),
-                    ),
-                ]
-                inputs[0].set_data_from_numpy(input_data_0)
-                inputs[1].set_data_from_numpy(input_data_1)
-                result = client.infer(model_name, inputs)
-                output0 = result.as_numpy("OUTPUT0")
-                output1 = result.as_numpy("OUTPUT1")
-                self.assertIsNotNone(output0)
-                self.assertIsNotNone(output1)
+    def test_ensemble(self):
+        model_name = "ensemble"
+        self.infer(model_name)
 
-                self.assertTrue(np.allclose(output0, 2 * input_data_0))
-                self.assertTrue(np.allclose(output1, 2 * input_data_1))
+    def test_ensemble_gpu(self):
+        model_name = "ensemble_gpu"
+        self.infer(model_name)
 
 
 if __name__ == "__main__":
diff --git a/qa/L0_backend_python/ensemble/test.sh b/qa/L0_backend_python/ensemble/test.sh
index 961ff16e5b..a72a375557 100755
--- a/qa/L0_backend_python/ensemble/test.sh
+++ b/qa/L0_backend_python/ensemble/test.sh
@@ -25,9 +25,8 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-CLIENT_PY=./lifecycle_test.py
 CLIENT_LOG="./ensemble_client.log"
-EXPECTED_NUM_TESTS="1"
+EXPECTED_NUM_TESTS="2"
 TEST_RESULT_FILE='test_results.txt'
 source ../common.sh
 source ../../common/util.sh
@@ -47,14 +46,10 @@ cp ../../python_models/ensemble/config.pbtxt ./models/ensemble
 
 mkdir -p models/add_sub_1/1/
 cp ../../python_models/add_sub/config.pbtxt ./models/add_sub_1
-(cd models/add_sub_1 && \
-          sed -i "s/^name:.*/name: \"add_sub_1\"/" config.pbtxt)
 cp ../../python_models/add_sub/model.py ./models/add_sub_1/1/
 
 mkdir -p models/add_sub_2/1/
 cp ../../python_models/add_sub/config.pbtxt ./models/add_sub_2/
-(cd models/add_sub_2 && \
-          sed -i "s/^name:.*/name: \"add_sub_2\"/" config.pbtxt)
 cp ../../python_models/add_sub/model.py ./models/add_sub_2/1/
 
 # Ensemble GPU Model
diff --git a/qa/python_models/add_sub/config.pbtxt b/qa/python_models/add_sub/config.pbtxt
index b0805c0089..39bd6771d0 100644
--- a/qa/python_models/add_sub/config.pbtxt
+++ b/qa/python_models/add_sub/config.pbtxt
@@ -24,7 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-name: "add_sub"
 backend: "python"
 
 input [

From 70606bb0cf59258d0fc9a7f751ab2d68e221f18c Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Fri, 27 Oct 2023 14:24:05 -0700
Subject: [PATCH 2/2] Test torch allocator gpu memory directly rather than
 global gpu memory for more consistency

---
 qa/python_models/bls/model.py | 75 +++++++++++++++++++++++++++--------
 1 file changed, 58 insertions(+), 17 deletions(-)

diff --git a/qa/python_models/bls/model.py b/qa/python_models/bls/model.py
index 69c0d2740b..024bbbe550 100644
--- a/qa/python_models/bls/model.py
+++ b/qa/python_models/bls/model.py
@@ -406,11 +406,19 @@ def test_zero_length_io(self):
         output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
         self.assertTrue(np.all(output0 == input0))
 
-    def test_bls_tensor_lifecycle(self):
+    def cuda_memory_stats(self):
+        allocated_bytes = torch.cuda.memory_allocated()
+        reserved_bytes = torch.cuda.memory_reserved()
+        return allocated_bytes, reserved_bytes
+
+    def bls_tensor_lifecycle_helper(self):
         model_name = "dlpack_identity"
+        verbose = True
 
         # A 10 MB tensor.
         input_size = 10 * 1024 * 1024
+        input_type_size_bytes = 4  # TYPE_FP32
+        input_size_bytes = input_size * input_type_size_bytes
 
         # Sending the tensor 50 times to test whether the deallocation is
         # happening correctly. If the deallocation doesn't happen correctly,
@@ -438,26 +446,43 @@ def test_bls_tensor_lifecycle(self):
                 output0.as_numpy(), input0, "BLS CPU memory lifecycle failed."
             )
 
+        # Show total memory stats before gpu tensor test
+        print(torch.cuda.memory_summary())
+
         # Checking the same with the GPU tensors.
         for index in range(50):
             input0 = None
             infer_request = None
             input0_pb = None
+            fail_msg = f"GPU memory lifecycle test failed at index: {index}"
 
             torch.cuda.empty_cache()
-            free_memory, _ = torch.cuda.mem_get_info()
-            if index == 1:
-                recorded_memory = free_memory
-
-            if index > 1:
-                self.assertEqual(
-                    free_memory,
-                    recorded_memory,
-                    "GPU memory lifecycle test failed at index: " + str(index),
-                )
+            alloced, cached = self.cuda_memory_stats()
+
+            # Check cuda memory usage is cleaned up (empty) between iterations
+            # when device tensors go out of scope
+            self.assertEqual(alloced, 0, fail_msg)
+            # Check that cache is properly cleaned up when emptied
+            self.assertEqual(cached, 0, fail_msg)
+
+            if verbose:
+                # NOTE: this reflects total gpu memory usage, and may be affected
+                # by other processes, so don't use it for direct checks but log it
+                # for debugging/context.
+                free_memory, total_memory = torch.cuda.mem_get_info()
+                used_memory = total_memory - free_memory
+                print(f"[DEBUG][Iteration {index}][GPU] {used_memory=} bytes")
 
             input0 = torch.ones([1, input_size], dtype=torch.float32).to("cuda")
             input0_pb = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(input0))
+            # Check cuda memory usage after creating device tensor
+            alloced, _ = self.cuda_memory_stats()
+            self.assertEqual(
+                alloced,
+                input_size_bytes,
+                "Expected precise byte allocation after input tensor creation",
+            )
+
             infer_request = pb_utils.InferenceRequest(
                 model_name=model_name,
                 inputs=[input0_pb],
@@ -477,6 +502,14 @@ def test_bls_tensor_lifecycle(self):
             output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0")
             output0_pytorch = from_dlpack(output0.to_dlpack())
 
+            # Stats after getting output tensor
+            alloced, _ = self.cuda_memory_stats()
+            self.assertEqual(
+                alloced,
+                input_size_bytes,
+                "Expected only input allocation, as output zero-copies input tensor",
+            )
+
             # Set inference response and output0_pytorch to None, to make sure
             # that the DLPack is still valid.
             output0 = None
@@ -486,12 +519,18 @@ def test_bls_tensor_lifecycle(self):
                 f"input ({input0}) and output ({output0_pytorch}) didn't match for identity model.",
             )
 
-            # We are seeing intermittent failures in the GPU memory lifecycle
-            # test where the free memory is not the same as the recorded memory.
-            # It is suspected that this is due to the Python garbage collector
-            # not releasing the memory immediately. Calling the garbage
-            # collector here to make sure that the memory is cleaned up.
-            collected = gc.collect()
+        print(torch.cuda.memory_summary())
+
+    def assert_cuda_memory_empty(self, msg):
+        torch.cuda.empty_cache()
+        alloced, cached = self.cuda_memory_stats()
+        self.assertEqual(alloced, 0, msg)
+        self.assertEqual(cached, 0, msg)
+
+    def test_bls_tensor_lifecycle(self):
+        self.assert_cuda_memory_empty("Expected all gpu memory cleaned up before test")
+        self.bls_tensor_lifecycle_helper()
+        self.assert_cuda_memory_empty("Expected all gpu memory cleaned up after test")
 
     def _test_gpu_bls_add_sub(self, is_input0_gpu, is_input1_gpu, is_decoupled=False):
         input0 = torch.rand(16)
@@ -738,6 +777,8 @@ def execute(self, requests):
         for _ in requests:
             # Run the unittest and store the results in InferenceResponse.
             test = unittest.main("model", exit=False)
+            for test_case, traceback in test.result.failures:
+                print(f"{test_case} failed:\n{traceback}")
             responses.append(
                 pb_utils.InferenceResponse(
                     [