From 6815fb2f4073b896b665334fd3b3a6e6ff4d0370 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 27 Oct 2023 14:04:13 -0700 Subject: [PATCH 1/2] Improve L0_backend_python ensemble subtest stability --- .../ensemble/ensemble_test.py | 41 +++++-------------- qa/L0_backend_python/ensemble/test.sh | 7 +--- qa/python_models/add_sub/config.pbtxt | 1 - 3 files changed, 12 insertions(+), 37 deletions(-) diff --git a/qa/L0_backend_python/ensemble/ensemble_test.py b/qa/L0_backend_python/ensemble/ensemble_test.py index 64ddc3816f..9fb60e5a4e 100755 --- a/qa/L0_backend_python/ensemble/ensemble_test.py +++ b/qa/L0_backend_python/ensemble/ensemble_test.py @@ -43,8 +43,7 @@ class EnsembleTest(tu.TestResultCollector): def setUp(self): self._shm_leak_detector = shm_util.ShmLeakDetector() - def test_ensemble(self): - model_name = "ensemble" + def infer(self, model_name): shape = [16] with self._shm_leak_detector.Probe() as shm_probe: with httpclient.InferenceServerClient("localhost:8000") as client: @@ -70,36 +69,18 @@ def test_ensemble(self): self.assertIsNotNone(output0) self.assertIsNotNone(output1) - self.assertTrue(np.allclose(output0, 2 * input_data_0)) - self.assertTrue(np.allclose(output1, 2 * input_data_1)) + # Set a big enough tolerance to reduce intermittence. May be + # better to test integer outputs in the future for consistency. + self.assertTrue(np.allclose(output0, 2 * input_data_0, atol=1e-06)) + self.assertTrue(np.allclose(output1, 2 * input_data_1, atol=1e-06)) - model_name = "ensemble_gpu" - with self._shm_leak_detector.Probe() as shm_probe: - with httpclient.InferenceServerClient("localhost:8000") as client: - input_data_0 = np.random.random(shape).astype(np.float32) - input_data_1 = np.random.random(shape).astype(np.float32) - inputs = [ - httpclient.InferInput( - "INPUT0", - input_data_0.shape, - np_to_triton_dtype(input_data_0.dtype), - ), - httpclient.InferInput( - "INPUT1", - input_data_1.shape, - np_to_triton_dtype(input_data_1.dtype), - ), - ] - inputs[0].set_data_from_numpy(input_data_0) - inputs[1].set_data_from_numpy(input_data_1) - result = client.infer(model_name, inputs) - output0 = result.as_numpy("OUTPUT0") - output1 = result.as_numpy("OUTPUT1") - self.assertIsNotNone(output0) - self.assertIsNotNone(output1) + def test_ensemble(self): + model_name = "ensemble" + self.infer(model_name) - self.assertTrue(np.allclose(output0, 2 * input_data_0)) - self.assertTrue(np.allclose(output1, 2 * input_data_1)) + def test_ensemble_gpu(self): + model_name = "ensemble_gpu" + self.infer(model_name) if __name__ == "__main__": diff --git a/qa/L0_backend_python/ensemble/test.sh b/qa/L0_backend_python/ensemble/test.sh index 961ff16e5b..a72a375557 100755 --- a/qa/L0_backend_python/ensemble/test.sh +++ b/qa/L0_backend_python/ensemble/test.sh @@ -25,9 +25,8 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -CLIENT_PY=./lifecycle_test.py CLIENT_LOG="./ensemble_client.log" -EXPECTED_NUM_TESTS="1" +EXPECTED_NUM_TESTS="2" TEST_RESULT_FILE='test_results.txt' source ../common.sh source ../../common/util.sh @@ -47,14 +46,10 @@ cp ../../python_models/ensemble/config.pbtxt ./models/ensemble mkdir -p models/add_sub_1/1/ cp ../../python_models/add_sub/config.pbtxt ./models/add_sub_1 -(cd models/add_sub_1 && \ - sed -i "s/^name:.*/name: \"add_sub_1\"/" config.pbtxt) cp ../../python_models/add_sub/model.py ./models/add_sub_1/1/ mkdir -p models/add_sub_2/1/ cp ../../python_models/add_sub/config.pbtxt ./models/add_sub_2/ -(cd models/add_sub_2 && \ - sed -i "s/^name:.*/name: \"add_sub_2\"/" config.pbtxt) cp ../../python_models/add_sub/model.py ./models/add_sub_2/1/ # Ensemble GPU Model diff --git a/qa/python_models/add_sub/config.pbtxt b/qa/python_models/add_sub/config.pbtxt index b0805c0089..39bd6771d0 100644 --- a/qa/python_models/add_sub/config.pbtxt +++ b/qa/python_models/add_sub/config.pbtxt @@ -24,7 +24,6 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -name: "add_sub" backend: "python" input [ From 70606bb0cf59258d0fc9a7f751ab2d68e221f18c Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 27 Oct 2023 14:24:05 -0700 Subject: [PATCH 2/2] Test torch allocator gpu memory directly rather than global gpu memory for more consistency --- qa/python_models/bls/model.py | 75 +++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 17 deletions(-) diff --git a/qa/python_models/bls/model.py b/qa/python_models/bls/model.py index 69c0d2740b..024bbbe550 100644 --- a/qa/python_models/bls/model.py +++ b/qa/python_models/bls/model.py @@ -406,11 +406,19 @@ def test_zero_length_io(self): output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") self.assertTrue(np.all(output0 == input0)) - def test_bls_tensor_lifecycle(self): + def cuda_memory_stats(self): + allocated_bytes = torch.cuda.memory_allocated() + reserved_bytes = torch.cuda.memory_reserved() + return allocated_bytes, reserved_bytes + + def bls_tensor_lifecycle_helper(self): model_name = "dlpack_identity" + verbose = True # A 10 MB tensor. input_size = 10 * 1024 * 1024 + input_type_size_bytes = 4 # TYPE_FP32 + input_size_bytes = input_size * input_type_size_bytes # Sending the tensor 50 times to test whether the deallocation is # happening correctly. If the deallocation doesn't happen correctly, @@ -438,26 +446,43 @@ def test_bls_tensor_lifecycle(self): output0.as_numpy(), input0, "BLS CPU memory lifecycle failed." ) + # Show total memory stats before gpu tensor test + print(torch.cuda.memory_summary()) + # Checking the same with the GPU tensors. for index in range(50): input0 = None infer_request = None input0_pb = None + fail_msg = f"GPU memory lifecycle test failed at index: {index}" torch.cuda.empty_cache() - free_memory, _ = torch.cuda.mem_get_info() - if index == 1: - recorded_memory = free_memory - - if index > 1: - self.assertEqual( - free_memory, - recorded_memory, - "GPU memory lifecycle test failed at index: " + str(index), - ) + alloced, cached = self.cuda_memory_stats() + + # Check cuda memory usage is cleaned up (empty) between iterations + # when device tensors go out of scope + self.assertEqual(alloced, 0, fail_msg) + # Check that cache is properly cleaned up when emptied + self.assertEqual(cached, 0, fail_msg) + + if verbose: + # NOTE: this reflects total gpu memory usage, and may be affected + # by other processes, so don't use it for direct checks but log it + # for debugging/context. + free_memory, total_memory = torch.cuda.mem_get_info() + used_memory = total_memory - free_memory + print(f"[DEBUG][Iteration {index}][GPU] {used_memory=} bytes") input0 = torch.ones([1, input_size], dtype=torch.float32).to("cuda") input0_pb = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(input0)) + # Check cuda memory usage after creating device tensor + alloced, _ = self.cuda_memory_stats() + self.assertEqual( + alloced, + input_size_bytes, + "Expected precise byte allocation after input tensor creation", + ) + infer_request = pb_utils.InferenceRequest( model_name=model_name, inputs=[input0_pb], @@ -477,6 +502,14 @@ def test_bls_tensor_lifecycle(self): output0 = pb_utils.get_output_tensor_by_name(infer_response, "OUTPUT0") output0_pytorch = from_dlpack(output0.to_dlpack()) + # Stats after getting output tensor + alloced, _ = self.cuda_memory_stats() + self.assertEqual( + alloced, + input_size_bytes, + "Expected only input allocation, as output zero-copies input tensor", + ) + # Set inference response and output0_pytorch to None, to make sure # that the DLPack is still valid. output0 = None @@ -486,12 +519,18 @@ def test_bls_tensor_lifecycle(self): f"input ({input0}) and output ({output0_pytorch}) didn't match for identity model.", ) - # We are seeing intermittent failures in the GPU memory lifecycle - # test where the free memory is not the same as the recorded memory. - # It is suspected that this is due to the Python garbage collector - # not releasing the memory immediately. Calling the garbage - # collector here to make sure that the memory is cleaned up. - collected = gc.collect() + print(torch.cuda.memory_summary()) + + def assert_cuda_memory_empty(self, msg): + torch.cuda.empty_cache() + alloced, cached = self.cuda_memory_stats() + self.assertEqual(alloced, 0, msg) + self.assertEqual(cached, 0, msg) + + def test_bls_tensor_lifecycle(self): + self.assert_cuda_memory_empty("Expected all gpu memory cleaned up before test") + self.bls_tensor_lifecycle_helper() + self.assert_cuda_memory_empty("Expected all gpu memory cleaned up after test") def _test_gpu_bls_add_sub(self, is_input0_gpu, is_input1_gpu, is_decoupled=False): input0 = torch.rand(16) @@ -738,6 +777,8 @@ def execute(self, requests): for _ in requests: # Run the unittest and store the results in InferenceResponse. test = unittest.main("model", exit=False) + for test_case, traceback in test.result.failures: + print(f"{test_case} failed:\n{traceback}") responses.append( pb_utils.InferenceResponse( [