From 33e6cddb88ff3ea30fc6028e311decee3d5af9b7 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Fri, 4 Aug 2023 16:51:45 -0700 Subject: [PATCH 1/7] Add test when unload/load requests for same model received the same time --- qa/L0_lifecycle/lifecycle_test.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/qa/L0_lifecycle/lifecycle_test.py b/qa/L0_lifecycle/lifecycle_test.py index 1caffb8f56..eec7aa527f 100755 --- a/qa/L0_lifecycle/lifecycle_test.py +++ b/qa/L0_lifecycle/lifecycle_test.py @@ -2885,6 +2885,21 @@ def test_concurrent_load_unload(self): self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) self.assertFalse(triton_client.is_model_ready("identity_zero_1_int32")) + # Unload identity_zero_1_int32 and immediately load it + # The model can either be loaded or unloaded but server must not crash + triton_client.load_model("identity_zero_1_int32") + self.assertTrue(triton_client.is_model_ready("identity_zero_1_int32")) + with concurrent.futures.ThreadPoolExecutor() as pool: + unload_thread = pool.submit( + triton_client.unload_model, "identity_zero_1_int32" + ) + load_thread = pool.submit(triton_client.load_model, "identity_zero_1_int32") + unload_thread.result() + load_thread.result() + triton_client.unload_model("identity_zero_1_int32") # to unload if loaded + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + triton_client.is_model_ready("identity_zero_1_int32") # Load ensemble_zero_1_float32 and unload its dependency while loading # The unload operation should wait until the load is completed with concurrent.futures.ThreadPoolExecutor() as pool: From f072579f7a2569f49a419cf7fb88108a6fecd241 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Mon, 7 Aug 2023 18:09:27 -0700 Subject: [PATCH 2/7] Add test_same_model_overlapping_load_unload --- qa/L0_lifecycle/lifecycle_test.py | 56 ++++++++++++++++++++++--------- qa/L0_lifecycle/test.sh | 31 +++++++++++++++++ 2 files changed, 72 insertions(+), 15 deletions(-) diff --git a/qa/L0_lifecycle/lifecycle_test.py b/qa/L0_lifecycle/lifecycle_test.py index eec7aa527f..8ab9de5030 100755 --- a/qa/L0_lifecycle/lifecycle_test.py +++ b/qa/L0_lifecycle/lifecycle_test.py @@ -2885,21 +2885,6 @@ def test_concurrent_load_unload(self): self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) self.assertFalse(triton_client.is_model_ready("identity_zero_1_int32")) - # Unload identity_zero_1_int32 and immediately load it - # The model can either be loaded or unloaded but server must not crash - triton_client.load_model("identity_zero_1_int32") - self.assertTrue(triton_client.is_model_ready("identity_zero_1_int32")) - with concurrent.futures.ThreadPoolExecutor() as pool: - unload_thread = pool.submit( - triton_client.unload_model, "identity_zero_1_int32" - ) - load_thread = pool.submit(triton_client.load_model, "identity_zero_1_int32") - unload_thread.result() - load_thread.result() - triton_client.unload_model("identity_zero_1_int32") # to unload if loaded - self.assertTrue(triton_client.is_server_live()) - self.assertTrue(triton_client.is_server_ready()) - triton_client.is_model_ready("identity_zero_1_int32") # Load ensemble_zero_1_float32 and unload its dependency while loading # The unload operation should wait until the load is completed with concurrent.futures.ThreadPoolExecutor() as pool: @@ -2931,6 +2916,47 @@ def test_concurrent_load_unload(self): for model_name in model_names: self.assertEqual(is_load, triton_client.is_model_ready(model_name)) + def test_same_model_overlapping_load_unload(self): + try: + triton_client = grpcclient.InferenceServerClient( + "localhost:8001", verbose=True + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + model_name = "python_identity_model" + # Success of this test requires the correct order of unload and load + # that cannot be reliably controlled, so allowing some retries can + # minimize the chance of undetermined test result. + max_trials = 2 + for i in range(max_trials): + # Start with model loaded + triton_client.load_model(model_name) + self.assertTrue(triton_client.is_model_ready(model_name)) + # Unload the model and then immediately load it + with concurrent.futures.ThreadPoolExecutor() as pool: + unload_thread = pool.submit(triton_client.unload_model, model_name) + load_thread = pool.submit(triton_client.load_model, model_name) + unload_thread.result() + load_thread.result() + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + # Poll for unload, in-case unload happen after load + poll_interval = 1 # seconds + poll_timeout = 16 # seconds + poll_max_steps = poll_timeout / poll_interval + poll_steps = 0 + model_loaded = triton_client.is_model_ready(model_name) + while poll_steps < poll_max_steps and model_loaded: + time.sleep(poll_interval) + poll_steps += 1 + model_loaded = triton_client.is_model_ready(model_name) + # Make sure model is loaded, which implies load happen after unload + if model_loaded: + # Test passed + return + # Test result is undetermined + self.assertTrue(False, "Cannot overlap a load with an unload in max trails") + if __name__ == "__main__": unittest.main() diff --git a/qa/L0_lifecycle/test.sh b/qa/L0_lifecycle/test.sh index ab12c1c7b8..4ad33baba0 100755 --- a/qa/L0_lifecycle/test.sh +++ b/qa/L0_lifecycle/test.sh @@ -1824,6 +1824,37 @@ set -e kill $SERVER_PID wait $SERVER_PID +LOG_IDX=$((LOG_IDX+1)) + +# LifeCycleTest.test_same_model_overlapping_load_unload +rm -rf models +mkdir models +cp -r ../python_models/identity_fp32 models/python_identity_model && \ + (cd models/python_identity_model && \ + mkdir 1 && mv model.py 1 && \ + sed -i "s/identity_fp32/python_identity_model/" config.pbtxt) + +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --log-verbose=2" +SERVER_LOG="./inference_server_$LOG_IDX.log" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e +python $LC_TEST LifeCycleTest.test_same_model_overlapping_load_unload >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Failed\n***" + RET=1 +fi +set -e + +kill $SERVER_PID +wait $SERVER_PID + if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" fi From 0ae7a96348035040e481bb03a02f42eba580735e Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Wed, 9 Aug 2023 18:23:46 -0700 Subject: [PATCH 3/7] Use a load/unload stress test instead --- qa/L0_lifecycle/lifecycle_test.py | 83 ++++++++++++++++++------------- qa/L0_lifecycle/test.sh | 14 +++--- 2 files changed, 56 insertions(+), 41 deletions(-) diff --git a/qa/L0_lifecycle/lifecycle_test.py b/qa/L0_lifecycle/lifecycle_test.py index 8ab9de5030..e2d3752a98 100755 --- a/qa/L0_lifecycle/lifecycle_test.py +++ b/qa/L0_lifecycle/lifecycle_test.py @@ -2916,46 +2916,61 @@ def test_concurrent_load_unload(self): for model_name in model_names: self.assertEqual(is_load, triton_client.is_model_ready(model_name)) - def test_same_model_overlapping_load_unload(self): + def test_load_unload_same_model_stress(self): + model_name = "identity_zero_1_int32" + num_threads = 16 + num_iterations = 1024 try: triton_client = grpcclient.InferenceServerClient( "localhost:8001", verbose=True ) except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) - model_name = "python_identity_model" - # Success of this test requires the correct order of unload and load - # that cannot be reliably controlled, so allowing some retries can - # minimize the chance of undetermined test result. - max_trials = 2 - for i in range(max_trials): - # Start with model loaded - triton_client.load_model(model_name) - self.assertTrue(triton_client.is_model_ready(model_name)) - # Unload the model and then immediately load it - with concurrent.futures.ThreadPoolExecutor() as pool: - unload_thread = pool.submit(triton_client.unload_model, model_name) - load_thread = pool.submit(triton_client.load_model, model_name) - unload_thread.result() - load_thread.result() - self.assertTrue(triton_client.is_server_live()) - self.assertTrue(triton_client.is_server_ready()) - # Poll for unload, in-case unload happen after load - poll_interval = 1 # seconds - poll_timeout = 16 # seconds - poll_max_steps = poll_timeout / poll_interval - poll_steps = 0 - model_loaded = triton_client.is_model_ready(model_name) - while poll_steps < poll_max_steps and model_loaded: - time.sleep(poll_interval) - poll_steps += 1 - model_loaded = triton_client.is_model_ready(model_name) - # Make sure model is loaded, which implies load happen after unload - if model_loaded: - # Test passed - return - # Test result is undetermined - self.assertTrue(False, "Cannot overlap a load with an unload in max trails") + load_unload_exceptions = {"load_before_unload_finish_count": 0} + + def _load_unload(): + for i in range(num_iterations): + try: + triton_client.load_model(model_name) + except InferenceServerException as ex: + # Acceptable for an unload to happen after a load completes, but + # before the load can verify its load state. + fail_reasons = [ + "unexpected miss in global map", + "no version is available", + "failed to poll from model repository", + ] + fail_messages = [ + ("failed to load '" + model_name + "', " + reason) + for reason in fail_reasons + ] + self.assertIn(ex.message(), fail_messages) + try: + triton_client.unload_model(model_name) + except InferenceServerException as ex: + # Acceptable for a load to happen during an async unload + self.assertEqual( + ex.message(), + "failed to unload '" + + model_name + + "', versions that are still available: 1", + ) + load_unload_exceptions["load_before_unload_finish_count"] += 1 + + with concurrent.futures.ThreadPoolExecutor() as pool: + threads = [] + for i in range(num_threads): + threads.append(pool.submit(_load_unload)) + for t in threads: + t.result() + + self.assertTrue(triton_client.is_server_live()) + self.assertTrue(triton_client.is_server_ready()) + self.assertGreater( + load_unload_exceptions["load_before_unload_finish_count"], + 0, + "The test case did not replicate a load while async unloading. Consider increase concurrency.", + ) if __name__ == "__main__": diff --git a/qa/L0_lifecycle/test.sh b/qa/L0_lifecycle/test.sh index 4ad33baba0..20a8ef955d 100755 --- a/qa/L0_lifecycle/test.sh +++ b/qa/L0_lifecycle/test.sh @@ -1826,15 +1826,15 @@ wait $SERVER_PID LOG_IDX=$((LOG_IDX+1)) -# LifeCycleTest.test_same_model_overlapping_load_unload +# LifeCycleTest.test_load_unload_same_model_stress rm -rf models mkdir models -cp -r ../python_models/identity_fp32 models/python_identity_model && \ - (cd models/python_identity_model && \ - mkdir 1 && mv model.py 1 && \ - sed -i "s/identity_fp32/python_identity_model/" config.pbtxt) +cp -r identity_zero_1_int32 models && \ + (cd models/identity_zero_1_int32 && \ + mkdir 1 && \ + sed -i "s/string_value: \"10\"/string_value: \"0\"/" config.pbtxt) -SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --log-verbose=2" +SERVER_ARGS="--model-repository=`pwd`/models --model-control-mode=explicit --model-load-thread-count=16 --log-verbose=2" SERVER_LOG="./inference_server_$LOG_IDX.log" run_server if [ "$SERVER_PID" == "0" ]; then @@ -1844,7 +1844,7 @@ if [ "$SERVER_PID" == "0" ]; then fi set +e -python $LC_TEST LifeCycleTest.test_same_model_overlapping_load_unload >>$CLIENT_LOG 2>&1 +python $LC_TEST LifeCycleTest.test_load_unload_same_model_stress >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" From 45176495b2549cfcacef116868419d8aa23a15c1 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Thu, 10 Aug 2023 10:59:51 -0700 Subject: [PATCH 4/7] Pre-merge test name update --- qa/L0_lifecycle/lifecycle_test.py | 8 ++++---- qa/L0_lifecycle/test.sh | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/qa/L0_lifecycle/lifecycle_test.py b/qa/L0_lifecycle/lifecycle_test.py index e2d3752a98..f45b466bb9 100755 --- a/qa/L0_lifecycle/lifecycle_test.py +++ b/qa/L0_lifecycle/lifecycle_test.py @@ -2789,7 +2789,7 @@ def test_load_gpu_limit(self): except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) - def test_concurrent_load_speedup(self): + def test_concurrent_model_load_speedup(self): # Initialize client try: triton_client = grpcclient.InferenceServerClient( @@ -2835,7 +2835,7 @@ def test_concurrent_load_speedup(self): for model_name in model_pair: self.assertTrue(triton_client.is_model_ready(model_name)) - def test_concurrent_load(self): + def test_concurrent_model_load(self): # Initialize client try: triton_client = grpcclient.InferenceServerClient( @@ -2864,7 +2864,7 @@ def test_concurrent_load(self): model_metadata = triton_client.get_model_metadata("identity_model") self.assertEqual(model_metadata.platform, "python") - def test_concurrent_load_unload(self): + def test_concurrent_model_load_unload(self): # Initialize client try: triton_client = grpcclient.InferenceServerClient( @@ -2916,7 +2916,7 @@ def test_concurrent_load_unload(self): for model_name in model_names: self.assertEqual(is_load, triton_client.is_model_ready(model_name)) - def test_load_unload_same_model_stress(self): + def test_concurrent_same_model_load_unload_stress(self): model_name = "identity_zero_1_int32" num_threads = 16 num_iterations = 1024 diff --git a/qa/L0_lifecycle/test.sh b/qa/L0_lifecycle/test.sh index 20a8ef955d..894fbd43b8 100755 --- a/qa/L0_lifecycle/test.sh +++ b/qa/L0_lifecycle/test.sh @@ -1713,7 +1713,7 @@ wait $SERVER_PID LOG_IDX=$((LOG_IDX+1)) -# LifeCycleTest.test_concurrent_load_speedup +# LifeCycleTest.test_concurrent_model_load_speedup rm -rf models mkdir models MODEL_NAME="identity_zero_1_int32" @@ -1743,7 +1743,7 @@ if [ "$SERVER_PID" == "0" ]; then fi set +e -python $LC_TEST LifeCycleTest.test_concurrent_load_speedup >>$CLIENT_LOG 2>&1 +python $LC_TEST LifeCycleTest.test_concurrent_model_load_speedup >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" @@ -1756,7 +1756,7 @@ wait $SERVER_PID LOG_IDX=$((LOG_IDX+1)) -# LifeCycleTest.test_concurrent_load +# LifeCycleTest.test_concurrent_model_load rm -rf models models_v1 models_v2 mkdir models models_v2 cp -r identity_zero_1_int32 models/identity_model && \ @@ -1778,7 +1778,7 @@ if [ "$SERVER_PID" == "0" ]; then fi set +e -python $LC_TEST LifeCycleTest.test_concurrent_load >>$CLIENT_LOG 2>&1 +python $LC_TEST LifeCycleTest.test_concurrent_model_load >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" @@ -1791,7 +1791,7 @@ wait $SERVER_PID LOG_IDX=$((LOG_IDX+1)) -# LifeCycleTest.test_concurrent_load_unload +# LifeCycleTest.test_concurrent_model_load_unload rm -rf models mkdir models cp -r identity_zero_1_int32 models && mkdir -p models/identity_zero_1_int32/1 @@ -1813,7 +1813,7 @@ if [ "$SERVER_PID" == "0" ]; then fi set +e -python $LC_TEST LifeCycleTest.test_concurrent_load_unload >>$CLIENT_LOG 2>&1 +python $LC_TEST LifeCycleTest.test_concurrent_model_load_unload >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" @@ -1826,7 +1826,7 @@ wait $SERVER_PID LOG_IDX=$((LOG_IDX+1)) -# LifeCycleTest.test_load_unload_same_model_stress +# LifeCycleTest.test_concurrent_same_model_load_unload_stress rm -rf models mkdir models cp -r identity_zero_1_int32 models && \ @@ -1844,7 +1844,7 @@ if [ "$SERVER_PID" == "0" ]; then fi set +e -python $LC_TEST LifeCycleTest.test_load_unload_same_model_stress >>$CLIENT_LOG 2>&1 +python $LC_TEST LifeCycleTest.test_concurrent_same_model_load_unload_stress >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" From 781cab1bfe816a3ffd5eaf23b01a7bfa38314bcd Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Thu, 10 Aug 2023 11:15:35 -0700 Subject: [PATCH 5/7] Address pre-commit error --- README.md | 16 ++++++------ docs/index.md | 10 ++++---- docs/user_guide/faq.md | 56 +++++++++++++++++++++--------------------- 3 files changed, 41 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index 229f4f4103..eea96a979f 100644 --- a/README.md +++ b/README.md @@ -38,15 +38,15 @@ and corresponds to the 23.07 container release on ---- Triton Inference Server is an open source inference serving software that -streamlines AI inferencing. Triton enables teams to deploy any AI model from -multiple deep learning and machine learning frameworks, including TensorRT, -TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton -Inference Server supports inference across cloud, data center, edge and embedded -devices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia. Triton Inference -Server delivers optimized performance for many query types, including real time, +streamlines AI inferencing. Triton enables teams to deploy any AI model from +multiple deep learning and machine learning frameworks, including TensorRT, +TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton +Inference Server supports inference across cloud, data center, edge and embedded +devices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia. Triton Inference +Server delivers optimized performance for many query types, including real time, batched, ensembles and audio/video streaming. Triton inference Server is part of -[NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/), -a software platform that accelerates the data science pipeline and streamlines +[NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/), +a software platform that accelerates the data science pipeline and streamlines the development and deployment of production AI. Major features include: diff --git a/docs/index.md b/docs/index.md index 62bdb27d43..6d42750eaa 100644 --- a/docs/index.md +++ b/docs/index.md @@ -60,15 +60,15 @@ Triton Inference Server is an open source inference serving software that stream # Triton Inference Server -Triton Inference Server enables teams to deploy any AI model from multiple deep -learning and machine learning frameworks, including TensorRT, TensorFlow, +Triton Inference Server enables teams to deploy any AI model from multiple deep +learning and machine learning frameworks, including TensorRT, TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton supports inference across cloud, data center, edge and embedded devices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia. Triton Inference Server delivers optimized performance -for many query types, including real time, batched, ensembles and audio/video +for many query types, including real time, batched, ensembles and audio/video streaming. Triton inference Server is part of -[NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/), -a software platform that accelerates the data science pipeline and streamlines +[NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/), +a software platform that accelerates the data science pipeline and streamlines the development and deployment of production AI. Major features include: diff --git a/docs/user_guide/faq.md b/docs/user_guide/faq.md index c272fd25a3..523b38f750 100644 --- a/docs/user_guide/faq.md +++ b/docs/user_guide/faq.md @@ -165,41 +165,41 @@ the backtrace to better help us resolve the problem. ## What are the benefits of using [Triton Inference Server](https://developer.nvidia.com/triton-inference-server) as part of the [NVIDIA AI Enterprise Software Suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/)? -NVIDIA AI Enterprise enables enterprises to implement full AI workflows by +NVIDIA AI Enterprise enables enterprises to implement full AI workflows by delivering an entire end-to-end AI platform. Four key benefits: ### Enterprise-Grade Support, Security & API Stability: -Business-critical AI projects stay on track with NVIDIA Enterprise Support, -available globally to assist both IT teams with deploying and managing the -lifecycle of AI applications and the developer teams with building AI -applications. Support includes maintenance updates, dependable SLAs and -response times. Regular security reviews and priority notifications mitigate -potential risk of unmanaged opensource and ensure compliance with corporate -standards. Finally, long term support and regression testing ensures API +Business-critical AI projects stay on track with NVIDIA Enterprise Support, +available globally to assist both IT teams with deploying and managing the +lifecycle of AI applications and the developer teams with building AI +applications. Support includes maintenance updates, dependable SLAs and +response times. Regular security reviews and priority notifications mitigate +potential risk of unmanaged opensource and ensure compliance with corporate +standards. Finally, long term support and regression testing ensures API stability between releases. -### Speed time to production with AI Workflows & Pretrained Models: -To reduce the complexity of developing common AI applications, NVIDIA AI -Enterprise includes -[AI workflows](https://www.nvidia.com/en-us/launchpad/ai/workflows/) which are -reference applications for specific business outcomes such as Intelligent -Virtual Assistants and Digital Fingerprinting for real-time cybersecurity threat -detection. AI workflow reference applications may include -[AI frameworks](https://docs.nvidia.com/deeplearning/frameworks/index.html) and -[pretrained models](https://developer.nvidia.com/ai-models), -[Helm Charts](https://catalog.ngc.nvidia.com/helm-charts), -[Jupyter Notebooks](https://developer.nvidia.com/run-jupyter-notebooks) and +### Speed time to production with AI Workflows & Pretrained Models: +To reduce the complexity of developing common AI applications, NVIDIA AI +Enterprise includes +[AI workflows](https://www.nvidia.com/en-us/launchpad/ai/workflows/) which are +reference applications for specific business outcomes such as Intelligent +Virtual Assistants and Digital Fingerprinting for real-time cybersecurity threat +detection. AI workflow reference applications may include +[AI frameworks](https://docs.nvidia.com/deeplearning/frameworks/index.html) and +[pretrained models](https://developer.nvidia.com/ai-models), +[Helm Charts](https://catalog.ngc.nvidia.com/helm-charts), +[Jupyter Notebooks](https://developer.nvidia.com/run-jupyter-notebooks) and [documentation](https://docs.nvidia.com/ai-enterprise/index.html#overview). -### Performance for Efficiency and Cost Savings: -Using accelerated compute for AI workloads such as data process with -[NVIDIA RAPIDS Accelerator](https://developer.nvidia.com/rapids) for Apache -Spark and inference with Triton Inference Sever delivers better performance -which also improves efficiency and reduces operation and infrastructure costs, +### Performance for Efficiency and Cost Savings: +Using accelerated compute for AI workloads such as data process with +[NVIDIA RAPIDS Accelerator](https://developer.nvidia.com/rapids) for Apache +Spark and inference with Triton Inference Sever delivers better performance +which also improves efficiency and reduces operation and infrastructure costs, including savings from reduced time and energy consumption. -### Optimized and Certified to Deploy Everywhere: -Cloud, Data Center, Edge Optimized and certified to ensure reliable performance -whether it’s running your AI in the public cloud, virtualized data centers, or -on DGX systems. +### Optimized and Certified to Deploy Everywhere: +Cloud, Data Center, Edge Optimized and certified to ensure reliable performance +whether it’s running your AI in the public cloud, virtualized data centers, or +on DGX systems. From 267bf186de8f85b753206d9f4aa49cfacc25cbab Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Fri, 11 Aug 2023 14:40:44 -0700 Subject: [PATCH 6/7] Revert "Address pre-commit error" This reverts commit 781cab1bfe816a3ffd5eaf23b01a7bfa38314bcd. --- README.md | 16 ++++++------ docs/index.md | 10 ++++---- docs/user_guide/faq.md | 56 +++++++++++++++++++++--------------------- 3 files changed, 41 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index eea96a979f..229f4f4103 100644 --- a/README.md +++ b/README.md @@ -38,15 +38,15 @@ and corresponds to the 23.07 container release on ---- Triton Inference Server is an open source inference serving software that -streamlines AI inferencing. Triton enables teams to deploy any AI model from -multiple deep learning and machine learning frameworks, including TensorRT, -TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton -Inference Server supports inference across cloud, data center, edge and embedded -devices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia. Triton Inference -Server delivers optimized performance for many query types, including real time, +streamlines AI inferencing. Triton enables teams to deploy any AI model from +multiple deep learning and machine learning frameworks, including TensorRT, +TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton +Inference Server supports inference across cloud, data center, edge and embedded +devices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia. Triton Inference +Server delivers optimized performance for many query types, including real time, batched, ensembles and audio/video streaming. Triton inference Server is part of -[NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/), -a software platform that accelerates the data science pipeline and streamlines +[NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/), +a software platform that accelerates the data science pipeline and streamlines the development and deployment of production AI. Major features include: diff --git a/docs/index.md b/docs/index.md index 6d42750eaa..62bdb27d43 100644 --- a/docs/index.md +++ b/docs/index.md @@ -60,15 +60,15 @@ Triton Inference Server is an open source inference serving software that stream # Triton Inference Server -Triton Inference Server enables teams to deploy any AI model from multiple deep -learning and machine learning frameworks, including TensorRT, TensorFlow, +Triton Inference Server enables teams to deploy any AI model from multiple deep +learning and machine learning frameworks, including TensorRT, TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton supports inference across cloud, data center, edge and embedded devices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia. Triton Inference Server delivers optimized performance -for many query types, including real time, batched, ensembles and audio/video +for many query types, including real time, batched, ensembles and audio/video streaming. Triton inference Server is part of -[NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/), -a software platform that accelerates the data science pipeline and streamlines +[NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/), +a software platform that accelerates the data science pipeline and streamlines the development and deployment of production AI. Major features include: diff --git a/docs/user_guide/faq.md b/docs/user_guide/faq.md index 523b38f750..c272fd25a3 100644 --- a/docs/user_guide/faq.md +++ b/docs/user_guide/faq.md @@ -165,41 +165,41 @@ the backtrace to better help us resolve the problem. ## What are the benefits of using [Triton Inference Server](https://developer.nvidia.com/triton-inference-server) as part of the [NVIDIA AI Enterprise Software Suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/)? -NVIDIA AI Enterprise enables enterprises to implement full AI workflows by +NVIDIA AI Enterprise enables enterprises to implement full AI workflows by delivering an entire end-to-end AI platform. Four key benefits: ### Enterprise-Grade Support, Security & API Stability: -Business-critical AI projects stay on track with NVIDIA Enterprise Support, -available globally to assist both IT teams with deploying and managing the -lifecycle of AI applications and the developer teams with building AI -applications. Support includes maintenance updates, dependable SLAs and -response times. Regular security reviews and priority notifications mitigate -potential risk of unmanaged opensource and ensure compliance with corporate -standards. Finally, long term support and regression testing ensures API +Business-critical AI projects stay on track with NVIDIA Enterprise Support, +available globally to assist both IT teams with deploying and managing the +lifecycle of AI applications and the developer teams with building AI +applications. Support includes maintenance updates, dependable SLAs and +response times. Regular security reviews and priority notifications mitigate +potential risk of unmanaged opensource and ensure compliance with corporate +standards. Finally, long term support and regression testing ensures API stability between releases. -### Speed time to production with AI Workflows & Pretrained Models: -To reduce the complexity of developing common AI applications, NVIDIA AI -Enterprise includes -[AI workflows](https://www.nvidia.com/en-us/launchpad/ai/workflows/) which are -reference applications for specific business outcomes such as Intelligent -Virtual Assistants and Digital Fingerprinting for real-time cybersecurity threat -detection. AI workflow reference applications may include -[AI frameworks](https://docs.nvidia.com/deeplearning/frameworks/index.html) and -[pretrained models](https://developer.nvidia.com/ai-models), -[Helm Charts](https://catalog.ngc.nvidia.com/helm-charts), -[Jupyter Notebooks](https://developer.nvidia.com/run-jupyter-notebooks) and +### Speed time to production with AI Workflows & Pretrained Models: +To reduce the complexity of developing common AI applications, NVIDIA AI +Enterprise includes +[AI workflows](https://www.nvidia.com/en-us/launchpad/ai/workflows/) which are +reference applications for specific business outcomes such as Intelligent +Virtual Assistants and Digital Fingerprinting for real-time cybersecurity threat +detection. AI workflow reference applications may include +[AI frameworks](https://docs.nvidia.com/deeplearning/frameworks/index.html) and +[pretrained models](https://developer.nvidia.com/ai-models), +[Helm Charts](https://catalog.ngc.nvidia.com/helm-charts), +[Jupyter Notebooks](https://developer.nvidia.com/run-jupyter-notebooks) and [documentation](https://docs.nvidia.com/ai-enterprise/index.html#overview). -### Performance for Efficiency and Cost Savings: -Using accelerated compute for AI workloads such as data process with -[NVIDIA RAPIDS Accelerator](https://developer.nvidia.com/rapids) for Apache -Spark and inference with Triton Inference Sever delivers better performance -which also improves efficiency and reduces operation and infrastructure costs, +### Performance for Efficiency and Cost Savings: +Using accelerated compute for AI workloads such as data process with +[NVIDIA RAPIDS Accelerator](https://developer.nvidia.com/rapids) for Apache +Spark and inference with Triton Inference Sever delivers better performance +which also improves efficiency and reduces operation and infrastructure costs, including savings from reduced time and energy consumption. -### Optimized and Certified to Deploy Everywhere: -Cloud, Data Center, Edge Optimized and certified to ensure reliable performance -whether it’s running your AI in the public cloud, virtualized data centers, or -on DGX systems. +### Optimized and Certified to Deploy Everywhere: +Cloud, Data Center, Edge Optimized and certified to ensure reliable performance +whether it’s running your AI in the public cloud, virtualized data centers, or +on DGX systems. From e729a67a851ab0318000933719dfc42e7a80b2c7 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Fri, 11 Aug 2023 17:29:32 -0700 Subject: [PATCH 7/7] Record number of occurrence of each exception --- qa/L0_lifecycle/lifecycle_test.py | 63 ++++++++++++++++++++----------- qa/L0_lifecycle/test.sh | 2 + 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/qa/L0_lifecycle/lifecycle_test.py b/qa/L0_lifecycle/lifecycle_test.py index 1600b87ed4..0ff7511541 100755 --- a/qa/L0_lifecycle/lifecycle_test.py +++ b/qa/L0_lifecycle/lifecycle_test.py @@ -2927,52 +2927,71 @@ def test_concurrent_same_model_load_unload_stress(self): ) except Exception as ex: self.assertTrue(False, "unexpected error {}".format(ex)) - load_unload_exceptions = {"load_before_unload_finish_count": 0} + + load_fail_reasons = [ + "unexpected miss in global map", + "no version is available", + "failed to poll from model repository", + ] + unload_fail_reasons = ["versions that are still available: 1"] + load_fail_messages = [ + ("failed to load '" + model_name + "', " + reason) + for reason in load_fail_reasons + ] + unload_fail_messages = [ + ("failed to unload '" + model_name + "', " + reason) + for reason in unload_fail_reasons + ] + global_exception_stats = {} # { "exception message": number of occurrence } + load_before_unload_finish = [False] # use list to access by reference def _load_unload(): + exception_stats = {} # { "exception message": number of occurrence } for i in range(num_iterations): try: triton_client.load_model(model_name) except InferenceServerException as ex: # Acceptable for an unload to happen after a load completes, but # before the load can verify its load state. - fail_reasons = [ - "unexpected miss in global map", - "no version is available", - "failed to poll from model repository", - ] - fail_messages = [ - ("failed to load '" + model_name + "', " + reason) - for reason in fail_reasons - ] - self.assertIn(ex.message(), fail_messages) + error_message = ex.message() + self.assertIn(error_message, load_fail_messages) + if error_message not in exception_stats: + exception_stats[error_message] = 0 + exception_stats[error_message] += 1 try: triton_client.unload_model(model_name) except InferenceServerException as ex: # Acceptable for a load to happen during an async unload - self.assertEqual( - ex.message(), - "failed to unload '" - + model_name - + "', versions that are still available: 1", - ) - load_unload_exceptions["load_before_unload_finish_count"] += 1 + error_message = ex.message() + self.assertIn(error_message, unload_fail_messages) + if error_message not in exception_stats: + exception_stats[error_message] = 0 + exception_stats[error_message] += 1 + load_before_unload_finish[0] = True + return exception_stats with concurrent.futures.ThreadPoolExecutor() as pool: threads = [] for i in range(num_threads): threads.append(pool.submit(_load_unload)) for t in threads: - t.result() + exception_stats = t.result() + for key, count in exception_stats.items(): + if key not in global_exception_stats: + global_exception_stats[key] = 0 + global_exception_stats[key] += count self.assertTrue(triton_client.is_server_live()) self.assertTrue(triton_client.is_server_ready()) - self.assertGreater( - load_unload_exceptions["load_before_unload_finish_count"], - 0, + self.assertTrue( + load_before_unload_finish[0], "The test case did not replicate a load while async unloading. Consider increase concurrency.", ) + stats_path = "./test_concurrent_same_model_load_unload_stress.statistics.log" + with open(stats_path, mode="w", encoding="utf-8") as f: + f.write(str(global_exception_stats) + "\n") + def test_concurrent_model_instance_load_speedup(self): # Initialize client try: diff --git a/qa/L0_lifecycle/test.sh b/qa/L0_lifecycle/test.sh index e0090bb47b..4d0ab84517 100755 --- a/qa/L0_lifecycle/test.sh +++ b/qa/L0_lifecycle/test.sh @@ -1849,6 +1849,8 @@ if [ $? -ne 0 ]; then cat $CLIENT_LOG echo -e "\n***\n*** Test Failed\n***" RET=1 +else + cat ./test_concurrent_same_model_load_unload_stress.statistics.log fi set -e