Skip to content

Commit f07b3f9

Browse files
committed
[DO NOT MERGE Add test. FIXME: model generation
1 parent fb65173 commit f07b3f9

File tree

2 files changed

+212
-0
lines changed

2 files changed

+212
-0
lines changed

qa/L0_device_memory_tracker/test.py

+90
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/usr/bin/env python
2+
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions
6+
# are met:
7+
# * Redistributions of source code must retain the above copyright
8+
# notice, this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright
10+
# notice, this list of conditions and the following disclaimer in the
11+
# documentation and/or other materials provided with the distribution.
12+
# * Neither the name of NVIDIA CORPORATION nor the names of its
13+
# contributors may be used to endorse or promote products derived
14+
# from this software without specific prior written permission.
15+
#
16+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
28+
import unittest
29+
import numpy as np
30+
import time
31+
32+
import tritonclient.http as tritonclient
33+
from tritonclient.utils import InferenceServerException
34+
35+
import nvidia_smi
36+
37+
38+
class MemoryUsageTest(unittest.TestCase):
39+
40+
def setUp(self):
41+
nvidia_smi.nvmlInit()
42+
self.gpu_handle_ = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
43+
self.client_ = tritonclient.InferenceServerClient(url="localhost:8000")
44+
45+
def tearDown(self):
46+
nvidia_smi.nvmlShutdown()
47+
48+
def report_used_gpu_memory(self):
49+
info = nvidia_smi.nvmlDeviceGetMemoryInfo(self.gpu_handle_)
50+
return info.used
51+
52+
def is_testing_backend(self, model_name, backend_name):
53+
return self.client_.get_model_config(
54+
model_name)["backend"] == backend_name
55+
56+
def verify_recorded_usage(self, model_stat):
57+
recorded_gpu_usage = 0
58+
for usage in model_stat["memory_usage"]:
59+
if usage["type"] == "GPU":
60+
recorded_gpu_usage += usage["byte_size"]
61+
# unload and verify recorded usage
62+
before_total_usage = self.report_used_gpu_memory()
63+
self.client_.unload_model(model_stat["name"])
64+
# unload return after puttting model to unload process,
65+
# wait to be finished
66+
time.sleep(2)
67+
usage_delta = before_total_usage - self.report_used_gpu_memory()
68+
# check with tolerance as gpu usage obtained is overall usage
69+
self.assertTrue(
70+
usage_delta * 0.9 <= recorded_gpu_usage <= usage_delta * 1.1,
71+
msg=
72+
"For model {}, expect recorded usage to be in range [{}, {}], got {}"
73+
.format(model_stat["name"], usage_delta * 0.9, usage_delta * 1.1,
74+
recorded_gpu_usage))
75+
76+
def test_onnx(self):
77+
model_stats = self.client_.get_inference_statistics()["model_stats"]
78+
for model_stat in model_stats:
79+
if self.is_testing_backend(model_stat["name"], "onnxruntime"):
80+
self.verify_recorded_usage(model_stat)
81+
82+
def test_plan(self):
83+
model_stats = self.client_.get_inference_statistics()["model_stats"]
84+
for model_stat in model_stats:
85+
if self.is_testing_backend(model_stat["name"], "tensorrt"):
86+
self.verify_recorded_usage(model_stat)
87+
88+
89+
if __name__ == "__main__":
90+
unittest.main()

qa/L0_device_memory_tracker/test.sh

+122
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#!/bin/bash
2+
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions
6+
# are met:
7+
# * Redistributions of source code must retain the above copyright
8+
# notice, this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright
10+
# notice, this list of conditions and the following disclaimer in the
11+
# documentation and/or other materials provided with the distribution.
12+
# * Neither the name of NVIDIA CORPORATION nor the names of its
13+
# contributors may be used to endorse or promote products derived
14+
# from this software without specific prior written permission.
15+
#
16+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
28+
REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
29+
if [ "$#" -ge 1 ]; then
30+
REPO_VERSION=$1
31+
fi
32+
if [ -z "$REPO_VERSION" ]; then
33+
echo -e "Repository version must be specified"
34+
echo -e "\n***\n*** Test Failed\n***"
35+
exit 1
36+
fi
37+
if [ ! -z "$TEST_REPO_ARCH" ]; then
38+
REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
39+
fi
40+
41+
export CUDA_VISIBLE_DEVICES=0
42+
43+
TEST_LOG="./test.log"
44+
TEST_PY=test.py
45+
46+
DATADIR=/data/inferenceserver/${REPO_VERSION}
47+
rm -f *.log
48+
49+
TEST_RESULT_FILE='test_results.txt'
50+
SERVER=/opt/tritonserver/bin/tritonserver
51+
SERVER_LOG="./server.log"
52+
53+
source ../common/util.sh
54+
55+
RET=0
56+
57+
# prepare model repository, only contains ONNX and TRT models as the
58+
# corresponding backend are known to be memory.
59+
rm -rf models && mkdir models
60+
# ONNX
61+
cp -r /data/inferenceserver/${REPO_VERSION}/onnx_model_store/* models/.
62+
63+
# Convert to get TRT models against the system
64+
CAFFE2PLAN=../common/caffe2plan
65+
set +e
66+
mkdir -p models/vgg19_plan/1 && rm -f models/vgg19_plan/1/model.plan && \
67+
$CAFFE2PLAN -b32 -n prob -o models/vgg19_plan/1/model.plan \
68+
$DATADIR/caffe_models/vgg19.prototxt $DATADIR/caffe_models/vgg19.caffemodel
69+
if [ $? -ne 0 ]; then
70+
echo -e "\n***\n*** Failed to generate vgg19 PLAN\n***"
71+
exit 1
72+
fi
73+
74+
mkdir -p models/resnet50_plan/1 && rm -f models/resnet50_plan/1/model.plan && \
75+
$CAFFE2PLAN -b32 -n prob -o models/resnet50_plan/1/model.plan \
76+
$DATADIR/caffe_models/resnet50.prototxt $DATADIR/caffe_models/resnet50.caffemodel
77+
if [ $? -ne 0 ]; then
78+
echo -e "\n***\n*** Failed to generate resnet50 PLAN\n***"
79+
exit 1
80+
fi
81+
82+
mkdir -p models/resnet152_plan/1 && rm -f models/resnet152_plan/1/model.plan && \
83+
$CAFFE2PLAN -h -b32 -n prob -o models/resnet152_plan/1/model.plan \
84+
$DATADIR/caffe_models/resnet152.prototxt $DATADIR/caffe_models/resnet152.caffemodel
85+
if [ $? -ne 0 ]; then
86+
echo -e "\n***\n*** Failed to generate resnet152 PLAN\n***"
87+
exit 1
88+
fi
89+
set -e
90+
91+
# testing use nvidia-smi for Python to validate the reported usage
92+
pip install nvidia-ml-py3
93+
94+
# Start server to load all models (in parallel), then gradually unload
95+
# the models and expect the memory usage changes matches what are reported
96+
# in statistic.
97+
SERVER_ARGS="--model-repository=models --model-control-mode=explicit --load-model=*"
98+
run_server
99+
if [ "$SERVER_PID" == "0" ]; then
100+
echo -e "\n***\n*** Failed to start $SERVER\n***"
101+
cat $SERVER_LOG
102+
exit 1
103+
fi
104+
105+
set +e
106+
python $TEST_PY > $TEST_LOG 2>&1
107+
if [ $? -ne 0 ]; then
108+
RET=1
109+
fi
110+
set -e
111+
kill $SERVER_PID
112+
wait $SERVER_PID
113+
114+
if [ $RET -eq 0 ]; then
115+
echo -e "\n***\n*** Test Passed\n***"
116+
else
117+
cat $SERVER_LOG
118+
cat $TEST_LOG
119+
echo -e "\n***\n*** Test FAILED\n***"
120+
fi
121+
122+
exit $RET

0 commit comments

Comments
 (0)