diff --git a/qa/L0_io/test.sh b/qa/L0_io/test.sh index b8830d9ee4..3a9bb76b31 100755 --- a/qa/L0_io/test.sh +++ b/qa/L0_io/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -47,13 +47,11 @@ MODELSDIR=`pwd`/models DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository ENSEMBLEDIR=/data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_model_repository -export CUDA_VISIBLE_DEVICES=0,1 - # Must explicitly set LD_LIBRARY_PATH so that IO_TEST_UTIL can find # libtritonserver.so. LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH -rm -f $CLIENT_LOG.* +rm -f $CLIENT_LOG* # PyTorch is required for the Python backend dlpack add sub models pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html @@ -148,23 +146,47 @@ cp -r $MODELSDIR/fan_graphdef_float32_float32_float32 $MODELSDIR/fan_${full} && cp -r $ENSEMBLEDIR/nop_TYPE_FP32_-1 $MODELSDIR/. && \ mkdir -p $MODELSDIR/nop_TYPE_FP32_-1/1 +# prepare libtorch multi-device and multi-gpu models +cp -r ../L0_libtorch_instance_group_kind_model/models/libtorch_multi_device $MODELSDIR/. +cp ../L0_libtorch_instance_group_kind_model/gen_models.py ./gen_libtorch_model.py +mkdir -p $MODELSDIR/libtorch_multi_device/1 +mkdir -p $MODELSDIR/libtorch_multi_gpu/1 +cp $MODELSDIR/libtorch_multi_device/config.pbtxt $MODELSDIR/libtorch_multi_gpu/. +(cd $MODELSDIR/libtorch_multi_gpu && \ + sed -i "s/name: \"libtorch_multi_device\"/name: \"libtorch_multi_gpu\"/" config.pbtxt) + +set +e +python3 gen_libtorch_model.py >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Error when generating libtorch models. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +TRIALS="graphdef savedmodel onnx libtorch plan python python_dlpack libtorch_multi_gpu libtorch_multi_device" for input_device in -1 0 1; do for output_device in -1 0 1; do - for trial in graphdef savedmodel onnx libtorch plan python python_dlpack; do + for trial in ${TRIALS}; do # TensorRT Plan should only be deployed on GPU device model_devices="-1 0 1" && [[ "$trial" == "plan" ]] && model_devices="0 1" + full=${trial}_float32_float32_float32 && [[ "$trial" == "libtorch_multi"* ]] && full=${trial} + for model_device in $model_devices; do - full=${trial}_float32_float32_float32 full_log=$CLIENT_LOG.$full.$input_device.$output_device.$model_device host_policy=cpu if [ "$model_device" == "-1" ]; then - (cd $MODELSDIR/${full} && \ - sed -i "s/instance_group.*/instance_group [{ kind: KIND_CPU }]/" config.pbtxt) + if [[ "$trial" != "libtorch_multi"* ]]; then + (cd $MODELSDIR/${full} && \ + sed -i "s/instance_group.*/instance_group [{ kind: KIND_CPU }]/" config.pbtxt) + fi else host_policy=gpu_${model_device} - (cd $MODELSDIR/${full} && \ - sed -i "s/instance_group.*/instance_group [{ kind: KIND_GPU, gpus: [${model_device}] }]/" config.pbtxt) + if [[ "$trial" != "libtorch_multi"* ]]; then + (cd $MODELSDIR/${full} && \ + sed -i "s/instance_group.*/instance_group [{ kind: KIND_GPU, gpus: [${model_device}] }]/" config.pbtxt) + fi fi set +e @@ -196,14 +218,16 @@ for input_device in -1 0 1; do set -e # ensemble - set +e - $IO_TEST_UTIL -i $input_device -o $output_device -r $MODELSDIR -m fan_$full >>$full_log.ensemble 2>&1 - if [ $? -ne 0 ]; then - cat $full_log.ensemble - echo -e "\n***\n*** Test Failed\n***" - RET=1 + if [[ "$trial" != "libtorch_multi"* ]]; then + set +e + $IO_TEST_UTIL -i $input_device -o $output_device -r $MODELSDIR -m fan_$full >>$full_log.ensemble 2>&1 + if [ $? -ne 0 ]; then + cat $full_log.ensemble + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e fi - set -e done done diff --git a/qa/L0_libtorch_instance_group_kind_model/client.py b/qa/L0_libtorch_instance_group_kind_model/client.py new file mode 100644 index 0000000000..c3c8289f8a --- /dev/null +++ b/qa/L0_libtorch_instance_group_kind_model/client.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +sys.path.append("../common") + +import unittest +import numpy as np +import test_util as tu + +import tritonclient.http as httpclient + +# By default, find tritonserver on "localhost", but can be overridden +# with TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost') + + +class InferTest(tu.TestResultCollector): + + def test_infer(self): + try: + triton_client = httpclient.InferenceServerClient( + url=f"{_tritonserver_ipaddr}:8000") + except Exception as e: + print("channel creation failed: " + str(e)) + sys.exit(1) + + model_name = os.environ['MODEL_NAME'] + + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput('INPUT0', [1, 16], "FP32")) + inputs.append(httpclient.InferInput('INPUT1', [1, 16], "FP32")) + + # Create the data for the two input tensors. + input0_data = np.arange(start=0, stop=16, dtype=np.float32) + input0_data = np.expand_dims(input0_data, axis=0) + input1_data = np.arange(start=32, stop=48, dtype=np.float32) + input1_data = np.expand_dims(input1_data, axis=0) + + # Initialize the data + inputs[0].set_data_from_numpy(input0_data, binary_data=True) + inputs[1].set_data_from_numpy(input1_data, binary_data=True) + + outputs.append( + httpclient.InferRequestedOutput('OUTPUT__0', binary_data=True)) + outputs.append( + httpclient.InferRequestedOutput('OUTPUT__1', binary_data=True)) + + results = triton_client.infer(model_name, inputs, outputs=outputs) + + output0_data = results.as_numpy('OUTPUT__0') + output1_data = results.as_numpy('OUTPUT__1') + + expected_output_0 = input0_data + input1_data + expected_output_1 = input0_data - input1_data + + self.assertEqual(output0_data.shape, (1, 16)) + self.assertEqual(output1_data.shape, (1, 16)) + + self.assertTrue(np.all(expected_output_0 == output0_data)) + self.assertTrue(np.all(expected_output_1 == output1_data)) + + +if __name__ == '__main__': + unittest.main() diff --git a/qa/L0_libtorch_instance_group_kind_model/gen_models.py b/qa/L0_libtorch_instance_group_kind_model/gen_models.py new file mode 100755 index 0000000000..7cd6c5687e --- /dev/null +++ b/qa/L0_libtorch_instance_group_kind_model/gen_models.py @@ -0,0 +1,83 @@ +#!/usr/bin/python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import torch +import torch.nn as nn + + +class SumModule(nn.Module): + + def __init__(self, device): + super(SumModule, self).__init__() + self.device = device + + def forward(self, INPUT0, INPUT1): + INPUT0 = INPUT0.to(self.device) + INPUT1 = INPUT1.to(self.device) + print('SumModule - INPUT0 device: {}, INPUT1 device: {}\n'.format( + INPUT0.device, INPUT1.device)) + return INPUT0 + INPUT1 + + +class DiffModule(nn.Module): + + def __init__(self, device): + super(DiffModule, self).__init__() + self.device = device + + def forward(self, INPUT0, INPUT1): + INPUT0 = INPUT0.to(self.device) + INPUT1 = INPUT1.to(self.device) + print('DiffModule - INPUT0 device: {}, INPUT1 device: {}\n'.format( + INPUT0.device, INPUT1.device)) + return INPUT0 - INPUT1 + + +class TestModel(nn.Module): + + def __init__(self, device0, device1): + super(TestModel, self).__init__() + self.device0 = device0 + self.device1 = device1 + + self.layer1 = SumModule(self.device0) + self.layer2 = DiffModule(self.device1) + + def forward(self, INPUT0, INPUT1): + op0 = self.layer1(INPUT0, INPUT1) + op1 = self.layer2(INPUT0, INPUT1) + return op0, op1 + + +devices = [("cuda:2", "cuda:0"), ("cpu", "cuda:3")] +model_names = ["libtorch_multi_gpu", "libtorch_multi_device"] + +for device_pair, model_name in zip(devices, model_names): + model = TestModel(device_pair[0], device_pair[1]) + model_path = "models/" + model_name + "/1/model.pt" + scripted_model = torch.jit.script(model) + scripted_model.save(model_path) diff --git a/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt new file mode 100755 index 0000000000..bf8ca0d649 --- /dev/null +++ b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt @@ -0,0 +1,60 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "libtorch_multi_device" +platform: "pytorch_libtorch" +max_batch_size: 8 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT__0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT__1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] + +instance_group [ + { + kind: KIND_MODEL + } +] diff --git a/qa/L0_libtorch_instance_group_kind_model/test.sh b/qa/L0_libtorch_instance_group_kind_model/test.sh new file mode 100755 index 0000000000..c91095e8ed --- /dev/null +++ b/qa/L0_libtorch_instance_group_kind_model/test.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +pip3 uninstall -y torch +pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html + +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=models --log-verbose=1" +SERVER_LOG="./inference_server.log" + +CLIENT_PY=./client.py +CLIENT_LOG="./client.log" +EXPECTED_NUM_TESTS="1" +TEST_RESULT_FILE='test_results.txt' + +source ../common/util.sh + +RET=0 + +rm -f *.log *.txt + +mkdir -p models/libtorch_multi_device/1 +mkdir -p models/libtorch_multi_gpu/1 +cp models/libtorch_multi_device/config.pbtxt models/libtorch_multi_gpu/. +(cd models/libtorch_multi_gpu && \ + sed -i "s/name: \"libtorch_multi_device\"/name: \"libtorch_multi_gpu\"/" config.pbtxt) + +# Generate the models which are partioned across multiple devices +set +e +python3 gen_models.py >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Error when generating models. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +export MODEL_NAME='libtorch_multi_device' +python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +MESSAGES=("SumModule - INPUT0 device: cpu, INPUT1 device: cpu" + "DiffModule - INPUT0 device: cuda:3, INPUT1 device: cuda:3") +for MESSAGE in "${MESSAGES[@]}"; do + if grep -q "$MESSAGE" "$SERVER_LOG"; then + echo -e "Found \"$MESSAGE\"" >> "$CLIENT_LOG" + else + echo -e "Not found \"$MESSAGE\"" >> "$CLIENT_LOG" + RET=1 + fi +done + +export MODEL_NAME='libtorch_multi_gpu' +python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +MESSAGES=("SumModule - INPUT0 device: cuda:2, INPUT1 device: cuda:2" + "DiffModule - INPUT0 device: cuda:0, INPUT1 device: cuda:0") +for MESSAGE in "${MESSAGES[@]}"; do + if grep -q "$MESSAGE" "$SERVER_LOG"; then + echo -e "Found \"$MESSAGE\"" >> "$CLIENT_LOG" + else + echo -e "Not found \"$MESSAGE\"" >> "$CLIENT_LOG" + RET=1 + fi +done + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET