Add testing for Pytorch instance group kind MODEL

krishung5 · krishung5 · commit a439c228f016 · 2023-05-17T19:03:47.000-07:00
diff --git a/qa/L0_libtorch_instance_group_kind_model/client.py b/qa/L0_libtorch_instance_group_kind_model/client.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import sys
+
+sys.path.append("../common")
+
+import torch
+import unittest
+import numpy as np
+import test_util as tu
+
+import tritonclient.http as httpclient
+from tritonclient.utils import InferenceServerException
+
+# By default, find tritonserver on "localhost", but can be overridden
+# with TRITONSERVER_IPADDR envvar
+_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost')
+
+
+class InferTest(tu.TestResultCollector):
+
+    def test_infer(self):
+        try:
+            triton_client = httpclient.InferenceServerClient(
+                url=f"{_tritonserver_ipaddr}:8000")
+        except Exception as e:
+            print("channel creation failed: " + str(e))
+            sys.exit(1)
+
+        model_name = os.environ['MODEL_NAME']
+
+        inputs = []
+        outputs = []
+        inputs.append(httpclient.InferInput('INPUT0', [1, 16], "FP32"))
+        inputs.append(httpclient.InferInput('INPUT1', [1, 16], "FP32"))
+
+        # Create the data for the two input tensors.
+        input0_data = np.arange(start=0, stop=16, dtype=np.float32)
+        input0_data = np.expand_dims(input0_data, axis=0)
+        input1_data = np.arange(start=32, stop=48, dtype=np.float32)
+        input1_data = np.expand_dims(input1_data, axis=0)
+
+        # Initialize the data
+        inputs[0].set_data_from_numpy(input0_data, binary_data=True)
+        inputs[1].set_data_from_numpy(input1_data, binary_data=True)
+
+        outputs.append(
+            httpclient.InferRequestedOutput('OUTPUT__0', binary_data=True))
+        outputs.append(
+            httpclient.InferRequestedOutput('OUTPUT__1', binary_data=True))
+
+        if model_name == "libtorch_instance_kind_err":
+            with self.assertRaises(InferenceServerException) as ex:
+                results = triton_client.infer(model_name,
+                                              inputs,
+                                              outputs=outputs)
+            self.assertIn(
+                "Expected all tensors to be on the same device, but found at least two devices",
+                str(ex.exception))
+            return
+
+        results = triton_client.infer(model_name, inputs, outputs=outputs)
+
+        output0_data = results.as_numpy('OUTPUT__0')
+        output1_data = results.as_numpy('OUTPUT__1')
+
+        # Only validate the shape, as the output will differ every time the
+        # model is compiled and used on different devices.
+        self.assertEqual(output0_data.shape, (1, 4))
+        self.assertEqual(output1_data.shape, (1, 4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/qa/L0_libtorch_instance_group_kind_model/gen_models.py b/qa/L0_libtorch_instance_group_kind_model/gen_models.py
@@ -0,0 +1,60 @@
+#!/usr/bin/python
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DataParallel
+
+
+class TestModel(nn.Module):
+
+    def __init__(self, device1, device2):
+        super(TestModel, self).__init__()
+        self.device1 = device1
+        self.device2 = device2
+        self.layers1 = nn.Sequential(nn.Linear(16, 4),).to(self.device1)
+        self.layers2 = nn.Sequential(nn.Linear(16, 4)).to(self.device2)
+
+    def forward(self, INPUT0, INPUT1):
+        INPUT0 = INPUT0.to(self.device1)
+        INPUT1 = INPUT1.to(self.device2)
+        print('INPUT0 device: {}, INPUT1 device: {}\n'.format(
+            INPUT0.device, INPUT1.device))
+
+        op0 = self.layers1(INPUT0 + INPUT0)
+        op1 = self.layers2(INPUT1 + INPUT1)
+        return op0, op1
+
+
+devices = [("cuda:2", "cuda:0"), ("cpu", "cuda:3")]
+model_names = ["libtorch_multi_gpu", "libtorch_multi_devices"]
+
+for device_pair, model_name in zip(devices, model_names):
+    model = TestModel(device_pair[0], device_pair[1])
+    model_path = "models/" + model_name + "/1/model.pt"
+    scripted_model = torch.jit.script(model)
+    scripted_model.save(model_path)
diff --git a/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/1/model.pt b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/1/model.pt
diff --git a/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/config.pbtxt b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/config.pbtxt
@@ -0,0 +1,34 @@
+name: "libtorch_multi_devices"
+platform: "pytorch_libtorch"
+max_batch_size: 8
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  },
+  {
+    name: "INPUT1"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT__0"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  },
+  {
+    name: "OUTPUT__1"
+    data_type: TYPE_FP32
+    dims: [ 4 ]
+  }
+]
+
+instance_group [
+  {
+    kind: KIND_MODEL
+  }
+]
diff --git a/qa/L0_libtorch_instance_group_kind_model/test.sh b/qa/L0_libtorch_instance_group_kind_model/test.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
+if [ "$#" -ge 1 ]; then
+    REPO_VERSION=$1
+fi
+if [ -z "$REPO_VERSION" ]; then
+    echo -e "Repository version must be specified"
+    echo -e "\n***\n*** Test Failed\n***"
+    exit 1
+fi
+if [ ! -z "$TEST_REPO_ARCH" ]; then
+    REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
+fi
+
+pip3 uninstall -y torch
+pip3 install torch
+
+DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository
+SERVER=/opt/tritonserver/bin/tritonserver
+SERVER_ARGS="--model-repository=models --log-verbose=1"
+SERVER_LOG="./inference_server.log"
+
+CLIENT_PY=./client.py
+CLIENT_LOG="./client.log"
+EXPECTED_NUM_TESTS="1"
+TEST_RESULT_FILE='test_results.txt'
+
+source ../common/util.sh
+
+RET=0
+
+rm -f *.log *.txt
+
+mkdir -p models/libtorch_multi_gpu/1
+cp models/libtorch_multi_devices/config.pbtxt models/libtorch_multi_gpu/.
+(cd models/libtorch_multi_gpu && \
+    sed -i "s/name: \"libtorch_multi_devices\"/name: \"libtorch_multi_gpu\"/" config.pbtxt)
+
+# Generate the models which are partioned across multiple devices
+set +e
+python3 gen_models.py >> $CLIENT_LOG 2>&1 
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Error when generating models. \n***"
+    cat $CLIENT_LOG
+    RET=1
+fi
+set -e
+
+# Create the model that does not set instance_group_kind to 'KIND_MODEL'
+mkdir -p models/libtorch_instance_kind_err/1
+cp models/libtorch_multi_devices/config.pbtxt models/libtorch_instance_kind_err/.
+cp models/libtorch_multi_devices/1/model.pt models/libtorch_instance_kind_err/1/.
+(cd models/libtorch_instance_kind_err && \
+    sed -i "s/name: \"libtorch_multi_devices\"/name: \"libtorch_instance_kind_err\"/" config.pbtxt && \
+    sed -i "s/kind: KIND_MODEL/kind: KIND_GPU/" config.pbtxt)
+
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+set +e
+
+MESSAGE="INPUT0 device: cpu, INPUT1 device: cuda:3"
+export MODEL_NAME='libtorch_multi_devices'
+python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***"
+    cat $CLIENT_LOG
+    RET=1
+else
+    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Result Verification Failed\n***"
+        RET=1
+    fi
+fi
+
+if grep "$MESSAGE" $SERVER_LOG; then
+    echo -e "Found \"$MESSAGE\"" >> $CLIENT_LOG
+else
+    echo -e "Not found \"$MESSAGE\"" >> $CLIENT_LOG
+    RET=1
+fi
+
+MESSAGE="INPUT0 device: cuda:2, INPUT1 device: cuda:0"
+export MODEL_NAME='libtorch_multi_gpu'
+python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***"
+    cat $CLIENT_LOG
+    RET=1
+else
+    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Result Verification Failed\n***"
+        RET=1
+    fi
+fi
+
+if grep "$MESSAGE" $SERVER_LOG; then
+    echo -e "Found \"$MESSAGE\"" >> $CLIENT_LOG
+else
+    echo -e "Not found \"$MESSAGE\"" >> $CLIENT_LOG
+    RET=1
+fi
+
+MESSAGE="INPUT0 device: cuda:2, INPUT1 device: cuda:0"
+export MODEL_NAME='libtorch_instance_kind_err'
+python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***"
+    cat $CLIENT_LOG
+    RET=1
+else
+    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Result Verification Failed\n***"
+        RET=1
+    fi
+fi
+
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+if [ $RET -eq 0 ]; then
+    echo -e "\n***\n*** Test Passed\n***"
+else
+    cat $CLIENT_LOG
+    echo -e "\n***\n*** Test FAILED\n***"
+fi
+
+exit $RET