From dc0f7882fe81557428eeb60bbc0dffd2f7a83af6 Mon Sep 17 00:00:00 2001 From: krishung5 Date: Wed, 17 May 2023 18:57:57 -0700 Subject: [PATCH 1/9] Add testing for Pytorch instance group kind MODEL --- .../client.py | 100 +++++++++++ .../gen_models.py | 60 +++++++ .../models/libtorch_multi_devices/1/model.pt | Bin 0 -> 5558 bytes .../libtorch_multi_devices/config.pbtxt | 34 ++++ .../test.sh | 166 ++++++++++++++++++ 5 files changed, 360 insertions(+) create mode 100644 qa/L0_libtorch_instance_group_kind_model/client.py create mode 100755 qa/L0_libtorch_instance_group_kind_model/gen_models.py create mode 100644 qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/1/model.pt create mode 100755 qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/config.pbtxt create mode 100755 qa/L0_libtorch_instance_group_kind_model/test.sh diff --git a/qa/L0_libtorch_instance_group_kind_model/client.py b/qa/L0_libtorch_instance_group_kind_model/client.py new file mode 100644 index 0000000000..e882496e7a --- /dev/null +++ b/qa/L0_libtorch_instance_group_kind_model/client.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys + +sys.path.append("../common") + +import torch +import unittest +import numpy as np +import test_util as tu + +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException + +# By default, find tritonserver on "localhost", but can be overridden +# with TRITONSERVER_IPADDR envvar +_tritonserver_ipaddr = os.environ.get('TRITONSERVER_IPADDR', 'localhost') + + +class InferTest(tu.TestResultCollector): + + def test_infer(self): + try: + triton_client = httpclient.InferenceServerClient( + url=f"{_tritonserver_ipaddr}:8000") + except Exception as e: + print("channel creation failed: " + str(e)) + sys.exit(1) + + model_name = os.environ['MODEL_NAME'] + + inputs = [] + outputs = [] + inputs.append(httpclient.InferInput('INPUT0', [1, 16], "FP32")) + inputs.append(httpclient.InferInput('INPUT1', [1, 16], "FP32")) + + # Create the data for the two input tensors. + input0_data = np.arange(start=0, stop=16, dtype=np.float32) + input0_data = np.expand_dims(input0_data, axis=0) + input1_data = np.arange(start=32, stop=48, dtype=np.float32) + input1_data = np.expand_dims(input1_data, axis=0) + + # Initialize the data + inputs[0].set_data_from_numpy(input0_data, binary_data=True) + inputs[1].set_data_from_numpy(input1_data, binary_data=True) + + outputs.append( + httpclient.InferRequestedOutput('OUTPUT__0', binary_data=True)) + outputs.append( + httpclient.InferRequestedOutput('OUTPUT__1', binary_data=True)) + + if model_name == "libtorch_instance_kind_err": + with self.assertRaises(InferenceServerException) as ex: + results = triton_client.infer(model_name, + inputs, + outputs=outputs) + self.assertIn( + "Expected all tensors to be on the same device, but found at least two devices", + str(ex.exception)) + return + + results = triton_client.infer(model_name, inputs, outputs=outputs) + + output0_data = results.as_numpy('OUTPUT__0') + output1_data = results.as_numpy('OUTPUT__1') + + # Only validate the shape, as the output will differ every time the + # model is compiled and used on different devices. + self.assertEqual(output0_data.shape, (1, 4)) + self.assertEqual(output1_data.shape, (1, 4)) + + +if __name__ == '__main__': + unittest.main() diff --git a/qa/L0_libtorch_instance_group_kind_model/gen_models.py b/qa/L0_libtorch_instance_group_kind_model/gen_models.py new file mode 100755 index 0000000000..500f8e6667 --- /dev/null +++ b/qa/L0_libtorch_instance_group_kind_model/gen_models.py @@ -0,0 +1,60 @@ +#!/usr/bin/python +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import torch +import torch.nn as nn +from torch.nn.parallel import DataParallel + + +class TestModel(nn.Module): + + def __init__(self, device1, device2): + super(TestModel, self).__init__() + self.device1 = device1 + self.device2 = device2 + self.layers1 = nn.Sequential(nn.Linear(16, 4),).to(self.device1) + self.layers2 = nn.Sequential(nn.Linear(16, 4)).to(self.device2) + + def forward(self, INPUT0, INPUT1): + INPUT0 = INPUT0.to(self.device1) + INPUT1 = INPUT1.to(self.device2) + print('INPUT0 device: {}, INPUT1 device: {}\n'.format( + INPUT0.device, INPUT1.device)) + + op0 = self.layers1(INPUT0 + INPUT0) + op1 = self.layers2(INPUT1 + INPUT1) + return op0, op1 + + +devices = [("cuda:2", "cuda:0"), ("cpu", "cuda:3")] +model_names = ["libtorch_multi_gpu", "libtorch_multi_devices"] + +for device_pair, model_name in zip(devices, model_names): + model = TestModel(device_pair[0], device_pair[1]) + model_path = "models/" + model_name + "/1/model.pt" + scripted_model = torch.jit.script(model) + scripted_model.save(model_path) diff --git a/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/1/model.pt b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/1/model.pt new file mode 100644 index 0000000000000000000000000000000000000000..7fc331f64d56175e51934956d6f7ff2a6ce1f084 GIT binary patch literal 5558 zcmbVQ2|QHWAHR%sLS##}C@C};yWE>4>)0ZNmNCX;FlG!hgCs3Ni;`uYJbN2io>Cz* z_emmp%6nxgOQpR$Dn+aQcW4<}Uj5H}&fL#ke&65k{J!V+`<-(ZV=|M4e@(|lX9o{N~skcrS;AG?TxH&2I&_;?4 z{Nh0sH{w_Zr`*FDf*&k{Q|Jm%kX{zp^Fj|C8yA2AGG|*g9eu^^k$(r~%E`fi>m3V+ zg*x!7x8vLrBXv0YVHK$2b)M@KgX8LKvV8N}z)+Mbd@Ip_%Wlwza{G%xQ;D_EUon30V0!@k^m-S^ z{J9Oxxa$W$wH@JFzs}-{-7E)Ldqm+X`+88iDAy`YP=#A{#0~PaQ-WTgbfJ#g^H$ef zaFFJF2gnmADwh6s_Yfd}I1uTcKk%mrzdsEfYy}a2deU<^C8~z7xqLiW7<32Z;azt1 z)t5%#c>PCsru=wV2V3kn`1y*k84r4ru-u3JY%o)p0ayLAh`W!W2>VuF0L5BExnhGG z;ecE$^z;c81j>%p8$8djGN%wBAifW@PM!xBZp>v=2G?4Z z1FkC+7hpk+mzp0j;YQM~%(G-6`wkEeQZW+gXdkI+5UOS1mU=>aFzxk5_jQRM(M1)Zs zYz=;y!L5QsJf1;|B$DxX^eR#m!;KfRXgaWDTl`{QURX0C2_e)FYA{_O#a98j$A?7W zH!>*{d?0}s7EOo@!joyVFuI_-uPE|1h{OsZk__o6Ulejjj9}7*_?~0w!oDKB=gc62 zr4e0Z@)d;;ONxv#q>FyB1T$$7hDyaC8H7n8MPZ0EDg&{D6p8jG(U~MFBZPohDaNys zERaG*wDSVUfG++c5DEf7h(xoP)I~Z2#p$r_J|B3MUsM6gb*2My7Uq+h8JDN7fGi;U8H}cis$pKPpfrPq)U1u7U{gtWo{i8nw^Xs3X?QN4f@v7bnlnuP-MH0syb`@{$}< zH@+?$ih%6Lc}+y7qWO2u3XXuT;h)UwJhyU z^6F`zEaN`{O0KouQ{U99)7o@d@y$Ta zpEdF=7*csY5!()J-S++|S zUQmrP=*`7AItA84K9l)ioo>REyHoUL+K zB!~B@Rx2^3R?$5F=*5f)RlnXTu|YKFl9OYDw_L*R0MkUG%gX_>!eNyQG;<3AQ`J!+ zB0tZoHQhx}zM%B3ro5vLsXf4mAMb?{SC; zLV_w7Kbfus0DsbfOVq2@a!K*6ogIxLyHt<bMwEaRQ)WWiHU>>qdWUoGx`noih3j+R&Tx{ zdi&5TKXa8KtG(wZxGm!cZ`%th|Fze$xHeFW@z{OO^Zr^_j&S=iFuH1#Va7QU+R@L- zmS<-9SIS3^lqeP`;tmyb)Lc&#S)Y7e@ukg?*_R*HB*F@*#_u_QZR~qelARUW8~>pw z$LZ~h$gXD>p1vgxml9T&ryoC~i+#LuJ;}UW=H)3)AZB*8$`W<;e_;QU;1Lc{pLy94bBUW&_3rS+Ui$~m*~ zhIHaOqgFQ!%7QxGUdarhVrCl09OsymQC;>>9mSh5H-<;W2ap+K^}`v11#G`LWArC8 zvObln|FIJ2f35_R2?3g!5N51T{_8ex{|R@Lp+wR+w6pV(yJaykRmcO-W0dLWMn5>U z()(@19F2i|V;w;SkwuOZcdisXc<8l#%_iL@i`pLWl9%R77HVs)B~!8Ou(*s`KmxhH zH0DEDR;RGkW}#UbPMj@h@%UXb+GYhe^Dr@UEV|#>XS39GP11<$p-Ab&rhC!5ifa-iuQb+$rss@=C8GtzIBW{RF?U5= zZ5#&-$I;8Z9FOPw^|A$yhup*-S(jOK{)8D`Gu6~OVbpGKQ_6?YFzmMK_jUhn85QSI zVQ=P1d4!ag(;pX_-%h|*z|#MN3jVlW^7Ev)?Z4@A-axMT>T6n8siI~~(V-Y08|5Zt z8Tm!-sx5BW!aL>{g;g)b9CiC$vwpO4_TiDiC)tü(Guj3yqySXNHNF#AL|4l#J z18+HjiXVHMHvqm|Yqmdp=eodVC%?lXIYga6yWqLU3T)=pDaoEECSO|E?WlO>_<{Ep ztJvFe_nWr~8zp)di}-0K&vNoJIvr)=*dKbZg55eSn2pUlb}}&hv@Nw=;(_08dleDE zSx`gks{O>+`+0X#g5N9@Z*(>YRO+J7+jk1O~)l2{2ai5dK)maA@%e#76 zURsspw;=WL*1GZ1ipPhkFvch^(&FsuC;rzlg%S5}aijkIJ26@I+>p&>`xSN98gm27 zB*)uHXJ?Cf7R{^PxS=LI^^&-+=MF7jkyTjTr)x{x9&W=oIQ$)dy7|NQk-7l+E}dGb z)kSA+0Xf3PyN9m`mUpslzZc-~{iNPm&JW?57k=MQ*7^YYC;6UQKTXD+$VZBqo?sYK zfBQ#ur94b)x7$7naB6W6KSt3IyAEB@Y-@eEqa)#F#@!^ngTuqz=n-g9neA46yOPas z|Ggh{LYW14RkoGsI=rxP={bL*uE;|F#G@jmRJ4kRz1gCQkzxrSRfz~|^kt*;m;0G7 z+R}=Qh=-y~d$Z>=Pp;k0&e!UWF|XDiFiAS>AfM4f84Gc`eyAvBuwV2lr-<@(=;v%@bZu_8bYM!XG-;0?__d z$MQ$ff}e>Z(5)SZ#E*yB@>v7m{ho7dF||z9Of+?$?@s?J#U<~Zx@)sdL)v=mIrpwS z#hX-IZP{@%Ze;u!?C{J22zaIY9m@t197NFO^OPI72DZW z<(;NJJG;F$xNFF|9eZFtX_;}^8fHVcQnEO#DtJ9~6#*XcH0J)wz`J^Omk zkH}sdy1R{?EOd8=AlE20J?WmZdw)5RE?ZC1!suL$_xnAmRwfTcdF(_ruGl1vFyr%)oo}B{$O8JG*`Y=;2vkPY z)Fw*zXZ-%Du51-tGP$-5*B4*ITe;W8X!C%q#N$Z!Z5t=vR zzw?8AkVQ4X>z+6P-sSU|4FBnZ1n9}_!33@@hv+|O(Akr^NC!(JThhH7XbbcMMllniHAIeg1Ou#+=lxF}CnC>;c?TSTB=X~Z IKlSbZ0N*xzk^lez literal 0 HcmV?d00001 diff --git a/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/config.pbtxt b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/config.pbtxt new file mode 100755 index 0000000000..a68828a7d2 --- /dev/null +++ b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/config.pbtxt @@ -0,0 +1,34 @@ +name: "libtorch_multi_devices" +platform: "pytorch_libtorch" +max_batch_size: 8 + +input [ + { + name: "INPUT0" + data_type: TYPE_FP32 + dims: [ 16 ] + }, + { + name: "INPUT1" + data_type: TYPE_FP32 + dims: [ 16 ] + } +] +output [ + { + name: "OUTPUT__0" + data_type: TYPE_FP32 + dims: [ 4 ] + }, + { + name: "OUTPUT__1" + data_type: TYPE_FP32 + dims: [ 4 ] + } +] + +instance_group [ + { + kind: KIND_MODEL + } +] diff --git a/qa/L0_libtorch_instance_group_kind_model/test.sh b/qa/L0_libtorch_instance_group_kind_model/test.sh new file mode 100755 index 0000000000..bd0d249e7a --- /dev/null +++ b/qa/L0_libtorch_instance_group_kind_model/test.sh @@ -0,0 +1,166 @@ +#!/bin/bash +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +pip3 uninstall -y torch +pip3 install torch + +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=models --log-verbose=1" +SERVER_LOG="./inference_server.log" + +CLIENT_PY=./client.py +CLIENT_LOG="./client.log" +EXPECTED_NUM_TESTS="1" +TEST_RESULT_FILE='test_results.txt' + +source ../common/util.sh + +RET=0 + +rm -f *.log *.txt + +mkdir -p models/libtorch_multi_gpu/1 +cp models/libtorch_multi_devices/config.pbtxt models/libtorch_multi_gpu/. +(cd models/libtorch_multi_gpu && \ + sed -i "s/name: \"libtorch_multi_devices\"/name: \"libtorch_multi_gpu\"/" config.pbtxt) + +# Generate the models which are partioned across multiple devices +set +e +python3 gen_models.py >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Error when generating models. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +# Create the model that does not set instance_group_kind to 'KIND_MODEL' +mkdir -p models/libtorch_instance_kind_err/1 +cp models/libtorch_multi_devices/config.pbtxt models/libtorch_instance_kind_err/. +cp models/libtorch_multi_devices/1/model.pt models/libtorch_instance_kind_err/1/. +(cd models/libtorch_instance_kind_err && \ + sed -i "s/name: \"libtorch_multi_devices\"/name: \"libtorch_instance_kind_err\"/" config.pbtxt && \ + sed -i "s/kind: KIND_MODEL/kind: KIND_GPU/" config.pbtxt) + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +set +e + +MESSAGE="INPUT0 device: cpu, INPUT1 device: cuda:3" +export MODEL_NAME='libtorch_multi_devices' +python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +if grep "$MESSAGE" $SERVER_LOG; then + echo -e "Found \"$MESSAGE\"" >> $CLIENT_LOG +else + echo -e "Not found \"$MESSAGE\"" >> $CLIENT_LOG + RET=1 +fi + +MESSAGE="INPUT0 device: cuda:2, INPUT1 device: cuda:0" +export MODEL_NAME='libtorch_multi_gpu' +python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +if grep "$MESSAGE" $SERVER_LOG; then + echo -e "Found \"$MESSAGE\"" >> $CLIENT_LOG +else + echo -e "Not found \"$MESSAGE\"" >> $CLIENT_LOG + RET=1 +fi + +MESSAGE="INPUT0 device: cuda:2, INPUT1 device: cuda:0" +export MODEL_NAME='libtorch_instance_kind_err' +python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***" + cat $CLIENT_LOG + RET=1 +else + check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + cat $CLIENT_LOG + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET From 5e49dbb9d72af8b49d550fb2131ed09e4ba0e190 Mon Sep 17 00:00:00 2001 From: krishung5 Date: Thu, 18 May 2023 14:04:27 -0700 Subject: [PATCH 2/9] Remove unused item --- qa/L0_libtorch_instance_group_kind_model/client.py | 5 +---- qa/L0_libtorch_instance_group_kind_model/gen_models.py | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/qa/L0_libtorch_instance_group_kind_model/client.py b/qa/L0_libtorch_instance_group_kind_model/client.py index e882496e7a..6aeba4a201 100644 --- a/qa/L0_libtorch_instance_group_kind_model/client.py +++ b/qa/L0_libtorch_instance_group_kind_model/client.py @@ -30,7 +30,6 @@ sys.path.append("../common") -import torch import unittest import numpy as np import test_util as tu @@ -77,9 +76,7 @@ def test_infer(self): if model_name == "libtorch_instance_kind_err": with self.assertRaises(InferenceServerException) as ex: - results = triton_client.infer(model_name, - inputs, - outputs=outputs) + triton_client.infer(model_name, inputs, outputs=outputs) self.assertIn( "Expected all tensors to be on the same device, but found at least two devices", str(ex.exception)) diff --git a/qa/L0_libtorch_instance_group_kind_model/gen_models.py b/qa/L0_libtorch_instance_group_kind_model/gen_models.py index 500f8e6667..7c1be7912e 100755 --- a/qa/L0_libtorch_instance_group_kind_model/gen_models.py +++ b/qa/L0_libtorch_instance_group_kind_model/gen_models.py @@ -27,7 +27,6 @@ import torch import torch.nn as nn -from torch.nn.parallel import DataParallel class TestModel(nn.Module): From 1d2ec56906923db19b8bb9b66a6e46770801cd21 Mon Sep 17 00:00:00 2001 From: krishung5 Date: Mon, 5 Jun 2023 02:37:54 -0700 Subject: [PATCH 3/9] Update testing to verify the infer result --- .../client.py | 20 ++++++-------- .../gen_models.py | 17 +++++++++--- .../models/libtorch_multi_devices/1/model.pt | Bin 5558 -> 0 bytes .../test.sh | 25 +----------------- 4 files changed, 22 insertions(+), 40 deletions(-) delete mode 100644 qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/1/model.pt diff --git a/qa/L0_libtorch_instance_group_kind_model/client.py b/qa/L0_libtorch_instance_group_kind_model/client.py index 6aeba4a201..503fdf8b74 100644 --- a/qa/L0_libtorch_instance_group_kind_model/client.py +++ b/qa/L0_libtorch_instance_group_kind_model/client.py @@ -74,23 +74,19 @@ def test_infer(self): outputs.append( httpclient.InferRequestedOutput('OUTPUT__1', binary_data=True)) - if model_name == "libtorch_instance_kind_err": - with self.assertRaises(InferenceServerException) as ex: - triton_client.infer(model_name, inputs, outputs=outputs) - self.assertIn( - "Expected all tensors to be on the same device, but found at least two devices", - str(ex.exception)) - return - results = triton_client.infer(model_name, inputs, outputs=outputs) output0_data = results.as_numpy('OUTPUT__0') output1_data = results.as_numpy('OUTPUT__1') - # Only validate the shape, as the output will differ every time the - # model is compiled and used on different devices. - self.assertEqual(output0_data.shape, (1, 4)) - self.assertEqual(output1_data.shape, (1, 4)) + expected_output_0 = input0_data + input0_data + expected_output_1 = input1_data + input1_data + + self.assertEqual(output0_data.shape, (1, 16)) + self.assertEqual(output1_data.shape, (1, 16)) + + self.assertTrue(np.all(expected_output_0 == output0_data)) + self.assertTrue(np.all(expected_output_1 == output1_data)) if __name__ == '__main__': diff --git a/qa/L0_libtorch_instance_group_kind_model/gen_models.py b/qa/L0_libtorch_instance_group_kind_model/gen_models.py index 7c1be7912e..96d145f60a 100755 --- a/qa/L0_libtorch_instance_group_kind_model/gen_models.py +++ b/qa/L0_libtorch_instance_group_kind_model/gen_models.py @@ -29,14 +29,23 @@ import torch.nn as nn +class SumModule(nn.Module): + + def __init__(self): + super(SumModule, self).__init__() + + def forward(self, x): + return torch.sum(x, dim=1) + + class TestModel(nn.Module): def __init__(self, device1, device2): super(TestModel, self).__init__() self.device1 = device1 self.device2 = device2 - self.layers1 = nn.Sequential(nn.Linear(16, 4),).to(self.device1) - self.layers2 = nn.Sequential(nn.Linear(16, 4)).to(self.device2) + self.layers1 = SumModule().to(self.device1) + self.layers2 = SumModule().to(self.device2) def forward(self, INPUT0, INPUT1): INPUT0 = INPUT0.to(self.device1) @@ -44,8 +53,8 @@ def forward(self, INPUT0, INPUT1): print('INPUT0 device: {}, INPUT1 device: {}\n'.format( INPUT0.device, INPUT1.device)) - op0 = self.layers1(INPUT0 + INPUT0) - op1 = self.layers2(INPUT1 + INPUT1) + op0 = self.layers1(torch.stack([INPUT0, INPUT0], dim=1)) + op1 = self.layers2(torch.stack([INPUT1, INPUT1], dim=1)) return op0, op1 diff --git a/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/1/model.pt b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/1/model.pt deleted file mode 100644 index 7fc331f64d56175e51934956d6f7ff2a6ce1f084..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5558 zcmbVQ2|QHWAHR%sLS##}C@C};yWE>4>)0ZNmNCX;FlG!hgCs3Ni;`uYJbN2io>Cz* z_emmp%6nxgOQpR$Dn+aQcW4<}Uj5H}&fL#ke&65k{J!V+`<-(ZV=|M4e@(|lX9o{N~skcrS;AG?TxH&2I&_;?4 z{Nh0sH{w_Zr`*FDf*&k{Q|Jm%kX{zp^Fj|C8yA2AGG|*g9eu^^k$(r~%E`fi>m3V+ zg*x!7x8vLrBXv0YVHK$2b)M@KgX8LKvV8N}z)+Mbd@Ip_%Wlwza{G%xQ;D_EUon30V0!@k^m-S^ z{J9Oxxa$W$wH@JFzs}-{-7E)Ldqm+X`+88iDAy`YP=#A{#0~PaQ-WTgbfJ#g^H$ef zaFFJF2gnmADwh6s_Yfd}I1uTcKk%mrzdsEfYy}a2deU<^C8~z7xqLiW7<32Z;azt1 z)t5%#c>PCsru=wV2V3kn`1y*k84r4ru-u3JY%o)p0ayLAh`W!W2>VuF0L5BExnhGG z;ecE$^z;c81j>%p8$8djGN%wBAifW@PM!xBZp>v=2G?4Z z1FkC+7hpk+mzp0j;YQM~%(G-6`wkEeQZW+gXdkI+5UOS1mU=>aFzxk5_jQRM(M1)Zs zYz=;y!L5QsJf1;|B$DxX^eR#m!;KfRXgaWDTl`{QURX0C2_e)FYA{_O#a98j$A?7W zH!>*{d?0}s7EOo@!joyVFuI_-uPE|1h{OsZk__o6Ulejjj9}7*_?~0w!oDKB=gc62 zr4e0Z@)d;;ONxv#q>FyB1T$$7hDyaC8H7n8MPZ0EDg&{D6p8jG(U~MFBZPohDaNys zERaG*wDSVUfG++c5DEf7h(xoP)I~Z2#p$r_J|B3MUsM6gb*2My7Uq+h8JDN7fGi;U8H}cis$pKPpfrPq)U1u7U{gtWo{i8nw^Xs3X?QN4f@v7bnlnuP-MH0syb`@{$}< zH@+?$ih%6Lc}+y7qWO2u3XXuT;h)UwJhyU z^6F`zEaN`{O0KouQ{U99)7o@d@y$Ta zpEdF=7*csY5!()J-S++|S zUQmrP=*`7AItA84K9l)ioo>REyHoUL+K zB!~B@Rx2^3R?$5F=*5f)RlnXTu|YKFl9OYDw_L*R0MkUG%gX_>!eNyQG;<3AQ`J!+ zB0tZoHQhx}zM%B3ro5vLsXf4mAMb?{SC; zLV_w7Kbfus0DsbfOVq2@a!K*6ogIxLyHt<bMwEaRQ)WWiHU>>qdWUoGx`noih3j+R&Tx{ zdi&5TKXa8KtG(wZxGm!cZ`%th|Fze$xHeFW@z{OO^Zr^_j&S=iFuH1#Va7QU+R@L- zmS<-9SIS3^lqeP`;tmyb)Lc&#S)Y7e@ukg?*_R*HB*F@*#_u_QZR~qelARUW8~>pw z$LZ~h$gXD>p1vgxml9T&ryoC~i+#LuJ;}UW=H)3)AZB*8$`W<;e_;QU;1Lc{pLy94bBUW&_3rS+Ui$~m*~ zhIHaOqgFQ!%7QxGUdarhVrCl09OsymQC;>>9mSh5H-<;W2ap+K^}`v11#G`LWArC8 zvObln|FIJ2f35_R2?3g!5N51T{_8ex{|R@Lp+wR+w6pV(yJaykRmcO-W0dLWMn5>U z()(@19F2i|V;w;SkwuOZcdisXc<8l#%_iL@i`pLWl9%R77HVs)B~!8Ou(*s`KmxhH zH0DEDR;RGkW}#UbPMj@h@%UXb+GYhe^Dr@UEV|#>XS39GP11<$p-Ab&rhC!5ifa-iuQb+$rss@=C8GtzIBW{RF?U5= zZ5#&-$I;8Z9FOPw^|A$yhup*-S(jOK{)8D`Gu6~OVbpGKQ_6?YFzmMK_jUhn85QSI zVQ=P1d4!ag(;pX_-%h|*z|#MN3jVlW^7Ev)?Z4@A-axMT>T6n8siI~~(V-Y08|5Zt z8Tm!-sx5BW!aL>{g;g)b9CiC$vwpO4_TiDiC)tü(Guj3yqySXNHNF#AL|4l#J z18+HjiXVHMHvqm|Yqmdp=eodVC%?lXIYga6yWqLU3T)=pDaoEECSO|E?WlO>_<{Ep ztJvFe_nWr~8zp)di}-0K&vNoJIvr)=*dKbZg55eSn2pUlb}}&hv@Nw=;(_08dleDE zSx`gks{O>+`+0X#g5N9@Z*(>YRO+J7+jk1O~)l2{2ai5dK)maA@%e#76 zURsspw;=WL*1GZ1ipPhkFvch^(&FsuC;rzlg%S5}aijkIJ26@I+>p&>`xSN98gm27 zB*)uHXJ?Cf7R{^PxS=LI^^&-+=MF7jkyTjTr)x{x9&W=oIQ$)dy7|NQk-7l+E}dGb z)kSA+0Xf3PyN9m`mUpslzZc-~{iNPm&JW?57k=MQ*7^YYC;6UQKTXD+$VZBqo?sYK zfBQ#ur94b)x7$7naB6W6KSt3IyAEB@Y-@eEqa)#F#@!^ngTuqz=n-g9neA46yOPas z|Ggh{LYW14RkoGsI=rxP={bL*uE;|F#G@jmRJ4kRz1gCQkzxrSRfz~|^kt*;m;0G7 z+R}=Qh=-y~d$Z>=Pp;k0&e!UWF|XDiFiAS>AfM4f84Gc`eyAvBuwV2lr-<@(=;v%@bZu_8bYM!XG-;0?__d z$MQ$ff}e>Z(5)SZ#E*yB@>v7m{ho7dF||z9Of+?$?@s?J#U<~Zx@)sdL)v=mIrpwS z#hX-IZP{@%Ze;u!?C{J22zaIY9m@t197NFO^OPI72DZW z<(;NJJG;F$xNFF|9eZFtX_;}^8fHVcQnEO#DtJ9~6#*XcH0J)wz`J^Omk zkH}sdy1R{?EOd8=AlE20J?WmZdw)5RE?ZC1!suL$_xnAmRwfTcdF(_ruGl1vFyr%)oo}B{$O8JG*`Y=;2vkPY z)Fw*zXZ-%Du51-tGP$-5*B4*ITe;W8X!C%q#N$Z!Z5t=vR zzw?8AkVQ4X>z+6P-sSU|4FBnZ1n9}_!33@@hv+|O(Akr^NC!(JThhH7XbbcMMllniHAIeg1Ou#+=lxF}CnC>;c?TSTB=X~Z IKlSbZ0N*xzk^lez diff --git a/qa/L0_libtorch_instance_group_kind_model/test.sh b/qa/L0_libtorch_instance_group_kind_model/test.sh index bd0d249e7a..d24e7b1dc2 100755 --- a/qa/L0_libtorch_instance_group_kind_model/test.sh +++ b/qa/L0_libtorch_instance_group_kind_model/test.sh @@ -57,6 +57,7 @@ RET=0 rm -f *.log *.txt +mkdir -p models/libtorch_multi_devices/1 mkdir -p models/libtorch_multi_gpu/1 cp models/libtorch_multi_devices/config.pbtxt models/libtorch_multi_gpu/. (cd models/libtorch_multi_gpu && \ @@ -72,14 +73,6 @@ if [ $? -ne 0 ]; then fi set -e -# Create the model that does not set instance_group_kind to 'KIND_MODEL' -mkdir -p models/libtorch_instance_kind_err/1 -cp models/libtorch_multi_devices/config.pbtxt models/libtorch_instance_kind_err/. -cp models/libtorch_multi_devices/1/model.pt models/libtorch_instance_kind_err/1/. -(cd models/libtorch_instance_kind_err && \ - sed -i "s/name: \"libtorch_multi_devices\"/name: \"libtorch_instance_kind_err\"/" config.pbtxt && \ - sed -i "s/kind: KIND_MODEL/kind: KIND_GPU/" config.pbtxt) - run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" @@ -135,22 +128,6 @@ else RET=1 fi -MESSAGE="INPUT0 device: cuda:2, INPUT1 device: cuda:0" -export MODEL_NAME='libtorch_instance_kind_err' -python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 -if [ $? -ne 0 ]; then - echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***" - cat $CLIENT_LOG - RET=1 -else - check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS - if [ $? -ne 0 ]; then - cat $CLIENT_LOG - echo -e "\n***\n*** Test Result Verification Failed\n***" - RET=1 - fi -fi - set -e kill $SERVER_PID From d674175918aba412b5c3f4cbce98979993a3e2a2 Mon Sep 17 00:00:00 2001 From: krishung5 Date: Mon, 5 Jun 2023 09:32:51 -0700 Subject: [PATCH 4/9] Add copyright --- .../libtorch_multi_devices/config.pbtxt | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/config.pbtxt b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/config.pbtxt index a68828a7d2..2df1658e38 100755 --- a/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/config.pbtxt +++ b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/config.pbtxt @@ -1,3 +1,29 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + name: "libtorch_multi_devices" platform: "pytorch_libtorch" max_batch_size: 8 From f7e84397bf35be9c4e88fe08837d777041efade4 Mon Sep 17 00:00:00 2001 From: krishung5 Date: Mon, 5 Jun 2023 09:40:03 -0700 Subject: [PATCH 5/9] Remove unused import --- qa/L0_libtorch_instance_group_kind_model/client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/qa/L0_libtorch_instance_group_kind_model/client.py b/qa/L0_libtorch_instance_group_kind_model/client.py index 503fdf8b74..6b4a87bddd 100644 --- a/qa/L0_libtorch_instance_group_kind_model/client.py +++ b/qa/L0_libtorch_instance_group_kind_model/client.py @@ -35,7 +35,6 @@ import test_util as tu import tritonclient.http as httpclient -from tritonclient.utils import InferenceServerException # By default, find tritonserver on "localhost", but can be overridden # with TRITONSERVER_IPADDR envvar From 824ecb4a6d785664ee8068620de6b03aa4382492 Mon Sep 17 00:00:00 2001 From: krishung5 Date: Wed, 7 Jun 2023 03:13:09 -0700 Subject: [PATCH 6/9] Update pip install --- qa/L0_libtorch_instance_group_kind_model/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/L0_libtorch_instance_group_kind_model/test.sh b/qa/L0_libtorch_instance_group_kind_model/test.sh index d24e7b1dc2..4673dea7b5 100755 --- a/qa/L0_libtorch_instance_group_kind_model/test.sh +++ b/qa/L0_libtorch_instance_group_kind_model/test.sh @@ -39,7 +39,7 @@ if [ ! -z "$TEST_REPO_ARCH" ]; then fi pip3 uninstall -y torch -pip3 install torch +pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository SERVER=/opt/tritonserver/bin/tritonserver From 2e6a4faf56f8cd9ad13c92cf6cd8ab760e67b449 Mon Sep 17 00:00:00 2001 From: krishung5 Date: Thu, 8 Jun 2023 15:44:15 -0700 Subject: [PATCH 7/9] Update the model to use the same add sub logic --- .../client.py | 4 +- .../gen_models.py | 39 ++++++++++------- .../config.pbtxt | 2 +- .../test.sh | 42 +++++++++++-------- 4 files changed, 52 insertions(+), 35 deletions(-) rename qa/L0_libtorch_instance_group_kind_model/models/{libtorch_multi_devices => libtorch_multi_device}/config.pbtxt (98%) diff --git a/qa/L0_libtorch_instance_group_kind_model/client.py b/qa/L0_libtorch_instance_group_kind_model/client.py index 6b4a87bddd..c3c8289f8a 100644 --- a/qa/L0_libtorch_instance_group_kind_model/client.py +++ b/qa/L0_libtorch_instance_group_kind_model/client.py @@ -78,8 +78,8 @@ def test_infer(self): output0_data = results.as_numpy('OUTPUT__0') output1_data = results.as_numpy('OUTPUT__1') - expected_output_0 = input0_data + input0_data - expected_output_1 = input1_data + input1_data + expected_output_0 = input0_data + input1_data + expected_output_1 = input0_data - input1_data self.assertEqual(output0_data.shape, (1, 16)) self.assertEqual(output1_data.shape, (1, 16)) diff --git a/qa/L0_libtorch_instance_group_kind_model/gen_models.py b/qa/L0_libtorch_instance_group_kind_model/gen_models.py index 96d145f60a..a1ca6efa91 100755 --- a/qa/L0_libtorch_instance_group_kind_model/gen_models.py +++ b/qa/L0_libtorch_instance_group_kind_model/gen_models.py @@ -30,36 +30,47 @@ class SumModule(nn.Module): - def __init__(self): super(SumModule, self).__init__() - def forward(self, x): - return torch.sum(x, dim=1) + def forward(self, INPUT0, INPUT1): + print('SumModule - INPUT0 device: {}, INPUT1 device: {}\n'.format( + INPUT0.device, INPUT1.device)) + return INPUT0 + INPUT1 + +class DiffModule(nn.Module): + def __init__(self): + super(DiffModule, self).__init__() + + def forward(self, INPUT0, INPUT1): + print('DiffModule - INPUT0 device: {}, INPUT1 device: {}\n'.format( + INPUT0.device, INPUT1.device)) + return INPUT0 - INPUT1 class TestModel(nn.Module): - def __init__(self, device1, device2): + def __init__(self, device0, device1): super(TestModel, self).__init__() + self.device0 = device0 self.device1 = device1 - self.device2 = device2 - self.layers1 = SumModule().to(self.device1) - self.layers2 = SumModule().to(self.device2) + + self.layers1 = SumModule().to(self.device0) + self.layers2 = DiffModule().to(self.device1) def forward(self, INPUT0, INPUT1): - INPUT0 = INPUT0.to(self.device1) - INPUT1 = INPUT1.to(self.device2) - print('INPUT0 device: {}, INPUT1 device: {}\n'.format( - INPUT0.device, INPUT1.device)) + INPUT0_0 = INPUT0.to(self.device0) + INPUT0_1 = INPUT0.to(self.device1) + INPUT1_0 = INPUT1.to(self.device0) + INPUT1_1 = INPUT1.to(self.device1) - op0 = self.layers1(torch.stack([INPUT0, INPUT0], dim=1)) - op1 = self.layers2(torch.stack([INPUT1, INPUT1], dim=1)) + op0 = self.layers1(INPUT0_0, INPUT1_0) + op1 = self.layers2(INPUT0_1, INPUT1_1) return op0, op1 devices = [("cuda:2", "cuda:0"), ("cpu", "cuda:3")] -model_names = ["libtorch_multi_gpu", "libtorch_multi_devices"] +model_names = ["libtorch_multi_gpu", "libtorch_multi_device"] for device_pair, model_name in zip(devices, model_names): model = TestModel(device_pair[0], device_pair[1]) diff --git a/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/config.pbtxt b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt similarity index 98% rename from qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/config.pbtxt rename to qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt index 2df1658e38..bf8ca0d649 100755 --- a/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_devices/config.pbtxt +++ b/qa/L0_libtorch_instance_group_kind_model/models/libtorch_multi_device/config.pbtxt @@ -24,7 +24,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -name: "libtorch_multi_devices" +name: "libtorch_multi_device" platform: "pytorch_libtorch" max_batch_size: 8 diff --git a/qa/L0_libtorch_instance_group_kind_model/test.sh b/qa/L0_libtorch_instance_group_kind_model/test.sh index 4673dea7b5..c91095e8ed 100755 --- a/qa/L0_libtorch_instance_group_kind_model/test.sh +++ b/qa/L0_libtorch_instance_group_kind_model/test.sh @@ -57,11 +57,11 @@ RET=0 rm -f *.log *.txt -mkdir -p models/libtorch_multi_devices/1 +mkdir -p models/libtorch_multi_device/1 mkdir -p models/libtorch_multi_gpu/1 -cp models/libtorch_multi_devices/config.pbtxt models/libtorch_multi_gpu/. +cp models/libtorch_multi_device/config.pbtxt models/libtorch_multi_gpu/. (cd models/libtorch_multi_gpu && \ - sed -i "s/name: \"libtorch_multi_devices\"/name: \"libtorch_multi_gpu\"/" config.pbtxt) + sed -i "s/name: \"libtorch_multi_device\"/name: \"libtorch_multi_gpu\"/" config.pbtxt) # Generate the models which are partioned across multiple devices set +e @@ -82,8 +82,7 @@ fi set +e -MESSAGE="INPUT0 device: cpu, INPUT1 device: cuda:3" -export MODEL_NAME='libtorch_multi_devices' +export MODEL_NAME='libtorch_multi_device' python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then echo -e "\n***\n*** Model $MODEL_NAME FAILED. \n***" @@ -98,14 +97,17 @@ else fi fi -if grep "$MESSAGE" $SERVER_LOG; then - echo -e "Found \"$MESSAGE\"" >> $CLIENT_LOG -else - echo -e "Not found \"$MESSAGE\"" >> $CLIENT_LOG - RET=1 -fi +MESSAGES=("SumModule - INPUT0 device: cpu, INPUT1 device: cpu" + "DiffModule - INPUT0 device: cuda:3, INPUT1 device: cuda:3") +for MESSAGE in "${MESSAGES[@]}"; do + if grep -q "$MESSAGE" "$SERVER_LOG"; then + echo -e "Found \"$MESSAGE\"" >> "$CLIENT_LOG" + else + echo -e "Not found \"$MESSAGE\"" >> "$CLIENT_LOG" + RET=1 + fi +done -MESSAGE="INPUT0 device: cuda:2, INPUT1 device: cuda:0" export MODEL_NAME='libtorch_multi_gpu' python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then @@ -121,12 +123,16 @@ else fi fi -if grep "$MESSAGE" $SERVER_LOG; then - echo -e "Found \"$MESSAGE\"" >> $CLIENT_LOG -else - echo -e "Not found \"$MESSAGE\"" >> $CLIENT_LOG - RET=1 -fi +MESSAGES=("SumModule - INPUT0 device: cuda:2, INPUT1 device: cuda:2" + "DiffModule - INPUT0 device: cuda:0, INPUT1 device: cuda:0") +for MESSAGE in "${MESSAGES[@]}"; do + if grep -q "$MESSAGE" "$SERVER_LOG"; then + echo -e "Found \"$MESSAGE\"" >> "$CLIENT_LOG" + else + echo -e "Not found \"$MESSAGE\"" >> "$CLIENT_LOG" + RET=1 + fi +done set -e From acb4752bd12f2b85f0651e971a6782fdd3698f07 Mon Sep 17 00:00:00 2001 From: krishung5 Date: Thu, 8 Jun 2023 18:21:08 -0700 Subject: [PATCH 8/9] Add torch multi-gpu and multi-device models to L0_io --- qa/L0_io/test.sh | 58 +++++++++++++------ .../gen_models.py | 26 +++++---- 2 files changed, 56 insertions(+), 28 deletions(-) diff --git a/qa/L0_io/test.sh b/qa/L0_io/test.sh index b8830d9ee4..9bce8a55ac 100755 --- a/qa/L0_io/test.sh +++ b/qa/L0_io/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -47,13 +47,11 @@ MODELSDIR=`pwd`/models DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository ENSEMBLEDIR=/data/inferenceserver/${REPO_VERSION}/qa_ensemble_model_repository/qa_model_repository -export CUDA_VISIBLE_DEVICES=0,1 - # Must explicitly set LD_LIBRARY_PATH so that IO_TEST_UTIL can find # libtritonserver.so. LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH -rm -f $CLIENT_LOG.* +rm -f $CLIENT_LOG* # PyTorch is required for the Python backend dlpack add sub models pip3 install torch==1.13.0+cu117 -f https://download.pytorch.org/whl/torch_stable.html @@ -148,23 +146,47 @@ cp -r $MODELSDIR/fan_graphdef_float32_float32_float32 $MODELSDIR/fan_${full} && cp -r $ENSEMBLEDIR/nop_TYPE_FP32_-1 $MODELSDIR/. && \ mkdir -p $MODELSDIR/nop_TYPE_FP32_-1/1 +# prepare libtorch multi-device and multi-gpu models +cp -r ../L0_libtorch_instance_group_kind_model/models/libtorch_multi_device $MODELSDIR/. +cp ../L0_libtorch_instance_group_kind_model/gen_models.py ./gen_libtorch_model.py +mkdir -p $MODELSDIR/libtorch_multi_gpu/1 +cp $MODELSDIR/libtorch_multi_device/config.pbtxt models/libtorch_multi_gpu/. +(cd $MODELSDIR/libtorch_multi_gpu && \ + sed -i "s/name: \"libtorch_multi_device\"/name: \"libtorch_multi_gpu\"/" config.pbtxt) + +set +e +python3 gen_libtorch_model.py >> $CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Error when generating libtorch models. \n***" + cat $CLIENT_LOG + RET=1 +fi +set -e + +TRIALS="graphdef savedmodel onnx libtorch plan python python_dlpack libtorch_multi_gpu libtorch_multi_device" + for input_device in -1 0 1; do for output_device in -1 0 1; do - for trial in graphdef savedmodel onnx libtorch plan python python_dlpack; do + for trial in ${TRIALS}; do # TensorRT Plan should only be deployed on GPU device model_devices="-1 0 1" && [[ "$trial" == "plan" ]] && model_devices="0 1" + full=${trial}_float32_float32_float32 && [[ "$trial" == "libtorch_multi"* ]] && full=${trial} + for model_device in $model_devices; do - full=${trial}_float32_float32_float32 full_log=$CLIENT_LOG.$full.$input_device.$output_device.$model_device host_policy=cpu if [ "$model_device" == "-1" ]; then - (cd $MODELSDIR/${full} && \ - sed -i "s/instance_group.*/instance_group [{ kind: KIND_CPU }]/" config.pbtxt) + if [[ "$trial" != "libtorch_multi"* ]]; then + (cd $MODELSDIR/${full} && \ + sed -i "s/instance_group.*/instance_group [{ kind: KIND_CPU }]/" config.pbtxt) + fi else host_policy=gpu_${model_device} - (cd $MODELSDIR/${full} && \ - sed -i "s/instance_group.*/instance_group [{ kind: KIND_GPU, gpus: [${model_device}] }]/" config.pbtxt) + if [[ "$trial" != "libtorch_multi"* ]]; then + (cd $MODELSDIR/${full} && \ + sed -i "s/instance_group.*/instance_group [{ kind: KIND_GPU, gpus: [${model_device}] }]/" config.pbtxt) + fi fi set +e @@ -196,14 +218,16 @@ for input_device in -1 0 1; do set -e # ensemble - set +e - $IO_TEST_UTIL -i $input_device -o $output_device -r $MODELSDIR -m fan_$full >>$full_log.ensemble 2>&1 - if [ $? -ne 0 ]; then - cat $full_log.ensemble - echo -e "\n***\n*** Test Failed\n***" - RET=1 + if [[ "$trial" != "libtorch_multi"* ]]; then + set +e + $IO_TEST_UTIL -i $input_device -o $output_device -r $MODELSDIR -m fan_$full >>$full_log.ensemble 2>&1 + if [ $? -ne 0 ]; then + cat $full_log.ensemble + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi + set -e fi - set -e done done diff --git a/qa/L0_libtorch_instance_group_kind_model/gen_models.py b/qa/L0_libtorch_instance_group_kind_model/gen_models.py index a1ca6efa91..7cd6c5687e 100755 --- a/qa/L0_libtorch_instance_group_kind_model/gen_models.py +++ b/qa/L0_libtorch_instance_group_kind_model/gen_models.py @@ -30,19 +30,28 @@ class SumModule(nn.Module): - def __init__(self): + + def __init__(self, device): super(SumModule, self).__init__() + self.device = device def forward(self, INPUT0, INPUT1): + INPUT0 = INPUT0.to(self.device) + INPUT1 = INPUT1.to(self.device) print('SumModule - INPUT0 device: {}, INPUT1 device: {}\n'.format( INPUT0.device, INPUT1.device)) return INPUT0 + INPUT1 + class DiffModule(nn.Module): - def __init__(self): + + def __init__(self, device): super(DiffModule, self).__init__() + self.device = device def forward(self, INPUT0, INPUT1): + INPUT0 = INPUT0.to(self.device) + INPUT1 = INPUT1.to(self.device) print('DiffModule - INPUT0 device: {}, INPUT1 device: {}\n'.format( INPUT0.device, INPUT1.device)) return INPUT0 - INPUT1 @@ -55,17 +64,12 @@ def __init__(self, device0, device1): self.device0 = device0 self.device1 = device1 - self.layers1 = SumModule().to(self.device0) - self.layers2 = DiffModule().to(self.device1) + self.layer1 = SumModule(self.device0) + self.layer2 = DiffModule(self.device1) def forward(self, INPUT0, INPUT1): - INPUT0_0 = INPUT0.to(self.device0) - INPUT0_1 = INPUT0.to(self.device1) - INPUT1_0 = INPUT1.to(self.device0) - INPUT1_1 = INPUT1.to(self.device1) - - op0 = self.layers1(INPUT0_0, INPUT1_0) - op1 = self.layers2(INPUT0_1, INPUT1_1) + op0 = self.layer1(INPUT0, INPUT1) + op1 = self.layer2(INPUT0, INPUT1) return op0, op1 From 872b2c9ddea993af242574e5a5f428b6a4b7a3e0 Mon Sep 17 00:00:00 2001 From: krishung5 Date: Fri, 9 Jun 2023 14:34:05 -0700 Subject: [PATCH 9/9] Fix up model version --- qa/L0_io/test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qa/L0_io/test.sh b/qa/L0_io/test.sh index 9bce8a55ac..3a9bb76b31 100755 --- a/qa/L0_io/test.sh +++ b/qa/L0_io/test.sh @@ -149,8 +149,9 @@ cp -r $ENSEMBLEDIR/nop_TYPE_FP32_-1 $MODELSDIR/. && \ # prepare libtorch multi-device and multi-gpu models cp -r ../L0_libtorch_instance_group_kind_model/models/libtorch_multi_device $MODELSDIR/. cp ../L0_libtorch_instance_group_kind_model/gen_models.py ./gen_libtorch_model.py +mkdir -p $MODELSDIR/libtorch_multi_device/1 mkdir -p $MODELSDIR/libtorch_multi_gpu/1 -cp $MODELSDIR/libtorch_multi_device/config.pbtxt models/libtorch_multi_gpu/. +cp $MODELSDIR/libtorch_multi_device/config.pbtxt $MODELSDIR/libtorch_multi_gpu/. (cd $MODELSDIR/libtorch_multi_gpu && \ sed -i "s/name: \"libtorch_multi_device\"/name: \"libtorch_multi_gpu\"/" config.pbtxt) @@ -164,7 +165,6 @@ fi set -e TRIALS="graphdef savedmodel onnx libtorch plan python python_dlpack libtorch_multi_gpu libtorch_multi_device" - for input_device in -1 0 1; do for output_device in -1 0 1; do for trial in ${TRIALS}; do