From 53f8a755dbbbfb01fe1ac9f1eb71d10df2d4c372 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 6 Nov 2023 00:06:20 -0500 Subject: [PATCH 1/9] Add testing for single buffer --- qa/L0_implicit_state/test.sh | 2 ++ qa/L0_sequence_batcher/test.sh | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/qa/L0_implicit_state/test.sh b/qa/L0_implicit_state/test.sh index b5b5034991..c878fd3544 100755 --- a/qa/L0_implicit_state/test.sh +++ b/qa/L0_implicit_state/test.sh @@ -46,9 +46,11 @@ export BACKENDS export IMPLICIT_STATE=1 INITIAL_STATE_ZERO=${INITIAL_STATE_ZERO:="0"} INITIAL_STATE_FILE=${INITIAL_STATE_FILE:="0"} +SINGLE_STATE_BUFFER=${SINGLE_STATE_BUFFER:="0"} export INITIAL_STATE_ZERO export INITIAL_STATE_FILE +export SINGLE_STATE_BUFFER MODELDIR=${MODELDIR:=`pwd`/models} TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"} diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh index 0889a602e1..679df85b51 100755 --- a/qa/L0_sequence_batcher/test.sh +++ b/qa/L0_sequence_batcher/test.sh @@ -169,6 +169,10 @@ export INITIAL_STATE_FILE INITIAL_STATE_ZERO=${INITIAL_STATE_ZERO:="0"} export INITIAL_STATE_ZERO +# If USE_SINGLE_BUFFER is not specified, set to 0 +USE_SINGLE_BUFFER=${USE_SINGLE_BUFFER:="0"} +export USE_SINGLE_BUFFER + # Setup non-variable-size model repositories. The same models are in each # repository but they are configured as: # models0 - four instances with non-batching model @@ -332,6 +336,20 @@ for MODEL in $MODELS; do (cd queue_delay_models/$(basename $MODEL)_full && \ sed -i "s/$(basename $MODEL)/$(basename $MODEL)_full/" config.pbtxt && \ sed -i "s/minimum_slot_utilization: 0/minimum_slot_utilization: 1/" config.pbtxt) + + if [ "$USE_SINGLE_BUFFER" == "1" && "$IMPLICIT_STATE" == "1" ]; then + SED_REPLACE_PATTERN="N;N;N;N;N;/state.*dims:.*/a use_single_buffer: true" + (cd models1/$(basename $MODEL) && \ + sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + (cd models2/$(basename $MODEL) && \ + sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + (cd models4/$(basename $MODEL) && \ + sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + (cd queue_delay_models/$(basename $MODEL)_full && \ + sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + (cd queue_delay_models/$(basename $MODEL)_half && \ + sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + fi else cp -r $MODEL queue_delay_models/$(basename $MODEL)_full && \ (cd queue_delay_models/$(basename $MODEL)_full && \ From c23c58667932399184d667b4074fc0d0193ff58f Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 6 Nov 2023 10:32:45 -0500 Subject: [PATCH 2/9] Add testing for implicit state with buffer growth --- qa/L0_implicit_state/implicit_state.py | 56 ++++++ qa/L0_implicit_state/test.sh | 6 +- qa/L0_sequence_batcher/test.sh | 2 + src/test/implicit_state/src/implicit_state.cc | 161 ++++++++++++++++-- 4 files changed, 209 insertions(+), 16 deletions(-) diff --git a/qa/L0_implicit_state/implicit_state.py b/qa/L0_implicit_state/implicit_state.py index 8d56a2c570..6ba24dfc0f 100755 --- a/qa/L0_implicit_state/implicit_state.py +++ b/qa/L0_implicit_state/implicit_state.py @@ -83,6 +83,62 @@ def test_wrong_implicit_state_name(self): err_str = str(e.exception).lower() self.assertIn("state 'undefined_state' is not a valid state name", err_str) + def test_implicit_state_single_buffer(self): + triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32")) + inputs.append(tritonhttpclient.InferInput("TEST_CASE", [1], "INT32")) + inputs[0].set_data_from_numpy(np.random.randint(5, size=[1], dtype=np.int32)) + inputs[1].set_data_from_numpy(np.asarray([2], dtype=np.int32)) + + triton_client.infer( + model_name="single_state_buffer", + inputs=inputs, + sequence_id=2, + sequence_start=True, + sequence_end=False, + ) + + triton_client.infer( + model_name="single_state_buffer", + inputs=inputs, + sequence_id=2, + sequence_start=False, + sequence_end=True, + ) + + def test_implicit_state_growable_memory(self): + triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") + inputs = [] + inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32")) + inputs.append(tritonhttpclient.InferInput("TEST_CASE", [1], "INT32")) + inputs[0].set_data_from_numpy(np.random.randint(5, size=[1], dtype=np.int32)) + inputs[1].set_data_from_numpy(np.asarray([3], dtype=np.int32)) + + triton_client.infer( + model_name="growable_memory", + inputs=inputs, + sequence_id=2, + sequence_start=True, + sequence_end=False, + ) + + triton_client.infer( + model_name="growable_memory", + inputs=inputs, + sequence_id=2, + sequence_start=False, + sequence_end=False, + ) + + triton_client.infer( + model_name="growable_memory", + inputs=inputs, + sequence_id=2, + sequence_start=False, + sequence_end=True, + ) + def test_no_update(self): # Test implicit state without updating any state triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") diff --git a/qa/L0_implicit_state/test.sh b/qa/L0_implicit_state/test.sh index c878fd3544..39984406df 100755 --- a/qa/L0_implicit_state/test.sh +++ b/qa/L0_implicit_state/test.sh @@ -62,10 +62,14 @@ source ../common/util.sh cp ./libtriton_implicit_state.so models/no_implicit_state/ cp ./libtriton_implicit_state.so models/no_state_update/ cp ./libtriton_implicit_state.so models/wrong_internal_state/ +cp ./libtriton_implicit_state.so models/single_state_buffer/ +cp ./libtriton_implicit_state.so models/growable_memory/ mkdir -p models/no_implicit_state/1/ mkdir -p models/no_state_update/1/ mkdir -p models/wrong_internal_state/1/ +mkdir -p models/single_state_buffer/1/ +mkdir -p models/growable_memory/1/ for BACKEND in $BACKENDS; do dtype="int32" @@ -94,7 +98,7 @@ done CLIENT_LOG=`pwd`/client.log SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR}" IMPLICIT_STATE_CLIENT='implicit_state.py' -EXPECTED_TEST_NUM=5 +EXPECTED_TEST_NUM=6 rm -rf $CLIENT_LOG run_server diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh index 679df85b51..8878e94e1f 100755 --- a/qa/L0_sequence_batcher/test.sh +++ b/qa/L0_sequence_batcher/test.sh @@ -339,6 +339,8 @@ for MODEL in $MODELS; do if [ "$USE_SINGLE_BUFFER" == "1" && "$IMPLICIT_STATE" == "1" ]; then SED_REPLACE_PATTERN="N;N;N;N;N;/state.*dims:.*/a use_single_buffer: true" + (cd models0/$(basename $MODEL) && \ + sed -i "$SED_REPLACE_PATTERN" config.pbtxt) (cd models1/$(basename $MODEL) && \ sed -i "$SED_REPLACE_PATTERN" config.pbtxt) (cd models2/$(basename $MODEL) && \ diff --git a/src/test/implicit_state/src/implicit_state.cc b/src/test/implicit_state/src/implicit_state.cc index 74c3142dea..3a63dede10 100644 --- a/src/test/implicit_state/src/implicit_state.cc +++ b/src/test/implicit_state/src/implicit_state.cc @@ -48,8 +48,15 @@ namespace triton { namespace backend { namespace implicit { // for a non existent state or a model that doesn't have states section in // sequence batching. // -// * STATE_UPDATE_FALSE = 3: Tests not calling the state update and expecting +// * STATE_UPDATE_FALSE = 1: Tests not calling the state update and expecting // the implicit state to not be updated. +// +// * USE_SINGLE_STATE_BUFFER = 2: For this scenario we will be using the same +// buffer for both input and output state. In total there will be 3 requests +// sent in a sequence. +// +// * USE_GROWABLE_STATE_BUFFER = 3: In this test case we use growable state +// buffer. currently, growable state buffer only supports CUDA memory. #define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, REQUEST, X) \ do { \ @@ -179,6 +186,10 @@ class ModelInstanceState : public BackendModelInstance { // Get the state of the model that corresponds to this instance. ModelState* StateForModel() const { return model_state_; } + void* state_ = nullptr; + + // Index of the request in the sequence + uint32_t request_index_ = 0; private: ModelInstanceState( @@ -365,14 +376,6 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState( instance, reinterpret_cast(instance_state))); - // Because this backend just copies IN -> OUT and requires that - // input and output be in CPU memory, we fail if a GPU instances is - // requested. - RETURN_ERROR_IF_FALSE( - instance_state->Kind() == TRITONSERVER_INSTANCEGROUPKIND_CPU, - TRITONSERVER_ERROR_INVALID_ARG, - std::string("'implicit_state' backend only supports CPU instances")); - return nullptr; // success } @@ -634,6 +637,12 @@ TRITONBACKEND_ModelInstanceExecute( continue; } + const float* lstart_buffer = reinterpret_cast(start_buffer); + if (*lstart_buffer == 1) { + instance_state->request_index_ = 0; + instance_state->state_ = nullptr; + } + const void* end_buffer = nullptr; GUARDED_RESPOND_IF_ERROR( responses, r, request, @@ -768,7 +777,24 @@ TRITONBACKEND_ModelInstanceExecute( input_state, 0 /* input_buffer_count */, &input_state_buffer, &buffer_byte_size, &input_memory_type, &input_memory_type_id)); if ((responses[r] == nullptr) || - (input_memory_type == TRITONSERVER_MEMORY_GPU)) { + (test_case_buffer_int == 3 && + input_memory_type != TRITONSERVER_MEMORY_GPU)) { + GUARDED_RESPOND_IF_ERROR( + responses, r, request, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "growable memory should always provide memory in GPU")); + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, + (std::string("request ") + std::to_string(r) + + ": failed to get input buffer in GPU memory, error " + "response sent") + .c_str()); + continue; + } else if ( + (responses[r] == nullptr) || + (input_memory_type == TRITONSERVER_MEMORY_GPU && + test_case_buffer_int != 3)) { GUARDED_RESPOND_IF_ERROR( responses, r, request, TRITONSERVER_ErrorNew( @@ -783,9 +809,32 @@ TRITONBACKEND_ModelInstanceExecute( continue; } - const int32_t ipbuffer_state = - *reinterpret_cast(input_state_buffer); - ipbuffer_state_int = ipbuffer_state; + // When using single state buffer, input/output tensors should point to + // the buffer. + if ((test_case_buffer_int == 2 || test_case_buffer_int == 3) && + instance_state->state_ != nullptr) { + if (input_state_buffer != instance_state->state_) { + GUARDED_RESPOND_IF_ERROR( + responses, r, request, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "Input and output state are using different buffers.")); + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, + (std::string("request ") + std::to_string(r) + + ": input and output state are using different buffers, error " + "response sent") + .c_str()); + continue; + } + } + + if (test_case_buffer_int == 2 || test_case_buffer_int == 1 || + test_case_buffer_int == 0) { + const int32_t ipbuffer_state = + *reinterpret_cast(input_state_buffer); + ipbuffer_state_int = ipbuffer_state; + } } switch (test_case_buffer_int) { @@ -867,7 +916,6 @@ TRITONBACKEND_ModelInstanceExecute( response_state, reinterpret_cast(&buffer), sizeof(int32_t), &actual_memory_type, &actual_memory_type_id)); - if ((responses[r] == nullptr) || (actual_memory_type == TRITONSERVER_MEMORY_GPU)) { GUARDED_RESPOND_IF_ERROR( @@ -975,7 +1023,90 @@ TRITONBACKEND_ModelInstanceExecute( } lbuffer = reinterpret_cast(buffer); *lbuffer = ipbuffer_int + ipbuffer_state_int; - } + } break; + // USE_SINGLE_BUFFER + case 2: { + TRITONBACKEND_State* response_state; + std::vector shape{1}; + GUARDED_RESPOND_IF_ERROR( + responses, r, request, + TRITONBACKEND_StateNew( + &response_state, request, "OUTPUT_STATE", + TRITONSERVER_TYPE_INT32, shape.data() /* data */, + shape.size() /* dim_count */)); + + if (responses[r] == nullptr) { + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, + (std::string("request ") + std::to_string(r) + + ": failed to create the output state 'OUTPUT_STATE', error " + "response sent") + .c_str()); + continue; + } + TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU; + int64_t actual_memory_type_id = 0; + char* buffer; + + // Request an output buffer in GPU. This is only for testing purposes + // to make sure that GPU output buffers can be requested. + GUARDED_RESPOND_IF_ERROR( + responses, r, request, + TRITONBACKEND_StateBuffer( + response_state, reinterpret_cast(&buffer), + sizeof(int32_t), &actual_memory_type, &actual_memory_type_id)); + + instance_state->state_ = buffer; + } break; + case 3: { + TRITONBACKEND_State* response_state; + std::vector shape{1}; + GUARDED_RESPOND_IF_ERROR( + responses, r, request, + TRITONBACKEND_StateNew( + &response_state, request, "OUTPUT_STATE", + TRITONSERVER_TYPE_INT32, shape.data() /* data */, + shape.size() /* dim_count */)); + + if (responses[r] == nullptr) { + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, + (std::string("request ") + std::to_string(r) + + ": failed to create the output state 'OUTPUT_STATE', error " + "response sent") + .c_str()); + continue; + } + TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_GPU; + int64_t actual_memory_type_id = 0; + char* buffer; + + size_t block_size = sizeof(int32_t) * 1024 * 1024; + + // Request an output buffer in GPU. This is only for testing purposes + // to make sure that GPU output buffers can be requested. + GUARDED_RESPOND_IF_ERROR( + responses, r, request, + TRITONBACKEND_StateBuffer( + response_state, reinterpret_cast(&buffer), + block_size * (instance_state->request_index_ + 1), + &actual_memory_type, &actual_memory_type_id)); + + // Only write the new data to the portion of the state buffer that + // has been grown. + cudaMemset( + buffer + block_size * (instance_state->request_index_), + instance_state->request_index_, block_size); + + instance_state->state_ = buffer; + } break; + } + const float* lend_buffer = reinterpret_cast(end_buffer); + + if (*lend_buffer == 1) { + instance_state->request_index_ = 0; + } else { + instance_state->request_index_ += 1; } uint64_t exec_end_ns = 0; From 2573503fbd25e86bd88f8d9b0f557cb77656a352 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Tue, 7 Nov 2023 00:11:31 -0500 Subject: [PATCH 3/9] Improve testing --- qa/L0_implicit_state/implicit_state.py | 24 ++++- .../models/growable_memory/config.pbtxt | 102 ++++++++++++++++++ qa/L0_sequence_batcher/test.sh | 31 +++--- src/test/implicit_state/src/implicit_state.cc | 48 ++++++++- 4 files changed, 183 insertions(+), 22 deletions(-) create mode 100644 qa/L0_implicit_state/models/growable_memory/config.pbtxt diff --git a/qa/L0_implicit_state/implicit_state.py b/qa/L0_implicit_state/implicit_state.py index 6ba24dfc0f..cab271600b 100755 --- a/qa/L0_implicit_state/implicit_state.py +++ b/qa/L0_implicit_state/implicit_state.py @@ -115,29 +115,47 @@ def test_implicit_state_growable_memory(self): inputs[0].set_data_from_numpy(np.random.randint(5, size=[1], dtype=np.int32)) inputs[1].set_data_from_numpy(np.asarray([3], dtype=np.int32)) - triton_client.infer( + output = triton_client.infer( model_name="growable_memory", inputs=inputs, sequence_id=2, sequence_start=True, sequence_end=False, ) + output_state = output.as_numpy("OUTPUT_STATE") + expected_output_state = np.zeros(output_state.shape, dtype=np.int8) + np.testing.assert_equal(output_state, expected_output_state) - triton_client.infer( + output = triton_client.infer( model_name="growable_memory", inputs=inputs, sequence_id=2, sequence_start=False, sequence_end=False, ) + output_state = output.as_numpy("OUTPUT_STATE") + expected_output_state = np.concatenate( + [expected_output_state, np.ones(expected_output_state.shape, dtype=np.int8)] + ) + np.testing.assert_equal(output_state, expected_output_state) - triton_client.infer( + output = triton_client.infer( model_name="growable_memory", inputs=inputs, sequence_id=2, sequence_start=False, sequence_end=True, ) + output_state = output.as_numpy("OUTPUT_STATE") + expected_output_state = np.concatenate( + [ + expected_output_state, + np.full( + (expected_output_state.shape[0] // 2,), dtype=np.int8, fill_value=2 + ), + ] + ) + np.testing.assert_equal(output_state, expected_output_state) def test_no_update(self): # Test implicit state without updating any state diff --git a/qa/L0_implicit_state/models/growable_memory/config.pbtxt b/qa/L0_implicit_state/models/growable_memory/config.pbtxt new file mode 100644 index 0000000000..2bbb955308 --- /dev/null +++ b/qa/L0_implicit_state/models/growable_memory/config.pbtxt @@ -0,0 +1,102 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions # are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "growable_memory" +backend: "implicit_state" +max_batch_size: 0 +sequence_batching { + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "END" + control [ + { + kind: CONTROL_SEQUENCE_END + fp32_false_true: [ 0, 1 ] + } + ] + } + ] + state [ + { + input_name: "INPUT_STATE" + output_name: "OUTPUT_STATE" + data_type: TYPE_INT8 + dims: [1024, 1024] + use_single_buffer: true + use_growable_memory: true + } + ] +} + +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "TEST_CASE" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "OUTPUT_STATE" + data_type: TYPE_INT8 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_GPU + } +] diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh index 8878e94e1f..6aa2f06629 100755 --- a/qa/L0_sequence_batcher/test.sh +++ b/qa/L0_sequence_batcher/test.sh @@ -337,21 +337,22 @@ for MODEL in $MODELS; do sed -i "s/$(basename $MODEL)/$(basename $MODEL)_full/" config.pbtxt && \ sed -i "s/minimum_slot_utilization: 0/minimum_slot_utilization: 1/" config.pbtxt) - if [ "$USE_SINGLE_BUFFER" == "1" && "$IMPLICIT_STATE" == "1" ]; then - SED_REPLACE_PATTERN="N;N;N;N;N;/state.*dims:.*/a use_single_buffer: true" - (cd models0/$(basename $MODEL) && \ - sed -i "$SED_REPLACE_PATTERN" config.pbtxt) - (cd models1/$(basename $MODEL) && \ - sed -i "$SED_REPLACE_PATTERN" config.pbtxt) - (cd models2/$(basename $MODEL) && \ - sed -i "$SED_REPLACE_PATTERN" config.pbtxt) - (cd models4/$(basename $MODEL) && \ - sed -i "$SED_REPLACE_PATTERN" config.pbtxt) - (cd queue_delay_models/$(basename $MODEL)_full && \ - sed -i "$SED_REPLACE_PATTERN" config.pbtxt) - (cd queue_delay_models/$(basename $MODEL)_half && \ - sed -i "$SED_REPLACE_PATTERN" config.pbtxt) - fi + # TODO: Enable single state buffer testing for sequence batcher + # if [ "$USE_SINGLE_BUFFER" == "1" && "$IMPLICIT_STATE" == "1" ]; then + # SED_REPLACE_PATTERN="N;N;N;N;N;/state.*dims:.*/a use_single_buffer: true" + # (cd models0/$(basename $MODEL) && \ + # sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + # (cd models1/$(basename $MODEL) && \ + # sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + # (cd models2/$(basename $MODEL) && \ + # sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + # (cd models4/$(basename $MODEL) && \ + # sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + # (cd queue_delay_models/$(basename $MODEL)_full && \ + # sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + # (cd queue_delay_models/$(basename $MODEL)_half && \ + # sed -i "$SED_REPLACE_PATTERN" config.pbtxt) + # fi else cp -r $MODEL queue_delay_models/$(basename $MODEL)_full && \ (cd queue_delay_models/$(basename $MODEL)_full && \ diff --git a/src/test/implicit_state/src/implicit_state.cc b/src/test/implicit_state/src/implicit_state.cc index 3a63dede10..3f6f4678e7 100644 --- a/src/test/implicit_state/src/implicit_state.cc +++ b/src/test/implicit_state/src/implicit_state.cc @@ -1060,12 +1060,17 @@ TRITONBACKEND_ModelInstanceExecute( } break; case 3: { TRITONBACKEND_State* response_state; - std::vector shape{1}; + size_t block_size = sizeof(int8_t) * 1024 * 1024; + int64_t current_elements = + (instance_state->request_index_ + 1) * 1024 * 1024; + std::cout << "current elements are " + << (instance_state->request_index_ + 1) << std::endl; + std::vector shape{current_elements}; GUARDED_RESPOND_IF_ERROR( responses, r, request, TRITONBACKEND_StateNew( &response_state, request, "OUTPUT_STATE", - TRITONSERVER_TYPE_INT32, shape.data() /* data */, + TRITONSERVER_TYPE_INT8, shape.data() /* data */, shape.size() /* dim_count */)); if (responses[r] == nullptr) { @@ -1081,8 +1086,6 @@ TRITONBACKEND_ModelInstanceExecute( int64_t actual_memory_type_id = 0; char* buffer; - size_t block_size = sizeof(int32_t) * 1024 * 1024; - // Request an output buffer in GPU. This is only for testing purposes // to make sure that GPU output buffers can be requested. GUARDED_RESPOND_IF_ERROR( @@ -1098,6 +1101,43 @@ TRITONBACKEND_ModelInstanceExecute( buffer + block_size * (instance_state->request_index_), instance_state->request_index_, block_size); + TRITONBACKEND_Output* response_output; + GUARDED_RESPOND_IF_ERROR( + responses, r, request, + TRITONBACKEND_ResponseOutput( + responses[r], &response_output, "OUTPUT_STATE", + TRITONSERVER_TYPE_INT8, shape.data() /* data */, + shape.size() /* dim_count */)); + + actual_memory_type = TRITONSERVER_MEMORY_CPU; + actual_memory_type_id = 0; + char* output_buffer; + GUARDED_RESPOND_IF_ERROR( + responses, r, request, + TRITONBACKEND_OutputBuffer( + response_output, reinterpret_cast(&output_buffer), + block_size * (instance_state->request_index_ + 1), + &actual_memory_type, &actual_memory_type_id)); + if ((responses[r] == nullptr) || + (actual_memory_type != TRITONSERVER_MEMORY_CPU)) { + GUARDED_RESPOND_IF_ERROR( + responses, r, request, + TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_UNSUPPORTED, + "the backend can only handle CPU tensors")); + LOG_MESSAGE( + TRITONSERVER_LOG_ERROR, + (std::string("request ") + std::to_string(r) + + "the backend can only handle CPU tensors" + "response sent") + .c_str()); + continue; + } + cudaMemcpy( + output_buffer, buffer, + block_size * (instance_state->request_index_ + 1), + cudaMemcpyDeviceToHost); + instance_state->state_ = buffer; } break; } From d051daed471b37c381ff5067d2dfeca24b352fc6 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Wed, 8 Nov 2023 22:10:43 -0500 Subject: [PATCH 4/9] Fix up --- src/test/implicit_state/src/implicit_state.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/implicit_state/src/implicit_state.cc b/src/test/implicit_state/src/implicit_state.cc index 3f6f4678e7..7def94934c 100644 --- a/src/test/implicit_state/src/implicit_state.cc +++ b/src/test/implicit_state/src/implicit_state.cc @@ -56,7 +56,7 @@ namespace triton { namespace backend { namespace implicit { // sent in a sequence. // // * USE_GROWABLE_STATE_BUFFER = 3: In this test case we use growable state -// buffer. currently, growable state buffer only supports CUDA memory. +// buffer. Currently, growable state buffer only supports CUDA memory. #define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, REQUEST, X) \ do { \ @@ -754,7 +754,7 @@ TRITONBACKEND_ModelInstanceExecute( *reinterpret_cast(test_case_buffer); const int32_t ipbuffer_int = *reinterpret_cast(input_buffer); - int32_t ipbuffer_state_int; + int32_t ipbuffer_state_int = 0; if (test_case_buffer_int != 0) { TRITONBACKEND_Input* input_state = nullptr; From 648b76eb2beeb58b8c6b24ee70cc52dcedfb70cb Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Wed, 8 Nov 2023 23:15:50 -0500 Subject: [PATCH 5/9] Add CUDA virtual address size flag --- src/command_line_parser.cc | 25 +++++++++++++++++++++++++ src/command_line_parser.h | 1 + 2 files changed, 26 insertions(+) diff --git a/src/command_line_parser.cc b/src/command_line_parser.cc index 0907982d6b..6fa0fe31bb 100644 --- a/src/command_line_parser.cc +++ b/src/command_line_parser.cc @@ -347,6 +347,7 @@ enum TritonOptionId { OPTION_RATE_LIMIT_RESOURCE, OPTION_PINNED_MEMORY_POOL_BYTE_SIZE, OPTION_CUDA_MEMORY_POOL_BYTE_SIZE, + OPTION_CUDA_VIRTUAL_ADDRESS_SIZE, OPTION_RESPONSE_CACHE_BYTE_SIZE, OPTION_CACHE_CONFIG, OPTION_CACHE_DIR, @@ -751,6 +752,18 @@ TritonParser::SetupOptions() ":. This option can be used multiple " "times, but only once per GPU device. Subsequent uses will overwrite " "previous uses for the same GPU device. Default is 64 MB."}); + memory_device_options_.push_back( + {OPTION_CUDA_VIRTUAL_ADDRESS_SIZE, "cuda-virtual-address-size", + ":", + "The total CUDA virtual address size that will be used for each " + "implicit state when growable memory is used. This value determines " + "the maximum size of each implicit state. The state size cannot go " + "beyond this value. The argument should be " + "2 integers separated by colons in the format " + ":. This option can be used " + "multiple " + "times, but only once per GPU device. Subsequent uses will overwrite " + "previous uses for the same GPU device. Default is 1 GB."}); memory_device_options_.push_back( {OPTION_MIN_SUPPORTED_COMPUTE_CAPABILITY, "min-supported-compute-capability", Option::ArgFloat, @@ -996,6 +1009,14 @@ TritonServerParameters::BuildTritonServerOptions() loptions, cuda_pool.first, cuda_pool.second), "setting total CUDA memory byte size"); } + for (const auto& cuda_virtual_address_size : cuda_virtual_address_size_) { + THROW_IF_ERR( + ParseException, + TRITONSERVER_ServerOptionsSetCudaVirtualAddressSize( + loptions, cuda_virtual_address_size.first, + cuda_virtual_address_size.second), + "setting total CUDA virtual address size"); + } THROW_IF_ERR( ParseException, TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability( @@ -1569,6 +1590,10 @@ TritonParser::Parse(int argc, char** argv) lparams.cuda_pools_.push_back( ParsePairOption(optarg, ":")); break; + case OPTION_CUDA_VIRTUAL_ADDRESS_SIZE: + lparams.cuda_virtual_address_size_.push_back( + ParsePairOption(optarg, ":")); + break; case OPTION_RESPONSE_CACHE_BYTE_SIZE: { cache_size_present = true; const auto byte_size = std::to_string(ParseOption(optarg)); diff --git a/src/command_line_parser.h b/src/command_line_parser.h index 8a34babc98..ef562a3efb 100644 --- a/src/command_line_parser.h +++ b/src/command_line_parser.h @@ -145,6 +145,7 @@ struct TritonServerParameters { // memory pool configuration int64_t pinned_memory_pool_byte_size_{1 << 28}; std::list> cuda_pools_; + std::list> cuda_virtual_address_size_; // [FIXME] this option is broken after backend separation: this should have // controlled backend copy behavior but not properly propagate to backend From 653019a8561ba41d91e44da3316926a39009a5c4 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Thu, 9 Nov 2023 00:22:17 -0500 Subject: [PATCH 6/9] Add missing test files --- qa/L0_implicit_state/implicit_state.py | 12 ++- .../models/single_state_buffer/config.pbtxt | 96 +++++++++++++++++++ qa/L0_implicit_state/test.sh | 2 +- 3 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 qa/L0_implicit_state/models/single_state_buffer/config.pbtxt diff --git a/qa/L0_implicit_state/implicit_state.py b/qa/L0_implicit_state/implicit_state.py index cab271600b..5167159717 100755 --- a/qa/L0_implicit_state/implicit_state.py +++ b/qa/L0_implicit_state/implicit_state.py @@ -144,7 +144,7 @@ def test_implicit_state_growable_memory(self): inputs=inputs, sequence_id=2, sequence_start=False, - sequence_end=True, + sequence_end=False, ) output_state = output.as_numpy("OUTPUT_STATE") expected_output_state = np.concatenate( @@ -157,6 +157,16 @@ def test_implicit_state_growable_memory(self): ) np.testing.assert_equal(output_state, expected_output_state) + with self.assertRaises(InferenceServerException) as e: + triton_client.infer( + model_name="growable_memory", + inputs=inputs, + sequence_id=2, + sequence_start=False, + sequence_end=True, + ) + print(e) + def test_no_update(self): # Test implicit state without updating any state triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") diff --git a/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt b/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt new file mode 100644 index 0000000000..4751829aef --- /dev/null +++ b/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt @@ -0,0 +1,96 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions # are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "single_state_buffer" +backend: "implicit_state" +max_batch_size: 0 +sequence_batching { + control_input [ + { + name: "START" + control [ + { + kind: CONTROL_SEQUENCE_START + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "READY" + control [ + { + kind: CONTROL_SEQUENCE_READY + fp32_false_true: [ 0, 1 ] + } + ] + }, + { + name: "END" + control [ + { + kind: CONTROL_SEQUENCE_END + fp32_false_true: [ 0, 1 ] + } + ] + } + ] + state [ + { + input_name: "INPUT_STATE" + output_name: "OUTPUT_STATE" + data_type: TYPE_INT32 + dims: 1 + use_single_buffer: true + } + ] +} + +input [ + { + name: "INPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "TEST_CASE" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +output [ + { + name: "OUTPUT" + data_type: TYPE_INT32 + dims: [ 1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/L0_implicit_state/test.sh b/qa/L0_implicit_state/test.sh index 39984406df..973192ba03 100755 --- a/qa/L0_implicit_state/test.sh +++ b/qa/L0_implicit_state/test.sh @@ -96,7 +96,7 @@ for BACKEND in $BACKENDS; do done CLIENT_LOG=`pwd`/client.log -SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR}" +SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR} --cuda-virtual-address-size=0:$((1024*1024*4))" IMPLICIT_STATE_CLIENT='implicit_state.py' EXPECTED_TEST_NUM=6 rm -rf $CLIENT_LOG From f4e69773d4a489b6d8008b11d24c41b827ffe0ce Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Thu, 9 Nov 2023 13:08:14 -0500 Subject: [PATCH 7/9] Parameter rename --- qa/L0_implicit_state/models/growable_memory/config.pbtxt | 2 +- qa/L0_implicit_state/models/single_state_buffer/config.pbtxt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/qa/L0_implicit_state/models/growable_memory/config.pbtxt b/qa/L0_implicit_state/models/growable_memory/config.pbtxt index 2bbb955308..630e47ba19 100644 --- a/qa/L0_implicit_state/models/growable_memory/config.pbtxt +++ b/qa/L0_implicit_state/models/growable_memory/config.pbtxt @@ -62,7 +62,7 @@ sequence_batching { output_name: "OUTPUT_STATE" data_type: TYPE_INT8 dims: [1024, 1024] - use_single_buffer: true + use_same_buffer_for_input_output: true use_growable_memory: true } ] diff --git a/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt b/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt index 4751829aef..c14962585c 100644 --- a/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt +++ b/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt @@ -62,7 +62,7 @@ sequence_batching { output_name: "OUTPUT_STATE" data_type: TYPE_INT32 dims: 1 - use_single_buffer: true + use_same_buffer_for_input_output: true } ] } From 4b3f7620295f26ca5c97084d7d6671a25c6f5e1c Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Mon, 13 Nov 2023 01:15:50 -0500 Subject: [PATCH 8/9] Test fixes --- qa/L0_implicit_state/implicit_state.py | 10 ---------- qa/L0_implicit_state/test.sh | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/qa/L0_implicit_state/implicit_state.py b/qa/L0_implicit_state/implicit_state.py index 5167159717..2cdf7ff2e0 100755 --- a/qa/L0_implicit_state/implicit_state.py +++ b/qa/L0_implicit_state/implicit_state.py @@ -157,16 +157,6 @@ def test_implicit_state_growable_memory(self): ) np.testing.assert_equal(output_state, expected_output_state) - with self.assertRaises(InferenceServerException) as e: - triton_client.infer( - model_name="growable_memory", - inputs=inputs, - sequence_id=2, - sequence_start=False, - sequence_end=True, - ) - print(e) - def test_no_update(self): # Test implicit state without updating any state triton_client = tritonhttpclient.InferenceServerClient("localhost:8000") diff --git a/qa/L0_implicit_state/test.sh b/qa/L0_implicit_state/test.sh index 973192ba03..0722d29be1 100755 --- a/qa/L0_implicit_state/test.sh +++ b/qa/L0_implicit_state/test.sh @@ -98,7 +98,7 @@ done CLIENT_LOG=`pwd`/client.log SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR} --cuda-virtual-address-size=0:$((1024*1024*4))" IMPLICIT_STATE_CLIENT='implicit_state.py' -EXPECTED_TEST_NUM=6 +EXPECTED_TEST_NUM=7 rm -rf $CLIENT_LOG run_server From 95800062bbf0a5a4191742befbd5a600795ac358 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Tue, 14 Nov 2023 23:27:40 -0500 Subject: [PATCH 9/9] Only build implicit state backend for GPU=ON --- src/test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 6dcd3f8f09..a833976e13 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -102,9 +102,9 @@ add_subdirectory(repoagent/relocation_repoagent repoagent/relocation_repoagent) add_subdirectory(distributed_addsub distributed_addsub) add_subdirectory(dyna_sequence dyna_sequence) add_subdirectory(generative_sequence generative_sequence) -add_subdirectory(implicit_state implicit_state) add_subdirectory(query_backend query_backend) if(${TRITON_ENABLE_GPU}) add_subdirectory(sequence sequence) + add_subdirectory(implicit_state implicit_state) endif()