From 53f8a755dbbbfb01fe1ac9f1eb71d10df2d4c372 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Mon, 6 Nov 2023 00:06:20 -0500
Subject: [PATCH 1/9] Add testing for single buffer

---
 qa/L0_implicit_state/test.sh   |  2 ++
 qa/L0_sequence_batcher/test.sh | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/qa/L0_implicit_state/test.sh b/qa/L0_implicit_state/test.sh
index b5b5034991..c878fd3544 100755
--- a/qa/L0_implicit_state/test.sh
+++ b/qa/L0_implicit_state/test.sh
@@ -46,9 +46,11 @@ export BACKENDS
 export IMPLICIT_STATE=1
 INITIAL_STATE_ZERO=${INITIAL_STATE_ZERO:="0"}
 INITIAL_STATE_FILE=${INITIAL_STATE_FILE:="0"}
+SINGLE_STATE_BUFFER=${SINGLE_STATE_BUFFER:="0"}
 
 export INITIAL_STATE_ZERO
 export INITIAL_STATE_FILE
+export SINGLE_STATE_BUFFER
 
 MODELDIR=${MODELDIR:=`pwd`/models}
 TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh
index 0889a602e1..679df85b51 100755
--- a/qa/L0_sequence_batcher/test.sh
+++ b/qa/L0_sequence_batcher/test.sh
@@ -169,6 +169,10 @@ export INITIAL_STATE_FILE
 INITIAL_STATE_ZERO=${INITIAL_STATE_ZERO:="0"}
 export INITIAL_STATE_ZERO
 
+# If USE_SINGLE_BUFFER is not specified, set to 0
+USE_SINGLE_BUFFER=${USE_SINGLE_BUFFER:="0"}
+export USE_SINGLE_BUFFER
+
 # Setup non-variable-size model repositories. The same models are in each
 # repository but they are configured as:
 #   models0 - four instances with non-batching model
@@ -332,6 +336,20 @@ for MODEL in $MODELS; do
       (cd queue_delay_models/$(basename $MODEL)_full && \
         sed -i "s/$(basename $MODEL)/$(basename $MODEL)_full/" config.pbtxt && \
         sed -i "s/minimum_slot_utilization: 0/minimum_slot_utilization: 1/" config.pbtxt)
+
+    if [ "$USE_SINGLE_BUFFER" == "1" && "$IMPLICIT_STATE" == "1" ]; then
+      SED_REPLACE_PATTERN="N;N;N;N;N;/state.*dims:.*/a use_single_buffer: true"
+      (cd models1/$(basename $MODEL) && \
+        sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
+      (cd models2/$(basename $MODEL) && \
+        sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
+      (cd models4/$(basename $MODEL) && \
+        sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
+      (cd queue_delay_models/$(basename $MODEL)_full && \
+        sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
+      (cd queue_delay_models/$(basename $MODEL)_half && \
+        sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
+    fi
   else
     cp -r $MODEL queue_delay_models/$(basename $MODEL)_full && \
       (cd queue_delay_models/$(basename $MODEL)_full && \

From c23c58667932399184d667b4074fc0d0193ff58f Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Mon, 6 Nov 2023 10:32:45 -0500
Subject: [PATCH 2/9] Add testing for implicit state with buffer growth

---
 qa/L0_implicit_state/implicit_state.py        |  56 ++++++
 qa/L0_implicit_state/test.sh                  |   6 +-
 qa/L0_sequence_batcher/test.sh                |   2 +
 src/test/implicit_state/src/implicit_state.cc | 161 ++++++++++++++++--
 4 files changed, 209 insertions(+), 16 deletions(-)

diff --git a/qa/L0_implicit_state/implicit_state.py b/qa/L0_implicit_state/implicit_state.py
index 8d56a2c570..6ba24dfc0f 100755
--- a/qa/L0_implicit_state/implicit_state.py
+++ b/qa/L0_implicit_state/implicit_state.py
@@ -83,6 +83,62 @@ def test_wrong_implicit_state_name(self):
         err_str = str(e.exception).lower()
         self.assertIn("state 'undefined_state' is not a valid state name", err_str)
 
+    def test_implicit_state_single_buffer(self):
+        triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
+        inputs = []
+        inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32"))
+        inputs.append(tritonhttpclient.InferInput("TEST_CASE", [1], "INT32"))
+        inputs[0].set_data_from_numpy(np.random.randint(5, size=[1], dtype=np.int32))
+        inputs[1].set_data_from_numpy(np.asarray([2], dtype=np.int32))
+
+        triton_client.infer(
+            model_name="single_state_buffer",
+            inputs=inputs,
+            sequence_id=2,
+            sequence_start=True,
+            sequence_end=False,
+        )
+
+        triton_client.infer(
+            model_name="single_state_buffer",
+            inputs=inputs,
+            sequence_id=2,
+            sequence_start=False,
+            sequence_end=True,
+        )
+
+    def test_implicit_state_growable_memory(self):
+        triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
+        inputs = []
+        inputs.append(tritonhttpclient.InferInput("INPUT", [1], "INT32"))
+        inputs.append(tritonhttpclient.InferInput("TEST_CASE", [1], "INT32"))
+        inputs[0].set_data_from_numpy(np.random.randint(5, size=[1], dtype=np.int32))
+        inputs[1].set_data_from_numpy(np.asarray([3], dtype=np.int32))
+
+        triton_client.infer(
+            model_name="growable_memory",
+            inputs=inputs,
+            sequence_id=2,
+            sequence_start=True,
+            sequence_end=False,
+        )
+
+        triton_client.infer(
+            model_name="growable_memory",
+            inputs=inputs,
+            sequence_id=2,
+            sequence_start=False,
+            sequence_end=False,
+        )
+
+        triton_client.infer(
+            model_name="growable_memory",
+            inputs=inputs,
+            sequence_id=2,
+            sequence_start=False,
+            sequence_end=True,
+        )
+
     def test_no_update(self):
         # Test implicit state without updating any state
         triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
diff --git a/qa/L0_implicit_state/test.sh b/qa/L0_implicit_state/test.sh
index c878fd3544..39984406df 100755
--- a/qa/L0_implicit_state/test.sh
+++ b/qa/L0_implicit_state/test.sh
@@ -62,10 +62,14 @@ source ../common/util.sh
 cp ./libtriton_implicit_state.so models/no_implicit_state/
 cp ./libtriton_implicit_state.so models/no_state_update/
 cp ./libtriton_implicit_state.so models/wrong_internal_state/
+cp ./libtriton_implicit_state.so models/single_state_buffer/
+cp ./libtriton_implicit_state.so models/growable_memory/
 
 mkdir -p models/no_implicit_state/1/
 mkdir -p models/no_state_update/1/
 mkdir -p models/wrong_internal_state/1/
+mkdir -p models/single_state_buffer/1/
+mkdir -p models/growable_memory/1/
 
 for BACKEND in $BACKENDS; do
     dtype="int32"
@@ -94,7 +98,7 @@ done
 CLIENT_LOG=`pwd`/client.log
 SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR}"
 IMPLICIT_STATE_CLIENT='implicit_state.py'
-EXPECTED_TEST_NUM=5
+EXPECTED_TEST_NUM=6
 rm -rf $CLIENT_LOG
 
 run_server
diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh
index 679df85b51..8878e94e1f 100755
--- a/qa/L0_sequence_batcher/test.sh
+++ b/qa/L0_sequence_batcher/test.sh
@@ -339,6 +339,8 @@ for MODEL in $MODELS; do
 
     if [ "$USE_SINGLE_BUFFER" == "1" && "$IMPLICIT_STATE" == "1" ]; then
       SED_REPLACE_PATTERN="N;N;N;N;N;/state.*dims:.*/a use_single_buffer: true"
+      (cd models0/$(basename $MODEL) && \
+        sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
       (cd models1/$(basename $MODEL) && \
         sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
       (cd models2/$(basename $MODEL) && \
diff --git a/src/test/implicit_state/src/implicit_state.cc b/src/test/implicit_state/src/implicit_state.cc
index 74c3142dea..3a63dede10 100644
--- a/src/test/implicit_state/src/implicit_state.cc
+++ b/src/test/implicit_state/src/implicit_state.cc
@@ -48,8 +48,15 @@ namespace triton { namespace backend { namespace implicit {
 //   for a non existent state or a model that doesn't have states section in
 //   sequence batching.
 //
-//   * STATE_UPDATE_FALSE = 3: Tests not calling the state update and expecting
+//   * STATE_UPDATE_FALSE = 1: Tests not calling the state update and expecting
 //   the implicit state to not be updated.
+//
+//   * USE_SINGLE_STATE_BUFFER = 2: For this scenario we will be using the same
+//   buffer for both input and output state. In total there will be 3 requests
+//   sent in a sequence.
+//
+//   * USE_GROWABLE_STATE_BUFFER = 3: In this test case we use growable state
+//   buffer. currently, growable state buffer only supports CUDA memory.
 
 #define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, REQUEST, X)            \
   do {                                                                  \
@@ -179,6 +186,10 @@ class ModelInstanceState : public BackendModelInstance {
 
   // Get the state of the model that corresponds to this instance.
   ModelState* StateForModel() const { return model_state_; }
+  void* state_ = nullptr;
+
+  // Index of the request in the sequence
+  uint32_t request_index_ = 0;
 
  private:
   ModelInstanceState(
@@ -365,14 +376,6 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
       instance, reinterpret_cast<void*>(instance_state)));
 
-  // Because this backend just copies IN -> OUT and requires that
-  // input and output be in CPU memory, we fail if a GPU instances is
-  // requested.
-  RETURN_ERROR_IF_FALSE(
-      instance_state->Kind() == TRITONSERVER_INSTANCEGROUPKIND_CPU,
-      TRITONSERVER_ERROR_INVALID_ARG,
-      std::string("'implicit_state' backend only supports CPU instances"));
-
   return nullptr;  // success
 }
 
@@ -634,6 +637,12 @@ TRITONBACKEND_ModelInstanceExecute(
       continue;
     }
 
+    const float* lstart_buffer = reinterpret_cast<const float*>(start_buffer);
+    if (*lstart_buffer == 1) {
+      instance_state->request_index_ = 0;
+      instance_state->state_ = nullptr;
+    }
+
     const void* end_buffer = nullptr;
     GUARDED_RESPOND_IF_ERROR(
         responses, r, request,
@@ -768,7 +777,24 @@ TRITONBACKEND_ModelInstanceExecute(
               input_state, 0 /* input_buffer_count */, &input_state_buffer,
               &buffer_byte_size, &input_memory_type, &input_memory_type_id));
       if ((responses[r] == nullptr) ||
-          (input_memory_type == TRITONSERVER_MEMORY_GPU)) {
+          (test_case_buffer_int == 3 &&
+           input_memory_type != TRITONSERVER_MEMORY_GPU)) {
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r, request,
+            TRITONSERVER_ErrorNew(
+                TRITONSERVER_ERROR_UNSUPPORTED,
+                "growable memory should always provide memory in GPU"));
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_ERROR,
+            (std::string("request ") + std::to_string(r) +
+             ": failed to get input buffer in GPU memory, error "
+             "response sent")
+                .c_str());
+        continue;
+      } else if (
+          (responses[r] == nullptr) ||
+          (input_memory_type == TRITONSERVER_MEMORY_GPU &&
+           test_case_buffer_int != 3)) {
         GUARDED_RESPOND_IF_ERROR(
             responses, r, request,
             TRITONSERVER_ErrorNew(
@@ -783,9 +809,32 @@ TRITONBACKEND_ModelInstanceExecute(
         continue;
       }
 
-      const int32_t ipbuffer_state =
-          *reinterpret_cast<const int32_t*>(input_state_buffer);
-      ipbuffer_state_int = ipbuffer_state;
+      // When using single state buffer, input/output tensors should point to
+      // the buffer.
+      if ((test_case_buffer_int == 2 || test_case_buffer_int == 3) &&
+          instance_state->state_ != nullptr) {
+        if (input_state_buffer != instance_state->state_) {
+          GUARDED_RESPOND_IF_ERROR(
+              responses, r, request,
+              TRITONSERVER_ErrorNew(
+                  TRITONSERVER_ERROR_UNSUPPORTED,
+                  "Input and output state are using different buffers."));
+          LOG_MESSAGE(
+              TRITONSERVER_LOG_ERROR,
+              (std::string("request ") + std::to_string(r) +
+               ": input and output state are using different buffers, error "
+               "response sent")
+                  .c_str());
+          continue;
+        }
+      }
+
+      if (test_case_buffer_int == 2 || test_case_buffer_int == 1 ||
+          test_case_buffer_int == 0) {
+        const int32_t ipbuffer_state =
+            *reinterpret_cast<const int32_t*>(input_state_buffer);
+        ipbuffer_state_int = ipbuffer_state;
+      }
     }
 
     switch (test_case_buffer_int) {
@@ -867,7 +916,6 @@ TRITONBACKEND_ModelInstanceExecute(
                 response_state, reinterpret_cast<void**>(&buffer),
                 sizeof(int32_t), &actual_memory_type, &actual_memory_type_id));
 
-
         if ((responses[r] == nullptr) ||
             (actual_memory_type == TRITONSERVER_MEMORY_GPU)) {
           GUARDED_RESPOND_IF_ERROR(
@@ -975,7 +1023,90 @@ TRITONBACKEND_ModelInstanceExecute(
         }
         lbuffer = reinterpret_cast<int32_t*>(buffer);
         *lbuffer = ipbuffer_int + ipbuffer_state_int;
-      }
+      } break;
+      // USE_SINGLE_BUFFER
+      case 2: {
+        TRITONBACKEND_State* response_state;
+        std::vector<int64_t> shape{1};
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r, request,
+            TRITONBACKEND_StateNew(
+                &response_state, request, "OUTPUT_STATE",
+                TRITONSERVER_TYPE_INT32, shape.data() /* data */,
+                shape.size() /* dim_count */));
+
+        if (responses[r] == nullptr) {
+          LOG_MESSAGE(
+              TRITONSERVER_LOG_ERROR,
+              (std::string("request ") + std::to_string(r) +
+               ": failed to create the output state 'OUTPUT_STATE', error "
+               "response sent")
+                  .c_str());
+          continue;
+        }
+        TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_CPU;
+        int64_t actual_memory_type_id = 0;
+        char* buffer;
+
+        // Request an output buffer in GPU. This is only for testing purposes
+        // to make sure that GPU output buffers can be requested.
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r, request,
+            TRITONBACKEND_StateBuffer(
+                response_state, reinterpret_cast<void**>(&buffer),
+                sizeof(int32_t), &actual_memory_type, &actual_memory_type_id));
+
+        instance_state->state_ = buffer;
+      } break;
+      case 3: {
+        TRITONBACKEND_State* response_state;
+        std::vector<int64_t> shape{1};
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r, request,
+            TRITONBACKEND_StateNew(
+                &response_state, request, "OUTPUT_STATE",
+                TRITONSERVER_TYPE_INT32, shape.data() /* data */,
+                shape.size() /* dim_count */));
+
+        if (responses[r] == nullptr) {
+          LOG_MESSAGE(
+              TRITONSERVER_LOG_ERROR,
+              (std::string("request ") + std::to_string(r) +
+               ": failed to create the output state 'OUTPUT_STATE', error "
+               "response sent")
+                  .c_str());
+          continue;
+        }
+        TRITONSERVER_MemoryType actual_memory_type = TRITONSERVER_MEMORY_GPU;
+        int64_t actual_memory_type_id = 0;
+        char* buffer;
+
+        size_t block_size = sizeof(int32_t) * 1024 * 1024;
+
+        // Request an output buffer in GPU. This is only for testing purposes
+        // to make sure that GPU output buffers can be requested.
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r, request,
+            TRITONBACKEND_StateBuffer(
+                response_state, reinterpret_cast<void**>(&buffer),
+                block_size * (instance_state->request_index_ + 1),
+                &actual_memory_type, &actual_memory_type_id));
+
+        // Only write the new data to the portion of the state buffer that
+        // has been grown.
+        cudaMemset(
+            buffer + block_size * (instance_state->request_index_),
+            instance_state->request_index_, block_size);
+
+        instance_state->state_ = buffer;
+      } break;
+    }
+    const float* lend_buffer = reinterpret_cast<const float*>(end_buffer);
+
+    if (*lend_buffer == 1) {
+      instance_state->request_index_ = 0;
+    } else {
+      instance_state->request_index_ += 1;
     }
 
     uint64_t exec_end_ns = 0;

From 2573503fbd25e86bd88f8d9b0f557cb77656a352 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Tue, 7 Nov 2023 00:11:31 -0500
Subject: [PATCH 3/9] Improve testing

---
 qa/L0_implicit_state/implicit_state.py        |  24 ++++-
 .../models/growable_memory/config.pbtxt       | 102 ++++++++++++++++++
 qa/L0_sequence_batcher/test.sh                |  31 +++---
 src/test/implicit_state/src/implicit_state.cc |  48 ++++++++-
 4 files changed, 183 insertions(+), 22 deletions(-)
 create mode 100644 qa/L0_implicit_state/models/growable_memory/config.pbtxt

diff --git a/qa/L0_implicit_state/implicit_state.py b/qa/L0_implicit_state/implicit_state.py
index 6ba24dfc0f..cab271600b 100755
--- a/qa/L0_implicit_state/implicit_state.py
+++ b/qa/L0_implicit_state/implicit_state.py
@@ -115,29 +115,47 @@ def test_implicit_state_growable_memory(self):
         inputs[0].set_data_from_numpy(np.random.randint(5, size=[1], dtype=np.int32))
         inputs[1].set_data_from_numpy(np.asarray([3], dtype=np.int32))
 
-        triton_client.infer(
+        output = triton_client.infer(
             model_name="growable_memory",
             inputs=inputs,
             sequence_id=2,
             sequence_start=True,
             sequence_end=False,
         )
+        output_state = output.as_numpy("OUTPUT_STATE")
+        expected_output_state = np.zeros(output_state.shape, dtype=np.int8)
+        np.testing.assert_equal(output_state, expected_output_state)
 
-        triton_client.infer(
+        output = triton_client.infer(
             model_name="growable_memory",
             inputs=inputs,
             sequence_id=2,
             sequence_start=False,
             sequence_end=False,
         )
+        output_state = output.as_numpy("OUTPUT_STATE")
+        expected_output_state = np.concatenate(
+            [expected_output_state, np.ones(expected_output_state.shape, dtype=np.int8)]
+        )
+        np.testing.assert_equal(output_state, expected_output_state)
 
-        triton_client.infer(
+        output = triton_client.infer(
             model_name="growable_memory",
             inputs=inputs,
             sequence_id=2,
             sequence_start=False,
             sequence_end=True,
         )
+        output_state = output.as_numpy("OUTPUT_STATE")
+        expected_output_state = np.concatenate(
+            [
+                expected_output_state,
+                np.full(
+                    (expected_output_state.shape[0] // 2,), dtype=np.int8, fill_value=2
+                ),
+            ]
+        )
+        np.testing.assert_equal(output_state, expected_output_state)
 
     def test_no_update(self):
         # Test implicit state without updating any state
diff --git a/qa/L0_implicit_state/models/growable_memory/config.pbtxt b/qa/L0_implicit_state/models/growable_memory/config.pbtxt
new file mode 100644
index 0000000000..2bbb955308
--- /dev/null
+++ b/qa/L0_implicit_state/models/growable_memory/config.pbtxt
@@ -0,0 +1,102 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions # are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "growable_memory"
+backend: "implicit_state"
+max_batch_size: 0
+sequence_batching {
+  control_input [
+    {
+      name: "START"
+      control [
+        {
+          kind: CONTROL_SEQUENCE_START
+          fp32_false_true: [ 0, 1 ]
+        }
+      ]
+    },
+    {
+      name: "READY"
+      control [
+        {
+          kind: CONTROL_SEQUENCE_READY
+          fp32_false_true: [ 0, 1 ]
+        }
+      ]
+    },
+    {
+      name: "END"
+      control [
+        {
+          kind: CONTROL_SEQUENCE_END
+          fp32_false_true: [ 0, 1 ]
+        }
+      ]
+    }
+  ]
+  state [
+    {
+        input_name: "INPUT_STATE"
+        output_name: "OUTPUT_STATE"
+        data_type: TYPE_INT8
+        dims: [1024, 1024]
+        use_single_buffer: true
+        use_growable_memory: true
+    }
+  ]
+}
+
+input [
+  {
+    name: "INPUT"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "TEST_CASE"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+
+output [
+  {
+    name: "OUTPUT"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "OUTPUT_STATE"
+    data_type: TYPE_INT8
+    dims: [ 1 ]
+  }
+]
+
+instance_group [
+  {
+    count: 1
+    kind : KIND_GPU
+  }
+]
diff --git a/qa/L0_sequence_batcher/test.sh b/qa/L0_sequence_batcher/test.sh
index 8878e94e1f..6aa2f06629 100755
--- a/qa/L0_sequence_batcher/test.sh
+++ b/qa/L0_sequence_batcher/test.sh
@@ -337,21 +337,22 @@ for MODEL in $MODELS; do
         sed -i "s/$(basename $MODEL)/$(basename $MODEL)_full/" config.pbtxt && \
         sed -i "s/minimum_slot_utilization: 0/minimum_slot_utilization: 1/" config.pbtxt)
 
-    if [ "$USE_SINGLE_BUFFER" == "1" && "$IMPLICIT_STATE" == "1" ]; then
-      SED_REPLACE_PATTERN="N;N;N;N;N;/state.*dims:.*/a use_single_buffer: true"
-      (cd models0/$(basename $MODEL) && \
-        sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
-      (cd models1/$(basename $MODEL) && \
-        sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
-      (cd models2/$(basename $MODEL) && \
-        sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
-      (cd models4/$(basename $MODEL) && \
-        sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
-      (cd queue_delay_models/$(basename $MODEL)_full && \
-        sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
-      (cd queue_delay_models/$(basename $MODEL)_half && \
-        sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
-    fi
+    # TODO: Enable single state buffer testing for sequence batcher
+    # if [ "$USE_SINGLE_BUFFER" == "1" && "$IMPLICIT_STATE" == "1" ]; then
+    #   SED_REPLACE_PATTERN="N;N;N;N;N;/state.*dims:.*/a use_single_buffer: true"
+    #   (cd models0/$(basename $MODEL) && \
+    #     sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
+    #   (cd models1/$(basename $MODEL) && \
+    #     sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
+    #   (cd models2/$(basename $MODEL) && \
+    #     sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
+    #   (cd models4/$(basename $MODEL) && \
+    #     sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
+    #   (cd queue_delay_models/$(basename $MODEL)_full && \
+    #     sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
+    #   (cd queue_delay_models/$(basename $MODEL)_half && \
+    #     sed -i "$SED_REPLACE_PATTERN" config.pbtxt)
+    # fi
   else
     cp -r $MODEL queue_delay_models/$(basename $MODEL)_full && \
       (cd queue_delay_models/$(basename $MODEL)_full && \
diff --git a/src/test/implicit_state/src/implicit_state.cc b/src/test/implicit_state/src/implicit_state.cc
index 3a63dede10..3f6f4678e7 100644
--- a/src/test/implicit_state/src/implicit_state.cc
+++ b/src/test/implicit_state/src/implicit_state.cc
@@ -1060,12 +1060,17 @@ TRITONBACKEND_ModelInstanceExecute(
       } break;
       case 3: {
         TRITONBACKEND_State* response_state;
-        std::vector<int64_t> shape{1};
+        size_t block_size = sizeof(int8_t) * 1024 * 1024;
+        int64_t current_elements =
+            (instance_state->request_index_ + 1) * 1024 * 1024;
+        std::cout << "current elements are "
+                  << (instance_state->request_index_ + 1) << std::endl;
+        std::vector<int64_t> shape{current_elements};
         GUARDED_RESPOND_IF_ERROR(
             responses, r, request,
             TRITONBACKEND_StateNew(
                 &response_state, request, "OUTPUT_STATE",
-                TRITONSERVER_TYPE_INT32, shape.data() /* data */,
+                TRITONSERVER_TYPE_INT8, shape.data() /* data */,
                 shape.size() /* dim_count */));
 
         if (responses[r] == nullptr) {
@@ -1081,8 +1086,6 @@ TRITONBACKEND_ModelInstanceExecute(
         int64_t actual_memory_type_id = 0;
         char* buffer;
 
-        size_t block_size = sizeof(int32_t) * 1024 * 1024;
-
         // Request an output buffer in GPU. This is only for testing purposes
         // to make sure that GPU output buffers can be requested.
         GUARDED_RESPOND_IF_ERROR(
@@ -1098,6 +1101,43 @@ TRITONBACKEND_ModelInstanceExecute(
             buffer + block_size * (instance_state->request_index_),
             instance_state->request_index_, block_size);
 
+        TRITONBACKEND_Output* response_output;
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r, request,
+            TRITONBACKEND_ResponseOutput(
+                responses[r], &response_output, "OUTPUT_STATE",
+                TRITONSERVER_TYPE_INT8, shape.data() /* data */,
+                shape.size() /* dim_count */));
+
+        actual_memory_type = TRITONSERVER_MEMORY_CPU;
+        actual_memory_type_id = 0;
+        char* output_buffer;
+        GUARDED_RESPOND_IF_ERROR(
+            responses, r, request,
+            TRITONBACKEND_OutputBuffer(
+                response_output, reinterpret_cast<void**>(&output_buffer),
+                block_size * (instance_state->request_index_ + 1),
+                &actual_memory_type, &actual_memory_type_id));
+        if ((responses[r] == nullptr) ||
+            (actual_memory_type != TRITONSERVER_MEMORY_CPU)) {
+          GUARDED_RESPOND_IF_ERROR(
+              responses, r, request,
+              TRITONSERVER_ErrorNew(
+                  TRITONSERVER_ERROR_UNSUPPORTED,
+                  "the backend can only handle CPU tensors"));
+          LOG_MESSAGE(
+              TRITONSERVER_LOG_ERROR,
+              (std::string("request ") + std::to_string(r) +
+               "the backend can only handle CPU tensors"
+               "response sent")
+                  .c_str());
+          continue;
+        }
+        cudaMemcpy(
+            output_buffer, buffer,
+            block_size * (instance_state->request_index_ + 1),
+            cudaMemcpyDeviceToHost);
+
         instance_state->state_ = buffer;
       } break;
     }

From d051daed471b37c381ff5067d2dfeca24b352fc6 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Wed, 8 Nov 2023 22:10:43 -0500
Subject: [PATCH 4/9] Fix up

---
 src/test/implicit_state/src/implicit_state.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test/implicit_state/src/implicit_state.cc b/src/test/implicit_state/src/implicit_state.cc
index 3f6f4678e7..7def94934c 100644
--- a/src/test/implicit_state/src/implicit_state.cc
+++ b/src/test/implicit_state/src/implicit_state.cc
@@ -56,7 +56,7 @@ namespace triton { namespace backend { namespace implicit {
 //   sent in a sequence.
 //
 //   * USE_GROWABLE_STATE_BUFFER = 3: In this test case we use growable state
-//   buffer. currently, growable state buffer only supports CUDA memory.
+//   buffer. Currently, growable state buffer only supports CUDA memory.
 
 #define GUARDED_RESPOND_IF_ERROR(RESPONSES, IDX, REQUEST, X)            \
   do {                                                                  \
@@ -754,7 +754,7 @@ TRITONBACKEND_ModelInstanceExecute(
         *reinterpret_cast<const int32_t*>(test_case_buffer);
     const int32_t ipbuffer_int =
         *reinterpret_cast<const int32_t*>(input_buffer);
-    int32_t ipbuffer_state_int;
+    int32_t ipbuffer_state_int = 0;
 
     if (test_case_buffer_int != 0) {
       TRITONBACKEND_Input* input_state = nullptr;

From 648b76eb2beeb58b8c6b24ee70cc52dcedfb70cb Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Wed, 8 Nov 2023 23:15:50 -0500
Subject: [PATCH 5/9] Add CUDA virtual address size flag

---
 src/command_line_parser.cc | 25 +++++++++++++++++++++++++
 src/command_line_parser.h  |  1 +
 2 files changed, 26 insertions(+)

diff --git a/src/command_line_parser.cc b/src/command_line_parser.cc
index 0907982d6b..6fa0fe31bb 100644
--- a/src/command_line_parser.cc
+++ b/src/command_line_parser.cc
@@ -347,6 +347,7 @@ enum TritonOptionId {
   OPTION_RATE_LIMIT_RESOURCE,
   OPTION_PINNED_MEMORY_POOL_BYTE_SIZE,
   OPTION_CUDA_MEMORY_POOL_BYTE_SIZE,
+  OPTION_CUDA_VIRTUAL_ADDRESS_SIZE,
   OPTION_RESPONSE_CACHE_BYTE_SIZE,
   OPTION_CACHE_CONFIG,
   OPTION_CACHE_DIR,
@@ -751,6 +752,18 @@ TritonParser::SetupOptions()
        "<GPU device ID>:<pool byte size>. This option can be used multiple "
        "times, but only once per GPU device. Subsequent uses will overwrite "
        "previous uses for the same GPU device. Default is 64 MB."});
+  memory_device_options_.push_back(
+      {OPTION_CUDA_VIRTUAL_ADDRESS_SIZE, "cuda-virtual-address-size",
+       "<integer>:<integer>",
+       "The total CUDA virtual address size that will be used for each "
+       "implicit state when growable memory is used. This value determines "
+       "the maximum size of each implicit state. The state size cannot go "
+       "beyond this value. The argument should be "
+       "2 integers separated by colons in the format "
+       "<GPU device ID>:<CUDA virtual address size>. This option can be used "
+       "multiple "
+       "times, but only once per GPU device. Subsequent uses will overwrite "
+       "previous uses for the same GPU device. Default is 1 GB."});
   memory_device_options_.push_back(
       {OPTION_MIN_SUPPORTED_COMPUTE_CAPABILITY,
        "min-supported-compute-capability", Option::ArgFloat,
@@ -996,6 +1009,14 @@ TritonServerParameters::BuildTritonServerOptions()
             loptions, cuda_pool.first, cuda_pool.second),
         "setting total CUDA memory byte size");
   }
+  for (const auto& cuda_virtual_address_size : cuda_virtual_address_size_) {
+    THROW_IF_ERR(
+        ParseException,
+        TRITONSERVER_ServerOptionsSetCudaVirtualAddressSize(
+            loptions, cuda_virtual_address_size.first,
+            cuda_virtual_address_size.second),
+        "setting total CUDA virtual address size");
+  }
   THROW_IF_ERR(
       ParseException,
       TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability(
@@ -1569,6 +1590,10 @@ TritonParser::Parse(int argc, char** argv)
           lparams.cuda_pools_.push_back(
               ParsePairOption<int, uint64_t>(optarg, ":"));
           break;
+        case OPTION_CUDA_VIRTUAL_ADDRESS_SIZE:
+          lparams.cuda_virtual_address_size_.push_back(
+              ParsePairOption<int, size_t>(optarg, ":"));
+          break;
         case OPTION_RESPONSE_CACHE_BYTE_SIZE: {
           cache_size_present = true;
           const auto byte_size = std::to_string(ParseOption<int64_t>(optarg));
diff --git a/src/command_line_parser.h b/src/command_line_parser.h
index 8a34babc98..ef562a3efb 100644
--- a/src/command_line_parser.h
+++ b/src/command_line_parser.h
@@ -145,6 +145,7 @@ struct TritonServerParameters {
   // memory pool configuration
   int64_t pinned_memory_pool_byte_size_{1 << 28};
   std::list<std::pair<int, uint64_t>> cuda_pools_;
+  std::list<std::pair<int, size_t>> cuda_virtual_address_size_;
 
   // [FIXME] this option is broken after backend separation: this should have
   // controlled backend copy behavior but not properly propagate to backend

From 653019a8561ba41d91e44da3316926a39009a5c4 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Thu, 9 Nov 2023 00:22:17 -0500
Subject: [PATCH 6/9] Add missing test files

---
 qa/L0_implicit_state/implicit_state.py        | 12 ++-
 .../models/single_state_buffer/config.pbtxt   | 96 +++++++++++++++++++
 qa/L0_implicit_state/test.sh                  |  2 +-
 3 files changed, 108 insertions(+), 2 deletions(-)
 create mode 100644 qa/L0_implicit_state/models/single_state_buffer/config.pbtxt

diff --git a/qa/L0_implicit_state/implicit_state.py b/qa/L0_implicit_state/implicit_state.py
index cab271600b..5167159717 100755
--- a/qa/L0_implicit_state/implicit_state.py
+++ b/qa/L0_implicit_state/implicit_state.py
@@ -144,7 +144,7 @@ def test_implicit_state_growable_memory(self):
             inputs=inputs,
             sequence_id=2,
             sequence_start=False,
-            sequence_end=True,
+            sequence_end=False,
         )
         output_state = output.as_numpy("OUTPUT_STATE")
         expected_output_state = np.concatenate(
@@ -157,6 +157,16 @@ def test_implicit_state_growable_memory(self):
         )
         np.testing.assert_equal(output_state, expected_output_state)
 
+        with self.assertRaises(InferenceServerException) as e:
+            triton_client.infer(
+                model_name="growable_memory",
+                inputs=inputs,
+                sequence_id=2,
+                sequence_start=False,
+                sequence_end=True,
+            )
+        print(e)
+
     def test_no_update(self):
         # Test implicit state without updating any state
         triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
diff --git a/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt b/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt
new file mode 100644
index 0000000000..4751829aef
--- /dev/null
+++ b/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt
@@ -0,0 +1,96 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions # are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "single_state_buffer"
+backend: "implicit_state"
+max_batch_size: 0
+sequence_batching {
+  control_input [
+    {
+      name: "START"
+      control [
+        {
+          kind: CONTROL_SEQUENCE_START
+          fp32_false_true: [ 0, 1 ]
+        }
+      ]
+    },
+    {
+      name: "READY"
+      control [
+        {
+          kind: CONTROL_SEQUENCE_READY
+          fp32_false_true: [ 0, 1 ]
+        }
+      ]
+    },
+    {
+      name: "END"
+      control [
+        {
+          kind: CONTROL_SEQUENCE_END
+          fp32_false_true: [ 0, 1 ]
+        }
+      ]
+    }
+  ]
+  state [
+    {
+        input_name: "INPUT_STATE"
+        output_name: "OUTPUT_STATE"
+        data_type: TYPE_INT32
+        dims: 1
+        use_single_buffer: true
+    }
+  ]
+}
+
+input [
+  {
+    name: "INPUT"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "TEST_CASE"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+
+output [
+  {
+    name: "OUTPUT"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
diff --git a/qa/L0_implicit_state/test.sh b/qa/L0_implicit_state/test.sh
index 39984406df..973192ba03 100755
--- a/qa/L0_implicit_state/test.sh
+++ b/qa/L0_implicit_state/test.sh
@@ -96,7 +96,7 @@ for BACKEND in $BACKENDS; do
 done
 
 CLIENT_LOG=`pwd`/client.log
-SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR}"
+SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR} --cuda-virtual-address-size=0:$((1024*1024*4))"
 IMPLICIT_STATE_CLIENT='implicit_state.py'
 EXPECTED_TEST_NUM=6
 rm -rf $CLIENT_LOG

From f4e69773d4a489b6d8008b11d24c41b827ffe0ce Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Thu, 9 Nov 2023 13:08:14 -0500
Subject: [PATCH 7/9] Parameter rename

---
 qa/L0_implicit_state/models/growable_memory/config.pbtxt     | 2 +-
 qa/L0_implicit_state/models/single_state_buffer/config.pbtxt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/qa/L0_implicit_state/models/growable_memory/config.pbtxt b/qa/L0_implicit_state/models/growable_memory/config.pbtxt
index 2bbb955308..630e47ba19 100644
--- a/qa/L0_implicit_state/models/growable_memory/config.pbtxt
+++ b/qa/L0_implicit_state/models/growable_memory/config.pbtxt
@@ -62,7 +62,7 @@ sequence_batching {
         output_name: "OUTPUT_STATE"
         data_type: TYPE_INT8
         dims: [1024, 1024]
-        use_single_buffer: true
+        use_same_buffer_for_input_output: true
         use_growable_memory: true
     }
   ]
diff --git a/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt b/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt
index 4751829aef..c14962585c 100644
--- a/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt
+++ b/qa/L0_implicit_state/models/single_state_buffer/config.pbtxt
@@ -62,7 +62,7 @@ sequence_batching {
         output_name: "OUTPUT_STATE"
         data_type: TYPE_INT32
         dims: 1
-        use_single_buffer: true
+        use_same_buffer_for_input_output: true
     }
   ]
 }

From 4b3f7620295f26ca5c97084d7d6671a25c6f5e1c Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Mon, 13 Nov 2023 01:15:50 -0500
Subject: [PATCH 8/9] Test fixes

---
 qa/L0_implicit_state/implicit_state.py | 10 ----------
 qa/L0_implicit_state/test.sh           |  2 +-
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/qa/L0_implicit_state/implicit_state.py b/qa/L0_implicit_state/implicit_state.py
index 5167159717..2cdf7ff2e0 100755
--- a/qa/L0_implicit_state/implicit_state.py
+++ b/qa/L0_implicit_state/implicit_state.py
@@ -157,16 +157,6 @@ def test_implicit_state_growable_memory(self):
         )
         np.testing.assert_equal(output_state, expected_output_state)
 
-        with self.assertRaises(InferenceServerException) as e:
-            triton_client.infer(
-                model_name="growable_memory",
-                inputs=inputs,
-                sequence_id=2,
-                sequence_start=False,
-                sequence_end=True,
-            )
-        print(e)
-
     def test_no_update(self):
         # Test implicit state without updating any state
         triton_client = tritonhttpclient.InferenceServerClient("localhost:8000")
diff --git a/qa/L0_implicit_state/test.sh b/qa/L0_implicit_state/test.sh
index 973192ba03..0722d29be1 100755
--- a/qa/L0_implicit_state/test.sh
+++ b/qa/L0_implicit_state/test.sh
@@ -98,7 +98,7 @@ done
 CLIENT_LOG=`pwd`/client.log
 SERVER_ARGS="--backend-directory=${BACKEND_DIR} --model-repository=${MODELDIR} --cuda-virtual-address-size=0:$((1024*1024*4))"
 IMPLICIT_STATE_CLIENT='implicit_state.py'
-EXPECTED_TEST_NUM=6
+EXPECTED_TEST_NUM=7
 rm -rf $CLIENT_LOG
 
 run_server

From 95800062bbf0a5a4191742befbd5a600795ac358 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Tue, 14 Nov 2023 23:27:40 -0500
Subject: [PATCH 9/9] Only build implicit state backend for GPU=ON

---
 src/test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index 6dcd3f8f09..a833976e13 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -102,9 +102,9 @@ add_subdirectory(repoagent/relocation_repoagent repoagent/relocation_repoagent)
 add_subdirectory(distributed_addsub distributed_addsub)
 add_subdirectory(dyna_sequence dyna_sequence)
 add_subdirectory(generative_sequence generative_sequence)
-add_subdirectory(implicit_state implicit_state)
 add_subdirectory(query_backend query_backend)
 
 if(${TRITON_ENABLE_GPU})
   add_subdirectory(sequence sequence)
+  add_subdirectory(implicit_state implicit_state)
 endif()