triton-inference-server · rmccorm4 · Sep 26, 2023 · Sep 26, 2023 · Sep 29, 2023 · Sep 30, 2023
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@
 __pycache__
 tmp
 *.log
+test_results*.txt
diff --git a/qa/L0_http/llm_test.py b/qa/L0_http/llm_test.py
@@ -0,0 +1,135 @@
+#!/usr/bin/python3
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+sys.path.append("../common")
+
+import json
+import unittest
+
+import requests
+import test_util as tu
+
+
+class HttpTest(tu.TestResultCollector):
+    def _get_infer_url(self, model_name, route):
+        return f"http://localhost:8000/v2/models/{model_name}/{route}"
+
+    def _simple_infer(self, model_name, inputs, expected_outputs):
+        headers = {"Content-Type": "application/json"}
+        url = self._get_infer_url(model_name, "infer")
+        r = requests.post(url, data=json.dumps(inputs), headers=headers)
+        r.raise_for_status()
+
+        content = r.json()
+        print(content)
+
+        self.assertEqual(content["model_name"], model_name)
+        self.assertIn("outputs", content)
+        self.assertEqual(content["outputs"], expected_outputs)
+
+    def _simple_generate_stream(self, model_name, inputs, expected_outputs):
+        import sseclient
+
+        headers = {"Accept": "text/event-stream"}
+        url = self._get_infer_url(model_name, "generate_stream")
+        # stream=True used to indicate response can be iterated over
+        r = requests.post(url, data=json.dumps(inputs), headers=headers, stream=True)
+
+        # Validate SSE format
+        print(r.headers)
+        self.assertIn("Content-Type", r.headers)
+        # FIXME: Clarify correct header here.
+        # self.assertEqual(r.headers['Content-Type'], 'text/event-stream')
+        self.assertEqual(r.headers["Content-Type"], "text/event-stream; charset=utf-8")
+
+        # SSE format (data: []) is hard to parse, use helper library for simplicity
+        client = sseclient.SSEClient(r)
+        tokens = []
+        for i, event in enumerate(client.events()):
+            # End of event stream
+            if event.data == "[DONE]":
+                continue
+
+            # Parse event data, join events into a single response
+            data = json.loads(event.data)
+            print(f"Event {i}:", data)
+            if "TEXT" not in data:
+                print("FIXME: EXPECTED OUTPUT FIELD NOT FOUND")
+            else:
+                tokens.append(data["TEXT"])
+        print("TOKENS:", tokens)
+
+    def test_infer(self):
+        model_name = "onnx_zero_1_object"
+        parameters = {}
+
+        # Setup text-based input
+        input0_data = ["hello"]
+        input0 = {
+            "name": "INPUT0",
+            "datatype": "BYTES",
+            "shape": [1, 1],
+            "data": input0_data,
+        }
+        inputs = {"inputs": [input0], "parameters": parameters}
+        # Identity model, output should match input
+        expected_outputs = [
+            {
+                "name": "OUTPUT0",
+                "datatype": "BYTES",
+                "shape": [1, 1],
+                "data": input0_data,
+            }
+        ]
+        self._simple_infer(model_name, inputs, expected_outputs)
+
+    # def test_generate(self):
+    #    pass
+
+    def test_generate_stream(self):
+        # TODO: vllm
+        model_name = "onnx_zero_1_object"
+        parameters = {}
+        # Setup text-based input
+        input0_data = ["hello"]
+        inputs = {"prompt": input0_data, "stream": True, "parameters": parameters}
+        # Identity model, output should match input
+        expected_outputs = [
+            {
+                "name": "OUTPUT0",
+                "datatype": "BYTES",
+                "shape": [1, 1],
+                "data": input0_data,
+            }
+        ]
+        self._simple_generate_stream(model_name, inputs, expected_outputs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_http/test.sh b/qa/L0_http/test.sh
@@ -629,6 +629,66 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
+### LLM REST API Endpoint Tests ###
+
+# Helper library to parse SSE events
+# https://github.com/mpetazzoni/sseclient
+pip install sseclient-py
+
+# Setup model repository
+rm -r ${MODELDIR}/*
+# TODO: Replace identity model with vllm model
+MODEL_NAME="onnx_zero_1_object"
+cp -r $DATADIR/qa_identity_model_repository/${MODEL_NAME} ${MODELDIR}/vllm
+
+SERVER_ARGS="-model-repository=${MODELDIR}"
+SERVER_LOG="./inference_server_llm_test.log"
+CLIENT_LOG="./llm_test.log"
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+## Curl Tests
+# Test that direct curl infer request returns success, more detailed checking
+# will be done via python requests in unit tests.
+# TODO: Use /generate and /generate_stream routes instead.
+curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/models/${MODEL_NAME}/infer -d '{"inputs": [{"name":"INPUT0","datatype":"BYTES","shape":[1,1],"data":["hello"]}]}'
+assert_curl_success "Unexpected infer failure"
+
+# TODO: /generate
+#curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/models/${MODEL_NAME}/generate -d '{"prompt": "hello", "stream": false}'
+#assert_curl_success "Unexpected generate failure"
+
+curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/models/${MODEL_NAME}/generate_stream -d '{"prompt": "hello", "stream": true}'
+assert_curl_success "Unexpected generate_stream failure"
+
+## Python Unit Tests
+TEST_RESULT_FILE='test_results_llm.txt'
+PYTHON_TEST=llm_test.py
+EXPECTED_NUM_TESTS=2
+set +e
+python3 $PYTHON_TEST >$CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+    cat $CLIENT_LOG
+    RET=1
+else
+    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Result Verification Failed\n***"
+        RET=1
+    fi
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+###
+
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** Test Passed\n***"
 else

diff --git a/qa/L0_trace/test.sh b/qa/L0_trace/test.sh
@@ -81,24 +81,6 @@ cp -r $DATADIR/$MODELBASE $MODELSDIR/simple && \
 RET=0
 
 # Helpers =======================================
-function assert_curl_success {
-  message="${1}"
-  if [ "$code" != "200" ]; then
-    cat ./curl.out
-    echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***"
-    RET=1
-  fi
-}
-
-function assert_curl_failure {
-  message="${1}"
-  if [ "$code" != "400" ]; then
-    cat ./curl.out
-    echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***"
-    RET=1
-  fi
-}
-
 function get_global_trace_setting {
   rm -f ./curl.out
   set +e

diff --git a/qa/common/util.sh b/qa/common/util.sh
@@ -508,3 +508,25 @@ remove_array_outliers() {
 
     arr=("${arr[@]:$start_index:$end_index}")
 }
+
+# Curl helpers:
+# 1. Assumes http return code is returned in "code" variable.
+# 2. Assumes output written to "curl.out"
+function assert_curl_success {
+  message="${1}"
+  if [ "$code" != "200" ]; then
+    cat ./curl.out
+    echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***"
+    RET=1
+  fi
+}
+
+function assert_curl_failure {
+  message="${1}"
+  if [ "$code" != "400" ]; then
+    cat ./curl.out
+    echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***"
+    RET=1
+  fi
+}
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,4 @@ @@
     __pycache__
     tmp
     *.log
+    test_results*.txt