triton-inference-server · AndyDai-nv · Aug 23, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 8, 2024
diff --git a/genai-perf/docs/goodput_tutorial.md b/genai-perf/docs/goodput_tutorial.md
@@ -0,0 +1,103 @@
+<!--
+Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Tutorials
+
+## Profile GPT2 running on Triton + vLLM <a id="triton-vllm"></a>
+
+### Run GPT2 on Triton Inference Server using vLLM
+
+<details>
+<summary>See instructions</summary>
+
+Run Triton Inference Server with vLLM backend container:
+
+```bash
+export RELEASE="yy.mm" # e.g. export RELEASE="24.06"
+
+
+docker run -it --net=host --gpus=1 --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/tritonserver:${RELEASE}-vllm-python-py3
+
+# Install Triton CLI (~5 min):
+pip install "git+https://github.com/triton-inference-server/[email protected]"
+
+# Download model:
+triton import -m gpt2 --backend vllm
+
+# Run server:
+triton start
+```
+
+</details>
+
+### Run GenAI-Perf
+
+Run GenAI-Perf from Triton Inference Server SDK container:
+
+```bash
+export RELEASE="yy.mm" # e.g. export RELEASE="24.06"
+
+docker run -it --net=host --gpus=1 nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
+
+# Run GenAI-Perf in the container:
+genai-perf profile \
+  -m gpt2 \
+  --service-kind triton \
+  --backend vllm \
+  --num-prompts 100 \
+  --random-seed 123 \
+  --synthetic-input-tokens-mean 200 \
+  --synthetic-input-tokens-stddev 0 \
+  --streaming \
+  --output-tokens-mean 100 \
+  --output-tokens-stddev 0 \
+  --output-tokens-mean-deterministic \
+  --tokenizer hf-internal-testing/llama-tokenizer \
+  --concurrency 4 \
+  --measurement-interval 800 \
+  --profile-export-file my_profile_export.json \
+  --url localhost:8001 \
+  --goodput time_to_first_tokens:10 inter_token_latencies:1.6
+```
+
+Example output:
+
+```
+                                   LLM Metrics                                    
+┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓
+┃                Statistic ┃    avg ┃    min ┃    max ┃    p99 ┃    p90 ┃    p75 ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩
+│ Time to first token (ms) │   5.60 │   4.00 │  13.03 │  12.69 │   7.02 │   5.04 │
+│ Inter token latency (ms) │   1.63 │   1.38 │   2.28 │   2.23 │   1.82 │   1.63 │
+│     Request latency (ms) │ 184.84 │ 173.10 │ 237.81 │ 237.75 │ 228.81 │ 180.72 │
+│   Output sequence length │ 111.43 │ 103.00 │ 126.00 │ 124.55 │ 115.10 │ 114.00 │
+│    Input sequence length │ 200.00 │ 200.00 │ 200.00 │ 200.00 │ 200.00 │ 200.00 │
+└──────────────────────────┴────────┴────────┴────────┴────────┴────────┴────────┘
+Output token throughput (per sec): 1205.26
+Request goodput (per sec): 7.21
+Request throughput (per sec): 10.82
+```
+
diff --git a/genai-perf/genai_perf/export_data/console_exporter.py b/genai-perf/genai_perf/export_data/console_exporter.py
@@ -66,6 +66,15 @@ def export(self) -> None:
         # System metrics are printed after the table
         for metric in self._metrics.system_metrics:
             line = metric.name.replace("_", " ").capitalize()
+            if metric.name == "request_goodput":
+                if not self._args.goodput:
+                    continue
+                value = self._stats[metric.name]["avg"]
+                if value is None:
+                    value = "N/A"
+                    line += f" ({metric.unit}): {value}"
+                    print(line)
+                    continue
             value = self._stats[metric.name]["avg"]
             line += f" ({metric.unit}): {value:.2f}"
             print(line)

diff --git a/genai-perf/genai_perf/export_data/csv_exporter.py b/genai-perf/genai_perf/export_data/csv_exporter.py
@@ -94,6 +94,14 @@ def _write_system_metrics(self, csv_writer) -> None:
         for metric in self._metrics.system_metrics:
             metric_str = metric.name.replace("_", " ").title()
             metric_str += f" ({metric.unit})"
+            if metric.name == "request_goodput":
+                if not self._args.goodput:
+                    continue
+                value = self._stats[metric.name]["avg"]
+                if value is None:
+                    value = "N/A"
+                    csv_writer.writerow([metric_str, f"{value}"])
+                    continue
             value = self._stats[metric.name]["avg"]
             csv_writer.writerow([metric_str, f"{value:.2f}"])
 

diff --git a/genai-perf/genai_perf/goodput_calculator/__init__.py b/genai-perf/genai_perf/goodput_calculator/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from genai_perf.goodput_calculator.llm_goodput_calculator import LLMGoodputCalculator
+from genai_perf.goodput_calculator.goodput_calculator import GoodputCalculator
diff --git a/genai-perf/genai_perf/goodput_calculator/goodput_calculator.py b/genai-perf/genai_perf/goodput_calculator/goodput_calculator.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional
+
+from genai_perf.metrics import Metrics
+
+class GoodputCalculator(ABC):
+    """A base class to calculate goodput according to SLOs."""
+
+    MS_TO_NS_CONVERSION = 1e6
+
+    def __init__(
+            self,
+            goodput_constraints: Dict[str, float],
+            metric: Metrics,
+            benchmark_duration: float,
+    ) -> None:
+        self._goodput_constraints = goodput_constraints
+        self._benchmark_duration = benchmark_duration
+        self._metric = metric
+        self._goodput = "N/A"
+
+    def compute(self) -> None:
+        """
+        Compute the goodput result.
+
+        The GoodputCalculator class sets valid SLOs from users' input, aggregates
+        request metric values, counts the number of good requests, and calculates
+        the final goodput.
+        """
+        self._set_valid_slos()
+        self._combine_requests_metric_values()
+        good_count = self._count_good_reqs()
+        self._compute_goodput(good_count)
+
+    @abstractmethod
+    def _set_valid_slos(self) -> None:
+        """
+        Check users' Service Level Objectives (SLOs) inputs. 
+        Set the valid ones while logging the invalid ones. 
+        """
+        pass
+
+    @abstractmethod
+    def _combine_requests_metric_values(self) -> None:
+        """
+        Combine values from the metrics that match with the valid SLOs at a
+        per request level.  
+        """
+        pass
+
+    @abstractmethod
+    def _count_good_reqs(self) -> Optional[int]:
+        """Count the number of good requests according to SLOs."""
+        pass
+
+    @abstractmethod
+    def _compute_goodput(self, good_count) -> None:
+        """Compute the goodput."""
+        pass
+
+    @property
+    def goodput(self) -> List[float]:
+        return self._goodput
+