Skip to content

Commit 9a0cb17

Browse files
authored
Added Prometheus Server to LPG (#857)
* first commit * separated templates * remove empty file * fmt
1 parent 87d11d5 commit 9a0cb17

File tree

4 files changed

+35
-2
lines changed

4 files changed

+35
-2
lines changed

benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py

+13
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import requests
1414
import time
1515
from typing import AsyncGenerator, List, Optional, Tuple, Dict
16+
from prometheus_client import start_http_server, Histogram
1617

1718
import google.auth
1819
import google.auth.transport.requests
@@ -27,7 +28,12 @@
2728
MIN_SEQ_LEN = 4
2829
CLIENT_TIMEOUT_SEC = 3 * 60 * 60
2930
NEW_TEXT_KEY = "\nOutput:\n"
31+
PROMETHEUS_PORT = 9090
3032

33+
# Prometheus Metrics
34+
prompt_length_metric = Histogram("LatencyProfileGenerator:prompt_length", "Input prompt length", buckets=[2**i for i in range(1, 16)])
35+
response_length_metric = Histogram("LatencyProfileGenerator:response_length", "Response length", buckets=[2**i for i in range(1, 16)])
36+
tpot_metric = Histogram('LatencyProfileGenerator:time_per_output_token', 'Time per output token per request')
3137

3238
def sample_requests(
3339
dataset_path: str,
@@ -264,6 +270,10 @@ async def send_request(
264270

265271
# (prompt len, output len, latency, success)
266272
request_latency = (prompt_len, output_len, (request_end_time - request_start_time))
273+
tpot_metric.observe((request_end_time - request_start_time) / output_len)
274+
prompt_length_metric.observe(prompt_len)
275+
response_length_metric.observe(output_len)
276+
267277
return request_latency, None
268278

269279
async def benchmark(
@@ -589,6 +599,9 @@ async def main(args: argparse.Namespace):
589599
else args.endpoint
590600
)
591601

602+
print(f"Starting Prometheus Server on port {PROMETHEUS_PORT}")
603+
start_http_server(PROMETHEUS_PORT)
604+
592605
api_url = f"http://{args.host}:{args.port}/{endpoint}"
593606
tokenizer = AutoTokenizer.from_pretrained(
594607
args.tokenizer, trust_remote_code=args.trust_remote_code

benchmarks/benchmark/tools/profile-generator/container/requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,5 @@ aioprometheus[starlette]
3434
pynvml == 11.5.0
3535
accelerate
3636
aiohttp
37-
google-auth
37+
google-auth
38+
prometheus_client >= 0.21.0

benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf

+8-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ locals {
2323
? "${path.module}/manifest-templates"
2424
: pathexpand(var.templates_path)
2525
)
26-
latency-profile-generator-template = "${path.module}/manifest-templates/latency-profile-generator.yaml.tpl"
26+
latency-profile-generator-template = "${path.module}/manifest-templates/latency-profile-generator.yaml.tpl"
27+
latency-profile-generator-podmonitoring-template = "${path.module}/manifest-templates/latency-profile-generator-podmonitoring.yaml.tpl"
2728
hugging_face_token_secret = (
2829
var.hugging_face_secret == null || var.hugging_face_secret_version == null
2930
? null
@@ -68,4 +69,10 @@ resource "kubernetes_manifest" "latency-profile-generator" {
6869
save_aggregated_result = var.save_aggregated_result
6970
models = var.models
7071
}))
72+
}
73+
74+
resource "kubernetes_manifest" "latency-profile-generator-podmonitoring" {
75+
manifest = yamldecode(templatefile(local.latency-profile-generator-podmonitoring-template, {
76+
namespace = var.namespace
77+
}))
7178
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: monitoring.googleapis.com/v1
2+
kind: PodMonitoring
3+
metadata:
4+
name: "lpg-driver-podmonitoring"
5+
namespace: ${namespace}
6+
spec:
7+
selector:
8+
matchLabels:
9+
name: latency-profile-generator
10+
endpoints:
11+
- port: 9090
12+
interval: 15s

0 commit comments

Comments
 (0)