|
10 | 10 | from datetime import datetime
|
11 | 11 | import json
|
12 | 12 | import random
|
| 13 | +import requests |
13 | 14 | import time
|
14 | 15 | from typing import AsyncGenerator, List, Tuple
|
15 | 16 |
|
| 17 | +import google.auth |
| 18 | +import google.auth.transport.requests |
| 19 | + |
16 | 20 | import aiohttp
|
17 | 21 | import numpy as np
|
18 | 22 | from transformers import AutoTokenizer
|
@@ -302,6 +306,60 @@ def save_json_results(args: argparse.Namespace, benchmark_result):
|
302 | 306 | with open(file_name, "w", encoding="utf-8") as outfile:
|
303 | 307 | json.dump(final_json, outfile)
|
304 | 308 |
|
| 309 | +def metrics_to_scrape(backend: str) -> List[str]: |
| 310 | + if backend == "vllm": |
| 311 | + return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"] |
| 312 | + elif backend == "jetstream": |
| 313 | + return ["jetstream_slots_used_percentage", "jetstream_prefill_backlog_size"] |
| 314 | + else: |
| 315 | + return [] |
| 316 | + |
| 317 | +def print_metrics(metrics: List[str], duration: float, backend: str): |
| 318 | + # Creates a credentials object from the default service account file |
| 319 | + # Assumes that script has appropriate default credentials set up, ref: |
| 320 | + # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials |
| 321 | + credentials, project_id = google.auth.default() |
| 322 | + # Prepare an authentication request - helps format the request auth token |
| 323 | + auth_req = google.auth.transport.requests.Request() |
| 324 | + |
| 325 | + all_metric_results = {} |
| 326 | + |
| 327 | + for metric in metrics: |
| 328 | + print("Metric Name: %s" % (metric)) |
| 329 | + metric_results = {} |
| 330 | + # Queries scrape all metrics collected from the last $DURATION seconds from the backend's related |
| 331 | + # podmonitoring spec assumed to be named "$BACKEND-podmonitoring" |
| 332 | + queries = { |
| 333 | + "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), |
| 334 | + "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), |
| 335 | + "Min": "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), |
| 336 | + "Max": "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), |
| 337 | + "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), |
| 338 | + "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), |
| 339 | + } |
| 340 | + for query_name, query in queries.items(): |
| 341 | + # Request refresh tokens |
| 342 | + credentials.refresh(auth_req) |
| 343 | + |
| 344 | + # Configure respective query |
| 345 | + url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/query' % (project_id) |
| 346 | + headers_api = {'Authorization': 'Bearer ' + credentials.token} |
| 347 | + params = {'query': query} |
| 348 | + request_post = requests.get(url=url, headers=headers_api, params=params) |
| 349 | + response = request_post.json() |
| 350 | + |
| 351 | + # handle response |
| 352 | + if request_post.ok: |
| 353 | + if response["status"] == "success": |
| 354 | + metric_results[query_name] = response["data"]["result"][0]["value"][1] |
| 355 | + print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1])) |
| 356 | + else: |
| 357 | + print("Cloud Monitoring PromQL Error: %s" % (response["error"])) |
| 358 | + else: |
| 359 | + print("HTTP Error: %s" % (response)) |
| 360 | + all_metric_results[metric] = metric_results |
| 361 | + return all_metric_results |
| 362 | + |
305 | 363 |
|
306 | 364 | def main(args: argparse.Namespace):
|
307 | 365 | print(args)
|
@@ -420,6 +478,10 @@ def main(args: argparse.Namespace):
|
420 | 478 | )
|
421 | 479 | benchmark_result['avg_output_len'] = avg_output_len
|
422 | 480 |
|
| 481 | + if args.scrape_server_metrics: |
| 482 | + server_metrics = print_metrics(metrics_to_scrape(args.backend), benchmark_time, args.backend) |
| 483 | + benchmark_result['server_metrics'] = server_metrics |
| 484 | + |
423 | 485 | if args.save_json_results:
|
424 | 486 | save_json_results(args, benchmark_result)
|
425 | 487 |
|
@@ -545,5 +607,10 @@ def main(args: argparse.Namespace):
|
545 | 607 | " the form of a string."
|
546 | 608 | ),
|
547 | 609 | )
|
| 610 | + parser.add_argument( |
| 611 | + "--scrape-server-metrics", |
| 612 | + action="store_true", |
| 613 | + help="Whether to scrape server metrics.", |
| 614 | + ) |
548 | 615 | cmd_args = parser.parse_args()
|
549 | 616 | main(cmd_args)
|
0 commit comments