7
7
8
8
import argparse
9
9
import asyncio
10
+ from datetime import datetime
10
11
import json
11
12
import random
12
13
import time
@@ -266,6 +267,42 @@ async def benchmark(
266
267
await asyncio .gather (* tasks )
267
268
268
269
270
+ def save_json_results (args : argparse .Namespace , benchmark_result ):
271
+ # dimensions values are strings
272
+ dimensions_json = {}
273
+ # metrics values are numerical
274
+ metrics_json = {}
275
+
276
+ # Setup
277
+ current_dt = datetime .now ().strftime ("%Y%m%d-%H%M%S" )
278
+ dimensions_json ["date" ] = current_dt
279
+ dimensions_json ["backend" ] = args .backend
280
+ dimensions_json ["model_id" ] = args .model
281
+ dimensions_json ["tokenizer_id" ] = args .tokenizer
282
+ if args .additional_metadata_metrics_to_save is not None :
283
+ dimensions_json = {
284
+ ** dimensions_json ,
285
+ ** json .loads (args .additional_metadata_metrics_to_save ),
286
+ }
287
+ metrics_json ["num_prompts" ] = args .num_prompts
288
+
289
+ # Traffic
290
+ metrics_json ["request_rate" ] = args .request_rate
291
+ metrics_json = {** metrics_json , ** benchmark_result }
292
+
293
+ final_json = {}
294
+ final_json ["metrics" ] = metrics_json
295
+ final_json ["dimensions" ] = dimensions_json
296
+
297
+ # Save to file
298
+ base_model_id = args .model .split ("/" )[- 1 ]
299
+ file_name = (
300
+ f"{ args .backend } -{ args .request_rate } qps-{ base_model_id } -{ current_dt } .json"
301
+ )
302
+ with open (file_name , "w" , encoding = "utf-8" ) as outfile :
303
+ json .dump (final_json , outfile )
304
+
305
+
269
306
def main (args : argparse .Namespace ):
270
307
print (args )
271
308
random .seed (args .seed )
@@ -305,24 +342,32 @@ def main(args: argparse.Namespace):
305
342
args .model ,
306
343
)
307
344
)
345
+ benchmark_result = {}
308
346
benchmark_end_time = time .time ()
309
347
benchmark_time = benchmark_end_time - benchmark_start_time
310
348
print (f"Total time: { benchmark_time :.2f} s" )
311
349
print (f"Requests/min: { 60 * args .num_prompts / benchmark_time :.2f} " )
350
+ benchmark_result ['benchmark_time' ] = benchmark_time
312
351
313
352
total_output_tokens = np .sum ([output_len for _ , output_len , _ in
314
353
REQUEST_LATENCY ])
315
354
output_tokens_per_min = 60 * total_output_tokens / benchmark_time
316
355
print (f"Output_tokens/min: { output_tokens_per_min :.2f} " )
356
+ benchmark_result ['total_output_token' ] = int (total_output_tokens )
357
+ benchmark_result ['output_tokens_per_min' ] = output_tokens_per_min
317
358
318
359
total_input_tokens = np .sum ([prompt_len for prompt_len , _ , _ in
319
360
REQUEST_LATENCY ])
320
361
input_tokens_per_min = 60 * total_input_tokens / benchmark_time
321
362
print (f"Input_tokens/min: { input_tokens_per_min :.2f} " )
363
+ benchmark_result ['total_input_tokens' ] = int (total_input_tokens )
364
+ benchmark_result ['input_tokens_per_min' ] = input_tokens_per_min
322
365
323
366
total_tokens = total_input_tokens + total_output_tokens
324
367
tokens_per_min = 60 * total_tokens / benchmark_time
325
368
print (f"Tokens/min: { tokens_per_min :.2f} " )
369
+ benchmark_result ['total_tokens' ] = int (total_tokens )
370
+ benchmark_result ['tokens_per_min' ] = tokens_per_min
326
371
327
372
if args .machine_cost :
328
373
print (
@@ -336,6 +381,7 @@ def main(args: argparse.Namespace):
336
381
"Average seconds/request (includes waiting time on server):"
337
382
f" { avg_latency :.2f} "
338
383
)
384
+ benchmark_result ['avg_latency' ] = avg_latency
339
385
340
386
avg_per_token_latency = np .mean ([
341
387
latency / (prompt_len + output_len )
@@ -345,6 +391,7 @@ def main(args: argparse.Namespace):
345
391
"Average milliseconds/token (includes waiting time on server):"
346
392
f" { 1000 * avg_per_token_latency :.2f} "
347
393
)
394
+ benchmark_result ['avg_per_token_latency' ] = avg_per_token_latency
348
395
349
396
avg_per_output_token_latency = np .mean (
350
397
[latency / output_len for _ , output_len , latency in REQUEST_LATENCY ]
@@ -353,6 +400,7 @@ def main(args: argparse.Namespace):
353
400
"Average milliseconds/output_token (includes waiting time on server):"
354
401
f" { 1000 * avg_per_output_token_latency :.2f} "
355
402
)
403
+ benchmark_result ['avg_per_output_token_latency' ] = avg_per_output_token_latency
356
404
357
405
avg_input_len = np .mean (
358
406
[prompt_len for prompt_len , _ , _ in REQUEST_LATENCY ]
@@ -361,6 +409,7 @@ def main(args: argparse.Namespace):
361
409
"Average input length:"
362
410
f" { avg_input_len :.2f} "
363
411
)
412
+ benchmark_result ['avg_input_len' ] = avg_input_len
364
413
365
414
avg_output_len = np .mean (
366
415
[output_len for _ , output_len , _ in REQUEST_LATENCY ]
@@ -369,6 +418,10 @@ def main(args: argparse.Namespace):
369
418
"Average output length:"
370
419
f" { avg_output_len :.2f} "
371
420
)
421
+ benchmark_result ['avg_output_len' ] = avg_output_len
422
+
423
+ if args .save_json_results :
424
+ save_json_results (args , benchmark_result )
372
425
373
426
374
427
if __name__ == "__main__" :
@@ -479,6 +532,18 @@ def main(args: argparse.Namespace):
479
532
" and max_output_length."
480
533
),
481
534
)
535
+ parser .add_argument (
536
+ "--save-json-results" ,
537
+ action = "store_true" ,
538
+ help = "Whether to save benchmark results to a json file." ,
539
+ )
540
+ parser .add_argument (
541
+ "--additional-metadata-metrics-to-save" ,
542
+ type = str ,
543
+ help = (
544
+ "Additional metadata about the workload. Should be a dictionary in"
545
+ " the form of a string."
546
+ ),
547
+ )
482
548
cmd_args = parser .parse_args ()
483
549
main (cmd_args )
484
-
0 commit comments