Skip to content

Commit be2c99f

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Track the output path for metrics_table in experiments metadata, if output bucket is specified but no file name is specified, we will generate a unique file name
PiperOrigin-RevId: 742523374
1 parent cd8ecfc commit be2c99f

File tree

5 files changed

+115
-65
lines changed

5 files changed

+115
-65
lines changed

tests/unit/vertexai/test_evaluation.py

+24-8
Original file line numberDiff line numberDiff line change
@@ -2086,18 +2086,34 @@ def test_upload_results(self, mock_storage_blob_from_string):
20862086
mock.ANY,
20872087
)
20882088

2089-
def test_upload_results_with_default_file_name(self, mock_storage_blob_from_string):
2089+
def test_upload_results_with_default_output_file_name(
2090+
self, mock_storage_blob_from_string
2091+
):
2092+
mock_metric_results = _MOCK_EXACT_MATCH_RESULT
20902093
with mock.patch.object(
20912094
aiplatform_utils, "timestamped_unique_name"
20922095
) as mock_timestamped_unique_name:
2093-
mock_timestamped_unique_name.return_value = "2025-02-10-12-00-00-12345"
2094-
evaluation.utils.upload_evaluation_results(
2095-
MOCK_EVAL_RESULT,
2096-
_TEST_BUCKET,
2097-
)
2098-
2096+
with mock.patch.object(
2097+
target=gapic_evaluation_services.EvaluationServiceClient,
2098+
attribute="evaluate_instances",
2099+
side_effect=mock_metric_results,
2100+
):
2101+
mock_timestamped_unique_name.return_value = "2025-02-10-12-00-00-12345"
2102+
eval_dataset = pd.DataFrame(
2103+
{
2104+
"response": ["test", "text"],
2105+
"reference": ["test", "ref"],
2106+
}
2107+
)
2108+
test_metrics = ["exact_match"]
2109+
test_eval_task = EvalTask(
2110+
dataset=eval_dataset,
2111+
metrics=test_metrics,
2112+
output_uri_prefix=_TEST_BUCKET,
2113+
)
2114+
_ = test_eval_task.evaluate()
20992115
mock_storage_blob_from_string.assert_any_call(
2100-
uri="gs://test-bucket/eval_results_2025-02-10-12-00-00-12345/eval_results_2025-02-10-12-00-00-12345.csv",
2116+
uri="gs://test-bucket/eval_results_2025-02-10-12-00-00-12345/summary_metrics.json",
21012117
client=mock.ANY,
21022118
)
21032119

vertexai/evaluation/eval_task.py

+32-13
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,13 @@
2222
from google.api_core import exceptions
2323
import vertexai
2424
from google.cloud.aiplatform import base
25+
from google.cloud.aiplatform import utils
2526
from google.cloud.aiplatform.metadata import metadata
2627
from vertexai import generative_models
2728
from vertexai.evaluation import _base as eval_base
2829
from vertexai.evaluation import _evaluation
2930
from vertexai.evaluation import constants
30-
from vertexai.evaluation import utils
31+
from vertexai.evaluation import utils as eval_utils
3132
from vertexai.evaluation.metrics import (
3233
_base as metrics_base,
3334
)
@@ -289,10 +290,10 @@ def __init__(
289290
evaluation results.
290291
"""
291292
self._raw_dataset = dataset
292-
self._dataset = utils.load_dataset(dataset)
293+
self._dataset = eval_utils.load_dataset(dataset)
293294
self._metrics = metrics
294295
self._experiment = experiment
295-
self._metric_column_mapping = utils.initialize_metric_column_mapping(
296+
self._metric_column_mapping = eval_utils.initialize_metric_column_mapping(
296297
metric_column_mapping, self._dataset
297298
)
298299
self.output_uri_prefix = output_uri_prefix
@@ -320,6 +321,7 @@ def _evaluate_with_experiment(
320321
experiment_run_name: Optional[str] = None,
321322
evaluation_service_qps: Optional[float] = None,
322323
retry_timeout: float = 120.0,
324+
output_file_name: Optional[str] = None,
323325
) -> EvalResult:
324326
"""Runs an evaluation for the EvalTask with an experiment.
325327
@@ -336,13 +338,19 @@ def _evaluate_with_experiment(
336338
evaluation_service_qps: The custom QPS limit for the evaluation service.
337339
retry_timeout: How long to keep retrying the evaluation requests for
338340
the whole evaluation dataset, in seconds.
341+
output_file_name: The file name with csv suffix to store the output
342+
metrics_table to be tracked in the experiment run.
339343
340344
Returns:
341345
The evaluation result.
342346
"""
343347
self._validate_experiment_run()
344348
with vertexai.preview.start_run(experiment_run_name):
345-
self._log_eval_experiment_param(model, prompt_template)
349+
self._log_eval_experiment_param(
350+
model=model,
351+
prompt_template=prompt_template,
352+
output_file_name=output_file_name,
353+
)
346354
eval_result = _evaluation.evaluate(
347355
dataset=self._dataset,
348356
metrics=self._metrics,
@@ -413,6 +421,8 @@ def evaluate(
413421
"`vertexai.init(experiment='experiment_name')`for logging this"
414422
" evaluation run."
415423
)
424+
if self.output_uri_prefix and not output_file_name:
425+
output_file_name = f"eval_results_{utils.timestamped_unique_name()}.csv"
416426
self._verify_and_set_response_column_name(
417427
response_column_name=response_column_name,
418428
metric_column_mapping_key=constants.Dataset.MODEL_RESPONSE_COLUMN,
@@ -433,6 +443,7 @@ def evaluate(
433443
experiment_run_name=experiment_run_name,
434444
evaluation_service_qps=evaluation_service_qps,
435445
retry_timeout=retry_timeout,
446+
output_file_name=output_file_name,
436447
)
437448
metadata._experiment_tracker.set_experiment(
438449
experiment=global_experiment_name,
@@ -449,6 +460,7 @@ def evaluate(
449460
experiment_run_name=experiment_run_name,
450461
evaluation_service_qps=evaluation_service_qps,
451462
retry_timeout=retry_timeout,
463+
output_file_name=output_file_name,
452464
)
453465
metadata._experiment_tracker.reset()
454466
elif not self._experiment and global_experiment_name:
@@ -458,6 +470,7 @@ def evaluate(
458470
experiment_run_name=experiment_run_name,
459471
evaluation_service_qps=evaluation_service_qps,
460472
retry_timeout=retry_timeout,
473+
output_file_name=output_file_name,
461474
)
462475
else:
463476
eval_result = _evaluation.evaluate(
@@ -490,7 +503,7 @@ def evaluate(
490503
if isinstance(self._raw_dataset, str):
491504
dataset_uri = self._raw_dataset
492505

493-
utils.upload_evaluation_results(
506+
eval_utils.upload_evaluation_results(
494507
eval_result,
495508
self.output_uri_prefix,
496509
output_file_name,
@@ -513,35 +526,41 @@ def _log_eval_experiment_param(
513526
self,
514527
model: Optional[Union[GenerativeModel, Callable[[str], str]]] = None,
515528
prompt_template: Optional[str] = None,
529+
output_file_name: Optional[str] = None,
516530
) -> None:
517531
"""Logs variable input parameters of an evaluation to an experiment run."""
518-
model_metadata = {}
532+
eval_metadata = {}
519533

520534
if prompt_template is not None:
521-
model_metadata.update({"prompt_template": prompt_template})
535+
eval_metadata.update({"prompt_template": prompt_template})
522536

523537
if isinstance(model, GenerativeModel):
524-
model_metadata.update(
538+
eval_metadata.update(
525539
{
526540
"model_name": model._model_name,
527541
}
528542
)
529543

530544
if model._generation_config and isinstance(model._generation_config, dict):
531-
model_metadata.update(**model._generation_config)
545+
eval_metadata.update(**model._generation_config)
532546

533547
if model._safety_settings and isinstance(model._safety_settings, dict):
534548
safety_settings = model._safety_settings
535549
safety_settings_as_str = {
536550
category.name: threshold.name
537551
for category, threshold in safety_settings.items()
538552
}
539-
model_metadata.update(safety_settings_as_str)
553+
eval_metadata.update(safety_settings_as_str)
554+
555+
if self.output_uri_prefix and output_file_name:
556+
eval_metadata.update(
557+
{"output_file": self.output_uri_prefix + "/" + output_file_name}
558+
)
540559

541-
if model_metadata:
542-
_LOGGER.info(f"Logging Eval Experiment metadata: {model_metadata}")
560+
if eval_metadata:
561+
_LOGGER.info(f"Logging Eval Experiment metadata: {eval_metadata}")
543562
try:
544-
vertexai.preview.log_params(model_metadata)
563+
vertexai.preview.log_params(eval_metadata)
545564
except (ValueError, TypeError) as e:
546565
_LOGGER.warning(f"Experiment metadata logging failed: {str(e)}")
547566

vertexai/evaluation/utils.py

+20-21
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@
3535
evaluation_service as gapic_evaluation_services,
3636
)
3737
from vertexai.evaluation import _base as eval_base
38+
from vertexai.evaluation.metrics import _base as metrics_base
3839
from vertexai.evaluation.metrics import (
39-
_base as metrics_base,
4040
metric_prompt_template as metric_prompt_template_base,
4141
)
4242

@@ -359,26 +359,25 @@ def upload_evaluation_results(
359359
if eval_result.metrics_table is None:
360360
return
361361
if destination_uri_prefix.startswith(_GCS_PREFIX):
362-
if not file_name:
363-
file_name = f"eval_results_{utils.timestamped_unique_name()}.csv"
364-
base_name, extension = os.path.splitext(file_name)
365-
file_type = extension.lower()[1:]
366-
output_folder = destination_uri_prefix + "/" + base_name
367-
metrics_table_path = output_folder + "/" + file_name
368-
_upload_pandas_df_to_gcs(
369-
eval_result.metrics_table, metrics_table_path, file_type
370-
)
371-
_upload_evaluation_summary_to_gcs(
372-
eval_result.summary_metrics,
373-
output_folder + "/summary_metrics.json",
374-
candidate_model_name,
375-
baseline_model_name,
376-
dataset_uri,
377-
metrics,
378-
)
379-
_ipython_utils.display_gen_ai_evaluation_results_button(
380-
metrics_table_path.split(_GCS_PREFIX)[1]
381-
)
362+
if file_name:
363+
base_name, extension = os.path.splitext(file_name)
364+
file_type = extension.lower()[1:]
365+
output_folder = destination_uri_prefix + "/" + base_name
366+
metrics_table_path = output_folder + "/" + file_name
367+
_upload_pandas_df_to_gcs(
368+
eval_result.metrics_table, metrics_table_path, file_type
369+
)
370+
_upload_evaluation_summary_to_gcs(
371+
eval_result.summary_metrics,
372+
output_folder + "/summary_metrics.json",
373+
candidate_model_name,
374+
baseline_model_name,
375+
dataset_uri,
376+
metrics,
377+
)
378+
_ipython_utils.display_gen_ai_evaluation_results_button(
379+
metrics_table_path.split(_GCS_PREFIX)[1]
380+
)
382381
else:
383382
raise ValueError(
384383
f"Unsupported destination URI: {destination_uri_prefix}."

vertexai/preview/evaluation/eval_task.py

+20-3
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from google.api_core import exceptions
2525
import vertexai
2626
from google.cloud.aiplatform import base
27+
from google.cloud.aiplatform import utils
2728
from google.cloud.aiplatform.metadata import metadata
2829
from vertexai import generative_models
2930
from vertexai.preview import reasoning_engines
@@ -336,6 +337,7 @@ def _evaluate_with_experiment(
336337
experiment_run_name: Optional[str] = None,
337338
evaluation_service_qps: Optional[float] = None,
338339
retry_timeout: float = 120.0,
340+
output_file_name: Optional[str] = None,
339341
) -> EvalResult:
340342
"""Runs an evaluation for the EvalTask with an experiment.
341343
@@ -355,14 +357,19 @@ def _evaluate_with_experiment(
355357
evaluation_service_qps: The custom QPS limit for the evaluation service.
356358
retry_timeout: How long to keep retrying the evaluation requests for
357359
the whole evaluation dataset, in seconds.
360+
output_path: The file name with csv suffix to store the output
361+
metrics_table to be tracked in the experiment run.
358362
359363
Returns:
360364
The evaluation result.
361365
"""
362366
self._validate_experiment_run()
363367
with vertexai.preview.start_run(experiment_run_name):
364368
self._log_eval_experiment_param(
365-
model=model, runnable=runnable, prompt_template=prompt_template
369+
model=model,
370+
runnable=runnable,
371+
prompt_template=prompt_template,
372+
output_file_name=output_file_name,
366373
)
367374
eval_result = _evaluation.evaluate(
368375
dataset=self._dataset,
@@ -451,7 +458,8 @@ def evaluate(
451458
response_column_name=baseline_model_response_column_name,
452459
metric_column_mapping_key=constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
453460
)
454-
461+
if self.output_uri_prefix and not output_file_name:
462+
output_file_name = f"eval_results_{utils.timestamped_unique_name()}.csv"
455463
experiment_run_name = experiment_run_name or f"{uuid.uuid4()}"
456464
if self._experiment and global_experiment_name:
457465
metadata._experiment_tracker.set_experiment( # pylint: disable=protected-access
@@ -464,6 +472,7 @@ def evaluate(
464472
experiment_run_name=experiment_run_name,
465473
evaluation_service_qps=evaluation_service_qps,
466474
retry_timeout=retry_timeout,
475+
output_file_name=output_file_name,
467476
)
468477
metadata._experiment_tracker.set_experiment( # pylint: disable=protected-access
469478
experiment=global_experiment_name, backing_tensorboard=False
@@ -479,6 +488,7 @@ def evaluate(
479488
experiment_run_name=experiment_run_name,
480489
evaluation_service_qps=evaluation_service_qps,
481490
retry_timeout=retry_timeout,
491+
output_file_name=output_file_name,
482492
)
483493
metadata._experiment_tracker.reset() # pylint: disable=protected-access
484494
elif not self._experiment and global_experiment_name:
@@ -489,6 +499,7 @@ def evaluate(
489499
experiment_run_name=experiment_run_name,
490500
evaluation_service_qps=evaluation_service_qps,
491501
retry_timeout=retry_timeout,
502+
output_file_name=output_file_name,
492503
)
493504
else:
494505
eval_result = _evaluation.evaluate(
@@ -503,7 +514,7 @@ def evaluate(
503514
autorater_config=self._autorater_config,
504515
)
505516
eval_utils.upload_evaluation_results(
506-
eval_result.metrics_table, self.output_uri_prefix, output_file_name
517+
eval_result, self.output_uri_prefix, output_file_name
507518
)
508519
return eval_result
509520

@@ -522,6 +533,7 @@ def _log_eval_experiment_param(
522533
model: _ModelType = None,
523534
runnable: _RunnableType = None,
524535
prompt_template: Optional[str] = None,
536+
output_file_name: Optional[str] = None,
525537
) -> None:
526538
"""Logs variable input parameters of an evaluation to an experiment run."""
527539
eval_metadata = {}
@@ -568,6 +580,11 @@ def _log_eval_experiment_param(
568580
} # pylint: disable=protected-access
569581
)
570582

583+
if self.output_uri_prefix and output_file_name:
584+
eval_metadata.update(
585+
{"output_file": self.output_uri_prefix + "/" + output_file_name}
586+
)
587+
571588
if eval_metadata:
572589
_LOGGER.info(
573590
f"Logging Eval experiment evaluation metadata: {eval_metadata}"

vertexai/preview/evaluation/utils.py

+19-20
Original file line numberDiff line numberDiff line change
@@ -368,26 +368,25 @@ def upload_evaluation_results(
368368
if eval_result.metrics_table is None:
369369
return
370370
if destination_uri_prefix.startswith(_GCS_PREFIX):
371-
if not file_name:
372-
file_name = f"eval_results_{utils.timestamped_unique_name()}.csv"
373-
base_name, extension = os.path.splitext(file_name)
374-
file_type = extension.lower()[1:]
375-
output_folder = destination_uri_prefix + "/" + base_name
376-
metrics_table_path = output_folder + "/" + file_name
377-
_upload_pandas_df_to_gcs(
378-
eval_result.metrics_table, metrics_table_path, file_type
379-
)
380-
_upload_evaluation_summary_to_gcs(
381-
eval_result.summary_metrics,
382-
output_folder + "/summary_metrics.json",
383-
candidate_model_name,
384-
baseline_model_name,
385-
dataset_uri,
386-
metrics,
387-
)
388-
_ipython_utils.display_gen_ai_evaluation_results_button(
389-
metrics_table_path.split(_GCS_PREFIX)[1]
390-
)
371+
if file_name:
372+
base_name, extension = os.path.splitext(file_name)
373+
file_type = extension.lower()[1:]
374+
output_folder = destination_uri_prefix + "/" + base_name
375+
metrics_table_path = output_folder + "/" + file_name
376+
_upload_pandas_df_to_gcs(
377+
eval_result.metrics_table, metrics_table_path, file_type
378+
)
379+
_upload_evaluation_summary_to_gcs(
380+
eval_result.summary_metrics,
381+
output_folder + "/summary_metrics.json",
382+
candidate_model_name,
383+
baseline_model_name,
384+
dataset_uri,
385+
metrics,
386+
)
387+
_ipython_utils.display_gen_ai_evaluation_results_button(
388+
metrics_table_path.split(_GCS_PREFIX)[1]
389+
)
391390
else:
392391
raise ValueError(
393392
f"Unsupported destination URI: {destination_uri_prefix}."

0 commit comments

Comments
 (0)