Skip to content

Commit e8897e7

Browse files
jsondaicopybara-github
authored andcommitted
feat: GenAI SDK client - Add automatic candidate naming and creation timestamp to evaluation dataset metadata
PiperOrigin-RevId: 773786705
1 parent f8f66f1 commit e8897e7

File tree

3 files changed

+182
-1
lines changed

3 files changed

+182
-1
lines changed

tests/unit/vertexai/genai/test_evals.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,49 @@ def test_inference_with_string_model_success(
154154
}
155155
),
156156
)
157+
assert inference_result.candidate_name == "gemini-pro"
158+
assert inference_result.gcs_source is None
159+
160+
@mock.patch.object(_evals_utils, "EvalDatasetLoader")
161+
def test_inference_with_callable_model_sets_candidate_name(
162+
self, mock_eval_dataset_loader
163+
):
164+
mock_df = pd.DataFrame({"prompt": ["test prompt"]})
165+
mock_eval_dataset_loader.return_value.load.return_value = mock_df.to_dict(
166+
orient="records"
167+
)
168+
169+
def my_model_fn(contents):
170+
return "callable response"
171+
172+
inference_result = self.client.evals.run_inference(
173+
model=my_model_fn,
174+
src=mock_df,
175+
)
176+
assert inference_result.candidate_name == "my_model_fn"
177+
assert inference_result.gcs_source is None
178+
179+
@mock.patch.object(_evals_utils, "EvalDatasetLoader")
180+
def test_inference_with_lambda_model_candidate_name_is_none(
181+
self, mock_eval_dataset_loader
182+
):
183+
mock_df = pd.DataFrame({"prompt": ["test prompt"]})
184+
mock_eval_dataset_loader.return_value.load.return_value = mock_df.to_dict(
185+
orient="records"
186+
)
187+
188+
inference_result = self.client.evals.run_inference(
189+
model=lambda x: "lambda response", # pylint: disable=unnecessary-lambda
190+
src=mock_df,
191+
)
192+
# Lambdas may or may not have a __name__ depending on Python version/env
193+
# but it's typically '<lambda>' if it exists.
194+
# The code under test uses getattr(model, "__name__", None)
195+
assert (
196+
inference_result.candidate_name == "<lambda>"
197+
or inference_result.candidate_name is None
198+
)
199+
assert inference_result.gcs_source is None
157200

158201
@mock.patch.object(_evals_utils, "EvalDatasetLoader")
159202
def test_inference_with_callable_model_success(self, mock_eval_dataset_loader):
@@ -179,6 +222,8 @@ def mock_model_fn(contents):
179222
}
180223
),
181224
)
225+
assert inference_result.candidate_name == "mock_model_fn"
226+
assert inference_result.gcs_source is None
182227

183228
@mock.patch.object(_evals_common, "Models")
184229
@mock.patch.object(_evals_utils, "EvalDatasetLoader")
@@ -224,6 +269,8 @@ def test_inference_with_prompt_template(
224269
}
225270
),
226271
)
272+
assert inference_result.candidate_name == "gemini-pro"
273+
assert inference_result.gcs_source is None
227274

228275
@mock.patch.object(_evals_common, "Models")
229276
@mock.patch.object(_evals_utils, "EvalDatasetLoader")
@@ -273,6 +320,10 @@ def test_inference_with_gcs_destination(
273320
pd.testing.assert_frame_equal(
274321
inference_result.eval_dataset_df, expected_df_to_save
275322
)
323+
assert inference_result.candidate_name == "gemini-pro"
324+
assert inference_result.gcs_source == vertexai_genai_types.GcsSource(
325+
uris=[gcs_dest_path]
326+
)
276327

277328
@mock.patch.object(_evals_common, "Models")
278329
@mock.patch.object(_evals_utils, "EvalDatasetLoader")
@@ -322,6 +373,8 @@ def test_inference_with_local_destination(
322373
}
323374
)
324375
pd.testing.assert_frame_equal(inference_result.eval_dataset_df, expected_df)
376+
assert inference_result.candidate_name == "gemini-pro"
377+
assert inference_result.gcs_source is None
325378

326379
@mock.patch.object(_evals_common, "Models")
327380
@mock.patch.object(_evals_utils, "EvalDatasetLoader")
@@ -405,6 +458,8 @@ def test_inference_from_request_column_save_locally(
405458
expected_records, key=lambda x: x["request"]
406459
)
407460
os.remove(local_dest_path)
461+
assert inference_result.candidate_name == "gemini-pro"
462+
assert inference_result.gcs_source is None
408463

409464
@mock.patch.object(_evals_common, "Models")
410465
def test_inference_from_local_jsonl_file(self, mock_models):
@@ -478,6 +533,8 @@ def test_inference_from_local_jsonl_file(self, mock_models):
478533
any_order=True,
479534
)
480535
os.remove(local_src_path)
536+
assert inference_result.candidate_name == "gemini-pro"
537+
assert inference_result.gcs_source is None
481538

482539
@mock.patch.object(_evals_common, "Models")
483540
def test_inference_from_local_csv_file(self, mock_models):
@@ -548,6 +605,8 @@ def test_inference_from_local_csv_file(self, mock_models):
548605
any_order=True,
549606
)
550607
os.remove(local_src_path)
608+
assert inference_result.candidate_name == "gemini-pro"
609+
assert inference_result.gcs_source is None
551610

552611
@mock.patch.object(_evals_common, "Models")
553612
@mock.patch.object(_evals_utils, "EvalDatasetLoader")
@@ -719,6 +778,8 @@ def mock_generate_content_logic(*args, **kwargs):
719778
expected_df.sort_values(by="id").reset_index(drop=True),
720779
check_dtype=False,
721780
)
781+
assert inference_result.candidate_name == "gemini-pro"
782+
assert inference_result.gcs_source is None
722783

723784
@mock.patch.object(_evals_common, "Models")
724785
@mock.patch.object(_evals_utils, "EvalDatasetLoader")
@@ -794,6 +855,8 @@ def test_inference_with_multimodal_content(
794855
}
795856
),
796857
)
858+
assert inference_result.candidate_name == "gemini-pro"
859+
assert inference_result.gcs_source is None
797860

798861

799862
class TestMetricPromptBuilder:
@@ -3295,3 +3358,76 @@ def test_execute_evaluation_multiple_datasets(
32953358
assert summary_metric.mean_score == 1.0
32963359

32973360
assert mock_eval_dependencies["mock_evaluate_instances"].call_count == 2
3361+
3362+
def test_execute_evaluation_deduplicates_candidate_names(
3363+
self, mock_api_client_fixture, mock_eval_dependencies
3364+
):
3365+
"""Tests that duplicate candidate names are indexed."""
3366+
dataset1 = vertexai_genai_types.EvaluationDataset(
3367+
eval_dataset_df=pd.DataFrame(
3368+
[{"prompt": "p1", "response": "r1", "reference": "ref1"}]
3369+
),
3370+
candidate_name="gemini-pro",
3371+
)
3372+
dataset2 = vertexai_genai_types.EvaluationDataset(
3373+
eval_dataset_df=pd.DataFrame(
3374+
[{"prompt": "p1", "response": "r2", "reference": "ref1"}]
3375+
),
3376+
candidate_name="gemini-flash",
3377+
)
3378+
dataset3 = vertexai_genai_types.EvaluationDataset(
3379+
eval_dataset_df=pd.DataFrame(
3380+
[{"prompt": "p1", "response": "r3", "reference": "ref1"}]
3381+
),
3382+
candidate_name="gemini-pro",
3383+
)
3384+
3385+
mock_eval_dependencies[
3386+
"mock_evaluate_instances"
3387+
].return_value = vertexai_genai_types.EvaluateInstancesResponse(
3388+
exact_match_results=vertexai_genai_types.ExactMatchResults(
3389+
exact_match_metric_values=[
3390+
vertexai_genai_types.ExactMatchMetricValue(score=1.0)
3391+
]
3392+
)
3393+
)
3394+
3395+
result = _evals_common._execute_evaluation(
3396+
api_client=mock_api_client_fixture,
3397+
dataset=[dataset1, dataset2, dataset3],
3398+
metrics=[vertexai_genai_types.Metric(name="exact_match")],
3399+
)
3400+
3401+
assert result.metadata.candidate_names == [
3402+
"gemini-pro #1",
3403+
"gemini-flash",
3404+
"gemini-pro #2",
3405+
]
3406+
3407+
@mock.patch("vertexai._genai._evals_common.datetime")
3408+
def test_execute_evaluation_adds_creation_timestamp(
3409+
self, mock_datetime, mock_api_client_fixture, mock_eval_dependencies
3410+
):
3411+
"""Tests that creation_timestamp is added to the result metadata."""
3412+
import datetime
3413+
3414+
mock_now = datetime.datetime(
3415+
2025, 6, 18, 12, 0, 0, tzinfo=datetime.timezone.utc
3416+
)
3417+
mock_datetime.datetime.now.return_value = mock_now
3418+
3419+
dataset = vertexai_genai_types.EvaluationDataset(
3420+
eval_dataset_df=pd.DataFrame(
3421+
[{"prompt": "p", "response": "r", "reference": "r"}]
3422+
)
3423+
)
3424+
metric = vertexai_genai_types.Metric(name="exact_match")
3425+
3426+
result = _evals_common._execute_evaluation(
3427+
api_client=mock_api_client_fixture,
3428+
dataset=dataset,
3429+
metrics=[metric],
3430+
)
3431+
3432+
assert result.metadata is not None
3433+
assert result.metadata.creation_timestamp == mock_now

vertexai/_genai/_evals_common.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313
# limitations under the License.
1414
#
1515
"""Common utilities for evals."""
16+
import collections
1617
import concurrent.futures
18+
import datetime
1719
import json
1820
import logging
1921
import os
@@ -475,6 +477,17 @@ def _execute_inference(
475477
end_time = time.time()
476478
logger.info("Inference completed in %.2f seconds.", end_time - start_time)
477479

480+
candidate_name = None
481+
if isinstance(model, str):
482+
candidate_name = model
483+
elif callable(model):
484+
candidate_name = getattr(model, "__name__", None)
485+
486+
evaluation_dataset = types.EvaluationDataset(
487+
eval_dataset_df=results_df,
488+
candidate_name=candidate_name,
489+
)
490+
478491
if dest:
479492
file_name = "inference_results.jsonl"
480493
full_dest_path = dest
@@ -500,13 +513,14 @@ def _execute_inference(
500513
file_type="jsonl",
501514
)
502515
logger.info("Results saved to GCS: %s", full_dest_path)
516+
evaluation_dataset.gcs_source = types.GcsSource(uris=[full_dest_path])
503517
else:
504518
results_df.to_json(full_dest_path, orient="records", lines=True)
505519
logger.info("Results saved locally to: %s", full_dest_path)
506520
except Exception as e: # pylint: disable=broad-exception-caught
507521
logger.error("Failed to save results to %s. Error: %s", full_dest_path, e)
508522

509-
return types.EvaluationDataset(eval_dataset_df=results_df)
523+
return evaluation_dataset
510524

511525

512526
def _get_dataset_source(
@@ -690,6 +704,19 @@ def _execute_evaluation(
690704
f"Unsupported dataset type: {type(dataset)}. Must be an"
691705
" EvaluationDataset or a list of EvaluationDataset."
692706
)
707+
original_candidate_names = [
708+
ds.candidate_name or f"candidate_{i+1}" for i, ds in enumerate(dataset_list)
709+
]
710+
name_counts = collections.Counter(original_candidate_names)
711+
deduped_candidate_names = []
712+
current_name_counts = collections.defaultdict(int)
713+
714+
for name in original_candidate_names:
715+
if name_counts[name] > 1:
716+
current_name_counts[name] += 1
717+
deduped_candidate_names.append(f"{name} #{current_name_counts[name]}")
718+
else:
719+
deduped_candidate_names.append(name)
693720

694721
loader = _evals_utils.EvalDatasetLoader(api_client=api_client)
695722
processed_eval_dataset, num_response_candidates = _resolve_dataset_inputs(
@@ -714,6 +741,17 @@ def _execute_evaluation(
714741
logger.info("Evaluation took: %f seconds", t2 - t1)
715742

716743
evaluation_result.evaluation_dataset = dataset_list
744+
745+
if not evaluation_result.metadata:
746+
evaluation_result.metadata = types.EvaluationRunMetadata()
747+
748+
evaluation_result.metadata.creation_timestamp = datetime.datetime.now(
749+
datetime.timezone.utc
750+
)
751+
752+
if deduped_candidate_names:
753+
evaluation_result.metadata.candidate_names = deduped_candidate_names
754+
717755
logger.info("Evaluation run completed.")
718756

719757
if dest:

vertexai/_genai/types.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2064,6 +2064,10 @@ class EvaluationDataset(_common.BaseModel):
20642064
default=None,
20652065
description="""The evaluation dataset in the form of a Pandas DataFrame.""",
20662066
)
2067+
candidate_name: Optional[str] = Field(
2068+
default=None,
2069+
description="""The name of the candidate model or agent for this evaluation dataset.""",
2070+
)
20672071
gcs_source: Optional[GcsSource] = Field(
20682072
default=None,
20692073
description="""The GCS source for the evaluation dataset.""",
@@ -2089,6 +2093,9 @@ class EvaluationDatasetDict(TypedDict, total=False):
20892093
eval_dataset_df: Optional[pd.DataFrame]
20902094
"""The evaluation dataset in the form of a Pandas DataFrame."""
20912095

2096+
candidate_name: Optional[str]
2097+
"""The name of the candidate model or agent for this evaluation dataset."""
2098+
20922099
gcs_source: Optional[GcsSourceDict]
20932100
"""The GCS source for the evaluation dataset."""
20942101

0 commit comments

Comments
 (0)