Skip to content

Commit 41cd5a8

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Evaluation: Release GenAI Evaluation SDK Agent Evaluation features to vertexai.preview module.
PiperOrigin-RevId: 698077630
1 parent df2c650 commit 41cd5a8

18 files changed

+5234
-18
lines changed

tests/unit/vertexai/test_evaluation.py

+178-8
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@
3232
from google.cloud.aiplatform_v1.types import (
3333
evaluation_service as gapic_evaluation_service_types,
3434
)
35+
from google.cloud.aiplatform_v1beta1.services import (
36+
evaluation_service as gapic_evaluation_services_preview,
37+
)
38+
from google.cloud.aiplatform_v1beta1.types import (
39+
evaluation_service as gapic_evaluation_service_types_preview,
40+
)
3541
from vertexai import evaluation
3642
from vertexai import generative_models
3743
from vertexai.evaluation import _base as eval_base
@@ -45,13 +51,19 @@
4551
)
4652
from vertexai.evaluation.metrics import pairwise_metric
4753
from vertexai.evaluation.metrics import pointwise_metric
54+
from vertexai.preview import evaluation as evaluation_preview
55+
from vertexai.preview import reasoning_engines
4856
import numpy as np
4957
import pandas as pd
5058
import pytest
5159

5260

5361
EvalTask = eval_task.EvalTask
62+
EvalTaskPreview = evaluation_preview.eval_task.EvalTask
5463
Pointwise = metric_prompt_template_examples.MetricPromptTemplateExamples.Pointwise
64+
PointwisePreview = (
65+
evaluation_preview.metrics.metric_prompt_template_examples.MetricPromptTemplateExamples.Pointwise
66+
)
5567
Pairwise = metric_prompt_template_examples.MetricPromptTemplateExamples.Pairwise
5668

5769
_TEST_PROJECT = "test-project"
@@ -142,6 +154,15 @@
142154
"instruction": ["test", "instruction"],
143155
}
144156
)
157+
_TEST_AGENT_EVAL_DATASET_WITHOUT_RESPONSE = pd.DataFrame(
158+
{
159+
"prompt": ["test_input1", "test_input2"],
160+
"reference_trajectory": [
161+
[{"tool_name": "test_tool1"}, {"tool_name": "test_tool2"}],
162+
[{"tool_name": "test_tool3"}, {"tool_name": "test_tool4"}],
163+
],
164+
},
165+
)
145166
_TEST_EVAL_DATASET_ALL_INCLUDED = pd.DataFrame(
146167
{
147168
"prompt": ["test_prompt", "text_prompt"],
@@ -300,6 +321,25 @@
300321
{response}
301322
"""
302323

324+
_MOCK_RUNNABLE_INFERENCE_RESPONSE = [
325+
{
326+
"input": "test_input",
327+
"output": "test_output",
328+
"intermediate_steps": [
329+
[{"kwargs": {"tool": "test_tool1"}, "tool_output": "test_tool_output"}],
330+
[{"kwargs": {"tool": "test_tool2"}, "tool_output": "test_tool_output"}],
331+
],
332+
},
333+
{
334+
"input": "test_input",
335+
"output": "test_output",
336+
"intermediate_steps": [
337+
[{"kwargs": {"tool": "test_tool2"}, "tool_output": "test_tool_output"}],
338+
[{"kwargs": {"tool": "test_tool3"}, "tool_output": "test_tool_output"}],
339+
],
340+
},
341+
]
342+
303343
_MOCK_EXACT_MATCH_RESULT = (
304344
gapic_evaluation_service_types.EvaluateInstancesResponse(
305345
exact_match_results=gapic_evaluation_service_types.ExactMatchResults(
@@ -316,6 +356,26 @@
316356
)
317357
),
318358
)
359+
_MOCK_TRAJECTORY_EXACT_MATCH_RESULT = (
360+
gapic_evaluation_service_types_preview.EvaluateInstancesResponse(
361+
trajectory_exact_match_results=gapic_evaluation_service_types_preview.TrajectoryExactMatchResults(
362+
trajectory_exact_match_metric_values=[
363+
gapic_evaluation_service_types_preview.TrajectoryExactMatchMetricValue(
364+
score=1.0
365+
),
366+
]
367+
)
368+
),
369+
gapic_evaluation_service_types_preview.EvaluateInstancesResponse(
370+
trajectory_exact_match_results=gapic_evaluation_service_types_preview.TrajectoryExactMatchResults(
371+
trajectory_exact_match_metric_values=[
372+
gapic_evaluation_service_types_preview.TrajectoryExactMatchMetricValue(
373+
score=0.0
374+
),
375+
]
376+
)
377+
),
378+
)
319379
_MOCK_POINTWISE_RESULT = (
320380
gapic_evaluation_service_types.EvaluateInstancesResponse(
321381
pointwise_metric_result=gapic_evaluation_service_types.PointwiseMetricResult(
@@ -354,6 +414,18 @@
354414
)
355415
),
356416
)
417+
_MOCK_COHERENCE_RESULT = (
418+
gapic_evaluation_service_types_preview.EvaluateInstancesResponse(
419+
pointwise_metric_result=gapic_evaluation_service_types_preview.PointwiseMetricResult(
420+
score=5, explanation="explanation"
421+
)
422+
),
423+
gapic_evaluation_service_types_preview.EvaluateInstancesResponse(
424+
pointwise_metric_result=gapic_evaluation_service_types_preview.PointwiseMetricResult(
425+
score=4, explanation="explanation"
426+
)
427+
),
428+
)
357429
_MOCK_PAIRWISE_SUMMARIZATION_QUALITY_RESULT = (
358430
gapic_evaluation_service_types.EvaluateInstancesResponse(
359431
pairwise_metric_result=gapic_evaluation_service_types.PairwiseMetricResult(
@@ -1177,6 +1249,106 @@ def test_eval_result_experiment_run_logging(self):
11771249
)
11781250

11791251

1252+
@pytest.mark.usefixtures("google_auth_mock")
1253+
class TestAgentEvaluation:
1254+
def setup_method(self):
1255+
vertexai.init(
1256+
project=_TEST_PROJECT,
1257+
location=_TEST_LOCATION,
1258+
)
1259+
1260+
def teardown_method(self):
1261+
initializer.global_pool.shutdown(wait=True)
1262+
1263+
@pytest.mark.parametrize("api_transport", ["grpc", "rest"])
1264+
def test_runnable_response_eval_with_runnable_inference(self, api_transport):
1265+
aiplatform.init(
1266+
project=_TEST_PROJECT,
1267+
location=_TEST_LOCATION,
1268+
api_transport=api_transport,
1269+
)
1270+
mock_runnable = mock.create_autospec(reasoning_engines.Queryable, instance=True)
1271+
mock_runnable.query.return_value = _MOCK_RUNNABLE_INFERENCE_RESPONSE
1272+
1273+
test_metrics = [PointwisePreview.COHERENCE]
1274+
test_eval_task = EvalTaskPreview(
1275+
dataset=_TEST_AGENT_EVAL_DATASET_WITHOUT_RESPONSE, metrics=test_metrics
1276+
)
1277+
mock_metric_results = _MOCK_COHERENCE_RESULT
1278+
with mock.patch.object(
1279+
target=gapic_evaluation_services_preview.EvaluationServiceClient,
1280+
attribute="evaluate_instances",
1281+
side_effect=mock_metric_results,
1282+
):
1283+
test_result = test_eval_task.evaluate(
1284+
runnable=mock_runnable,
1285+
prompt_template="test prompt template",
1286+
)
1287+
1288+
assert test_result.summary_metrics["row_count"] == 2
1289+
assert test_result.summary_metrics["coherence/mean"] == 4.5
1290+
assert test_result.summary_metrics["coherence/std"] == pytest.approx(0.7, 0.1)
1291+
assert set(test_result.metrics_table.columns.values) == set(
1292+
[
1293+
"prompt",
1294+
"reference_trajectory",
1295+
"response",
1296+
"latency_in_seconds",
1297+
"failure",
1298+
"predicted_trajectory",
1299+
"coherence/score",
1300+
"coherence/explanation",
1301+
]
1302+
)
1303+
assert list(test_result.metrics_table["coherence/score"].values) == [5, 4]
1304+
assert list(test_result.metrics_table["coherence/explanation"].values) == [
1305+
"explanation",
1306+
"explanation",
1307+
]
1308+
1309+
@pytest.mark.parametrize("api_transport", ["grpc", "rest"])
1310+
def test_runnable_trajectory_eval_with_runnable_inference(self, api_transport):
1311+
aiplatform.init(
1312+
project=_TEST_PROJECT,
1313+
location=_TEST_LOCATION,
1314+
api_transport=api_transport,
1315+
)
1316+
mock_runnable = mock.create_autospec(reasoning_engines.Queryable, instance=True)
1317+
mock_runnable.query.return_value = _MOCK_RUNNABLE_INFERENCE_RESPONSE
1318+
1319+
test_metrics = ["trajectory_exact_match"]
1320+
test_eval_task = EvalTaskPreview(
1321+
dataset=_TEST_AGENT_EVAL_DATASET_WITHOUT_RESPONSE, metrics=test_metrics
1322+
)
1323+
mock_metric_results = _MOCK_TRAJECTORY_EXACT_MATCH_RESULT
1324+
with mock.patch.object(
1325+
target=gapic_evaluation_services_preview.EvaluationServiceClient,
1326+
attribute="evaluate_instances",
1327+
side_effect=mock_metric_results,
1328+
):
1329+
test_result = test_eval_task.evaluate(runnable=mock_runnable)
1330+
1331+
assert test_result.summary_metrics["row_count"] == 2
1332+
assert test_result.summary_metrics["trajectory_exact_match/mean"] == 0.5
1333+
assert test_result.summary_metrics[
1334+
"trajectory_exact_match/std"
1335+
] == pytest.approx(0.7, 0.1)
1336+
assert set(test_result.metrics_table.columns.values) == set(
1337+
[
1338+
"prompt",
1339+
"response",
1340+
"latency_in_seconds",
1341+
"failure",
1342+
"predicted_trajectory",
1343+
"reference_trajectory",
1344+
"trajectory_exact_match/score",
1345+
]
1346+
)
1347+
assert list(
1348+
test_result.metrics_table["trajectory_exact_match/score"].values
1349+
) == [1.0, 0.0]
1350+
1351+
11801352
@pytest.mark.usefixtures("google_auth_mock")
11811353
class TestEvaluationErrors:
11821354
def setup_method(self):
@@ -1376,11 +1548,10 @@ def test_evaluate_baseline_model_response_column_not_provided(
13761548
):
13771549
test_eval_task.evaluate()
13781550

1379-
def test_evaluate_response_column_not_provided(
1380-
self,
1381-
):
1551+
@pytest.mark.parametrize("eval_task_version", [EvalTask, EvalTaskPreview])
1552+
def test_evaluate_response_column_not_provided(self, eval_task_version):
13821553
test_eval_dataset = _TEST_EVAL_DATASET_SINGLE
1383-
test_eval_task = EvalTask(
1554+
test_eval_task = eval_task_version(
13841555
dataset=test_eval_dataset,
13851556
metrics=["exact_match"],
13861557
)
@@ -1395,11 +1566,10 @@ def test_evaluate_response_column_not_provided(
13951566
):
13961567
test_eval_task.evaluate()
13971568

1398-
def test_evaluate_reference_column_not_provided(
1399-
self,
1400-
):
1569+
@pytest.mark.parametrize("eval_task_version", [EvalTask, EvalTaskPreview])
1570+
def test_evaluate_reference_column_not_provided(self, eval_task_version):
14011571
test_eval_dataset = pd.DataFrame({"response": ["test", "text"]})
1402-
test_eval_task = EvalTask(
1572+
test_eval_task = eval_task_version(
14031573
dataset=test_eval_dataset,
14041574
metrics=["exact_match"],
14051575
)

vertexai/preview/evaluation/__init__.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
#
1717
"""Vertex Gen AI Evaluation Service Module."""
1818

19-
from vertexai.evaluation import _base
20-
from vertexai.evaluation import eval_task
21-
from vertexai.evaluation import metrics
22-
from vertexai.evaluation import prompt_template
19+
from vertexai.preview.evaluation import _base
20+
from vertexai.preview.evaluation import eval_task
21+
from vertexai.preview.evaluation import metrics
22+
from vertexai.preview.evaluation import prompt_template
2323

2424

2525
EvalResult = _base.EvalResult

vertexai/preview/evaluation/_base.py

+97
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Copyright 2024 Google LLC
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
"""Base classes for evaluation."""
18+
19+
20+
import dataclasses
21+
from typing import Dict, List, Optional, TYPE_CHECKING, Union
22+
23+
from google.cloud.aiplatform_v1beta1.services import (
24+
evaluation_service as gapic_evaluation_services,
25+
)
26+
from vertexai.preview.evaluation.metrics import (
27+
_base as metrics_base,
28+
)
29+
30+
if TYPE_CHECKING:
31+
import pandas as pd
32+
33+
34+
@dataclasses.dataclass
35+
class EvaluationRunConfig:
36+
"""Evaluation Run Configurations.
37+
38+
Attributes:
39+
dataset: The dataset to evaluate.
40+
metrics: The list of metric names, or Metric instances to evaluate.
41+
metric_column_mapping: An optional dictionary column mapping that overrides
42+
the metric prompt template input variable names with mapped the evaluation
43+
dataset column names, used during evaluation. For example, if the
44+
input_variables of the metric prompt template are ["context",
45+
"reference"], the metric_column_mapping can be { "context":
46+
"news_context", "reference": "ground_truth", "response":
47+
"model_1_response" } if the dataset has columns "news_context",
48+
"ground_truth" and "model_1_response".
49+
client: The evaluation service client.
50+
evaluation_service_qps: The custom QPS limit for the evaluation service.
51+
retry_timeout: How long to keep retrying the evaluation requests, in
52+
seconds.
53+
"""
54+
55+
dataset: "pd.DataFrame"
56+
metrics: List[Union[str, metrics_base._Metric]]
57+
metric_column_mapping: Dict[str, str]
58+
client: gapic_evaluation_services.EvaluationServiceClient
59+
evaluation_service_qps: float
60+
retry_timeout: float
61+
62+
def validate_dataset_column(self, column_name: str) -> None:
63+
"""Validates that the column names in the column map are in the dataset.
64+
65+
Args:
66+
column_name: The column name to validate.
67+
68+
Raises:
69+
KeyError: If any of the column names are not in the dataset.
70+
"""
71+
if (
72+
self.metric_column_mapping.get(column_name, column_name)
73+
not in self.dataset.columns
74+
):
75+
raise KeyError(
76+
"Required column"
77+
f" `{self.metric_column_mapping.get(column_name, column_name)}` not"
78+
" found in the evaluation dataset. The columns in the evaluation"
79+
f" dataset are {list(self.dataset.columns)}."
80+
)
81+
82+
83+
@dataclasses.dataclass
84+
class EvalResult:
85+
"""Evaluation result.
86+
87+
Attributes:
88+
summary_metrics: A dictionary of summary evaluation metrics for an
89+
evaluation run.
90+
metrics_table: A pandas.DataFrame table containing evaluation dataset
91+
inputs, predictions, explanations, and metric results per row.
92+
metadata: The metadata for the evaluation run.
93+
"""
94+
95+
summary_metrics: Dict[str, float]
96+
metrics_table: Optional["pd.DataFrame"] = None
97+
metadata: Optional[Dict[str, str]] = None

0 commit comments

Comments
 (0)