32
32
from google .cloud .aiplatform_v1 .types import (
33
33
evaluation_service as gapic_evaluation_service_types ,
34
34
)
35
+ from google .cloud .aiplatform_v1beta1 .services import (
36
+ evaluation_service as gapic_evaluation_services_preview ,
37
+ )
38
+ from google .cloud .aiplatform_v1beta1 .types import (
39
+ evaluation_service as gapic_evaluation_service_types_preview ,
40
+ )
35
41
from vertexai import evaluation
36
42
from vertexai import generative_models
37
43
from vertexai .evaluation import _base as eval_base
45
51
)
46
52
from vertexai .evaluation .metrics import pairwise_metric
47
53
from vertexai .evaluation .metrics import pointwise_metric
54
+ from vertexai .preview import evaluation as evaluation_preview
55
+ from vertexai .preview import reasoning_engines
48
56
import numpy as np
49
57
import pandas as pd
50
58
import pytest
51
59
52
60
53
61
EvalTask = eval_task .EvalTask
62
+ EvalTaskPreview = evaluation_preview .eval_task .EvalTask
54
63
Pointwise = metric_prompt_template_examples .MetricPromptTemplateExamples .Pointwise
64
+ PointwisePreview = (
65
+ evaluation_preview .metrics .metric_prompt_template_examples .MetricPromptTemplateExamples .Pointwise
66
+ )
55
67
Pairwise = metric_prompt_template_examples .MetricPromptTemplateExamples .Pairwise
56
68
57
69
_TEST_PROJECT = "test-project"
142
154
"instruction" : ["test" , "instruction" ],
143
155
}
144
156
)
157
+ _TEST_AGENT_EVAL_DATASET_WITHOUT_RESPONSE = pd .DataFrame (
158
+ {
159
+ "prompt" : ["test_input1" , "test_input2" ],
160
+ "reference_trajectory" : [
161
+ [{"tool_name" : "test_tool1" }, {"tool_name" : "test_tool2" }],
162
+ [{"tool_name" : "test_tool3" }, {"tool_name" : "test_tool4" }],
163
+ ],
164
+ },
165
+ )
145
166
_TEST_EVAL_DATASET_ALL_INCLUDED = pd .DataFrame (
146
167
{
147
168
"prompt" : ["test_prompt" , "text_prompt" ],
300
321
{response}
301
322
"""
302
323
324
+ _MOCK_RUNNABLE_INFERENCE_RESPONSE = [
325
+ {
326
+ "input" : "test_input" ,
327
+ "output" : "test_output" ,
328
+ "intermediate_steps" : [
329
+ [{"kwargs" : {"tool" : "test_tool1" }, "tool_output" : "test_tool_output" }],
330
+ [{"kwargs" : {"tool" : "test_tool2" }, "tool_output" : "test_tool_output" }],
331
+ ],
332
+ },
333
+ {
334
+ "input" : "test_input" ,
335
+ "output" : "test_output" ,
336
+ "intermediate_steps" : [
337
+ [{"kwargs" : {"tool" : "test_tool2" }, "tool_output" : "test_tool_output" }],
338
+ [{"kwargs" : {"tool" : "test_tool3" }, "tool_output" : "test_tool_output" }],
339
+ ],
340
+ },
341
+ ]
342
+
303
343
_MOCK_EXACT_MATCH_RESULT = (
304
344
gapic_evaluation_service_types .EvaluateInstancesResponse (
305
345
exact_match_results = gapic_evaluation_service_types .ExactMatchResults (
316
356
)
317
357
),
318
358
)
359
+ _MOCK_TRAJECTORY_EXACT_MATCH_RESULT = (
360
+ gapic_evaluation_service_types_preview .EvaluateInstancesResponse (
361
+ trajectory_exact_match_results = gapic_evaluation_service_types_preview .TrajectoryExactMatchResults (
362
+ trajectory_exact_match_metric_values = [
363
+ gapic_evaluation_service_types_preview .TrajectoryExactMatchMetricValue (
364
+ score = 1.0
365
+ ),
366
+ ]
367
+ )
368
+ ),
369
+ gapic_evaluation_service_types_preview .EvaluateInstancesResponse (
370
+ trajectory_exact_match_results = gapic_evaluation_service_types_preview .TrajectoryExactMatchResults (
371
+ trajectory_exact_match_metric_values = [
372
+ gapic_evaluation_service_types_preview .TrajectoryExactMatchMetricValue (
373
+ score = 0.0
374
+ ),
375
+ ]
376
+ )
377
+ ),
378
+ )
319
379
_MOCK_POINTWISE_RESULT = (
320
380
gapic_evaluation_service_types .EvaluateInstancesResponse (
321
381
pointwise_metric_result = gapic_evaluation_service_types .PointwiseMetricResult (
354
414
)
355
415
),
356
416
)
417
+ _MOCK_COHERENCE_RESULT = (
418
+ gapic_evaluation_service_types_preview .EvaluateInstancesResponse (
419
+ pointwise_metric_result = gapic_evaluation_service_types_preview .PointwiseMetricResult (
420
+ score = 5 , explanation = "explanation"
421
+ )
422
+ ),
423
+ gapic_evaluation_service_types_preview .EvaluateInstancesResponse (
424
+ pointwise_metric_result = gapic_evaluation_service_types_preview .PointwiseMetricResult (
425
+ score = 4 , explanation = "explanation"
426
+ )
427
+ ),
428
+ )
357
429
_MOCK_PAIRWISE_SUMMARIZATION_QUALITY_RESULT = (
358
430
gapic_evaluation_service_types .EvaluateInstancesResponse (
359
431
pairwise_metric_result = gapic_evaluation_service_types .PairwiseMetricResult (
@@ -1177,6 +1249,106 @@ def test_eval_result_experiment_run_logging(self):
1177
1249
)
1178
1250
1179
1251
1252
+ @pytest .mark .usefixtures ("google_auth_mock" )
1253
+ class TestAgentEvaluation :
1254
+ def setup_method (self ):
1255
+ vertexai .init (
1256
+ project = _TEST_PROJECT ,
1257
+ location = _TEST_LOCATION ,
1258
+ )
1259
+
1260
+ def teardown_method (self ):
1261
+ initializer .global_pool .shutdown (wait = True )
1262
+
1263
+ @pytest .mark .parametrize ("api_transport" , ["grpc" , "rest" ])
1264
+ def test_runnable_response_eval_with_runnable_inference (self , api_transport ):
1265
+ aiplatform .init (
1266
+ project = _TEST_PROJECT ,
1267
+ location = _TEST_LOCATION ,
1268
+ api_transport = api_transport ,
1269
+ )
1270
+ mock_runnable = mock .create_autospec (reasoning_engines .Queryable , instance = True )
1271
+ mock_runnable .query .return_value = _MOCK_RUNNABLE_INFERENCE_RESPONSE
1272
+
1273
+ test_metrics = [PointwisePreview .COHERENCE ]
1274
+ test_eval_task = EvalTaskPreview (
1275
+ dataset = _TEST_AGENT_EVAL_DATASET_WITHOUT_RESPONSE , metrics = test_metrics
1276
+ )
1277
+ mock_metric_results = _MOCK_COHERENCE_RESULT
1278
+ with mock .patch .object (
1279
+ target = gapic_evaluation_services_preview .EvaluationServiceClient ,
1280
+ attribute = "evaluate_instances" ,
1281
+ side_effect = mock_metric_results ,
1282
+ ):
1283
+ test_result = test_eval_task .evaluate (
1284
+ runnable = mock_runnable ,
1285
+ prompt_template = "test prompt template" ,
1286
+ )
1287
+
1288
+ assert test_result .summary_metrics ["row_count" ] == 2
1289
+ assert test_result .summary_metrics ["coherence/mean" ] == 4.5
1290
+ assert test_result .summary_metrics ["coherence/std" ] == pytest .approx (0.7 , 0.1 )
1291
+ assert set (test_result .metrics_table .columns .values ) == set (
1292
+ [
1293
+ "prompt" ,
1294
+ "reference_trajectory" ,
1295
+ "response" ,
1296
+ "latency_in_seconds" ,
1297
+ "failure" ,
1298
+ "predicted_trajectory" ,
1299
+ "coherence/score" ,
1300
+ "coherence/explanation" ,
1301
+ ]
1302
+ )
1303
+ assert list (test_result .metrics_table ["coherence/score" ].values ) == [5 , 4 ]
1304
+ assert list (test_result .metrics_table ["coherence/explanation" ].values ) == [
1305
+ "explanation" ,
1306
+ "explanation" ,
1307
+ ]
1308
+
1309
+ @pytest .mark .parametrize ("api_transport" , ["grpc" , "rest" ])
1310
+ def test_runnable_trajectory_eval_with_runnable_inference (self , api_transport ):
1311
+ aiplatform .init (
1312
+ project = _TEST_PROJECT ,
1313
+ location = _TEST_LOCATION ,
1314
+ api_transport = api_transport ,
1315
+ )
1316
+ mock_runnable = mock .create_autospec (reasoning_engines .Queryable , instance = True )
1317
+ mock_runnable .query .return_value = _MOCK_RUNNABLE_INFERENCE_RESPONSE
1318
+
1319
+ test_metrics = ["trajectory_exact_match" ]
1320
+ test_eval_task = EvalTaskPreview (
1321
+ dataset = _TEST_AGENT_EVAL_DATASET_WITHOUT_RESPONSE , metrics = test_metrics
1322
+ )
1323
+ mock_metric_results = _MOCK_TRAJECTORY_EXACT_MATCH_RESULT
1324
+ with mock .patch .object (
1325
+ target = gapic_evaluation_services_preview .EvaluationServiceClient ,
1326
+ attribute = "evaluate_instances" ,
1327
+ side_effect = mock_metric_results ,
1328
+ ):
1329
+ test_result = test_eval_task .evaluate (runnable = mock_runnable )
1330
+
1331
+ assert test_result .summary_metrics ["row_count" ] == 2
1332
+ assert test_result .summary_metrics ["trajectory_exact_match/mean" ] == 0.5
1333
+ assert test_result .summary_metrics [
1334
+ "trajectory_exact_match/std"
1335
+ ] == pytest .approx (0.7 , 0.1 )
1336
+ assert set (test_result .metrics_table .columns .values ) == set (
1337
+ [
1338
+ "prompt" ,
1339
+ "response" ,
1340
+ "latency_in_seconds" ,
1341
+ "failure" ,
1342
+ "predicted_trajectory" ,
1343
+ "reference_trajectory" ,
1344
+ "trajectory_exact_match/score" ,
1345
+ ]
1346
+ )
1347
+ assert list (
1348
+ test_result .metrics_table ["trajectory_exact_match/score" ].values
1349
+ ) == [1.0 , 0.0 ]
1350
+
1351
+
1180
1352
@pytest .mark .usefixtures ("google_auth_mock" )
1181
1353
class TestEvaluationErrors :
1182
1354
def setup_method (self ):
@@ -1376,11 +1548,10 @@ def test_evaluate_baseline_model_response_column_not_provided(
1376
1548
):
1377
1549
test_eval_task .evaluate ()
1378
1550
1379
- def test_evaluate_response_column_not_provided (
1380
- self ,
1381
- ):
1551
+ @pytest .mark .parametrize ("eval_task_version" , [EvalTask , EvalTaskPreview ])
1552
+ def test_evaluate_response_column_not_provided (self , eval_task_version ):
1382
1553
test_eval_dataset = _TEST_EVAL_DATASET_SINGLE
1383
- test_eval_task = EvalTask (
1554
+ test_eval_task = eval_task_version (
1384
1555
dataset = test_eval_dataset ,
1385
1556
metrics = ["exact_match" ],
1386
1557
)
@@ -1395,11 +1566,10 @@ def test_evaluate_response_column_not_provided(
1395
1566
):
1396
1567
test_eval_task .evaluate ()
1397
1568
1398
- def test_evaluate_reference_column_not_provided (
1399
- self ,
1400
- ):
1569
+ @pytest .mark .parametrize ("eval_task_version" , [EvalTask , EvalTaskPreview ])
1570
+ def test_evaluate_reference_column_not_provided (self , eval_task_version ):
1401
1571
test_eval_dataset = pd .DataFrame ({"response" : ["test" , "text" ]})
1402
- test_eval_task = EvalTask (
1572
+ test_eval_task = eval_task_version (
1403
1573
dataset = test_eval_dataset ,
1404
1574
metrics = ["exact_match" ],
1405
1575
)
0 commit comments