99
99
evaluation_steps = _EVALUATION_STEPS ,
100
100
),
101
101
)
102
+ _TEST_COMET = pointwise_metric .Comet (
103
+ version = "COMET_22_SRC_REF" ,
104
+ source_language = "en" ,
105
+ target_language = "zh" ,
106
+ )
107
+ _TEST_METRICX = pointwise_metric .MetricX (
108
+ version = "METRICX_24_SRC" ,
109
+ source_language = "en" ,
110
+ target_language = "zh" ,
111
+ )
102
112
_TEST_METRICS = (
103
113
"exact_match" ,
104
114
"bleu" ,
139
149
"reference" : ["test" , "ref" ],
140
150
"context" : ["test" , "context" ],
141
151
"instruction" : ["test" , "instruction" ],
152
+ "source" : ["test" , "source" ],
142
153
}
143
154
)
144
155
_TEST_EVAL_DATASET_SINGLE = pd .DataFrame ({"prompt" : ["test_prompt" , "text_prompt" ]})
305
316
)
306
317
),
307
318
)
308
- _MOCK_POINTEWISE_RESULT = (
319
+ _MOCK_POINTWISE_RESULT = (
309
320
gapic_evaluation_service_types .EvaluateInstancesResponse (
310
321
pointwise_metric_result = gapic_evaluation_service_types .PointwiseMetricResult (
311
322
score = 5 , explanation = "explanation"
423
434
)
424
435
),
425
436
)
437
+ _EXPECTED_COLUMN_MAPPING = {
438
+ "context" : "context" ,
439
+ "reference" : "reference" ,
440
+ "response" : "response" ,
441
+ "instruction" : "instruction" ,
442
+ "prompt" : "prompt" ,
443
+ "source" : "source" ,
444
+ }
445
+ _MOCK_MODEL_BASED_TRANSLATION_RESULT = (
446
+ # The order of the responses is important.
447
+ gapic_evaluation_service_types .EvaluateInstancesResponse (
448
+ comet_result = gapic_evaluation_service_types .CometResult (score = 0.1 )
449
+ ),
450
+ gapic_evaluation_service_types .EvaluateInstancesResponse (
451
+ metricx_result = gapic_evaluation_service_types .MetricxResult (score = 5 )
452
+ ),
453
+ gapic_evaluation_service_types .EvaluateInstancesResponse (
454
+ comet_result = gapic_evaluation_service_types .CometResult (score = 0.9 )
455
+ ),
456
+ gapic_evaluation_service_types .EvaluateInstancesResponse (
457
+ metricx_result = gapic_evaluation_service_types .MetricxResult (score = 20 )
458
+ ),
459
+ )
426
460
427
461
428
462
@pytest .fixture (scope = "module" )
@@ -465,16 +499,10 @@ def test_create_eval_task(self):
465
499
assert test_eval_task .dataset .equals (_TEST_EVAL_DATASET_ALL_INCLUDED )
466
500
assert test_eval_task .metrics == _TEST_METRICS
467
501
assert test_eval_task .experiment == _TEST_EXPERIMENT
468
- assert test_eval_task ._metric_column_mapping == {
469
- "context" : "context" ,
470
- "reference" : "reference" ,
471
- "response" : "response" ,
472
- "instruction" : "instruction" ,
473
- "prompt" : "prompt" ,
474
- }
502
+ assert test_eval_task ._metric_column_mapping == _EXPECTED_COLUMN_MAPPING
475
503
476
504
@pytest .mark .parametrize ("api_transport" , ["grpc" , "rest" ])
477
- def test_compute_automatic_metrics (self , api_transport ):
505
+ def test_compute_exact_match_metric (self , api_transport ):
478
506
aiplatform .init (
479
507
project = _TEST_PROJECT ,
480
508
location = _TEST_LOCATION ,
@@ -521,7 +549,7 @@ def test_compute_pointwise_metrics(self, api_transport):
521
549
test_eval_task = EvalTask (
522
550
dataset = _TEST_EVAL_DATASET_ALL_INCLUDED , metrics = test_metrics
523
551
)
524
- mock_metric_results = _MOCK_POINTEWISE_RESULT
552
+ mock_metric_results = _MOCK_POINTWISE_RESULT
525
553
with mock .patch .object (
526
554
target = gapic_evaluation_services .EvaluationServiceClient ,
527
555
attribute = "evaluate_instances" ,
@@ -543,6 +571,7 @@ def test_compute_pointwise_metrics(self, api_transport):
543
571
"reference" ,
544
572
"test_pointwise_metric/score" ,
545
573
"test_pointwise_metric/explanation" ,
574
+ "source" ,
546
575
]
547
576
)
548
577
assert test_result .metrics_table ["response" ].equals (
@@ -567,7 +596,7 @@ def test_compute_pointwise_metrics_free_string(self):
567
596
metrics = [_TEST_POINTWISE_METRIC_FREE_STRING ],
568
597
metric_column_mapping = {"abc" : "prompt" },
569
598
)
570
- mock_metric_results = _MOCK_POINTEWISE_RESULT
599
+ mock_metric_results = _MOCK_POINTWISE_RESULT
571
600
with mock .patch .object (
572
601
target = gapic_evaluation_services .EvaluationServiceClient ,
573
602
attribute = "evaluate_instances" ,
@@ -589,6 +618,7 @@ def test_compute_pointwise_metrics_free_string(self):
589
618
"reference" ,
590
619
"test_pointwise_metric_str/score" ,
591
620
"test_pointwise_metric_str/explanation" ,
621
+ "source" ,
592
622
]
593
623
)
594
624
assert test_result .metrics_table ["response" ].equals (
@@ -695,6 +725,7 @@ def test_compute_pointwise_metrics_without_model_inference(self, api_transport):
695
725
"response" ,
696
726
"summarization_quality/score" ,
697
727
"summarization_quality/explanation" ,
728
+ "source" ,
698
729
]
699
730
)
700
731
assert list (
@@ -707,6 +738,48 @@ def test_compute_pointwise_metrics_without_model_inference(self, api_transport):
707
738
"explanation" ,
708
739
]
709
740
741
+ @pytest .mark .parametrize ("api_transport" , ["grpc" , "rest" ])
742
+ def test_compute_model_based_translation_metrics_without_model_inference (
743
+ self , api_transport
744
+ ):
745
+ aiplatform .init (
746
+ project = _TEST_PROJECT ,
747
+ location = _TEST_LOCATION ,
748
+ api_transport = api_transport ,
749
+ )
750
+ test_metrics = [_TEST_COMET , _TEST_METRICX ]
751
+ test_eval_task = EvalTask (
752
+ dataset = _TEST_EVAL_DATASET_ALL_INCLUDED , metrics = test_metrics
753
+ )
754
+
755
+ mock_metric_results = _MOCK_MODEL_BASED_TRANSLATION_RESULT
756
+ with mock .patch .object (
757
+ target = gapic_evaluation_services .EvaluationServiceClient ,
758
+ attribute = "evaluate_instances" ,
759
+ side_effect = mock_metric_results ,
760
+ ):
761
+ test_result = test_eval_task .evaluate ()
762
+
763
+ assert test_result .summary_metrics ["row_count" ] == 2
764
+ assert test_result .summary_metrics ["comet/mean" ] == 0.5
765
+ assert test_result .summary_metrics ["metricx/mean" ] == 12.5
766
+ assert test_result .summary_metrics ["comet/std" ] == pytest .approx (0.5 , 0.6 )
767
+ assert test_result .summary_metrics ["metricx/std" ] == pytest .approx (10 , 11 )
768
+ assert set (test_result .metrics_table .columns .values ) == set (
769
+ [
770
+ "context" ,
771
+ "instruction" ,
772
+ "reference" ,
773
+ "prompt" ,
774
+ "response" ,
775
+ "source" ,
776
+ "comet/score" ,
777
+ "metricx/score" ,
778
+ ]
779
+ )
780
+ assert list (test_result .metrics_table ["comet/score" ].values ) == [0.1 , 0.9 ]
781
+ assert list (test_result .metrics_table ["metricx/score" ].values ) == [5 , 20 ]
782
+
710
783
@pytest .mark .parametrize ("api_transport" , ["grpc" , "rest" ])
711
784
def test_compute_automatic_metrics_with_custom_metric_spec (self , api_transport ):
712
785
aiplatform .init (
@@ -940,6 +1013,7 @@ def test_compute_pairwise_metrics_without_model_inference(self, api_transport):
940
1013
"instruction" ,
941
1014
"pairwise_summarization_quality/pairwise_choice" ,
942
1015
"pairwise_summarization_quality/explanation" ,
1016
+ "source" ,
943
1017
]
944
1018
)
945
1019
assert list (
@@ -1281,7 +1355,7 @@ def test_evaluate_response_column_and_model_not_provided(self):
1281
1355
):
1282
1356
test_eval_task .evaluate ()
1283
1357
1284
- def test_evaluate_baseline_response_column_and_baseline_model_not_provided (
1358
+ def test_evaluate_baseline_model_response_column_not_provided (
1285
1359
self ,
1286
1360
):
1287
1361
test_eval_dataset = _TEST_EVAL_DATASET_SINGLE .copy (deep = True )
@@ -1302,6 +1376,63 @@ def test_evaluate_baseline_response_column_and_baseline_model_not_provided(
1302
1376
):
1303
1377
test_eval_task .evaluate ()
1304
1378
1379
+ def test_evaluate_response_column_not_provided (
1380
+ self ,
1381
+ ):
1382
+ test_eval_dataset = _TEST_EVAL_DATASET_SINGLE
1383
+ test_eval_task = EvalTask (
1384
+ dataset = test_eval_dataset ,
1385
+ metrics = ["exact_match" ],
1386
+ )
1387
+ with pytest .raises (
1388
+ KeyError ,
1389
+ match = re .escape (
1390
+ (
1391
+ "Required column `response` not found in the evaluation "
1392
+ "dataset. The columns in the evaluation dataset are ['prompt']"
1393
+ )
1394
+ ),
1395
+ ):
1396
+ test_eval_task .evaluate ()
1397
+
1398
+ def test_evaluate_reference_column_not_provided (
1399
+ self ,
1400
+ ):
1401
+ test_eval_dataset = pd .DataFrame ({"response" : ["test" , "text" ]})
1402
+ test_eval_task = EvalTask (
1403
+ dataset = test_eval_dataset ,
1404
+ metrics = ["exact_match" ],
1405
+ )
1406
+ with pytest .raises (
1407
+ KeyError ,
1408
+ match = re .escape (
1409
+ (
1410
+ "Required column `reference` not found in the evaluation "
1411
+ "dataset. The columns in the evaluation dataset are ['response']"
1412
+ )
1413
+ ),
1414
+ ):
1415
+ test_eval_task .evaluate ()
1416
+
1417
+ def test_evaluate_reference_or_source_column_not_provided (
1418
+ self ,
1419
+ ):
1420
+ test_eval_dataset = pd .DataFrame ({"response" : ["test" , "text" ]})
1421
+ test_eval_task = EvalTask (
1422
+ dataset = test_eval_dataset ,
1423
+ metrics = [_TEST_COMET , _TEST_METRICX ],
1424
+ )
1425
+ with pytest .raises (
1426
+ KeyError ,
1427
+ match = re .escape (
1428
+ (
1429
+ "Required column `source` not found in the evaluation "
1430
+ "dataset. The columns in the evaluation dataset are ['response']"
1431
+ )
1432
+ ),
1433
+ ):
1434
+ test_eval_task .evaluate ()
1435
+
1305
1436
def test_evaluate_invalid_prompt_template_variables (self ):
1306
1437
test_eval_task = EvalTask (
1307
1438
dataset = _TEST_EVAL_DATASET_SINGLE ,
@@ -1530,13 +1661,7 @@ def test_initialize_metric_column_mapping(self):
1530
1661
metric_column_mapping = metric_column_mapping ,
1531
1662
dataset = _TEST_EVAL_DATASET_ALL_INCLUDED ,
1532
1663
)
1533
- assert converted_metric_column_mapping == {
1534
- "prompt" : "prompt" ,
1535
- "response" : "response" ,
1536
- "reference" : "reference" ,
1537
- "context" : "context" ,
1538
- "instruction" : "instruction" ,
1539
- }
1664
+ assert converted_metric_column_mapping == _EXPECTED_COLUMN_MAPPING
1540
1665
1541
1666
1542
1667
class TestPromptTemplate :
0 commit comments