fix: add validation for evaluation dataset fields, update logging info for eval api request count

jsondai · copybara-github · commit d6ef50080f3b · 2024-05-14T15:43:50.000-07:00
PiperOrigin-RevId: 633729236
diff --git a/tests/unit/vertexai/test_evaluation.py b/tests/unit/vertexai/test_evaluation.py
@@ -571,6 +571,19 @@ def test_evaluate_pairwise_metrics_with_multiple_baseline_models(self):
         ):
             test_eval_task.evaluate(model=mock_candidate_model)
 
+    def test_evaluate_invalid_model_and_dataset_input(self):
+        test_eval_task = evaluation.EvalTask(
+            dataset=_TEST_EVAL_DATASET,
+            metrics=_TEST_METRICS,
+        )
+        with pytest.raises(
+            ValueError,
+            match=("The `model` parameter is specified, but the evaluation `dataset`"),
+        ):
+            test_eval_task.evaluate(
+                model=generative_models.GenerativeModel(model_name="invalid_model_name")
+            )
+
 
 @pytest.mark.usefixtures("google_auth_mock")
 class TestEvaluationUtils:
diff --git a/vertexai/preview/evaluation/_eval_tasks.py b/vertexai/preview/evaluation/_eval_tasks.py
@@ -79,7 +79,7 @@ class EvalTask:
         documentation page [Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval).
 
     Usage:
-        1. To perform bring your own prediction evaluation, provide the model
+        1. To perform bring-your-own-prediction(BYOP) evaluation, provide the model
         responses in the response column in the dataset. The response column name
         is "response" by default, or specify `response_column_name` parameter to
         customize.
diff --git a/vertexai/preview/evaluation/_evaluation.py b/vertexai/preview/evaluation/_evaluation.py
@@ -534,8 +534,7 @@ async def _compute_metrics(
                 metric_name = metric
             tasks_by_metric[metric_name].append(task)
 
-    api_request_count = (len(api_metrics) + len(custom_metrics)) * len(
-        evaluation_run_config.dataset)
+    api_request_count = len(api_metrics) * len(evaluation_run_config.dataset)
     _LOGGER.info(
         f"Computing metrics with a total of {api_request_count} Vertex online"
         " evaluation service requests."
@@ -629,7 +628,8 @@ def evaluate(
     Raises:
       ValueError: If the metrics list is empty, or the prompt template is not
       provided for PairwiseMetric, or multiple baseline models are specified for
-      PairwiseMetric instances.
+      PairwiseMetric instances, or both model and dataset model response column
+      are present.
     """
 
     if not metrics:
@@ -655,6 +655,22 @@ def evaluate(
             constants.Dataset.REFERENCE_COLUMN
         )
 
+    if (
+        model
+        and evaluation_run_config.column_map.get(
+            constants.Dataset.MODEL_RESPONSE_COLUMN
+        )
+        in dataset.columns
+    ):
+        raise ValueError(
+            "The `model` parameter is specified, but the evaluation `dataset`"
+            f" contains model response column `{response_column_name}` to perform"
+            " bring-your-own-prediction(BYOP) evaluation. If you would like to"
+            " perform rapid evaluation using the dataset with the existing model"
+            f" response column `{response_column_name}`, please remove the"
+            " `model` input parameter."
+        )
+
     baseline_model = None
     pairwise_metric_exists = any(
         isinstance(metric, metrics_base.PairwiseMetric)