feat: support customizing bring-your-own-response eval use case to use any columns

jsondai · copybara-github · commit 3e7bf819d951 · 2024-10-16T10:40:01.000-07:00
PiperOrigin-RevId: 686559721
diff --git a/tests/unit/vertexai/test_evaluation.py b/tests/unit/vertexai/test_evaluation.py
@@ -1222,11 +1222,12 @@ def test_evaluate_response_column_and_model_not_provided(self):
             metrics=[_TEST_POINTWISE_METRIC],
         )
         with pytest.raises(
-            KeyError,
+            ValueError,
             match=re.escape(
                 (
-                    "Required column `response` not found in the evaluation dataset."
-                    " The columns in the evaluation dataset are ['prompt']."
+                    "Cannot find the `response` column in the evaluation dataset"
+                    " to fill the metric prompt template for"
+                    " `test_pointwise_metric` metric."
                 )
             ),
         ):
@@ -1242,12 +1243,12 @@ def test_evaluate_baseline_response_column_and_baseline_model_not_provided(
             metrics=[_TEST_PAIRWISE_METRIC],
         )
         with pytest.raises(
-            KeyError,
+            ValueError,
             match=re.escape(
                 (
-                    "Required column `baseline_model_response` not found in the"
-                    " evaluation dataset. The columns in the evaluation dataset are"
-                    " ['prompt', 'response']."
+                    "Cannot find the `baseline_model_response` column in the"
+                    " evaluation dataset to fill the metric prompt template for"
+                    " `test_pairwise_metric` metric."
                 )
             ),
         ):
diff --git a/vertexai/evaluation/_evaluation.py b/vertexai/evaluation/_evaluation.py
@@ -103,8 +103,8 @@ def _validate_metric_column_map(
     """Validates the column map for metric prompt template usage."""
     for metric in evaluation_run_config.metrics:
         if isinstance(
-            metric, metrics_base._ModelBasedMetric
-        ):  # pylint: disable=protected-access
+            metric, metrics_base._ModelBasedMetric  # pylint: disable=protected-access
+        ):
             for variable in prompt_template_base.PromptTemplate(
                 metric.metric_prompt_template
             ).variables:
@@ -124,6 +124,35 @@ def _validate_metric_column_map(
                     )
 
 
+def _validate_dataset_for_automatic_metrics(
+    evaluation_run_config: evaluation_base.EvaluationRunConfig,
+):
+    """Validates the required columns exist in the dataset for automatic metrics."""
+    if set(evaluation_run_config.metrics).intersection(
+        set(constants.Metric.AUTOMATIC_METRIC_LIST)
+    ):
+        if (
+            constants.Dataset.REFERENCE_COLUMN
+            not in evaluation_run_config.metric_column_mapping
+        ):
+            evaluation_run_config.metric_column_mapping[
+                constants.Dataset.REFERENCE_COLUMN
+            ] = constants.Dataset.REFERENCE_COLUMN
+        evaluation_run_config.validate_dataset_column(
+            constants.Dataset.REFERENCE_COLUMN
+        )
+        if (
+            constants.Dataset.MODEL_RESPONSE_COLUMN
+            not in evaluation_run_config.metric_column_mapping
+        ):
+            evaluation_run_config.metric_column_mapping[
+                constants.Dataset.MODEL_RESPONSE_COLUMN
+            ] = constants.Dataset.MODEL_RESPONSE_COLUMN
+        evaluation_run_config.validate_dataset_column(
+            constants.Dataset.MODEL_RESPONSE_COLUMN
+        )
+
+
 def _compute_custom_metrics(
     row_dict: Dict[str, Any],
     custom_metrics: List[metrics_base.CustomMetric],
@@ -392,8 +421,8 @@ def _run_model_inference(
     is_baseline_model = (
         response_column_name == constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN
     )
-    if response_column_name not in evaluation_run_config.metric_column_mapping:
-        if model:
+    if model:
+        if response_column_name not in evaluation_run_config.metric_column_mapping:
             if constants.Dataset.PROMPT_COLUMN in evaluation_run_config.dataset.columns:
                 t1 = time.perf_counter()
                 if isinstance(model, generative_models.GenerativeModel):
@@ -423,8 +452,7 @@ def _run_model_inference(
                     " the model. Mappings in `metric_column_mapping` do not"
                     " apply for model inference and are used for evaluation only."
                 )
-    else:
-        if model:
+        else:
             raise ValueError(
                 "The `model` parameter or `baseline_model` in pairwise metric is"
                 " specified, but the evaluation `dataset` contains model response"
@@ -840,20 +868,6 @@ def evaluate(
         retry_timeout=retry_timeout,
     )
 
-    if set(evaluation_run_config.metrics).intersection(
-        set(constants.Metric.AUTOMATIC_METRIC_LIST)
-    ):
-        if (
-            constants.Dataset.REFERENCE_COLUMN
-            not in evaluation_run_config.metric_column_mapping
-        ):
-            evaluation_run_config.metric_column_mapping[
-                constants.Dataset.REFERENCE_COLUMN
-            ] = constants.Dataset.REFERENCE_COLUMN
-        evaluation_run_config.validate_dataset_column(
-            constants.Dataset.REFERENCE_COLUMN
-        )
-
     if prompt_template:
         _assemble_prompt_for_dataset(evaluation_run_config, prompt_template)
 
@@ -862,12 +876,7 @@ def evaluate(
         evaluation_run_config=evaluation_run_config,
         response_column_name=constants.Dataset.MODEL_RESPONSE_COLUMN,
     )
-    evaluation_run_config.validate_dataset_column(
-        metric_column_mapping.get(
-            constants.Dataset.MODEL_RESPONSE_COLUMN,
-            constants.Dataset.MODEL_RESPONSE_COLUMN,
-        )
-    )
+    _validate_dataset_for_automatic_metrics(evaluation_run_config)
 
     pairwise_metric_exists = any(
         isinstance(metric, pairwise_metric.PairwiseMetric)
@@ -880,12 +889,6 @@ def evaluate(
             evaluation_run_config=evaluation_run_config,
             response_column_name=constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
         )
-        evaluation_run_config.validate_dataset_column(
-            metric_column_mapping.get(
-                constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
-                constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
-            )
-        )
 
     _validate_metric_column_map(evaluation_run_config)
     t1 = time.perf_counter()
diff --git a/vertexai/evaluation/eval_task.py b/vertexai/evaluation/eval_task.py
@@ -71,18 +71,24 @@ class EvalTask:
             * baseline_model_response_column_name: "baseline_model_response"
 
         Requirement for different use cases:
-          * Bring-your-own-response: A `response` column is required. Response
-              column name can be customized by providing `response_column_name`
-              parameter. If a pairwise metric is used and a baseline model is
-              not provided, a `baseline_model_response` column is required.
-              Baseline model response column name can be customized by providing
-              `baseline_model_response_column_name` parameter. If the `response`
-              column or `baseline_model_response` column is present while the
+          * Bring-your-own-response (BYOR): You already have the data that you
+              want to evaluate stored in the dataset. Response column name can be
+              customized by providing `response_column_name` parameter, or in the
+              `metric_column_mapping`. For BYOR pairwise evaluation, the baseline
+              model response column name can be customized by providing
+              `baseline_model_response_column_name` parameter, or
+              in the `metric_column_mapping`. If the `response` column or
+              `baseline_model_response` column is present while the
               corresponding model is specified, an error will be raised.
-          * Perform model inference without a prompt template: A `prompt` column
-              in the evaluation dataset representing the input prompt to the
-              model is required and is used directly as input to the model.
-          * Perform model inference with a prompt template: Evaluation dataset
+
+          * Perform model inference without a prompt template: You have a dataset
+              containing the input prompts to the model and want to perform model
+              inference before evaluation. A column named `prompt` is required
+              in the evaluation dataset and is used directly as input to the model.
+
+          * Perform model inference with a prompt template: You have a dataset
+              containing the input variables to the prompt template and want to
+              assemble the prompts for model inference. Evaluation dataset
               must contain column names corresponding to the variable names in
               the prompt template. For example, if prompt template is
               "Instruction: {instruction}, context: {context}", the dataset must
@@ -371,18 +377,19 @@ def evaluate(
 
         Args:
           model: A GenerativeModel instance or a custom model function to generate
-            responses to evaluate. If not provided, the evaluation is computed with
-            the `response` column in the `dataset`.
+            responses to evaluate. If not provided, the evaluation can be performed
+            in the bring-your-own-response (BYOR) mode.
           prompt_template: The prompt template to use for the evaluation. If not
             set, the prompt template that was used to create the EvalTask will be
             used.
           experiment_run_name: The name of the experiment run to log the evaluation
             to if an experiment is set for this EvalTask. If not provided, a random
             unique experiment run name is used.
           response_column_name: The column name of model response in the dataset. If
-            provided, this will override the `response_column_name` of the `EvalTask`.
+            provided, this will override the `metric_column_mapping` of the `EvalTask`.
           baseline_model_response_column_name: The column name of baseline model
-            response in the dataset for pairwise metrics.
+            response in the dataset for pairwise metrics. If provided, this will
+            override the `metric_column_mapping` of the `EvalTask`
           evaluation_service_qps: The custom QPS limit for the evaluation service.
           retry_timeout: How long to keep retrying the evaluation requests for
             the whole evaluation dataset, in seconds.
@@ -400,11 +407,11 @@ def evaluate(
                 "`vertexai.init(experiment='experiment_name')`for logging this"
                 " evaluation run."
             )
-        self._verify_response_column_name(
+        self._verify_and_set_response_column_name(
             response_column_name=response_column_name,
             metric_column_mapping_key=constants.Dataset.MODEL_RESPONSE_COLUMN,
         )
-        self._verify_response_column_name(
+        self._verify_and_set_response_column_name(
             response_column_name=baseline_model_response_column_name,
             metric_column_mapping_key=constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
         )
@@ -503,10 +510,10 @@ def _log_eval_experiment_param(
             except (ValueError, TypeError) as e:
                 _LOGGER.warning(f"Experiment metadata logging failed: {str(e)}")
 
-    def _verify_response_column_name(
+    def _verify_and_set_response_column_name(
         self, response_column_name: str, metric_column_mapping_key: str
     ) -> None:
-        """Verifies if model response column name or baseline model response column name is valid."""
+        """Verifies and sets the model response column names."""
         if response_column_name:
             if response_column_name in self._dataset.columns:
                 self._metric_column_mapping[

Original file line number	Diff line number	Diff line change
`@@ -1222,11 +1222,12 @@ def test_evaluate_response_column_and_model_not_provided(self):`
`1222`	`1222`	`metrics=[_TEST_POINTWISE_METRIC],`
`1223`	`1223`	`)`
`1224`	`1224`	`with pytest.raises(`
`1225`		`- KeyError,`
	`1225`	`+ ValueError,`
`1226`	`1226`	`match=re.escape(`
`1227`	`1227`	`(`
`1228`		- "Required column `response` not found in the evaluation dataset."
`1229`		`- " The columns in the evaluation dataset are ['prompt']."`
	`1228`	+ "Cannot find the `response` column in the evaluation dataset"
	`1229`	`+ " to fill the metric prompt template for"`
	`1230`	+ " `test_pointwise_metric` metric."
`1230`	`1231`	`)`
`1231`	`1232`	`),`
`1232`	`1233`	`):`
`@@ -1242,12 +1243,12 @@ def test_evaluate_baseline_response_column_and_baseline_model_not_provided(`
`1242`	`1243`	`metrics=[_TEST_PAIRWISE_METRIC],`
`1243`	`1244`	`)`
`1244`	`1245`	`with pytest.raises(`
`1245`		`- KeyError,`
	`1246`	`+ ValueError,`
`1246`	`1247`	`match=re.escape(`
`1247`	`1248`	`(`
`1248`		- "Required column `baseline_model_response` not found in the"
`1249`		`- " evaluation dataset. The columns in the evaluation dataset are"`
`1250`		`- " ['prompt', 'response']."`
	`1249`	+ "Cannot find the `baseline_model_response` column in the"
	`1250`	`+ " evaluation dataset to fill the metric prompt template for"`
	`1251`	+ " `test_pairwise_metric` metric."
`1251`	`1252`	`)`
`1252`	`1253`	`),`
`1253`	`1254`	`):`