@@ -71,18 +71,24 @@ class EvalTask:
71
71
* baseline_model_response_column_name: "baseline_model_response"
72
72
73
73
Requirement for different use cases:
74
- * Bring-your-own-response: A `response` column is required. Response
75
- column name can be customized by providing `response_column_name`
76
- parameter. If a pairwise metric is used and a baseline model is
77
- not provided, a `baseline_model_response` column is required.
78
- Baseline model response column name can be customized by providing
79
- `baseline_model_response_column_name` parameter. If the `response`
80
- column or `baseline_model_response` column is present while the
74
+ * Bring-your-own-response (BYOR): You already have the data that you
75
+ want to evaluate stored in the dataset. Response column name can be
76
+ customized by providing `response_column_name` parameter, or in the
77
+ `metric_column_mapping`. For BYOR pairwise evaluation, the baseline
78
+ model response column name can be customized by providing
79
+ `baseline_model_response_column_name` parameter, or
80
+ in the `metric_column_mapping`. If the `response` column or
81
+ `baseline_model_response` column is present while the
81
82
corresponding model is specified, an error will be raised.
82
- * Perform model inference without a prompt template: A `prompt` column
83
- in the evaluation dataset representing the input prompt to the
84
- model is required and is used directly as input to the model.
85
- * Perform model inference with a prompt template: Evaluation dataset
83
+
84
+ * Perform model inference without a prompt template: You have a dataset
85
+ containing the input prompts to the model and want to perform model
86
+ inference before evaluation. A column named `prompt` is required
87
+ in the evaluation dataset and is used directly as input to the model.
88
+
89
+ * Perform model inference with a prompt template: You have a dataset
90
+ containing the input variables to the prompt template and want to
91
+ assemble the prompts for model inference. Evaluation dataset
86
92
must contain column names corresponding to the variable names in
87
93
the prompt template. For example, if prompt template is
88
94
"Instruction: {instruction}, context: {context}", the dataset must
@@ -371,18 +377,19 @@ def evaluate(
371
377
372
378
Args:
373
379
model: A GenerativeModel instance or a custom model function to generate
374
- responses to evaluate. If not provided, the evaluation is computed with
375
- the ` response` column in the `dataset` .
380
+ responses to evaluate. If not provided, the evaluation can be performed
381
+ in the bring-your-own- response (BYOR) mode .
376
382
prompt_template: The prompt template to use for the evaluation. If not
377
383
set, the prompt template that was used to create the EvalTask will be
378
384
used.
379
385
experiment_run_name: The name of the experiment run to log the evaluation
380
386
to if an experiment is set for this EvalTask. If not provided, a random
381
387
unique experiment run name is used.
382
388
response_column_name: The column name of model response in the dataset. If
383
- provided, this will override the `response_column_name ` of the `EvalTask`.
389
+ provided, this will override the `metric_column_mapping ` of the `EvalTask`.
384
390
baseline_model_response_column_name: The column name of baseline model
385
- response in the dataset for pairwise metrics.
391
+ response in the dataset for pairwise metrics. If provided, this will
392
+ override the `metric_column_mapping` of the `EvalTask`
386
393
evaluation_service_qps: The custom QPS limit for the evaluation service.
387
394
retry_timeout: How long to keep retrying the evaluation requests for
388
395
the whole evaluation dataset, in seconds.
@@ -400,11 +407,11 @@ def evaluate(
400
407
"`vertexai.init(experiment='experiment_name')`for logging this"
401
408
" evaluation run."
402
409
)
403
- self ._verify_response_column_name (
410
+ self ._verify_and_set_response_column_name (
404
411
response_column_name = response_column_name ,
405
412
metric_column_mapping_key = constants .Dataset .MODEL_RESPONSE_COLUMN ,
406
413
)
407
- self ._verify_response_column_name (
414
+ self ._verify_and_set_response_column_name (
408
415
response_column_name = baseline_model_response_column_name ,
409
416
metric_column_mapping_key = constants .Dataset .BASELINE_MODEL_RESPONSE_COLUMN ,
410
417
)
@@ -503,10 +510,10 @@ def _log_eval_experiment_param(
503
510
except (ValueError , TypeError ) as e :
504
511
_LOGGER .warning (f"Experiment metadata logging failed: { str (e )} " )
505
512
506
- def _verify_response_column_name (
513
+ def _verify_and_set_response_column_name (
507
514
self , response_column_name : str , metric_column_mapping_key : str
508
515
) -> None :
509
- """Verifies if model response column name or baseline model response column name is valid ."""
516
+ """Verifies and sets the model response column names ."""
510
517
if response_column_name :
511
518
if response_column_name in self ._dataset .columns :
512
519
self ._metric_column_mapping [
0 commit comments