20
20
21
21
22
22
class PairwiseMetric :
23
- """The Side-by-side(SxS) Pairwise Metric."""
23
+ """The Side-by-side(SxS) Pairwise Metric.
24
+
25
+ A model-based evaluation metric that compares two generative models
26
+ side-by-side, and allows users to A/B test their generative models to
27
+ determine which model is performing better on the given evaluation task.
28
+
29
+ For more details on when to use pairwise metrics, see
30
+ [Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#pointwise_versus_pairwise).
31
+
32
+ Result Details:
33
+
34
+ * In `EvalResult.summary_metrics`, win rates for both the baseline and
35
+ candidate model are computed, showing the rate of each model performs
36
+ better on the given task. The win rate is computed as the number of times
37
+ the candidate model performs better than the baseline model divided by the
38
+ total number of examples. The win rate is a number between 0 and 1.
39
+
40
+ * In `EvalResult.metrics_table`, a pairwise metric produces three
41
+ evaluation results for each row in the dataset:
42
+ * `pairwise_choice`: the `pairwise_choice` in the evaluation result is
43
+ an enumeration that indicates whether the candidate or baseline
44
+ model perform better.
45
+ * `explanation`: The model AutoRater's rationale behind each verdict
46
+ using chain-of-thought reasoning. These explanations help users
47
+ scrutinize the AutoRater's judgment and build appropriate trust in
48
+ its decisions.
49
+ * `confidence`: A score between 0 and 1, which signifies how confident
50
+ the AutoRater was with its verdict. A score closer to 1 means higher
51
+ confidence.
52
+
53
+ See [documentation page](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#understand-results)
54
+ for more details on understanding the metric results.
55
+
56
+ Usages:
57
+
58
+ ```
59
+ from vertexai.generative_models import GenerativeModel
60
+ from vertexai.preview.evaluation import EvalTask, PairwiseMetric
61
+
62
+ baseline_model = GenerativeModel("gemini-1.0-pro")
63
+ candidate_model = GenerativeModel("gemini-1.5-pro")
64
+
65
+ pairwise_summarization_quality = PairwiseMetric(
66
+ metric = "summarization_quality",
67
+ baseline_model=baseline_model,
68
+ )
69
+
70
+ eval_task = EvalTask(
71
+ dataset = pd.DataFrame({
72
+ "instruction": [...],
73
+ "context": [...],
74
+ }),
75
+ metrics=[pairwise_summarization_quality],
76
+ )
77
+
78
+ pairwise_results = eval_task.evaluate(
79
+ prompt_template="instruction: {instruction}. context: {context}",
80
+ model=candidate_model,
81
+ )
82
+ ```
83
+ """
24
84
25
85
def __init__ (
26
86
self ,
@@ -37,8 +97,8 @@ def __init__(
37
97
Args:
38
98
metric: The Side-by-side(SxS) pairwise evaluation metric name.
39
99
baseline_model: The baseline model for the Side-by-side(SxS) comparison.
40
- use_reference: Whether to use reference to compute the metric. If specified,
41
- the reference column is required in the dataset.
100
+ use_reference: Whether to use reference to compute the metric. If
101
+ specified, the reference column is required in the dataset.
42
102
version: The metric version to use for evaluation.
43
103
"""
44
104
self ._metric = metric
@@ -74,8 +134,8 @@ class CustomMetric:
74
134
Attributes:
75
135
name: The name of the metric.
76
136
metric_function: The evaluation function. Must use the dataset row/instance
77
- as the metric_function input. Returns per-instance metric result as a
78
- dictionary. The metric score must mapped to the CustomMetric.name as key.
137
+ as the metric_function input. Returns per-instance metric result as a
138
+ dictionary. The metric score must mapped to the CustomMetric.name as key.
79
139
"""
80
140
81
141
def __init__ (
0 commit comments