docs: update docstrings for rapid evaluation library.

jsondai · copybara-github · commit d6d371d61abd · 2024-05-15T13:14:52.000-07:00
PiperOrigin-RevId: 634051814
diff --git a/vertexai/preview/evaluation/_eval_tasks.py b/vertexai/preview/evaluation/_eval_tasks.py
@@ -54,7 +54,8 @@ class EvalTask:
     models and their settings, and assess the quality of the model's generated
     text.
 
-    Dataset details:
+    Dataset Details:
+
         Default dataset column names:
             * content_column_name: "content"
             * reference_column_name: "reference"
@@ -74,11 +75,13 @@ class EvalTask:
               dataset must contain `instruction` and `context` column.
 
     Metrics Details:
+
         The supported metrics, metric bundle descriptions, grading rubrics, and
         the required input fields can be found on the Vertex AI public
         documentation page [Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval).
 
     Usage:
+
         1. To perform bring-your-own-prediction(BYOP) evaluation, provide the model
         responses in the response column in the dataset. The response column name
         is "response" by default, or specify `response_column_name` parameter to
diff --git a/vertexai/preview/evaluation/metrics/_base.py b/vertexai/preview/evaluation/metrics/_base.py
@@ -20,7 +20,67 @@
 
 
 class PairwiseMetric:
-    """The Side-by-side(SxS) Pairwise Metric."""
+    """The Side-by-side(SxS) Pairwise Metric.
+
+    A model-based evaluation metric that compares two generative models
+    side-by-side, and allows users to A/B test their generative models to
+    determine which model is performing better on the given evaluation task.
+
+    For more details on when to use pairwise metrics, see
+    [Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#pointwise_versus_pairwise).
+
+    Result Details:
+
+        * In `EvalResult.summary_metrics`, win rates for both the baseline and
+        candidate model are computed, showing the rate of each model performs
+        better on the given task. The win rate is computed as the number of times
+        the candidate model performs better than the baseline model divided by the
+        total number of examples. The win rate is a number between 0 and 1.
+
+        * In `EvalResult.metrics_table`, a pairwise metric produces three
+        evaluation results for each row in the dataset:
+            * `pairwise_choice`: the `pairwise_choice` in the evaluation result is
+              an enumeration that indicates whether the candidate or baseline
+              model perform better.
+            * `explanation`: The model AutoRater's rationale behind each verdict
+              using chain-of-thought reasoning. These explanations help users
+              scrutinize the AutoRater's judgment and build appropriate trust in
+              its decisions.
+            * `confidence`: A score between 0 and 1, which signifies how confident
+              the AutoRater was with its verdict. A score closer to 1 means higher
+              confidence.
+
+        See [documentation page](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#understand-results)
+        for more details on understanding the metric results.
+
+    Usages:
+
+        ```
+        from vertexai.generative_models import GenerativeModel
+        from vertexai.preview.evaluation import EvalTask, PairwiseMetric
+
+        baseline_model = GenerativeModel("gemini-1.0-pro")
+        candidate_model = GenerativeModel("gemini-1.5-pro")
+
+        pairwise_summarization_quality = PairwiseMetric(
+          metric = "summarization_quality",
+          baseline_model=baseline_model,
+        )
+
+        eval_task =  EvalTask(
+          dataset = pd.DataFrame({
+              "instruction": [...],
+              "context": [...],
+          }),
+          metrics=[pairwise_summarization_quality],
+        )
+
+        pairwise_results = eval_task.evaluate(
+            prompt_template="instruction: {instruction}. context: {context}",
+            model=candidate_model,
+        )
+        ```
+    """
 
     def __init__(
         self,
@@ -37,8 +97,8 @@ def __init__(
         Args:
           metric: The Side-by-side(SxS) pairwise evaluation metric name.
           baseline_model: The baseline model for the Side-by-side(SxS) comparison.
-          use_reference: Whether to use reference to compute the metric. If specified,
-            the reference column is required in the dataset.
+          use_reference: Whether to use reference to compute the metric. If
+            specified, the reference column is required in the dataset.
           version: The metric version to use for evaluation.
         """
         self._metric = metric
@@ -74,8 +134,8 @@ class CustomMetric:
     Attributes:
       name: The name of the metric.
       metric_function: The evaluation function. Must use the dataset row/instance
-       as the metric_function input. Returns per-instance metric result as a
-       dictionary. The metric score must mapped to the CustomMetric.name as key.
+        as the metric_function input. Returns per-instance metric result as a
+        dictionary. The metric score must mapped to the CustomMetric.name as key.
     """
 
     def __init__(
diff --git a/vertexai/preview/evaluation/prompt_template.py b/vertexai/preview/evaluation/prompt_template.py
@@ -27,18 +27,14 @@ class PromptTemplate:
     values using the `assemble` method, providing flexibility in generating
     dynamic prompts.
 
-    Example Usage:
+    Usage:
 
         ```
-            template_str = "Hello, {name}! Today is {day}. How are you?"
-            prompt_template = PromptTemplate(template_str)
-            completed_prompt = prompt_template.assemble(name="John", day="Monday")
-            print(completed_prompt)
+        template_str = "Hello, {name}! Today is {day}. How are you?"
+        prompt_template = PromptTemplate(template_str)
+        completed_prompt = prompt_template.assemble(name="John", day="Monday")
+        print(completed_prompt)
         ```
-
-    Attributes:
-        template: The template string containing placeholders for replacement.
-        placeholders: A set of placeholder names from the template string.
     """
 
     def __init__(self, template: str):