feat: GenAI Evaluation: Release GenAI Evaluation SDK parsing rubric generation response with additional fields to vertexai.preview module.

vertex-sdk-bot · copybara-github · commit 79ca86a57ef1 · 2025-04-01T17:47:22.000-07:00
PiperOrigin-RevId: 742903092
diff --git a/tests/unit/vertexai/test_evaluation.py b/tests/unit/vertexai/test_evaluation.py
@@ -429,9 +429,9 @@
         "prompt": ["test_prompt", "text_prompt", "test_prompt_3"],
         "response": ["test", "text", "test_response_3"],
         "rubrics": [
-            ["test_rubric"],
-            ["test_rubric"],
-            ["test_rubric"],
+            ["test_rubric1", "test_rubric2"],
+            ["test_rubric1", "test_rubric2"],
+            ["test_rubric1", "test_rubric2"],
         ],
     }
 )
@@ -681,7 +681,26 @@
         "candidates": [
             {
                 "content": {
-                    "parts": [{"text": """```json{"questions": ["test_rubric"]}```"""}]
+                    "parts": [
+                        {
+                            "text": """```json{"questions": ["test_rubric1", "test_rubric2"]}```"""
+                        }
+                    ]
+                },
+            }
+        ]
+    }
+)
+_MOCK_MODEL_RUBRIC_GENERATION_RESPONSE_WITH_ADDITIONAL = generative_models.GenerationResponse.from_dict(
+    {
+        "candidates": [
+            {
+                "content": {
+                    "parts": [
+                        {
+                            "text": """```json{"questions": ["test_rubric1", "test_rubric2"], "desc": "test_desc"}```"""
+                        }
+                    ]
                 },
             }
         ]
@@ -2611,11 +2630,11 @@ def test_validate_metrics_multiple_rubric_based_metrics(self):
 
     def test_default_rubrics_parser_succeeds(self):
         parsed_rubrics = utils_preview.parse_rubrics(_UNPARSED_RUBRIC)
-        assert parsed_rubrics == ["test_rubric"]
+        assert parsed_rubrics == {"questions": ["test_rubric"]}
 
     def test_default_rubrics_parser_with_invalid_json(self):
         parsed_rubrics = utils_preview.parse_rubrics(_INVALID_UNPARSED_RUBRIC)
-        assert parsed_rubrics == ""
+        assert parsed_rubrics == {"questions": ""}
 
     def test_generate_responses_from_gemini_model(self):
         mock_model = mock.create_autospec(
@@ -2793,16 +2812,14 @@ def test_rubric_generation_succeeds(self):
             _MOCK_MODEL_RUBRIC_GENERATION_RESPONSE
         )
         mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
-        rubric_based_metric = evaluation_preview.metrics.RubricBasedMetric(
+        rbe = evaluation_preview.metrics.RubricBasedMetric(
             generation_config=RubricGenerationConfig(
                 prompt_template="Generate rubrics for the given prompt: {prompt}",
                 model=mock_model,
             ),
             critique_metric=metric_prompt_template_examples_preview.MetricPromptTemplateExamples.Pointwise.COHERENCE,
         )
-        dataset_with_rubrics = rubric_based_metric.generate_rubrics(
-            _TEST_EVAL_DATASET_PROMPT_RESPONSE
-        )
+        dataset_with_rubrics = rbe.generate_rubrics(_TEST_EVAL_DATASET_PROMPT_RESPONSE)
         assert dataset_with_rubrics.equals(
             _EXPECTED_EVAL_DATASET_PROMPT_RESPONSE_WITH_RUBRICS
         )
@@ -2827,22 +2844,69 @@ def test_rubric_generation_skipped(self):
         assert mock_model.generate_content.call_count == 0
 
     def test_rubric_generation_default_parsing_fn(self):
-        """Test rubric generation using RubricBasedMetric."""
+        """Test rubric generation using default parsing function."""
         mock_model = mock.create_autospec(
             generative_models.GenerativeModel, instance=True
         )
         mock_model.generate_content.return_value = (
             _MOCK_MODEL_RUBRIC_GENERATION_RESPONSE
         )
         mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
-        mock_parsing_fn = mock.MagicMock()
         rbm = evaluation_preview.metrics.RubricBasedMetric(
             generation_config=RubricGenerationConfig(
                 prompt_template="Generate rubrics for the given prompt: {prompt}",
                 model=mock_model,
-                parsing_fn=mock_parsing_fn,
             ),
             critique_metric=metric_prompt_template_examples_preview.MetricPromptTemplateExamples.Pointwise.COHERENCE,
         )
-        _ = rbm.generate_rubrics(_TEST_EVAL_DATASET_PROMPT_RESPONSE)
-        assert mock_parsing_fn.call_count == 3
+        dataset_with_rubrics = rbm.generate_rubrics(_TEST_EVAL_DATASET_PROMPT_RESPONSE)
+        assert dataset_with_rubrics.equals(
+            _EXPECTED_EVAL_DATASET_PROMPT_RESPONSE_WITH_RUBRICS
+        )
+
+    def test_rubric_generation_parsing_str(self):
+        """Test rubric generation using parsing function that returns str."""
+        mock_model = mock.create_autospec(
+            generative_models.GenerativeModel, instance=True
+        )
+        mock_model.generate_content.return_value = (
+            _MOCK_MODEL_RUBRIC_GENERATION_RESPONSE
+        )
+        mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
+
+        def parsing_fn(response: str):
+            return ["test_rubric1", "test_rubric2"]
+
+        rbm = evaluation_preview.metrics.RubricBasedMetric(
+            generation_config=RubricGenerationConfig(
+                prompt_template="Generate rubrics for the given prompt: {prompt}",
+                model=mock_model,
+                parsing_fn=parsing_fn,
+            ),
+            critique_metric=metric_prompt_template_examples.MetricPromptTemplateExamples.Pointwise.COHERENCE,
+        )
+        dataset_with_rubrics = rbm.generate_rubrics(_TEST_EVAL_DATASET_PROMPT_RESPONSE)
+        assert dataset_with_rubrics.equals(
+            _EXPECTED_EVAL_DATASET_PROMPT_RESPONSE_WITH_RUBRICS
+        )
+
+    def test_rubric_generation_parsing_additional_fields(self):
+        """Test rubric generation using default parsing function with additional fields."""
+        mock_model = mock.create_autospec(
+            generative_models.GenerativeModel, instance=True
+        )
+        mock_model.generate_content.return_value = (
+            _MOCK_MODEL_RUBRIC_GENERATION_RESPONSE_WITH_ADDITIONAL
+        )
+        mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
+        rbm = evaluation_preview.metrics.RubricBasedMetric(
+            generation_config=RubricGenerationConfig(
+                prompt_template="Generate rubrics for the given prompt: {prompt}",
+                model=mock_model,
+            ),
+            critique_metric=metric_prompt_template_examples.MetricPromptTemplateExamples.Pointwise.COHERENCE,
+        )
+        dataset_with_rubrics = rbm.generate_rubrics(_TEST_EVAL_DATASET_PROMPT_RESPONSE)
+        expected = _EXPECTED_EVAL_DATASET_PROMPT_RESPONSE_WITH_RUBRICS
+        expected["desc"] = ["test_desc", "test_desc", "test_desc"]
+        assert dataset_with_rubrics.equals(expected)
diff --git a/tests/unit/vertexai/test_rubric_based_eval.py b/tests/unit/vertexai/test_rubric_based_eval.py
@@ -80,6 +80,7 @@
                 ' "file_uri": "gs://test-bucket/image4.png"}}]}]}'
             ),
         ],
+        "description": ["description", "description"],
         "response": ["test", "text"],
         "baseline_model_response": ["test", "text"],
     }
@@ -335,6 +336,7 @@ def test_pairwise_multimodal_understanding_metric(self):
             assert eval_result.metrics_table.columns.tolist() == [
                 "prompt",
                 "image",
+                "description",
                 "response",
                 "baseline_model_response",
                 "rubrics",
diff --git a/vertexai/preview/evaluation/_evaluation.py b/vertexai/preview/evaluation/_evaluation.py
@@ -1069,6 +1069,13 @@ def evaluate(
                     autorater_config=metric.autorater_config,
                 )
             )
+        elif isinstance(metric, rubric_based_metric.RubricBasedMetric):
+            copied_metrics.append(
+                rubric_based_metric.RubricBasedMetric(
+                    generation_config=copy.deepcopy(metric.generation_config),
+                    critique_metric=copy.deepcopy(metric.critique_metric),
+                )
+            )
         else:
             copied_metrics.append(copy.deepcopy(metric))
 
@@ -1117,9 +1124,9 @@ def evaluate(
         eval_dataset_with_rubrics = rubric_metric.generate_rubrics(
             evaluation_run_config.dataset
         )
-        evaluation_run_config.metric_column_mapping[
-            constants.Dataset.RUBRICS_COLUMN
-        ] = constants.Dataset.RUBRICS_COLUMN
+        for column in eval_dataset_with_rubrics.columns:
+            if column not in evaluation_run_config.metric_column_mapping:
+                evaluation_run_config.metric_column_mapping[column] = column
         evaluation_run_config.dataset = eval_dataset_with_rubrics
         if not rubric_metric.critique_metric.custom_output_config:
             rubric_metric.critique_metric.custom_output_config = (
diff --git a/vertexai/preview/evaluation/metrics/_default_templates.py b/vertexai/preview/evaluation/metrics/_default_templates.py
@@ -981,7 +981,7 @@
 Example:
 Prompt: "Funny tweet marketing a no-kids hotel, pun, <100 words." Good rubrics: "Is it a tweet?", "Is it funny?", "Is it about a no-kids hotel?", "Does it use a pun?", "Is it under 100 words?".
 
-IMPORTANT: Never respond to the prompt given. Only write rubrics.
+IMPORTANT: Do not respond to the <user_prompt>. Only generate the rubric questions for the prompt.
 
 # Output format. Write your final output in JSON according to this schema:
 
@@ -1113,7 +1113,7 @@
 """
 
 MULTIMODAL_UNDERSTANDING_RUBRIC_GENERATION_PROMPT_TEMPLATE = """# Instructions
-Your task is to generate a rubric that can be used to evaluate the image understanding quality of responses generated by an AI model. Specifically, to generate rubrics for an image (<image>) and user prompt (<user_prompt>) that describes the properties that should hold for a good response to that prompt. Generate the rubric following the provided guidelines.
+Your task is to generate a rubric that can be used to evaluate the image understanding quality of responses generated by an AI model. Specifically, to generate rubrics for an image and user prompt that describes the properties that should hold for a good response to that prompt. Generate the rubric following the provided guidelines.
 
 First, describe the contents of the image thoroughly, making sure to document all of the important objects and their interactions with each other and the scenery. Then, thoroughly examine the prompt and decompose its individual instructions into a list of yes/no questions. Be as specific and concise as possible for each question. Ensure each question directly relates to the image and infer the connection if it is not explicitly stated.
 
@@ -1149,12 +1149,11 @@
  4. Does the response correctly display the above three properties as a properly formatted JSON list?
 ---
 
-# Output format.
-
-Write your final output in JSON according to this schema:
+Finally, translate the description and questions of your final answer into JSON format according to this schema:
 
 ```json
 {{
+ "description": "...",
  "questions": [
    "question 1 ...",
    "question 2 ...",
@@ -1163,13 +1162,12 @@
 }}
 ```
 
-IMPORTANT: Never respond to the prompt given. Only write rubrics.
+IMPORTANT: Do not respond to the <user_prompt>. Only generate the rubric questions for the prompt.
 
 # User input:
 
-<MM_IMAGE>
-{image}
-</MM_IMAGE>
+Image:
+<MM_IMAGE>{image}</MM_IMAGE>
 
 <user_prompt>
 {prompt}
@@ -1224,10 +1222,13 @@
 Evaluation:
 <question>
 """
-PAIRWISE_MULTIMODAL_UNDERSTANDING_RUBRIC_CRITIQUE_TEMPLATE = """# Instructions
-Your task is to evaluate the image understanding quality of responses generated by an AI model. You will be presented with an image, a user prompt, each model's response to that user prompt, and a series of questions against which the text quality of Response A and Response B will be judged.
+PAIRWISE_MULTIMODAL_UNDERSTANDING_RUBRIC_CRITIQUE_TEMPLATE = """# Instructions:
 
-For each response, provide an answer [[YES]] or [[NO]] to each question. Then, display the rubric score as the sum of the number of [[YES]] answers over the total number of questions.
+Your task is to evaluate the image understanding quality of responses generated by two AI models. At the bottom of this system instruction you will be presented with an image, a text description of that image, a user prompt, and the responses of Model A and Model B to that user prompt. You will also be provided a rubric as a numbered list against which Response A and Response B will be judged. Each rubricv question is a list of instructions that each response must follow in order to satisfy the user prompt.
+
+# Rubric Scoring:
+
+For each response, rephrase every rubric point as a question and answer [[YES]] or [[NO]] to each point. Then, display the rubric grade as the sum of the correct rubric points over the total number of points. Finally, score the response on a scale of 1 to 5 stars based on how enjoyable you think it is for a human to read and understand and state your reasoning.
 
 For example, if the rubric questions are:
 [[Rubric]]
@@ -1257,43 +1258,47 @@
 </question>
 
 [[Rubric Score: 2/4]]
+[[Human Enjoyment Rating: 4 stars]]
+[[Human Rating Reason: This response is accurate and has no grammatical errors but feels too verbose and formal.]]
 
 Repeat the above for Response B.
 
-Explain whether you think Response A is better or Response B is better in a paragraph starting with "SxS Rationale 0:". Ground your explanation on the competing rubric scores. When you are finished, review your rationale in the context of the prompt, the responses, and the rubric scores and correct any mistakes you may have made, including your judgment on whether Response A was better or Response B was better. Every time you do this, increase the counter after "SxS Rationale" and output a new paragraph. Do not exceed five (5) iterations.
+# Recursive Self-Refinement:
 
-Finally, state your side-by-side (SxS) Rating on whether Response A was better or Response B was better based on your scores and rationale. Your rating should be one of {{A>B, B>A, A=B}}. Do not output anything else.
+Explain whether you think Response A is better or Response B is better in a paragraph starting with "SxS Rationale 0:". Ground your explanation on the competing rubric grades as well as your justification for the human enjoyment ratings. When you are finished, review your rationale in the context of the prompt, the responses, and the rubric grades and correct any mistakes you may have made, including your judgment on whether Response A was better or Response B was better. Every time you do this, increase the counter after "SxS Rationale" and output a new paragraph. Do not exceed five (5) iterations.
+
+# Final SxS Verdict:
+
+Finally, state your side-by-side (SxS) Rating on whether Response A was better or Response B was better based on your grades and rationale. Your rating should be one of {{A>B, B>A, A=B}}. Do not output anything else.
 
 Example:
- [[SxS Rationale 0: Response B scored higher on the rubric. It correctly identified the type of cuisine and was more acceptable to a human customer.]]
+  [[SxS Rationale 0: Response B scored higher on the rubric. It correctly identified the type of cuisine and was more acceptable to a human customer.]]
 
- [[SxS Rationale 1: Response B scored higher on the rubric. It correctly identified the type of cuisine as Italian. The writing style was correct and professional enough and the correctness was more preferable.]]
+  [[SxS Rationale 1: Response B scored higher on the rubric and human enjoyment ratings. It correctly identified the type of cuisine as Italian. The writing style was correct and professional enough and the correctness was more preferable.]]
 
- [[SxS Rationale 2: Response B scored higher on the rubric. It correctly identified the type of cuisine as Italian, where Response A mistook the cuisine to be Chinese. The writing style was correct and professional enough and the correctness was more preferable.]]
+  [[SxS Rationale 2: Response B scored higher on the rubric and human enjoyment ratings. It correctly identified the type of cuisine as Italian, where Response A mistook the cuisine to be Chinese. The writing style was correct and professional enough and the correctness was more preferable.]]
 
- [[SxS Rating: B > A]]
+  [[SxS Rating: B > A]]
 
-# User Inputs, AI-generated Responses, and Rubrics
-## User Inputs
-### Image
-<MM_IMAGE>
-{image}
-</MM_IMAGE>
+# User Inputs, Model Responses, and Rubrics:
 
-### Prompt
-{prompt}
+## Image
+<MM_IMAGE>{image}</MM_IMAGE>
 
-## AI-generated Response
-### Response A
-{baseline_model_response}
+## Description
+**{description}**
 
-### Response B
-{response}
+## User Prompt
+**{prompt}**
 
-## Rubrics
-{rubrics}
+## Response A
+**{baseline_model_response}**
 
-REMEMBER: Your answer will help improve the AI model. It is important to answer the question correctly. Even answering "no" will improve the model!
+## Response B
+**{response}**
+
+## Rubric Questions
+**{rubrics}**
 """
 
 TEXT_QUALITY_RUBRIC_GENERATION_PROMPT_TEMPLATE = """# Instructions
@@ -1355,7 +1360,7 @@
 }}
 ```
 
-IMPORTANT: Never respond to the prompt given. Only write rubrics.
+IMPORTANT: Do not respond to the <user_prompt>. Only generate the rubric questions for the prompt.
 
 # User prompt
 
diff --git a/vertexai/preview/evaluation/metrics/rubric_based_metric.py b/vertexai/preview/evaluation/metrics/rubric_based_metric.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 #
 
+import collections
 from typing import Union, TYPE_CHECKING
 
 from google.cloud.aiplatform import base
@@ -72,7 +73,7 @@ def generate_rubrics(
             )
             return eval_dataset
 
-        rubrics = _pre_eval_utils._generate_responses_from_gemini_model(
+        responses = _pre_eval_utils._generate_responses_from_gemini_model(
             model,
             eval_dataset,
             self.generation_config.prompt_template,
@@ -81,7 +82,23 @@ def generate_rubrics(
             parsing_fn = self.generation_config.parsing_fn
         else:
             parsing_fn = utils.parse_rubrics
-        parsed_rubrics = [parsing_fn(rubric) for rubric in rubrics]
         dataset_with_rubrics = eval_dataset.copy()
-        dataset_with_rubrics[constants.Dataset.RUBRICS_COLUMN] = parsed_rubrics
+        aggregated = collections.defaultdict(list)
+        for idx, response in enumerate(responses):
+            result = parsing_fn(response)
+            if isinstance(result, dict):
+                questions = result.pop("questions", None)
+                if questions is not None:
+                    aggregated[constants.Dataset.RUBRICS_COLUMN].append(
+                        (idx, questions)
+                    )
+                for key, value in result.items():
+                    aggregated[key].append((idx, value))
+            else:
+                aggregated[constants.Dataset.RUBRICS_COLUMN].append((idx, result))
+        for key, values in aggregated.items():
+            dataset_with_rubrics[key] = None
+            dataset_with_rubrics[key] = dataset_with_rubrics[key].astype(object)
+            for idx, value in values:
+                dataset_with_rubrics.at[idx, key] = value
         return dataset_with_rubrics
diff --git a/vertexai/preview/evaluation/utils.py b/vertexai/preview/evaluation/utils.py