Skip to content

Commit 79ca86a

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Evaluation: Release GenAI Evaluation SDK parsing rubric generation response with additional fields to vertexai.preview module.
PiperOrigin-RevId: 742903092
1 parent e0bf9ba commit 79ca86a

File tree

6 files changed

+159
-66
lines changed

6 files changed

+159
-66
lines changed

tests/unit/vertexai/test_evaluation.py

+79-15
Original file line numberDiff line numberDiff line change
@@ -429,9 +429,9 @@
429429
"prompt": ["test_prompt", "text_prompt", "test_prompt_3"],
430430
"response": ["test", "text", "test_response_3"],
431431
"rubrics": [
432-
["test_rubric"],
433-
["test_rubric"],
434-
["test_rubric"],
432+
["test_rubric1", "test_rubric2"],
433+
["test_rubric1", "test_rubric2"],
434+
["test_rubric1", "test_rubric2"],
435435
],
436436
}
437437
)
@@ -681,7 +681,26 @@
681681
"candidates": [
682682
{
683683
"content": {
684-
"parts": [{"text": """```json{"questions": ["test_rubric"]}```"""}]
684+
"parts": [
685+
{
686+
"text": """```json{"questions": ["test_rubric1", "test_rubric2"]}```"""
687+
}
688+
]
689+
},
690+
}
691+
]
692+
}
693+
)
694+
_MOCK_MODEL_RUBRIC_GENERATION_RESPONSE_WITH_ADDITIONAL = generative_models.GenerationResponse.from_dict(
695+
{
696+
"candidates": [
697+
{
698+
"content": {
699+
"parts": [
700+
{
701+
"text": """```json{"questions": ["test_rubric1", "test_rubric2"], "desc": "test_desc"}```"""
702+
}
703+
]
685704
},
686705
}
687706
]
@@ -2611,11 +2630,11 @@ def test_validate_metrics_multiple_rubric_based_metrics(self):
26112630

26122631
def test_default_rubrics_parser_succeeds(self):
26132632
parsed_rubrics = utils_preview.parse_rubrics(_UNPARSED_RUBRIC)
2614-
assert parsed_rubrics == ["test_rubric"]
2633+
assert parsed_rubrics == {"questions": ["test_rubric"]}
26152634

26162635
def test_default_rubrics_parser_with_invalid_json(self):
26172636
parsed_rubrics = utils_preview.parse_rubrics(_INVALID_UNPARSED_RUBRIC)
2618-
assert parsed_rubrics == ""
2637+
assert parsed_rubrics == {"questions": ""}
26192638

26202639
def test_generate_responses_from_gemini_model(self):
26212640
mock_model = mock.create_autospec(
@@ -2793,16 +2812,14 @@ def test_rubric_generation_succeeds(self):
27932812
_MOCK_MODEL_RUBRIC_GENERATION_RESPONSE
27942813
)
27952814
mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
2796-
rubric_based_metric = evaluation_preview.metrics.RubricBasedMetric(
2815+
rbe = evaluation_preview.metrics.RubricBasedMetric(
27972816
generation_config=RubricGenerationConfig(
27982817
prompt_template="Generate rubrics for the given prompt: {prompt}",
27992818
model=mock_model,
28002819
),
28012820
critique_metric=metric_prompt_template_examples_preview.MetricPromptTemplateExamples.Pointwise.COHERENCE,
28022821
)
2803-
dataset_with_rubrics = rubric_based_metric.generate_rubrics(
2804-
_TEST_EVAL_DATASET_PROMPT_RESPONSE
2805-
)
2822+
dataset_with_rubrics = rbe.generate_rubrics(_TEST_EVAL_DATASET_PROMPT_RESPONSE)
28062823
assert dataset_with_rubrics.equals(
28072824
_EXPECTED_EVAL_DATASET_PROMPT_RESPONSE_WITH_RUBRICS
28082825
)
@@ -2827,22 +2844,69 @@ def test_rubric_generation_skipped(self):
28272844
assert mock_model.generate_content.call_count == 0
28282845

28292846
def test_rubric_generation_default_parsing_fn(self):
2830-
"""Test rubric generation using RubricBasedMetric."""
2847+
"""Test rubric generation using default parsing function."""
28312848
mock_model = mock.create_autospec(
28322849
generative_models.GenerativeModel, instance=True
28332850
)
28342851
mock_model.generate_content.return_value = (
28352852
_MOCK_MODEL_RUBRIC_GENERATION_RESPONSE
28362853
)
28372854
mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
2838-
mock_parsing_fn = mock.MagicMock()
28392855
rbm = evaluation_preview.metrics.RubricBasedMetric(
28402856
generation_config=RubricGenerationConfig(
28412857
prompt_template="Generate rubrics for the given prompt: {prompt}",
28422858
model=mock_model,
2843-
parsing_fn=mock_parsing_fn,
28442859
),
28452860
critique_metric=metric_prompt_template_examples_preview.MetricPromptTemplateExamples.Pointwise.COHERENCE,
28462861
)
2847-
_ = rbm.generate_rubrics(_TEST_EVAL_DATASET_PROMPT_RESPONSE)
2848-
assert mock_parsing_fn.call_count == 3
2862+
dataset_with_rubrics = rbm.generate_rubrics(_TEST_EVAL_DATASET_PROMPT_RESPONSE)
2863+
assert dataset_with_rubrics.equals(
2864+
_EXPECTED_EVAL_DATASET_PROMPT_RESPONSE_WITH_RUBRICS
2865+
)
2866+
2867+
def test_rubric_generation_parsing_str(self):
2868+
"""Test rubric generation using parsing function that returns str."""
2869+
mock_model = mock.create_autospec(
2870+
generative_models.GenerativeModel, instance=True
2871+
)
2872+
mock_model.generate_content.return_value = (
2873+
_MOCK_MODEL_RUBRIC_GENERATION_RESPONSE
2874+
)
2875+
mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
2876+
2877+
def parsing_fn(response: str):
2878+
return ["test_rubric1", "test_rubric2"]
2879+
2880+
rbm = evaluation_preview.metrics.RubricBasedMetric(
2881+
generation_config=RubricGenerationConfig(
2882+
prompt_template="Generate rubrics for the given prompt: {prompt}",
2883+
model=mock_model,
2884+
parsing_fn=parsing_fn,
2885+
),
2886+
critique_metric=metric_prompt_template_examples.MetricPromptTemplateExamples.Pointwise.COHERENCE,
2887+
)
2888+
dataset_with_rubrics = rbm.generate_rubrics(_TEST_EVAL_DATASET_PROMPT_RESPONSE)
2889+
assert dataset_with_rubrics.equals(
2890+
_EXPECTED_EVAL_DATASET_PROMPT_RESPONSE_WITH_RUBRICS
2891+
)
2892+
2893+
def test_rubric_generation_parsing_additional_fields(self):
2894+
"""Test rubric generation using default parsing function with additional fields."""
2895+
mock_model = mock.create_autospec(
2896+
generative_models.GenerativeModel, instance=True
2897+
)
2898+
mock_model.generate_content.return_value = (
2899+
_MOCK_MODEL_RUBRIC_GENERATION_RESPONSE_WITH_ADDITIONAL
2900+
)
2901+
mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
2902+
rbm = evaluation_preview.metrics.RubricBasedMetric(
2903+
generation_config=RubricGenerationConfig(
2904+
prompt_template="Generate rubrics for the given prompt: {prompt}",
2905+
model=mock_model,
2906+
),
2907+
critique_metric=metric_prompt_template_examples.MetricPromptTemplateExamples.Pointwise.COHERENCE,
2908+
)
2909+
dataset_with_rubrics = rbm.generate_rubrics(_TEST_EVAL_DATASET_PROMPT_RESPONSE)
2910+
expected = _EXPECTED_EVAL_DATASET_PROMPT_RESPONSE_WITH_RUBRICS
2911+
expected["desc"] = ["test_desc", "test_desc", "test_desc"]
2912+
assert dataset_with_rubrics.equals(expected)

tests/unit/vertexai/test_rubric_based_eval.py

+2
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@
8080
' "file_uri": "gs://test-bucket/image4.png"}}]}]}'
8181
),
8282
],
83+
"description": ["description", "description"],
8384
"response": ["test", "text"],
8485
"baseline_model_response": ["test", "text"],
8586
}
@@ -335,6 +336,7 @@ def test_pairwise_multimodal_understanding_metric(self):
335336
assert eval_result.metrics_table.columns.tolist() == [
336337
"prompt",
337338
"image",
339+
"description",
338340
"response",
339341
"baseline_model_response",
340342
"rubrics",

vertexai/preview/evaluation/_evaluation.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -1069,6 +1069,13 @@ def evaluate(
10691069
autorater_config=metric.autorater_config,
10701070
)
10711071
)
1072+
elif isinstance(metric, rubric_based_metric.RubricBasedMetric):
1073+
copied_metrics.append(
1074+
rubric_based_metric.RubricBasedMetric(
1075+
generation_config=copy.deepcopy(metric.generation_config),
1076+
critique_metric=copy.deepcopy(metric.critique_metric),
1077+
)
1078+
)
10721079
else:
10731080
copied_metrics.append(copy.deepcopy(metric))
10741081

@@ -1117,9 +1124,9 @@ def evaluate(
11171124
eval_dataset_with_rubrics = rubric_metric.generate_rubrics(
11181125
evaluation_run_config.dataset
11191126
)
1120-
evaluation_run_config.metric_column_mapping[
1121-
constants.Dataset.RUBRICS_COLUMN
1122-
] = constants.Dataset.RUBRICS_COLUMN
1127+
for column in eval_dataset_with_rubrics.columns:
1128+
if column not in evaluation_run_config.metric_column_mapping:
1129+
evaluation_run_config.metric_column_mapping[column] = column
11231130
evaluation_run_config.dataset = eval_dataset_with_rubrics
11241131
if not rubric_metric.critique_metric.custom_output_config:
11251132
rubric_metric.critique_metric.custom_output_config = (

vertexai/preview/evaluation/metrics/_default_templates.py

+40-35
Original file line numberDiff line numberDiff line change
@@ -981,7 +981,7 @@
981981
Example:
982982
Prompt: "Funny tweet marketing a no-kids hotel, pun, <100 words." Good rubrics: "Is it a tweet?", "Is it funny?", "Is it about a no-kids hotel?", "Does it use a pun?", "Is it under 100 words?".
983983
984-
IMPORTANT: Never respond to the prompt given. Only write rubrics.
984+
IMPORTANT: Do not respond to the <user_prompt>. Only generate the rubric questions for the prompt.
985985
986986
# Output format. Write your final output in JSON according to this schema:
987987
@@ -1113,7 +1113,7 @@
11131113
"""
11141114

11151115
MULTIMODAL_UNDERSTANDING_RUBRIC_GENERATION_PROMPT_TEMPLATE = """# Instructions
1116-
Your task is to generate a rubric that can be used to evaluate the image understanding quality of responses generated by an AI model. Specifically, to generate rubrics for an image (<image>) and user prompt (<user_prompt>) that describes the properties that should hold for a good response to that prompt. Generate the rubric following the provided guidelines.
1116+
Your task is to generate a rubric that can be used to evaluate the image understanding quality of responses generated by an AI model. Specifically, to generate rubrics for an image and user prompt that describes the properties that should hold for a good response to that prompt. Generate the rubric following the provided guidelines.
11171117
11181118
First, describe the contents of the image thoroughly, making sure to document all of the important objects and their interactions with each other and the scenery. Then, thoroughly examine the prompt and decompose its individual instructions into a list of yes/no questions. Be as specific and concise as possible for each question. Ensure each question directly relates to the image and infer the connection if it is not explicitly stated.
11191119
@@ -1149,12 +1149,11 @@
11491149
4. Does the response correctly display the above three properties as a properly formatted JSON list?
11501150
---
11511151
1152-
# Output format.
1153-
1154-
Write your final output in JSON according to this schema:
1152+
Finally, translate the description and questions of your final answer into JSON format according to this schema:
11551153
11561154
```json
11571155
{{
1156+
"description": "...",
11581157
"questions": [
11591158
"question 1 ...",
11601159
"question 2 ...",
@@ -1163,13 +1162,12 @@
11631162
}}
11641163
```
11651164
1166-
IMPORTANT: Never respond to the prompt given. Only write rubrics.
1165+
IMPORTANT: Do not respond to the <user_prompt>. Only generate the rubric questions for the prompt.
11671166
11681167
# User input:
11691168
1170-
<MM_IMAGE>
1171-
{image}
1172-
</MM_IMAGE>
1169+
Image:
1170+
<MM_IMAGE>{image}</MM_IMAGE>
11731171
11741172
<user_prompt>
11751173
{prompt}
@@ -1224,10 +1222,13 @@
12241222
Evaluation:
12251223
<question>
12261224
"""
1227-
PAIRWISE_MULTIMODAL_UNDERSTANDING_RUBRIC_CRITIQUE_TEMPLATE = """# Instructions
1228-
Your task is to evaluate the image understanding quality of responses generated by an AI model. You will be presented with an image, a user prompt, each model's response to that user prompt, and a series of questions against which the text quality of Response A and Response B will be judged.
1225+
PAIRWISE_MULTIMODAL_UNDERSTANDING_RUBRIC_CRITIQUE_TEMPLATE = """# Instructions:
12291226
1230-
For each response, provide an answer [[YES]] or [[NO]] to each question. Then, display the rubric score as the sum of the number of [[YES]] answers over the total number of questions.
1227+
Your task is to evaluate the image understanding quality of responses generated by two AI models. At the bottom of this system instruction you will be presented with an image, a text description of that image, a user prompt, and the responses of Model A and Model B to that user prompt. You will also be provided a rubric as a numbered list against which Response A and Response B will be judged. Each rubricv question is a list of instructions that each response must follow in order to satisfy the user prompt.
1228+
1229+
# Rubric Scoring:
1230+
1231+
For each response, rephrase every rubric point as a question and answer [[YES]] or [[NO]] to each point. Then, display the rubric grade as the sum of the correct rubric points over the total number of points. Finally, score the response on a scale of 1 to 5 stars based on how enjoyable you think it is for a human to read and understand and state your reasoning.
12311232
12321233
For example, if the rubric questions are:
12331234
[[Rubric]]
@@ -1257,43 +1258,47 @@
12571258
</question>
12581259
12591260
[[Rubric Score: 2/4]]
1261+
[[Human Enjoyment Rating: 4 stars]]
1262+
[[Human Rating Reason: This response is accurate and has no grammatical errors but feels too verbose and formal.]]
12601263
12611264
Repeat the above for Response B.
12621265
1263-
Explain whether you think Response A is better or Response B is better in a paragraph starting with "SxS Rationale 0:". Ground your explanation on the competing rubric scores. When you are finished, review your rationale in the context of the prompt, the responses, and the rubric scores and correct any mistakes you may have made, including your judgment on whether Response A was better or Response B was better. Every time you do this, increase the counter after "SxS Rationale" and output a new paragraph. Do not exceed five (5) iterations.
1266+
# Recursive Self-Refinement:
12641267
1265-
Finally, state your side-by-side (SxS) Rating on whether Response A was better or Response B was better based on your scores and rationale. Your rating should be one of {{A>B, B>A, A=B}}. Do not output anything else.
1268+
Explain whether you think Response A is better or Response B is better in a paragraph starting with "SxS Rationale 0:". Ground your explanation on the competing rubric grades as well as your justification for the human enjoyment ratings. When you are finished, review your rationale in the context of the prompt, the responses, and the rubric grades and correct any mistakes you may have made, including your judgment on whether Response A was better or Response B was better. Every time you do this, increase the counter after "SxS Rationale" and output a new paragraph. Do not exceed five (5) iterations.
1269+
1270+
# Final SxS Verdict:
1271+
1272+
Finally, state your side-by-side (SxS) Rating on whether Response A was better or Response B was better based on your grades and rationale. Your rating should be one of {{A>B, B>A, A=B}}. Do not output anything else.
12661273
12671274
Example:
1268-
[[SxS Rationale 0: Response B scored higher on the rubric. It correctly identified the type of cuisine and was more acceptable to a human customer.]]
1275+
[[SxS Rationale 0: Response B scored higher on the rubric. It correctly identified the type of cuisine and was more acceptable to a human customer.]]
12691276
1270-
[[SxS Rationale 1: Response B scored higher on the rubric. It correctly identified the type of cuisine as Italian. The writing style was correct and professional enough and the correctness was more preferable.]]
1277+
[[SxS Rationale 1: Response B scored higher on the rubric and human enjoyment ratings. It correctly identified the type of cuisine as Italian. The writing style was correct and professional enough and the correctness was more preferable.]]
12711278
1272-
[[SxS Rationale 2: Response B scored higher on the rubric. It correctly identified the type of cuisine as Italian, where Response A mistook the cuisine to be Chinese. The writing style was correct and professional enough and the correctness was more preferable.]]
1279+
[[SxS Rationale 2: Response B scored higher on the rubric and human enjoyment ratings. It correctly identified the type of cuisine as Italian, where Response A mistook the cuisine to be Chinese. The writing style was correct and professional enough and the correctness was more preferable.]]
12731280
1274-
[[SxS Rating: B > A]]
1281+
[[SxS Rating: B > A]]
12751282
1276-
# User Inputs, AI-generated Responses, and Rubrics
1277-
## User Inputs
1278-
### Image
1279-
<MM_IMAGE>
1280-
{image}
1281-
</MM_IMAGE>
1283+
# User Inputs, Model Responses, and Rubrics:
12821284
1283-
### Prompt
1284-
{prompt}
1285+
## Image
1286+
<MM_IMAGE>{image}</MM_IMAGE>
12851287
1286-
## AI-generated Response
1287-
### Response A
1288-
{baseline_model_response}
1288+
## Description
1289+
**{description}**
12891290
1290-
### Response B
1291-
{response}
1291+
## User Prompt
1292+
**{prompt}**
12921293
1293-
## Rubrics
1294-
{rubrics}
1294+
## Response A
1295+
**{baseline_model_response}**
12951296
1296-
REMEMBER: Your answer will help improve the AI model. It is important to answer the question correctly. Even answering "no" will improve the model!
1297+
## Response B
1298+
**{response}**
1299+
1300+
## Rubric Questions
1301+
**{rubrics}**
12971302
"""
12981303

12991304
TEXT_QUALITY_RUBRIC_GENERATION_PROMPT_TEMPLATE = """# Instructions
@@ -1355,7 +1360,7 @@
13551360
}}
13561361
```
13571362
1358-
IMPORTANT: Never respond to the prompt given. Only write rubrics.
1363+
IMPORTANT: Do not respond to the <user_prompt>. Only generate the rubric questions for the prompt.
13591364
13601365
# User prompt
13611366

vertexai/preview/evaluation/metrics/rubric_based_metric.py

+20-3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414
#
1515

16+
import collections
1617
from typing import Union, TYPE_CHECKING
1718

1819
from google.cloud.aiplatform import base
@@ -72,7 +73,7 @@ def generate_rubrics(
7273
)
7374
return eval_dataset
7475

75-
rubrics = _pre_eval_utils._generate_responses_from_gemini_model(
76+
responses = _pre_eval_utils._generate_responses_from_gemini_model(
7677
model,
7778
eval_dataset,
7879
self.generation_config.prompt_template,
@@ -81,7 +82,23 @@ def generate_rubrics(
8182
parsing_fn = self.generation_config.parsing_fn
8283
else:
8384
parsing_fn = utils.parse_rubrics
84-
parsed_rubrics = [parsing_fn(rubric) for rubric in rubrics]
8585
dataset_with_rubrics = eval_dataset.copy()
86-
dataset_with_rubrics[constants.Dataset.RUBRICS_COLUMN] = parsed_rubrics
86+
aggregated = collections.defaultdict(list)
87+
for idx, response in enumerate(responses):
88+
result = parsing_fn(response)
89+
if isinstance(result, dict):
90+
questions = result.pop("questions", None)
91+
if questions is not None:
92+
aggregated[constants.Dataset.RUBRICS_COLUMN].append(
93+
(idx, questions)
94+
)
95+
for key, value in result.items():
96+
aggregated[key].append((idx, value))
97+
else:
98+
aggregated[constants.Dataset.RUBRICS_COLUMN].append((idx, result))
99+
for key, values in aggregated.items():
100+
dataset_with_rubrics[key] = None
101+
dataset_with_rubrics[key] = dataset_with_rubrics[key].astype(object)
102+
for idx, value in values:
103+
dataset_with_rubrics.at[idx, key] = value
87104
return dataset_with_rubrics

0 commit comments

Comments
 (0)