Skip to content

Commit bb07581

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Evaluation: Release GenAI Evaluation SDK rubric based evaluation to vertexai.preview module.
PiperOrigin-RevId: 742608797
1 parent be2c99f commit bb07581

19 files changed

+2812
-299
lines changed

tests/unit/vertexai/test_evaluation.py

+604
Large diffs are not rendered by default.

tests/unit/vertexai/test_multimodal_utils.py

+29-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
_TEST_LOCATION = "us-central1"
3434

3535
_MODEL_BASED_METRIC_INSTANCE_INPUT = {
36-
"prompt": '{"contents": [{"parts": [{"text": "test prompt"}]}]}',
36+
"prompt": "test prompt",
3737
"response": (
3838
'{"contents": [{"parts": [{"file_data": {"mime_type": "image/png",'
3939
' "file_uri": "gs://test-bucket/image1.png"}}]}]}'
@@ -102,3 +102,31 @@ def test_convert_multimodal_response_to_content_map(self):
102102
)
103103
]
104104
)
105+
106+
def test_split_metric_prompt_template(self):
107+
metric_prompt_template = "This prompt uses {prompt} and {response}."
108+
placeholders = ["prompt", "response", "baseline_response"]
109+
split_metric_prompt_template = multimodal_utils._split_metric_prompt_template(
110+
metric_prompt_template, placeholders
111+
)
112+
assert split_metric_prompt_template == [
113+
"This prompt uses ",
114+
"{prompt}",
115+
" and ",
116+
"{response}",
117+
".",
118+
]
119+
120+
def test_assemble_multi_modal_prompt_gemini_format(self):
121+
prompt_template = """This {prompt} uses the {response}."""
122+
prompt = multimodal_utils._assemble_multi_modal_prompt(
123+
prompt_template,
124+
_MODEL_BASED_METRIC_INSTANCE_INPUT,
125+
1,
126+
["response", "prompt"],
127+
)
128+
assert prompt[0] == "This "
129+
assert prompt[1] == "test prompt"
130+
assert prompt[2] == " uses the "
131+
assert prompt[3].file_data.file_uri == "gs://test-bucket/image1.png"
132+
assert prompt[4] == "."
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,346 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
from unittest import mock
17+
18+
from google import auth
19+
from google.auth import credentials as auth_credentials
20+
import vertexai
21+
from google.cloud.aiplatform import initializer
22+
from google.cloud.aiplatform_v1beta1.services import (
23+
evaluation_service as gapic_evaluation_services,
24+
)
25+
from google.cloud.aiplatform_v1beta1.types import (
26+
evaluation_service as gapic_evaluation_service_types,
27+
)
28+
from vertexai import generative_models
29+
from vertexai.preview.evaluation import eval_task
30+
from vertexai.preview.evaluation.metrics import (
31+
predefined_rubric_metrics,
32+
)
33+
import pandas as pd
34+
import pytest
35+
36+
37+
PredefinedRubricMetrics = predefined_rubric_metrics.PredefinedRubricMetrics
38+
EvalTask = eval_task.EvalTask
39+
_TEST_PROJECT = "test-project"
40+
_TEST_LOCATION = "us-central1"
41+
_TEST_EVAL_DATASET = pd.DataFrame(
42+
{
43+
"prompt": ["test_prompt", "text_prompt", "test_prompt_3"],
44+
"response": ["test", "text", "test_response_3"],
45+
}
46+
)
47+
_TEST_PAIRWISE_EVAL_DATASET = pd.DataFrame(
48+
{
49+
"prompt": ["test_prompt", "text_prompt", "test_prompt_3"],
50+
"response": ["test", "text", "test_response_3"],
51+
"baseline_model_response": ["test", "text", "test_response_3"],
52+
}
53+
)
54+
_TEST_MULTIMODAL_EVAL_DATASET = pd.DataFrame(
55+
{
56+
"prompt": ["test_prompt", "text_prompt"],
57+
"image": [
58+
(
59+
'{"contents": [{"parts": [{"file_data": {"mime_type": "image/png",'
60+
' "file_uri": "gs://test-bucket/image3.png"}}]}]}'
61+
),
62+
(
63+
'{"contents": [{"parts": [{"file_data": {"mime_type": "image/png",'
64+
' "file_uri": "gs://test-bucket/image4.png"}}]}]}'
65+
),
66+
],
67+
"response": ["test", "text"],
68+
}
69+
)
70+
_TEST_PAIRWISE_MULTIMODAL_EVAL_DATASET = pd.DataFrame(
71+
{
72+
"prompt": ["test_prompt", "text_prompt"],
73+
"image": [
74+
(
75+
'{"contents": [{"parts": [{"file_data": {"mime_type": "image/png",'
76+
' "file_uri": "gs://test-bucket/image3.png"}}]}]}'
77+
),
78+
(
79+
'{"contents": [{"parts": [{"file_data": {"mime_type": "image/png",'
80+
' "file_uri": "gs://test-bucket/image4.png"}}]}]}'
81+
),
82+
],
83+
"response": ["test", "text"],
84+
"baseline_model_response": ["test", "text"],
85+
}
86+
)
87+
_MOCK_RUBRIC_GENERATION_RESPONSE = generative_models.GenerationResponse.from_dict(
88+
{
89+
"candidates": [
90+
{
91+
"content": {
92+
"parts": [{"text": """```json{"questions": ["test_rubric"]}```"""}]
93+
},
94+
}
95+
]
96+
}
97+
)
98+
_MOCK_POINTWISE_RESULT = gapic_evaluation_service_types.PointwiseMetricResult(
99+
custom_output=gapic_evaluation_service_types.CustomOutput(
100+
raw_outputs=gapic_evaluation_service_types.RawOutput(
101+
raw_output=[
102+
"""The appropriate rubrics for this prompt are:
103+
<question>
104+
STEP 1: ...
105+
Question: question 1
106+
Verdict: yes
107+
</question>
108+
<question>
109+
STEP 1: ...
110+
Question: question 2
111+
Verdict: no
112+
</question>""",
113+
"""The appropriate rubrics for this prompt are:
114+
<question>
115+
STEP 1: ...
116+
Question: question 1
117+
Verdict: yes
118+
</question>
119+
<question>
120+
STEP 1: ...
121+
Question: question 2
122+
Verdict: no
123+
</question>""",
124+
],
125+
),
126+
)
127+
)
128+
_MOCK_PAIRWISE_RESULT = gapic_evaluation_service_types.PairwiseMetricResult(
129+
custom_output=gapic_evaluation_service_types.CustomOutput(
130+
raw_outputs=gapic_evaluation_service_types.RawOutput(
131+
raw_output=[
132+
""""[[Response A Answers:]]\n"
133+
"Response A\n"
134+
"[[Rubric Score:]]\n"
135+
"Rubric Score\n"
136+
"[[Response B Answers:]]\n"
137+
"Response A\n"
138+
"[[Rubric Score:]]\n"
139+
"Rubric Score\n"
140+
"[[SxS Rating: B > A]]""",
141+
""""[[Response A Answers:]]\n"
142+
"Response A\n"
143+
"[[Rubric Score:]]\n"
144+
"Rubric Score\n"
145+
"[[Response B Answers:]]\n"
146+
"Response A\n"
147+
"[[Rubric Score:]]\n"
148+
"Rubric Score\n"
149+
"[[SxS Rating: B > A]]""",
150+
],
151+
),
152+
)
153+
)
154+
_MOCK_PAIRWISE_RESPONSE = (
155+
gapic_evaluation_service_types.EvaluateInstancesResponse(
156+
pairwise_metric_result=_MOCK_PAIRWISE_RESULT
157+
),
158+
gapic_evaluation_service_types.EvaluateInstancesResponse(
159+
pairwise_metric_result=_MOCK_PAIRWISE_RESULT
160+
),
161+
)
162+
_MOCK_POINTWISE_RESPONSE = (
163+
gapic_evaluation_service_types.EvaluateInstancesResponse(
164+
pointwise_metric_result=_MOCK_POINTWISE_RESULT
165+
),
166+
gapic_evaluation_service_types.EvaluateInstancesResponse(
167+
pointwise_metric_result=_MOCK_POINTWISE_RESULT
168+
),
169+
)
170+
171+
172+
@pytest.fixture(scope="module")
173+
def google_auth_mock():
174+
with mock.patch.object(auth, "default") as google_auth_mock:
175+
google_auth_mock.return_value = (
176+
auth_credentials.AnonymousCredentials(),
177+
_TEST_PROJECT,
178+
)
179+
yield google_auth_mock
180+
181+
182+
@pytest.mark.usefixtures("google_auth_mock")
183+
class TestPredefinedRubricMetrics:
184+
def setup_method(self):
185+
vertexai.init(
186+
project=_TEST_PROJECT,
187+
location=_TEST_LOCATION,
188+
)
189+
190+
def teardown_method(self):
191+
initializer.global_pool.shutdown(wait=True)
192+
193+
def test_pointwise_instruction_following_metric(self):
194+
metric = PredefinedRubricMetrics.Pointwise.INSTRUCTION_FOLLOWING
195+
mock_model = mock.create_autospec(
196+
generative_models.GenerativeModel, instance=True
197+
)
198+
mock_model.generate_content.return_value = _MOCK_RUBRIC_GENERATION_RESPONSE
199+
mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
200+
with mock.patch.object(
201+
target=gapic_evaluation_services.EvaluationServiceClient,
202+
attribute="evaluate_instances",
203+
side_effect=_MOCK_POINTWISE_RESPONSE,
204+
):
205+
eval_result = EvalTask(
206+
dataset=_TEST_EVAL_DATASET, metrics=[metric]
207+
).evaluate()
208+
assert eval_result.metrics_table.columns.tolist() == [
209+
"prompt",
210+
"response",
211+
"rubrics",
212+
"rb_instruction_following/score",
213+
"rb_instruction_following/rubric_verdict_pairs",
214+
"rb_instruction_following/raw_outputs",
215+
]
216+
217+
def test_pairwise_instruction_following_metric(self):
218+
metric = PredefinedRubricMetrics.Pairwise.INSTRUCTION_FOLLOWING
219+
mock_model = mock.create_autospec(
220+
generative_models.GenerativeModel, instance=True
221+
)
222+
mock_model.generate_content.return_value = _MOCK_RUBRIC_GENERATION_RESPONSE
223+
mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
224+
with mock.patch.object(
225+
target=gapic_evaluation_services.EvaluationServiceClient,
226+
attribute="evaluate_instances",
227+
side_effect=_MOCK_PAIRWISE_RESPONSE,
228+
):
229+
eval_result = EvalTask(
230+
dataset=_TEST_PAIRWISE_EVAL_DATASET, metrics=[metric]
231+
).evaluate()
232+
assert eval_result.metrics_table.columns.tolist() == [
233+
"prompt",
234+
"response",
235+
"baseline_model_response",
236+
"rubrics",
237+
"pairwise_rb_instruction_following/pairwise_choice",
238+
"pairwise_rb_instruction_following/score",
239+
"pairwise_rb_instruction_following/baseline_rubric_verdict_pairs",
240+
"pairwise_rb_instruction_following/candidate_rubric_verdict_pairs",
241+
"pairwise_rb_instruction_following/raw_outputs",
242+
]
243+
244+
def test_pointwise_text_quality_metric(self):
245+
metric = PredefinedRubricMetrics.Pointwise.TEXT_QUALITY
246+
mock_model = mock.create_autospec(
247+
generative_models.GenerativeModel, instance=True
248+
)
249+
mock_model.generate_content.return_value = _MOCK_RUBRIC_GENERATION_RESPONSE
250+
mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
251+
with mock.patch.object(
252+
target=gapic_evaluation_services.EvaluationServiceClient,
253+
attribute="evaluate_instances",
254+
side_effect=_MOCK_POINTWISE_RESPONSE,
255+
):
256+
eval_result = EvalTask(
257+
dataset=_TEST_EVAL_DATASET, metrics=[metric]
258+
).evaluate()
259+
assert eval_result.metrics_table.columns.tolist() == [
260+
"prompt",
261+
"response",
262+
"rubrics",
263+
"rb_text_quality/score",
264+
"rb_text_quality/rubric_verdict_pairs",
265+
"rb_text_quality/raw_outputs",
266+
]
267+
268+
def test_pairwise_text_quality_metric(self):
269+
metric = PredefinedRubricMetrics.Pairwise.TEXT_QUALITY
270+
mock_model = mock.create_autospec(
271+
generative_models.GenerativeModel, instance=True
272+
)
273+
mock_model.generate_content.return_value = _MOCK_RUBRIC_GENERATION_RESPONSE
274+
mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
275+
with mock.patch.object(
276+
target=gapic_evaluation_services.EvaluationServiceClient,
277+
attribute="evaluate_instances",
278+
side_effect=_MOCK_PAIRWISE_RESPONSE,
279+
):
280+
eval_result = EvalTask(
281+
dataset=_TEST_PAIRWISE_EVAL_DATASET, metrics=[metric]
282+
).evaluate()
283+
assert eval_result.metrics_table.columns.tolist() == [
284+
"prompt",
285+
"response",
286+
"baseline_model_response",
287+
"rubrics",
288+
"pairwise_rb_text_quality/pairwise_choice",
289+
"pairwise_rb_text_quality/score",
290+
"pairwise_rb_text_quality/baseline_rubric_verdict_pairs",
291+
"pairwise_rb_text_quality/candidate_rubric_verdict_pairs",
292+
"pairwise_rb_text_quality/raw_outputs",
293+
]
294+
295+
def test_pointwise_multimodal_understanding_metric(self):
296+
metric = PredefinedRubricMetrics.Pointwise.MULTIMODAL_UNDERSTANDING
297+
mock_model = mock.create_autospec(
298+
generative_models.GenerativeModel, instance=True
299+
)
300+
mock_model.generate_content.return_value = _MOCK_RUBRIC_GENERATION_RESPONSE
301+
mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
302+
with mock.patch.object(
303+
target=gapic_evaluation_services.EvaluationServiceClient,
304+
attribute="evaluate_instances",
305+
side_effect=_MOCK_POINTWISE_RESPONSE,
306+
):
307+
eval_result = EvalTask(
308+
dataset=_TEST_MULTIMODAL_EVAL_DATASET, metrics=[metric]
309+
).evaluate()
310+
assert eval_result.metrics_table.columns.tolist() == [
311+
"prompt",
312+
"image",
313+
"response",
314+
"rubrics",
315+
"rb_multimodal_understanding/score",
316+
"rb_multimodal_understanding/rubric_verdict_pairs",
317+
"rb_multimodal_understanding/raw_outputs",
318+
]
319+
320+
def test_pairwise_multimodal_understanding_metric(self):
321+
metric = PredefinedRubricMetrics.Pairwise.MULTIMODAL_UNDERSTANDING
322+
mock_model = mock.create_autospec(
323+
generative_models.GenerativeModel, instance=True
324+
)
325+
mock_model.generate_content.return_value = _MOCK_RUBRIC_GENERATION_RESPONSE
326+
mock_model._model_name = "publishers/google/model/gemini-1.0-pro"
327+
with mock.patch.object(
328+
target=gapic_evaluation_services.EvaluationServiceClient,
329+
attribute="evaluate_instances",
330+
side_effect=_MOCK_PAIRWISE_RESPONSE,
331+
):
332+
eval_result = EvalTask(
333+
dataset=_TEST_PAIRWISE_MULTIMODAL_EVAL_DATASET, metrics=[metric]
334+
).evaluate()
335+
assert eval_result.metrics_table.columns.tolist() == [
336+
"prompt",
337+
"image",
338+
"response",
339+
"baseline_model_response",
340+
"rubrics",
341+
"pairwise_rb_multimodal_understanding/pairwise_choice",
342+
"pairwise_rb_multimodal_understanding/score",
343+
"pairwise_rb_multimodal_understanding/baseline_rubric_verdict_pairs",
344+
"pairwise_rb_multimodal_understanding/candidate_rubric_verdict_pairs",
345+
"pairwise_rb_multimodal_understanding/raw_outputs",
346+
]

0 commit comments

Comments
 (0)