Skip to content

Commit bbd4a49

Browse files
happy-qiaocopybara-github
authored andcommitted
fix: Avoid throw error when Part.text is empty in modality content checks
PiperOrigin-RevId: 650788670
1 parent fcdcc11 commit bbd4a49

File tree

2 files changed

+30
-7
lines changed

2 files changed

+30
-7
lines changed

tests/unit/vertexai/test_tokenization.py

+22-6
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
import hashlib
1717
import io
1818
import os
19-
import tempfile
2019
import shutil
20+
import tempfile
2121
from typing import List
2222
from unittest import mock
2323
from vertexai.generative_models import Content, Image, Part
@@ -27,8 +27,11 @@
2727
get_tokenizer_for_model,
2828
)
2929
import pytest
30-
from sentencepiece import sentencepiece_model_pb2
3130
import sentencepiece as spm
31+
from sentencepiece import sentencepiece_model_pb2
32+
from google.cloud.aiplatform_v1beta1.types import (
33+
content as gapic_content_types,
34+
)
3235

3336
_TOKENIZER_NAME = "google/gemma"
3437
_MODEL_NAME = "gemini-1.5-pro"
@@ -63,9 +66,14 @@
6366
[
6467
Part.from_text(_SENTENCE_1),
6568
Part.from_text(_SENTENCE_2),
69+
Part.from_text(_EMPTY_SENTENCE),
70+
],
71+
[_SENTENCE_1, _SENTENCE_2, _EMPTY_SENTENCE],
72+
[
73+
_TOKENS_MAP[_SENTENCE_1]["ids"],
74+
_TOKENS_MAP[_SENTENCE_2]["ids"],
75+
_TOKENS_MAP[_EMPTY_SENTENCE]["ids"],
6676
],
67-
[_SENTENCE_1, _SENTENCE_2],
68-
[_TOKENS_MAP[_SENTENCE_1]["ids"], _TOKENS_MAP[_SENTENCE_2]["ids"]],
6977
),
7078
(
7179
Content(role="user", parts=[Part.from_text(_SENTENCE_1)]),
@@ -78,10 +86,15 @@
7886
parts=[
7987
Part.from_text(_SENTENCE_1),
8088
Part.from_text(_SENTENCE_2),
89+
Part.from_text(_EMPTY_SENTENCE),
8190
],
8291
),
83-
[_SENTENCE_1, _SENTENCE_2],
84-
[_TOKENS_MAP[_SENTENCE_1]["ids"], _TOKENS_MAP[_SENTENCE_2]["ids"]],
92+
[_SENTENCE_1, _SENTENCE_2, _EMPTY_SENTENCE],
93+
[
94+
_TOKENS_MAP[_SENTENCE_1]["ids"],
95+
_TOKENS_MAP[_SENTENCE_2]["ids"],
96+
_TOKENS_MAP[_EMPTY_SENTENCE]["ids"],
97+
],
8598
),
8699
(
87100
[
@@ -128,6 +141,9 @@
128141

129142

130143
_LIST_OF_UNSUPPORTED_CONTENTS = [
144+
gapic_content_types.Part(
145+
video_metadata=gapic_content_types.VideoMetadata(start_offset="10s")
146+
),
131147
Part.from_uri("gs://bucket/object", mime_type="mime_type"),
132148
Part.from_data(b"inline_data_bytes", mime_type="mime_type"),
133149
Part.from_dict({"function_call": {"name": "test_function_call"}}),

vertexai/tokenization/_tokenizers.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
)
3434
from google.cloud.aiplatform_v1beta1.types import (
3535
content as gapic_content_types,
36+
tool as gapic_tool_types,
3637
)
3738

3839

@@ -120,7 +121,13 @@ def _assert_text_only_content_types_sequence(
120121

121122
def _assert_text_only_gapic_part(value: gapic_content_types.Part):
122123
"""Asserts that the gapic content part is a text content type."""
123-
if not value.text:
124+
if (
125+
gapic_content_types.FileData() != value.file_data
126+
or gapic_content_types.Blob() != value.inline_data
127+
or gapic_tool_types.FunctionCall() != value.function_call
128+
or gapic_tool_types.FunctionResponse() != value.function_response
129+
or gapic_content_types.VideoMetadata() != value.video_metadata
130+
):
124131
raise ValueError("Tokenizers do not support non-text content types.")
125132

126133

0 commit comments

Comments
 (0)