fix: Avoid throw error when Part.text is empty in modality content checks

happy-qiao · copybara-github · commit bbd4a49d3980 · 2024-07-09T16:18:11.000-07:00
PiperOrigin-RevId: 650788670
diff --git a/tests/unit/vertexai/test_tokenization.py b/tests/unit/vertexai/test_tokenization.py
@@ -16,8 +16,8 @@
 import hashlib
 import io
 import os
-import tempfile
 import shutil
+import tempfile
 from typing import List
 from unittest import mock
 from vertexai.generative_models import Content, Image, Part
@@ -27,8 +27,11 @@
     get_tokenizer_for_model,
 )
 import pytest
-from sentencepiece import sentencepiece_model_pb2
 import sentencepiece as spm
+from sentencepiece import sentencepiece_model_pb2
+from google.cloud.aiplatform_v1beta1.types import (
+    content as gapic_content_types,
+)
 
 _TOKENIZER_NAME = "google/gemma"
 _MODEL_NAME = "gemini-1.5-pro"
@@ -63,9 +66,14 @@
         [
             Part.from_text(_SENTENCE_1),
             Part.from_text(_SENTENCE_2),
+            Part.from_text(_EMPTY_SENTENCE),
+        ],
+        [_SENTENCE_1, _SENTENCE_2, _EMPTY_SENTENCE],
+        [
+            _TOKENS_MAP[_SENTENCE_1]["ids"],
+            _TOKENS_MAP[_SENTENCE_2]["ids"],
+            _TOKENS_MAP[_EMPTY_SENTENCE]["ids"],
         ],
-        [_SENTENCE_1, _SENTENCE_2],
-        [_TOKENS_MAP[_SENTENCE_1]["ids"], _TOKENS_MAP[_SENTENCE_2]["ids"]],
     ),
     (
         Content(role="user", parts=[Part.from_text(_SENTENCE_1)]),
@@ -78,10 +86,15 @@
             parts=[
                 Part.from_text(_SENTENCE_1),
                 Part.from_text(_SENTENCE_2),
+                Part.from_text(_EMPTY_SENTENCE),
             ],
         ),
-        [_SENTENCE_1, _SENTENCE_2],
-        [_TOKENS_MAP[_SENTENCE_1]["ids"], _TOKENS_MAP[_SENTENCE_2]["ids"]],
+        [_SENTENCE_1, _SENTENCE_2, _EMPTY_SENTENCE],
+        [
+            _TOKENS_MAP[_SENTENCE_1]["ids"],
+            _TOKENS_MAP[_SENTENCE_2]["ids"],
+            _TOKENS_MAP[_EMPTY_SENTENCE]["ids"],
+        ],
     ),
     (
         [
@@ -128,6 +141,9 @@
 
 
 _LIST_OF_UNSUPPORTED_CONTENTS = [
+    gapic_content_types.Part(
+        video_metadata=gapic_content_types.VideoMetadata(start_offset="10s")
+    ),
     Part.from_uri("gs://bucket/object", mime_type="mime_type"),
     Part.from_data(b"inline_data_bytes", mime_type="mime_type"),
     Part.from_dict({"function_call": {"name": "test_function_call"}}),
diff --git a/vertexai/tokenization/_tokenizers.py b/vertexai/tokenization/_tokenizers.py
@@ -33,6 +33,7 @@
 )
 from google.cloud.aiplatform_v1beta1.types import (
     content as gapic_content_types,
+    tool as gapic_tool_types,
 )
 
 
@@ -120,7 +121,13 @@ def _assert_text_only_content_types_sequence(
 
 def _assert_text_only_gapic_part(value: gapic_content_types.Part):
     """Asserts that the gapic content part is a text content type."""
-    if not value.text:
+    if (
+        gapic_content_types.FileData() != value.file_data
+        or gapic_content_types.Blob() != value.inline_data
+        or gapic_tool_types.FunctionCall() != value.function_call
+        or gapic_tool_types.FunctionResponse() != value.function_response
+        or gapic_content_types.VideoMetadata() != value.video_metadata
+    ):
         raise ValueError("Tokenizers do not support non-text content types.")