feat: add support for providing only text to MultiModalEmbeddingModel.get_embeddings()

sararob · copybara-github · commit 38ec40a12cf8 · 2023-08-04T08:08:56.000-07:00
PiperOrigin-RevId: 553809703
diff --git a/tests/unit/aiplatform/test_vision_models.py b/tests/unit/aiplatform/test_vision_models.py
@@ -264,3 +264,33 @@ def test_image_embedding_model_with_image_and_text(self):
 
         assert embedding_response.image_embedding == test_embeddings
         assert embedding_response.text_embedding == test_embeddings
+
+    def test_image_embedding_model_with_only_text(self):
+        aiplatform.init(
+            project=_TEST_PROJECT,
+            location=_TEST_LOCATION,
+        )
+        with mock.patch.object(
+            target=model_garden_service_client.ModelGardenServiceClient,
+            attribute="get_publisher_model",
+            return_value=gca_publisher_model.PublisherModel(
+                _IMAGE_EMBEDDING_PUBLISHER_MODEL_DICT
+            ),
+        ):
+            model = vision_models.MultiModalEmbeddingModel.from_pretrained(
+                "multimodalembedding@001"
+            )
+
+        test_embeddings = [0, 0]
+        gca_predict_response = gca_prediction_service.PredictResponse()
+        gca_predict_response.predictions.append({"textEmbedding": test_embeddings})
+
+        with mock.patch.object(
+            target=prediction_service_client.PredictionServiceClient,
+            attribute="predict",
+            return_value=gca_predict_response,
+        ):
+            embedding_response = model.get_embeddings(contextual_text="hello world")
+
+        assert not embedding_response.image_embedding
+        assert embedding_response.text_embedding == test_embeddings
diff --git a/vertexai/vision_models/_vision_models.py b/vertexai/vision_models/_vision_models.py
@@ -234,28 +234,32 @@ class MultiModalEmbeddingModel(_model_garden_models._ModelGardenModel):
     )
 
     def get_embeddings(
-        self, image: Image, contextual_text: Optional[str] = None
+        self, image: Optional[Image] = None, contextual_text: Optional[str] = None
     ) -> "MultiModalEmbeddingResponse":
         """Gets embedding vectors from the provided image.
 
         Args:
             image (Image):
-                The image to generate embeddings for.
+                Optional. The image to generate embeddings for. One of `image` or `contextual_text` is required.
             contextual_text (str):
                 Optional. Contextual text for your input image. If provided, the model will also
                 generate an embedding vector for the provided contextual text. The returned image
                 and text embedding vectors are in the same semantic space with the same dimensionality,
                 and the vectors can be used interchangeably for use cases like searching image by text
-                or searching text by image.
+                or searching text by image. One of `image` or `contextual_text` is required.
 
         Returns:
             ImageEmbeddingResponse:
                 The image and text embedding vectors.
         """
 
-        instance = {
-            "image": {"bytesBase64Encoded": image._as_base64_string()},
-        }
+        if not image and not contextual_text:
+            raise ValueError("One of `image` or `contextual_text` is required.")
+
+        instance = {}
+
+        if image:
+            instance["image"] = {"bytesBase64Encoded": image._as_base64_string()}
 
         if contextual_text:
             instance["text"] = contextual_text
@@ -280,11 +284,11 @@ class MultiModalEmbeddingResponse:
 
     Attributes:
         image_embedding (List[float]):
-            The emebedding vector generated from your image.
+            Optional. The embedding vector generated from your image.
         text_embedding (List[float]):
             Optional. The embedding vector generated from the contextual text provided for your image.
     """
 
-    image_embedding: List[float]
     _prediction_response: Any
+    image_embedding: Optional[List[float]] = None
     text_embedding: Optional[List[float]] = None