feat: LLM - Support streaming prediction for code generation models

Ark-kun · copybara-github · commit 3a8348bca2d9 · 2023-08-18T20:12:07.000-07:00
PiperOrigin-RevId: 558312759
diff --git a/tests/system/aiplatform/test_language_models.py b/tests/system/aiplatform/test_language_models.py
@@ -22,6 +22,7 @@
     job_state as gca_job_state,
 )
 from tests.system.aiplatform import e2e_base
+from vertexai import language_models
 from vertexai.preview.language_models import (
     ChatModel,
     InputOutputTextPair,
@@ -251,3 +252,16 @@ def test_batch_prediction_for_textembedding(self):
         job.delete()
 
         assert gapic_job.state == gca_job_state.JobState.JOB_STATE_SUCCEEDED
+
+    def test_code_generation_streaming(self):
+        aiplatform.init(project=e2e_base._PROJECT, location=e2e_base._LOCATION)
+
+        model = language_models.CodeGenerationModel.from_pretrained("code-bison@001")
+
+        for response in model.predict_streaming(
+            prefix="def reverse_string(s):",
+            suffix="    return s",
+            max_output_tokens=128,
+            temperature=0,
+        ):
+            assert response.text
diff --git a/tests/unit/aiplatform/test_language_models.py b/tests/unit/aiplatform/test_language_models.py
@@ -2068,6 +2068,39 @@ def test_code_completion(self):
             assert "temperature" not in prediction_parameters
             assert prediction_parameters["maxOutputTokens"] == default_max_output_tokens
 
+    def test_code_generation_model_predict_streaming(self):
+        """Tests the TextGenerationModel.predict_streaming method."""
+        with mock.patch.object(
+            target=model_garden_service_client.ModelGardenServiceClient,
+            attribute="get_publisher_model",
+            return_value=gca_publisher_model.PublisherModel(
+                _CODE_GENERATION_BISON_PUBLISHER_MODEL_DICT
+            ),
+        ):
+            model = language_models.CodeGenerationModel.from_pretrained(
+                "code-bison@001"
+            )
+
+        response_generator = (
+            gca_prediction_service.StreamingPredictResponse(
+                outputs=[_streaming_prediction.value_to_tensor(response_dict)]
+            )
+            for response_dict in _TEST_TEXT_GENERATION_PREDICTION_STREAMING
+        )
+
+        with mock.patch.object(
+            target=prediction_service_client.PredictionServiceClient,
+            attribute="server_streaming_predict",
+            return_value=response_generator,
+        ):
+            for response in model.predict_streaming(
+                prefix="def reverse_string(s):",
+                suffix="    return s",
+                max_output_tokens=1000,
+                temperature=0,
+            ):
+                assert len(response.text) > 10
+
     def test_text_embedding(self):
         """Tests the text embedding model."""
         aiplatform.init(
diff --git a/vertexai/language_models/_language_models.py b/vertexai/language_models/_language_models.py
@@ -59,6 +59,13 @@ def _get_model_id_from_tuning_model_id(tuning_model_id: str) -> str:
     return f"publishers/google/models/{model_name}@{version}"
 
 
+@dataclasses.dataclass
+class _PredictionRequest:
+    """A single-instance prediction request."""
+    instance: Dict[str, Any]
+    parameters: Optional[Dict[str, Any]] = None
+
+
 class _LanguageModel(_model_garden_models._ModelGardenModel):
     """_LanguageModel is a base class for all language models."""
 
@@ -1250,15 +1257,15 @@ class CodeGenerationModel(_LanguageModel):
     _LAUNCH_STAGE = _model_garden_models._SDK_GA_LAUNCH_STAGE
     _DEFAULT_MAX_OUTPUT_TOKENS = 128
 
-    def predict(
+    def _create_prediction_request(
         self,
         prefix: str,
         suffix: Optional[str] = None,
         *,
         max_output_tokens: Optional[int] = _DEFAULT_MAX_OUTPUT_TOKENS,
         temperature: Optional[float] = None,
-    ) -> "TextGenerationResponse":
-        """Gets model response for a single prompt.
+    ) -> _PredictionRequest:
+        """Creates a code generation prediction request.
 
         Args:
             prefix: Code before the current point.
@@ -1281,16 +1288,89 @@ def predict(
         if max_output_tokens:
             prediction_parameters["maxOutputTokens"] = max_output_tokens
 
+        return _PredictionRequest(instance=instance, parameters=prediction_parameters)
+
+    def predict(
+        self,
+        prefix: str,
+        suffix: Optional[str] = None,
+        *,
+        max_output_tokens: Optional[int] = _DEFAULT_MAX_OUTPUT_TOKENS,
+        temperature: Optional[float] = None,
+    ) -> "TextGenerationResponse":
+        """Gets model response for a single prompt.
+
+        Args:
+            prefix: Code before the current point.
+            suffix: Code after the current point.
+            max_output_tokens: Max length of the output text in tokens. Range: [1, 1000].
+            temperature: Controls the randomness of predictions. Range: [0, 1].
+
+        Returns:
+            A `TextGenerationResponse` object that contains the text produced by the model.
+        """
+        prediction_request = self._create_prediction_request(
+            prefix=prefix,
+            suffix=suffix,
+            max_output_tokens=max_output_tokens,
+            temperature=temperature,
+        )
+
         prediction_response = self._endpoint.predict(
-            instances=[instance],
-            parameters=prediction_parameters,
+            instances=[prediction_request.instance],
+            parameters=prediction_request.parameters,
         )
 
         return TextGenerationResponse(
             text=prediction_response.predictions[0]["content"],
             _prediction_response=prediction_response,
         )
 
+    def predict_streaming(
+        self,
+        prefix: str,
+        suffix: Optional[str] = None,
+        *,
+        max_output_tokens: Optional[int] = _DEFAULT_MAX_OUTPUT_TOKENS,
+        temperature: Optional[float] = None,
+    ) -> Iterator[TextGenerationResponse]:
+        """Predicts the code based on previous code.
+
+        The result is a stream (generator) of partial responses.
+
+        Args:
+            prefix: Code before the current point.
+            suffix: Code after the current point.
+            max_output_tokens: Max length of the output text in tokens. Range: [1, 1000].
+            temperature: Controls the randomness of predictions. Range: [0, 1].
+
+        Yields:
+            A stream of `TextGenerationResponse` objects that contain partial
+            responses produced by the model.
+        """
+        prediction_request = self._create_prediction_request(
+            prefix=prefix,
+            suffix=suffix,
+            max_output_tokens=max_output_tokens,
+            temperature=temperature,
+        )
+
+        prediction_service_client = self._endpoint._prediction_client
+        for prediction_dict in _streaming_prediction.predict_stream_of_dicts_from_single_dict(
+            prediction_service_client=prediction_service_client,
+            endpoint_name=self._endpoint_name,
+            instance=prediction_request.instance,
+            parameters=prediction_request.parameters,
+        ):
+            prediction_obj = aiplatform.models.Prediction(
+                predictions=[prediction_dict],
+                deployed_model_id="",
+            )
+            yield TextGenerationResponse(
+                text=prediction_dict["content"],
+                _prediction_response=prediction_obj,
+            )
+
 
 class _PreviewCodeGenerationModel(CodeGenerationModel, _TunableModelMixin):
     _LAUNCH_STAGE = _model_garden_models._SDK_PUBLIC_PREVIEW_LAUNCH_STAGE