Skip to content

Commit 91c2120

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI - Added audio_timestamp to GenerationConfig.
PiperOrigin-RevId: 689902378
1 parent 1f3b2d8 commit 91c2120

File tree

3 files changed

+43
-0
lines changed

3 files changed

+43
-0
lines changed

tests/system/vertexai/test_generative_models.py

+12
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,18 @@ def test_generate_content_from_text_and_remote_video(self, api_endpoint_env_name
344344
assert response.text
345345
assert "Zootopia" in response.text
346346

347+
def test_generate_content_from_text_and_remote_audio(self, api_endpoint_env_name):
348+
vision_model = generative_models.GenerativeModel(GEMINI_VISION_MODEL_NAME)
349+
audio_part = generative_models.Part.from_uri(
350+
uri="gs://cloud-samples-data/audio/speech_16k.wav",
351+
mime_type="audio/wav",
352+
)
353+
response = vision_model.generate_content(
354+
contents=["What is in the audio?", audio_part],
355+
generation_config=generative_models.GenerationConfig(audio_timestamp=True),
356+
)
357+
assert response.text
358+
347359
def test_grounding_google_search_retriever(self, api_endpoint_env_name):
348360
model = preview_generative_models.GenerativeModel(GEMINI_MODEL_NAME)
349361
google_search_retriever_tool = (

tests/unit/vertexai/test_generative_models.py

+27
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,33 @@ def test_generate_content(self, generative_models: generative_models):
761761
)
762762
assert response4.text
763763

764+
model5 = generative_models.GenerativeModel("gemini-1.5-pro-002")
765+
response5 = model5.generate_content(
766+
contents=[
767+
generative_models.Part.from_uri(
768+
"gs://cloud-samples-data/generative-ai/audio/pixel.mp3",
769+
mime_type="audio/mpeg",
770+
),
771+
"What is the audio about?",
772+
],
773+
generation_config=generative_models.GenerationConfig(
774+
audio_timestamp=True,
775+
),
776+
safety_settings=[
777+
generative_models.SafetySetting(
778+
category=generative_models.SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
779+
threshold=generative_models.SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
780+
method=generative_models.SafetySetting.HarmBlockMethod.SEVERITY,
781+
),
782+
generative_models.SafetySetting(
783+
category=generative_models.SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
784+
threshold=generative_models.SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH,
785+
method=generative_models.SafetySetting.HarmBlockMethod.PROBABILITY,
786+
),
787+
],
788+
)
789+
assert response5.text
790+
764791
@mock.patch.object(
765792
target=prediction_service.PredictionServiceClient,
766793
attribute="generate_content",

vertexai/generative_models/_generative_models.py

+4
Original file line numberDiff line numberDiff line change
@@ -1683,6 +1683,7 @@ def __init__(
16831683
response_mime_type: Optional[str] = None,
16841684
response_schema: Optional[Dict[str, Any]] = None,
16851685
seed: Optional[int] = None,
1686+
audio_timestamp: Optional[bool] = None,
16861687
routing_config: Optional["RoutingConfig"] = None,
16871688
logprobs: Optional[int] = None,
16881689
response_logprobs: Optional[bool] = None,
@@ -1712,6 +1713,7 @@ def __init__(
17121713
The model needs to be prompted to output the appropriate
17131714
response type, otherwise the behavior is undefined.
17141715
response_schema: Output response schema of the genreated candidate text.
1716+
audio_timestamp: If true, the timestamp of the audio will be included in the response.
17151717
routing_config: Model routing preference set in the request.
17161718
logprobs: Logit probabilities.
17171719
reponse_logprobs: If true, export the logprobs results in response.
@@ -1728,6 +1730,7 @@ def __init__(
17281730
max_output_tokens=100,
17291731
stop_sequences=["\n\n\n"],
17301732
seed=5,
1733+
audio_timestamp=True,
17311734
)
17321735
)
17331736
```
@@ -1750,6 +1753,7 @@ def __init__(
17501753
response_mime_type=response_mime_type,
17511754
response_schema=raw_schema,
17521755
seed=seed,
1756+
audio_timestamp=audio_timestamp,
17531757
logprobs=logprobs,
17541758
response_logprobs=response_logprobs,
17551759
)

0 commit comments

Comments
 (0)