@@ -19,6 +19,7 @@ package google.cloud.speech.v1;
19
19
import "google/api/annotations.proto" ;
20
20
import "google/api/client.proto" ;
21
21
import "google/api/field_behavior.proto" ;
22
+ import "google/cloud/speech/v1/resource.proto" ;
22
23
import "google/longrunning/operations.proto" ;
23
24
import "google/protobuf/any.proto" ;
24
25
import "google/protobuf/duration.proto" ;
@@ -181,7 +182,8 @@ message RecognitionConfig {
181
182
// a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
182
183
// recognition can be reduced if lossy codecs are used to capture or transmit
183
184
// audio, particularly if background noise is present. Lossy codecs include
184
- // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`.
185
+ // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`,
186
+ // and `WEBM_OPUS`.
185
187
//
186
188
// The `FLAC` and `WAV` audio file formats include a header that describes the
187
189
// included audio content. You can request recognition for `WAV` files that
@@ -236,6 +238,11 @@ message RecognitionConfig {
236
238
// is replaced with a single byte containing the block length. Only Speex
237
239
// wideband is supported. `sample_rate_hertz` must be 16000.
238
240
SPEEX_WITH_HEADER_BYTE = 7 ;
241
+
242
+ // Opus encoded audio frames in WebM container
243
+ // ([OggOpus](https://wiki.xiph.org/OggOpus)). `sample_rate_hertz` must be
244
+ // one of 8000, 12000, 16000, 24000, or 48000.
245
+ WEBM_OPUS = 9 ;
239
246
}
240
247
241
248
// Encoding of audio data sent in all `RecognitionAudio` messages.
@@ -279,6 +286,20 @@ message RecognitionConfig {
279
286
// of the currently supported language codes.
280
287
string language_code = 3 [(google.api.field_behavior ) = REQUIRED ];
281
288
289
+ // A list of up to 3 additional
290
+ // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,
291
+ // listing possible alternative languages of the supplied audio.
292
+ // See [Language
293
+ // Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
294
+ // of the currently supported language codes. If alternative languages are
295
+ // listed, recognition result will contain recognition in the most likely
296
+ // language detected including the main language_code. The recognition result
297
+ // will include the language tag of the language detected in the audio. Note:
298
+ // This feature is only supported for Voice Command and Voice Search use cases
299
+ // and performance may vary for other use cases (e.g., phone call
300
+ // transcription).
301
+ repeated string alternative_language_codes = 18 ;
302
+
282
303
// Maximum number of recognition hypotheses to be returned.
283
304
// Specifically, the maximum number of `SpeechRecognitionAlternative` messages
284
305
// within each `SpeechRecognitionResult`.
@@ -293,6 +314,13 @@ message RecognitionConfig {
293
314
// won't be filtered out.
294
315
bool profanity_filter = 5 ;
295
316
317
+ // Speech adaptation configuration improves the accuracy of speech
318
+ // recognition. For more information, see the [speech
319
+ // adaptation](https://cloud.google.com/speech-to-text/docs/adaptation)
320
+ // documentation.
321
+ // When speech adaptation is set it supersedes the `speech_contexts` field.
322
+ SpeechAdaptation adaptation = 20 ;
323
+
296
324
// Array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
297
325
// A means to provide context to assist the speech recognition. For more
298
326
// information, see
@@ -306,12 +334,33 @@ message RecognitionConfig {
306
334
// `false`.
307
335
bool enable_word_time_offsets = 8 ;
308
336
337
+ // If `true`, the top result includes a list of words and the
338
+ // confidence for those words. If `false`, no word-level confidence
339
+ // information is returned. The default is `false`.
340
+ bool enable_word_confidence = 15 ;
341
+
309
342
// If 'true', adds punctuation to recognition result hypotheses.
310
343
// This feature is only available in select languages. Setting this for
311
344
// requests in other languages has no effect at all.
312
345
// The default 'false' value does not add punctuation to result hypotheses.
313
346
bool enable_automatic_punctuation = 11 ;
314
347
348
+ // The spoken punctuation behavior for the call
349
+ // If not set, uses default behavior based on model of choice
350
+ // e.g. command_and_search will enable spoken punctuation by default
351
+ // If 'true', replaces spoken punctuation with the corresponding symbols in
352
+ // the request. For example, "how are you question mark" becomes "how are
353
+ // you?". See https://cloud.google.com/speech-to-text/docs/spoken-punctuation
354
+ // for support. If 'false', spoken punctuation is not replaced.
355
+ google.protobuf.BoolValue enable_spoken_punctuation = 22 ;
356
+
357
+ // The spoken emoji behavior for the call
358
+ // If not set, uses default behavior based on model of choice
359
+ // If 'true', adds spoken emoji formatting for the request. This will replace
360
+ // spoken emojis with the corresponding Unicode symbols in the final
361
+ // transcript. If 'false', spoken emojis are not replaced.
362
+ google.protobuf.BoolValue enable_spoken_emojis = 23 ;
363
+
315
364
// Config to enable speaker diarization and set additional
316
365
// parameters to make diarization better suited for your application.
317
366
// Note: When this is enabled, we send all the words from the beginning of the
@@ -537,6 +586,16 @@ message SpeechContext {
537
586
// improves the likelihood of correctly transcribing audio that includes
538
587
// months.
539
588
repeated string phrases = 1 ;
589
+
590
+ // Hint Boost. Positive value will increase the probability that a specific
591
+ // phrase will be recognized over other similar sounding phrases. The higher
592
+ // the boost, the higher the chance of false positive recognition as well.
593
+ // Negative boost values would correspond to anti-biasing. Anti-biasing is not
594
+ // enabled, so negative boost will simply be ignored. Though `boost` can
595
+ // accept a wide range of positive values, most use cases are best served with
596
+ // values between 0 and 20. We recommend using a binary search approach to
597
+ // finding the optimal value for your use case.
598
+ float boost = 4 ;
540
599
}
541
600
542
601
// Contains audio data in the encoding specified in the `RecognitionConfig`.
@@ -587,6 +646,12 @@ message LongRunningRecognizeResponse {
587
646
588
647
// When available, billed audio seconds for the corresponding request.
589
648
google.protobuf.Duration total_billed_time = 3 ;
649
+
650
+ // Original output config if present in the request.
651
+ TranscriptOutputConfig output_config = 6 ;
652
+
653
+ // If the transcript output fails this field contains the relevant error.
654
+ google.rpc.Status output_error = 7 ;
590
655
}
591
656
592
657
// Describes the progress of a long-running `LongRunningRecognize` call. It is
@@ -723,11 +788,10 @@ message StreamingRecognitionResult {
723
788
// For audio_channel_count = N, its output values can range from '1' to 'N'.
724
789
int32 channel_tag = 5 ;
725
790
726
- // The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of
727
- // the language in this result. This language code was detected to have the
728
- // most likelihood of being spoken in the audio.
729
- string language_code = 6
730
- [(google.api.field_behavior ) = OUTPUT_ONLY ];
791
+ // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
792
+ // of the language in this result. This language code was detected to have
793
+ // the most likelihood of being spoken in the audio.
794
+ string language_code = 6 [(google.api.field_behavior ) = OUTPUT_ONLY ];
731
795
}
732
796
733
797
// A speech recognition result corresponding to a portion of the audio.
@@ -742,6 +806,15 @@ message SpeechRecognitionResult {
742
806
// recognized result for the audio from that channel.
743
807
// For audio_channel_count = N, its output values can range from '1' to 'N'.
744
808
int32 channel_tag = 2 ;
809
+
810
+ // Time offset of the end of this result relative to the
811
+ // beginning of the audio.
812
+ google.protobuf.Duration result_end_time = 4 ;
813
+
814
+ // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
815
+ // of the language in this result. This language code was detected to have
816
+ // the most likelihood of being spoken in the audio.
817
+ string language_code = 5 [(google.api.field_behavior ) = OUTPUT_ONLY ];
745
818
}
746
819
747
820
// Alternative hypotheses (a.k.a. n-best list).
@@ -785,6 +858,15 @@ message WordInfo {
785
858
// The word corresponding to this set of information.
786
859
string word = 3 ;
787
860
861
+ // The confidence estimate between 0.0 and 1.0. A higher number
862
+ // indicates an estimated greater likelihood that the recognized words are
863
+ // correct. This field is set only for the top alternative of a non-streaming
864
+ // result or, of a streaming result where `is_final=true`.
865
+ // This field is not guaranteed to be accurate and users should not rely on it
866
+ // to be always provided.
867
+ // The default of 0.0 is a sentinel value indicating `confidence` was not set.
868
+ float confidence = 4 ;
869
+
788
870
// Output only. A distinct integer value is assigned for every speaker within
789
871
// the audio. This field specifies which one of those speakers was detected to
790
872
// have spoken this word. Value ranges from '1' to diarization_speaker_count.
0 commit comments