1
- // Copyright 2018 Google Inc .
1
+ // Copyright 2018 Google LLC .
2
2
//
3
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
4
// you may not use this file except in compliance with the License.
11
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
// See the License for the specific language governing permissions and
13
13
// limitations under the License.
14
+ //
14
15
15
16
syntax = "proto3" ;
16
17
@@ -20,6 +21,7 @@ import "google/api/annotations.proto";
20
21
import "google/longrunning/operations.proto" ;
21
22
import "google/protobuf/any.proto" ;
22
23
import "google/protobuf/duration.proto" ;
24
+ import "google/protobuf/empty.proto" ;
23
25
import "google/protobuf/timestamp.proto" ;
24
26
import "google/rpc/status.proto" ;
25
27
@@ -54,7 +56,8 @@ service Speech {
54
56
55
57
// Performs bidirectional streaming speech recognition: receive results while
56
58
// sending audio. This method is only available via the gRPC API (not REST).
57
- rpc StreamingRecognize (stream StreamingRecognizeRequest ) returns (stream StreamingRecognizeResponse );
59
+ rpc StreamingRecognize (stream StreamingRecognizeRequest ) returns (stream StreamingRecognizeResponse ) {
60
+ }
58
61
}
59
62
60
63
// The top-level message sent by the client for the `Recognize` method.
@@ -98,7 +101,7 @@ message StreamingRecognizeRequest {
98
101
// `audio_content` data. The audio bytes must be encoded as specified in
99
102
// `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
100
103
// pure binary representation (not base64). See
101
- // [audio limits](https://cloud.google.com/ speech/limits #content).
104
+ // [content limits](/ speech-to-text/quotas #content).
102
105
bytes audio_content = 2 ;
103
106
}
104
107
}
@@ -218,36 +221,36 @@ message RecognitionConfig {
218
221
// Valid values for OGG_OPUS are '1'-'254'.
219
222
// Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.
220
223
// If `0` or omitted, defaults to one channel (mono).
221
- // NOTE : We only recognize the first channel by default.
224
+ // Note : We only recognize the first channel by default.
222
225
// To perform independent recognition on each channel set
223
- // enable_separate_recognition_per_channel to 'true'.
226
+ // ` enable_separate_recognition_per_channel` to 'true'.
224
227
int32 audio_channel_count = 7 ;
225
228
226
- // This needs to be set to ‘true’ explicitly and audio_channel_count > 1
229
+ // This needs to be set to ‘true’ explicitly and ` audio_channel_count` > 1
227
230
// to get each channel recognized separately. The recognition result will
228
- // contain a channel_tag field to state which channel that result belongs to.
229
- // If this is not ‘ true’ , we will only recognize the first channel.
230
- // NOTE: The request is also billed cumulatively for all channels recognized:
231
- // (audio_channel_count times the audio length)
231
+ // contain a ` channel_tag` field to state which channel that result belongs
232
+ // to. If this is not true, we will only recognize the first channel. The
233
+ // request is billed cumulatively for all channels recognized:
234
+ // `audio_channel_count` multiplied by the length of the audio.
232
235
bool enable_separate_recognition_per_channel = 12 ;
233
236
234
237
// *Required* The language of the supplied audio as a
235
238
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
236
239
// Example: "en-US".
237
- // See [Language Support](https://cloud.google.com/ speech/docs/languages)
240
+ // See [Language Support](/ speech-to-text /docs/languages)
238
241
// for a list of the currently supported language codes.
239
242
string language_code = 3 ;
240
243
241
244
// *Optional* A list of up to 3 additional
242
245
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,
243
246
// listing possible alternative languages of the supplied audio.
244
- // See [Language Support](https://cloud.google.com/ speech/docs/languages)
247
+ // See [Language Support](/ speech-to-text /docs/languages)
245
248
// for a list of the currently supported language codes.
246
249
// If alternative languages are listed, recognition result will contain
247
250
// recognition in the most likely language detected including the main
248
251
// language_code. The recognition result will include the language tag
249
252
// of the language detected in the audio.
250
- // NOTE : This feature is only supported for Voice Command and Voice Search
253
+ // Note : This feature is only supported for Voice Command and Voice Search
251
254
// use cases and performance may vary for other use cases (e.g., phone call
252
255
// transcription).
253
256
repeated string alternative_language_codes = 18 ;
@@ -266,7 +269,9 @@ message RecognitionConfig {
266
269
// won't be filtered out.
267
270
bool profanity_filter = 5 ;
268
271
269
- // *Optional* A means to provide context to assist the speech recognition.
272
+ // *Optional* array of [SpeechContext][google.cloud.speech.v1p1beta1.SpeechContext].
273
+ // A means to provide context to assist the speech recognition. For more
274
+ // information, see [Phrase Hints](/speech-to-text/docs/basics#phrase-hints).
270
275
repeated SpeechContext speech_contexts = 6 ;
271
276
272
277
// *Optional* If `true`, the top result includes a list of words and
@@ -284,18 +289,20 @@ message RecognitionConfig {
284
289
// This feature is only available in select languages. Setting this for
285
290
// requests in other languages has no effect at all.
286
291
// The default 'false' value does not add punctuation to result hypotheses.
287
- // NOTE: " This is currently offered as an experimental service, complimentary
292
+ // Note: This is currently offered as an experimental service, complimentary
288
293
// to all users. In the future this may be exclusively available as a
289
- // premium feature."
294
+ // premium feature.
290
295
bool enable_automatic_punctuation = 11 ;
291
296
292
297
// *Optional* If 'true', enables speaker detection for each recognized word in
293
298
// the top alternative of the recognition result using a speaker_tag provided
294
299
// in the WordInfo.
295
300
// Note: When this is true, we send all the words from the beginning of the
296
- // audio for the top alternative in every consecutive responses.
301
+ // audio for the top alternative in every consecutive STREAMING responses.
297
302
// This is done in order to improve our speaker tags as our models learn to
298
303
// identify the speakers in the conversation over time.
304
+ // For non-streaming requests, the diarization results will be provided only
305
+ // in the top alternative of the FINAL SpeechRecognitionResult.
299
306
bool enable_speaker_diarization = 16 ;
300
307
301
308
// *Optional*
@@ -342,14 +349,18 @@ message RecognitionConfig {
342
349
string model = 13 ;
343
350
344
351
// *Optional* Set to true to use an enhanced model for speech recognition.
345
- // You must also set the `model` field to a valid, enhanced model. If
346
- // `use_enhanced` is set to true and the `model` field is not set, then
347
- // `use_enhanced` is ignored. If `use_enhanced` is true and an enhanced
348
- // version of the specified model does not exist, then the speech is
349
- // recognized using the standard version of the specified model.
352
+ // If `use_enhanced` is set to true and the `model` field is not set, then
353
+ // an appropriate enhanced model is chosen if:
354
+ // 1. project is eligible for requesting enhanced models
355
+ // 2. an enhanced model exists for the audio
356
+ //
357
+ // If `use_enhanced` is true and an enhanced version of the specified model
358
+ // does not exist, then the speech is recognized using the standard version
359
+ // of the specified model.
350
360
//
351
- // Enhanced speech models require that you opt-in to the audio logging using
352
- // instructions in the [alpha documentation](/speech/data-sharing). If you set
361
+ // Enhanced speech models require that you opt-in to data logging using
362
+ // instructions in the
363
+ // [documentation](/speech-to-text/docs/enable-data-logging). If you set
353
364
// `use_enhanced` to true and you have not enabled audio logging, then you
354
365
// will receive an error.
355
366
bool use_enhanced = 14 ;
@@ -494,14 +505,14 @@ message SpeechContext {
494
505
// to improve the accuracy for specific words and phrases, for example, if
495
506
// specific commands are typically spoken by the user. This can also be used
496
507
// to add additional words to the vocabulary of the recognizer. See
497
- // [usage limits](https://cloud.google.com/ speech/limits #content).
508
+ // [usage limits](/ speech-to-text/quotas #content).
498
509
repeated string phrases = 1 ;
499
510
}
500
511
501
512
// Contains audio data in the encoding specified in the `RecognitionConfig`.
502
513
// Either `content` or `uri` must be supplied. Supplying both or neither
503
514
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
504
- // [audio limits](https://cloud.google.com/ speech/limits #content).
515
+ // [content limits](/ speech-to-text/quotas #content).
505
516
message RecognitionAudio {
506
517
// The audio source, which is either inline content or a Google Cloud
507
518
// Storage uri.
@@ -512,7 +523,8 @@ message RecognitionAudio {
512
523
bytes content = 1 ;
513
524
514
525
// URI that points to a file that contains audio data bytes as specified in
515
- // `RecognitionConfig`. Currently, only Google Cloud Storage URIs are
526
+ // `RecognitionConfig`. The file must not be compressed (for example, gzip).
527
+ // Currently, only Google Cloud Storage URIs are
516
528
// supported, which must be specified in the following format:
517
529
// `gs://bucket_name/object_name` (other URI formats return
518
530
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
@@ -658,6 +670,10 @@ message StreamingRecognitionResult {
658
670
// The default of 0.0 is a sentinel value indicating `stability` was not set.
659
671
float stability = 3 ;
660
672
673
+ // Output only. Time offset of the end of this result relative to the
674
+ // beginning of the audio.
675
+ google.protobuf.Duration result_end_time = 4 ;
676
+
661
677
// For multi-channel audio, this is the channel number corresponding to the
662
678
// recognized result for the audio from that channel.
663
679
// For audio_channel_count = N, its output values can range from '1' to 'N'.
@@ -705,7 +721,7 @@ message SpeechRecognitionAlternative {
705
721
float confidence = 2 ;
706
722
707
723
// Output only. A list of word-specific information for each recognized word.
708
- // Note: When enable_speaker_diarization is true, you will see all the words
724
+ // Note: When ` enable_speaker_diarization` is true, you will see all the words
709
725
// from the beginning of the audio.
710
726
repeated WordInfo words = 3 ;
711
727
}
@@ -746,5 +762,4 @@ message WordInfo {
746
762
// speaker_tag is set if enable_speaker_diarization = 'true' and only in the
747
763
// top alternative.
748
764
int32 speaker_tag = 5 ;
749
-
750
765
}
0 commit comments