@@ -19,9 +19,7 @@ package google.cloud.speech.v1;
19
19
20
20
import "google/api/annotations.proto" ;
21
21
import "google/longrunning/operations.proto" ;
22
- import "google/protobuf/any.proto" ;
23
22
import "google/protobuf/duration.proto" ;
24
- import "google/protobuf/empty.proto" ;
25
23
import "google/protobuf/timestamp.proto" ;
26
24
import "google/rpc/status.proto" ;
27
25
@@ -278,6 +276,9 @@ message RecognitionConfig {
278
276
// premium feature.
279
277
bool enable_automatic_punctuation = 11 ;
280
278
279
+ // *Optional* Metadata regarding this request.
280
+ RecognitionMetadata metadata = 9 ;
281
+
281
282
// *Optional* Which model to select for the given request. Select the model
282
283
// best suited to your domain to get best results. If a model is not
283
284
// explicitly specified, then we auto-select a model based on the parameters
@@ -330,6 +331,133 @@ message RecognitionConfig {
330
331
bool use_enhanced = 14 ;
331
332
}
332
333
334
+ // Description of audio data to be recognized.
335
+ message RecognitionMetadata {
336
+ // Use case categories that the audio recognition request can be described
337
+ // by.
338
+ enum InteractionType {
339
+ // Use case is either unknown or is something other than one of the other
340
+ // values below.
341
+ INTERACTION_TYPE_UNSPECIFIED = 0 ;
342
+
343
+ // Multiple people in a conversation or discussion. For example in a
344
+ // meeting with two or more people actively participating. Typically
345
+ // all the primary people speaking would be in the same room (if not,
346
+ // see PHONE_CALL)
347
+ DISCUSSION = 1 ;
348
+
349
+ // One or more persons lecturing or presenting to others, mostly
350
+ // uninterrupted.
351
+ PRESENTATION = 2 ;
352
+
353
+ // A phone-call or video-conference in which two or more people, who are
354
+ // not in the same room, are actively participating.
355
+ PHONE_CALL = 3 ;
356
+
357
+ // A recorded message intended for another person to listen to.
358
+ VOICEMAIL = 4 ;
359
+
360
+ // Professionally produced audio (eg. TV Show, Podcast).
361
+ PROFESSIONALLY_PRODUCED = 5 ;
362
+
363
+ // Transcribe spoken questions and queries into text.
364
+ VOICE_SEARCH = 6 ;
365
+
366
+ // Transcribe voice commands, such as for controlling a device.
367
+ VOICE_COMMAND = 7 ;
368
+
369
+ // Transcribe speech to text to create a written document, such as a
370
+ // text-message, email or report.
371
+ DICTATION = 8 ;
372
+ }
373
+
374
+ // The use case most closely describing the audio content to be recognized.
375
+ InteractionType interaction_type = 1 ;
376
+
377
+ // The industry vertical to which this speech recognition request most
378
+ // closely applies. This is most indicative of the topics contained
379
+ // in the audio. Use the 6-digit NAICS code to identify the industry
380
+ // vertical - see https://www.naics.com/search/.
381
+ uint32 industry_naics_code_of_audio = 3 ;
382
+
383
+ // Enumerates the types of capture settings describing an audio file.
384
+ enum MicrophoneDistance {
385
+ // Audio type is not known.
386
+ MICROPHONE_DISTANCE_UNSPECIFIED = 0 ;
387
+
388
+ // The audio was captured from a closely placed microphone. Eg. phone,
389
+ // dictaphone, or handheld microphone. Generally if there speaker is within
390
+ // 1 meter of the microphone.
391
+ NEARFIELD = 1 ;
392
+
393
+ // The speaker if within 3 meters of the microphone.
394
+ MIDFIELD = 2 ;
395
+
396
+ // The speaker is more than 3 meters away from the microphone.
397
+ FARFIELD = 3 ;
398
+ }
399
+
400
+ // The audio type that most closely describes the audio being recognized.
401
+ MicrophoneDistance microphone_distance = 4 ;
402
+
403
+ // The original media the speech was recorded on.
404
+ enum OriginalMediaType {
405
+ // Unknown original media type.
406
+ ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0 ;
407
+
408
+ // The speech data is an audio recording.
409
+ AUDIO = 1 ;
410
+
411
+ // The speech data originally recorded on a video.
412
+ VIDEO = 2 ;
413
+ }
414
+
415
+ // The original media the speech was recorded on.
416
+ OriginalMediaType original_media_type = 5 ;
417
+
418
+ // The type of device the speech was recorded with.
419
+ enum RecordingDeviceType {
420
+ // The recording device is unknown.
421
+ RECORDING_DEVICE_TYPE_UNSPECIFIED = 0 ;
422
+
423
+ // Speech was recorded on a smartphone.
424
+ SMARTPHONE = 1 ;
425
+
426
+ // Speech was recorded using a personal computer or tablet.
427
+ PC = 2 ;
428
+
429
+ // Speech was recorded over a phone line.
430
+ PHONE_LINE = 3 ;
431
+
432
+ // Speech was recorded in a vehicle.
433
+ VEHICLE = 4 ;
434
+
435
+ // Speech was recorded outdoors.
436
+ OTHER_OUTDOOR_DEVICE = 5 ;
437
+
438
+ // Speech was recorded indoors.
439
+ OTHER_INDOOR_DEVICE = 6 ;
440
+ }
441
+
442
+ // The type of device the speech was recorded with.
443
+ RecordingDeviceType recording_device_type = 6 ;
444
+
445
+ // The device used to make the recording. Examples 'Nexus 5X' or
446
+ // 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
447
+ // 'Cardioid Microphone'.
448
+ string recording_device_name = 7 ;
449
+
450
+ // Mime type of the original audio file. For example `audio/m4a`,
451
+ // `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
452
+ // A list of possible audio mime types is maintained at
453
+ // http://www.iana.org/assignments/media-types/media-types.xhtml#audio
454
+ string original_mime_type = 8 ;
455
+
456
+ // Description of the content. Eg. "Recordings of federal supreme court
457
+ // hearings from 2012".
458
+ string audio_topic = 10 ;
459
+ }
460
+
333
461
// Provides "hints" to the speech recognizer to favor specific words and phrases
334
462
// in the results.
335
463
message SpeechContext {
@@ -504,10 +632,20 @@ message StreamingRecognitionResult {
504
632
// The default of 0.0 is a sentinel value indicating `stability` was not set.
505
633
float stability = 3 ;
506
634
635
+ // Output only. Time offset of the end of this result relative to the
636
+ // beginning of the audio.
637
+ google.protobuf.Duration result_end_time = 4 ;
638
+
507
639
// For multi-channel audio, this is the channel number corresponding to the
508
640
// recognized result for the audio from that channel.
509
641
// For audio_channel_count = N, its output values can range from '1' to 'N'.
510
642
int32 channel_tag = 5 ;
643
+
644
+ // Output only. The
645
+ // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
646
+ // language in this result. This language code was detected to have the most
647
+ // likelihood of being spoken in the audio.
648
+ string language_code = 6 ;
511
649
}
512
650
513
651
// A speech recognition result corresponding to a portion of the audio.
0 commit comments