Skip to content

Commit 641d812

Browse files
yoshi-automationJustinBeckwith
authored andcommitted
feat: add recognition metadata (#352)
1 parent 60c7ceb commit 641d812

File tree

3 files changed

+373
-5
lines changed

3 files changed

+373
-5
lines changed

packages/google-cloud-node/protos/google/cloud/speech/v1/cloud_speech.proto

+140-2
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,7 @@ package google.cloud.speech.v1;
1919

2020
import "google/api/annotations.proto";
2121
import "google/longrunning/operations.proto";
22-
import "google/protobuf/any.proto";
2322
import "google/protobuf/duration.proto";
24-
import "google/protobuf/empty.proto";
2523
import "google/protobuf/timestamp.proto";
2624
import "google/rpc/status.proto";
2725

@@ -278,6 +276,9 @@ message RecognitionConfig {
278276
// premium feature.
279277
bool enable_automatic_punctuation = 11;
280278

279+
// *Optional* Metadata regarding this request.
280+
RecognitionMetadata metadata = 9;
281+
281282
// *Optional* Which model to select for the given request. Select the model
282283
// best suited to your domain to get best results. If a model is not
283284
// explicitly specified, then we auto-select a model based on the parameters
@@ -330,6 +331,133 @@ message RecognitionConfig {
330331
bool use_enhanced = 14;
331332
}
332333

334+
// Description of audio data to be recognized.
335+
message RecognitionMetadata {
336+
// Use case categories that the audio recognition request can be described
337+
// by.
338+
enum InteractionType {
339+
// Use case is either unknown or is something other than one of the other
340+
// values below.
341+
INTERACTION_TYPE_UNSPECIFIED = 0;
342+
343+
// Multiple people in a conversation or discussion. For example in a
344+
// meeting with two or more people actively participating. Typically
345+
// all the primary people speaking would be in the same room (if not,
346+
// see PHONE_CALL)
347+
DISCUSSION = 1;
348+
349+
// One or more persons lecturing or presenting to others, mostly
350+
// uninterrupted.
351+
PRESENTATION = 2;
352+
353+
// A phone-call or video-conference in which two or more people, who are
354+
// not in the same room, are actively participating.
355+
PHONE_CALL = 3;
356+
357+
// A recorded message intended for another person to listen to.
358+
VOICEMAIL = 4;
359+
360+
// Professionally produced audio (eg. TV Show, Podcast).
361+
PROFESSIONALLY_PRODUCED = 5;
362+
363+
// Transcribe spoken questions and queries into text.
364+
VOICE_SEARCH = 6;
365+
366+
// Transcribe voice commands, such as for controlling a device.
367+
VOICE_COMMAND = 7;
368+
369+
// Transcribe speech to text to create a written document, such as a
370+
// text-message, email or report.
371+
DICTATION = 8;
372+
}
373+
374+
// The use case most closely describing the audio content to be recognized.
375+
InteractionType interaction_type = 1;
376+
377+
// The industry vertical to which this speech recognition request most
378+
// closely applies. This is most indicative of the topics contained
379+
// in the audio. Use the 6-digit NAICS code to identify the industry
380+
// vertical - see https://www.naics.com/search/.
381+
uint32 industry_naics_code_of_audio = 3;
382+
383+
// Enumerates the types of capture settings describing an audio file.
384+
enum MicrophoneDistance {
385+
// Audio type is not known.
386+
MICROPHONE_DISTANCE_UNSPECIFIED = 0;
387+
388+
// The audio was captured from a closely placed microphone. Eg. phone,
389+
// dictaphone, or handheld microphone. Generally if there speaker is within
390+
// 1 meter of the microphone.
391+
NEARFIELD = 1;
392+
393+
// The speaker if within 3 meters of the microphone.
394+
MIDFIELD = 2;
395+
396+
// The speaker is more than 3 meters away from the microphone.
397+
FARFIELD = 3;
398+
}
399+
400+
// The audio type that most closely describes the audio being recognized.
401+
MicrophoneDistance microphone_distance = 4;
402+
403+
// The original media the speech was recorded on.
404+
enum OriginalMediaType {
405+
// Unknown original media type.
406+
ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0;
407+
408+
// The speech data is an audio recording.
409+
AUDIO = 1;
410+
411+
// The speech data originally recorded on a video.
412+
VIDEO = 2;
413+
}
414+
415+
// The original media the speech was recorded on.
416+
OriginalMediaType original_media_type = 5;
417+
418+
// The type of device the speech was recorded with.
419+
enum RecordingDeviceType {
420+
// The recording device is unknown.
421+
RECORDING_DEVICE_TYPE_UNSPECIFIED = 0;
422+
423+
// Speech was recorded on a smartphone.
424+
SMARTPHONE = 1;
425+
426+
// Speech was recorded using a personal computer or tablet.
427+
PC = 2;
428+
429+
// Speech was recorded over a phone line.
430+
PHONE_LINE = 3;
431+
432+
// Speech was recorded in a vehicle.
433+
VEHICLE = 4;
434+
435+
// Speech was recorded outdoors.
436+
OTHER_OUTDOOR_DEVICE = 5;
437+
438+
// Speech was recorded indoors.
439+
OTHER_INDOOR_DEVICE = 6;
440+
}
441+
442+
// The type of device the speech was recorded with.
443+
RecordingDeviceType recording_device_type = 6;
444+
445+
// The device used to make the recording. Examples 'Nexus 5X' or
446+
// 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
447+
// 'Cardioid Microphone'.
448+
string recording_device_name = 7;
449+
450+
// Mime type of the original audio file. For example `audio/m4a`,
451+
// `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
452+
// A list of possible audio mime types is maintained at
453+
// http://www.iana.org/assignments/media-types/media-types.xhtml#audio
454+
string original_mime_type = 8;
455+
456+
// Description of the content. Eg. "Recordings of federal supreme court
457+
// hearings from 2012".
458+
string audio_topic = 10;
459+
}
460+
333461
// Provides "hints" to the speech recognizer to favor specific words and phrases
334462
// in the results.
335463
message SpeechContext {
@@ -504,10 +632,20 @@ message StreamingRecognitionResult {
504632
// The default of 0.0 is a sentinel value indicating `stability` was not set.
505633
float stability = 3;
506634

635+
// Output only. Time offset of the end of this result relative to the
636+
// beginning of the audio.
637+
google.protobuf.Duration result_end_time = 4;
638+
507639
// For multi-channel audio, this is the channel number corresponding to the
508640
// recognized result for the audio from that channel.
509641
// For audio_channel_count = N, its output values can range from '1' to 'N'.
510642
int32 channel_tag = 5;
643+
644+
// Output only. The
645+
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
646+
// language in this result. This language code was detected to have the most
647+
// likelihood of being spoken in the audio.
648+
string language_code = 6;
511649
}
512650

513651
// A speech recognition result corresponding to a portion of the audio.

0 commit comments

Comments
 (0)