Skip to content

Commit f32f412

Browse files
feat: add alternative_language_codes to RecognitionConfig (#824)
- [ ] Regenerate this pull request now. PiperOrigin-RevId: 413453425 Source-Link: googleapis/googleapis@2b47b24 Source-Link: googleapis/googleapis-gen@7ffe6e0 Copy-Tag: eyJwIjoiLmdpdGh1Yi8uT3dsQm90LnlhbWwiLCJoIjoiN2ZmZTZlMGExYmY2M2Q4NTQwMDA5Y2U2OTg2NjBlYmI3MWM1NGZmMSJ9 feat: add WEBM_OPUS codec feat: add SpeechAdaptation configuration feat: add word confidence feat: add spoken punctuation and spoken emojis feat: add hint boost in SpeechContext
1 parent a5e2021 commit f32f412

File tree

8 files changed

+3086
-417
lines changed

8 files changed

+3086
-417
lines changed

packages/google-cloud-speech/protos/google/cloud/speech/v1/cloud_speech.proto

+88-6
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package google.cloud.speech.v1;
1919
import "google/api/annotations.proto";
2020
import "google/api/client.proto";
2121
import "google/api/field_behavior.proto";
22+
import "google/cloud/speech/v1/resource.proto";
2223
import "google/longrunning/operations.proto";
2324
import "google/protobuf/any.proto";
2425
import "google/protobuf/duration.proto";
@@ -181,7 +182,8 @@ message RecognitionConfig {
181182
// a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
182183
// recognition can be reduced if lossy codecs are used to capture or transmit
183184
// audio, particularly if background noise is present. Lossy codecs include
184-
// `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`.
185+
// `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`,
186+
// and `WEBM_OPUS`.
185187
//
186188
// The `FLAC` and `WAV` audio file formats include a header that describes the
187189
// included audio content. You can request recognition for `WAV` files that
@@ -236,6 +238,11 @@ message RecognitionConfig {
236238
// is replaced with a single byte containing the block length. Only Speex
237239
// wideband is supported. `sample_rate_hertz` must be 16000.
238240
SPEEX_WITH_HEADER_BYTE = 7;
241+
242+
// Opus encoded audio frames in WebM container
243+
// ([OggOpus](https://wiki.xiph.org/OggOpus)). `sample_rate_hertz` must be
244+
// one of 8000, 12000, 16000, 24000, or 48000.
245+
WEBM_OPUS = 9;
239246
}
240247

241248
// Encoding of audio data sent in all `RecognitionAudio` messages.
@@ -279,6 +286,20 @@ message RecognitionConfig {
279286
// of the currently supported language codes.
280287
string language_code = 3 [(google.api.field_behavior) = REQUIRED];
281288

289+
// A list of up to 3 additional
290+
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,
291+
// listing possible alternative languages of the supplied audio.
292+
// See [Language
293+
// Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
294+
// of the currently supported language codes. If alternative languages are
295+
// listed, recognition result will contain recognition in the most likely
296+
// language detected including the main language_code. The recognition result
297+
// will include the language tag of the language detected in the audio. Note:
298+
// This feature is only supported for Voice Command and Voice Search use cases
299+
// and performance may vary for other use cases (e.g., phone call
300+
// transcription).
301+
repeated string alternative_language_codes = 18;
302+
282303
// Maximum number of recognition hypotheses to be returned.
283304
// Specifically, the maximum number of `SpeechRecognitionAlternative` messages
284305
// within each `SpeechRecognitionResult`.
@@ -293,6 +314,13 @@ message RecognitionConfig {
293314
// won't be filtered out.
294315
bool profanity_filter = 5;
295316

317+
// Speech adaptation configuration improves the accuracy of speech
318+
// recognition. For more information, see the [speech
319+
// adaptation](https://cloud.google.com/speech-to-text/docs/adaptation)
320+
// documentation.
321+
// When speech adaptation is set it supersedes the `speech_contexts` field.
322+
SpeechAdaptation adaptation = 20;
323+
296324
// Array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
297325
// A means to provide context to assist the speech recognition. For more
298326
// information, see
@@ -306,12 +334,33 @@ message RecognitionConfig {
306334
// `false`.
307335
bool enable_word_time_offsets = 8;
308336

337+
// If `true`, the top result includes a list of words and the
338+
// confidence for those words. If `false`, no word-level confidence
339+
// information is returned. The default is `false`.
340+
bool enable_word_confidence = 15;
341+
309342
// If 'true', adds punctuation to recognition result hypotheses.
310343
// This feature is only available in select languages. Setting this for
311344
// requests in other languages has no effect at all.
312345
// The default 'false' value does not add punctuation to result hypotheses.
313346
bool enable_automatic_punctuation = 11;
314347

348+
// The spoken punctuation behavior for the call
349+
// If not set, uses default behavior based on model of choice
350+
// e.g. command_and_search will enable spoken punctuation by default
351+
// If 'true', replaces spoken punctuation with the corresponding symbols in
352+
// the request. For example, "how are you question mark" becomes "how are
353+
// you?". See https://cloud.google.com/speech-to-text/docs/spoken-punctuation
354+
// for support. If 'false', spoken punctuation is not replaced.
355+
google.protobuf.BoolValue enable_spoken_punctuation = 22;
356+
357+
// The spoken emoji behavior for the call
358+
// If not set, uses default behavior based on model of choice
359+
// If 'true', adds spoken emoji formatting for the request. This will replace
360+
// spoken emojis with the corresponding Unicode symbols in the final
361+
// transcript. If 'false', spoken emojis are not replaced.
362+
google.protobuf.BoolValue enable_spoken_emojis = 23;
363+
315364
// Config to enable speaker diarization and set additional
316365
// parameters to make diarization better suited for your application.
317366
// Note: When this is enabled, we send all the words from the beginning of the
@@ -537,6 +586,16 @@ message SpeechContext {
537586
// improves the likelihood of correctly transcribing audio that includes
538587
// months.
539588
repeated string phrases = 1;
589+
590+
// Hint Boost. Positive value will increase the probability that a specific
591+
// phrase will be recognized over other similar sounding phrases. The higher
592+
// the boost, the higher the chance of false positive recognition as well.
593+
// Negative boost values would correspond to anti-biasing. Anti-biasing is not
594+
// enabled, so negative boost will simply be ignored. Though `boost` can
595+
// accept a wide range of positive values, most use cases are best served with
596+
// values between 0 and 20. We recommend using a binary search approach to
597+
// finding the optimal value for your use case.
598+
float boost = 4;
540599
}
541600

542601
// Contains audio data in the encoding specified in the `RecognitionConfig`.
@@ -587,6 +646,12 @@ message LongRunningRecognizeResponse {
587646

588647
// When available, billed audio seconds for the corresponding request.
589648
google.protobuf.Duration total_billed_time = 3;
649+
650+
// Original output config if present in the request.
651+
TranscriptOutputConfig output_config = 6;
652+
653+
// If the transcript output fails this field contains the relevant error.
654+
google.rpc.Status output_error = 7;
590655
}
591656

592657
// Describes the progress of a long-running `LongRunningRecognize` call. It is
@@ -723,11 +788,10 @@ message StreamingRecognitionResult {
723788
// For audio_channel_count = N, its output values can range from '1' to 'N'.
724789
int32 channel_tag = 5;
725790

726-
// The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of
727-
// the language in this result. This language code was detected to have the
728-
// most likelihood of being spoken in the audio.
729-
string language_code = 6
730-
[(google.api.field_behavior) = OUTPUT_ONLY];
791+
// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
792+
// of the language in this result. This language code was detected to have
793+
// the most likelihood of being spoken in the audio.
794+
string language_code = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
731795
}
732796

733797
// A speech recognition result corresponding to a portion of the audio.
@@ -742,6 +806,15 @@ message SpeechRecognitionResult {
742806
// recognized result for the audio from that channel.
743807
// For audio_channel_count = N, its output values can range from '1' to 'N'.
744808
int32 channel_tag = 2;
809+
810+
// Time offset of the end of this result relative to the
811+
// beginning of the audio.
812+
google.protobuf.Duration result_end_time = 4;
813+
814+
// Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
815+
// of the language in this result. This language code was detected to have
816+
// the most likelihood of being spoken in the audio.
817+
string language_code = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
745818
}
746819

747820
// Alternative hypotheses (a.k.a. n-best list).
@@ -785,6 +858,15 @@ message WordInfo {
785858
// The word corresponding to this set of information.
786859
string word = 3;
787860

861+
// The confidence estimate between 0.0 and 1.0. A higher number
862+
// indicates an estimated greater likelihood that the recognized words are
863+
// correct. This field is set only for the top alternative of a non-streaming
864+
// result or, of a streaming result where `is_final=true`.
865+
// This field is not guaranteed to be accurate and users should not rely on it
866+
// to be always provided.
867+
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
868+
float confidence = 4;
869+
788870
// Output only. A distinct integer value is assigned for every speaker within
789871
// the audio. This field specifies which one of those speakers was detected to
790872
// have spoken this word. Value ranges from '1' to diarization_speaker_count.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
// Copyright 2021 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
syntax = "proto3";
16+
17+
package google.cloud.speech.v1;
18+
19+
import "google/api/resource.proto";
20+
import "google/protobuf/timestamp.proto";
21+
import "google/api/annotations.proto";
22+
23+
option cc_enable_arenas = true;
24+
option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1;speech";
25+
option java_multiple_files = true;
26+
option java_outer_classname = "SpeechResourceProto";
27+
option java_package = "com.google.cloud.speech.v1";
28+
option objc_class_prefix = "GCS";
29+
30+
// A set of words or phrases that represents a common concept likely to appear
31+
// in your audio, for example a list of passenger ship names. CustomClass items
32+
// can be substituted into placeholders that you set in PhraseSet phrases.
33+
message CustomClass {
34+
option (google.api.resource) = {
35+
type: "speech.googleapis.com/CustomClass"
36+
pattern: "projects/{project}/locations/{location}/customClasses/{custom_class}"
37+
};
38+
39+
// An item of the class.
40+
message ClassItem {
41+
// The class item's value.
42+
string value = 1;
43+
}
44+
45+
// The resource name of the custom class.
46+
string name = 1;
47+
48+
// If this custom class is a resource, the custom_class_id is the resource id
49+
// of the CustomClass. Case sensitive.
50+
string custom_class_id = 2;
51+
52+
// A collection of class items.
53+
repeated ClassItem items = 3;
54+
}
55+
56+
// Provides "hints" to the speech recognizer to favor specific words and phrases
57+
// in the results.
58+
message PhraseSet {
59+
option (google.api.resource) = {
60+
type: "speech.googleapis.com/PhraseSet"
61+
pattern: "projects/{project}/locations/{location}/phraseSets/{phrase_set}"
62+
};
63+
64+
// A phrases containing words and phrase "hints" so that
65+
// the speech recognition is more likely to recognize them. This can be used
66+
// to improve the accuracy for specific words and phrases, for example, if
67+
// specific commands are typically spoken by the user. This can also be used
68+
// to add additional words to the vocabulary of the recognizer. See
69+
// [usage limits](https://cloud.google.com/speech-to-text/quotas#content).
70+
//
71+
// List items can also include pre-built or custom classes containing groups
72+
// of words that represent common concepts that occur in natural language. For
73+
// example, rather than providing a phrase hint for every month of the
74+
// year (e.g. "i was born in january", "i was born in febuary", ...), use the
75+
// pre-built `$MONTH` class improves the likelihood of correctly transcribing
76+
// audio that includes months (e.g. "i was born in $month").
77+
// To refer to pre-built classes, use the class' symbol prepended with `$`
78+
// e.g. `$MONTH`. To refer to custom classes that were defined inline in the
79+
// request, set the class's `custom_class_id` to a string unique to all class
80+
// resources and inline classes. Then use the class' id wrapped in $`{...}`
81+
// e.g. "${my-months}". To refer to custom classes resources, use the class'
82+
// id wrapped in `${}` (e.g. `${my-months}`).
83+
//
84+
// Speech-to-Text supports three locations: `global`, `us` (US North America),
85+
// and `eu` (Europe). If you are calling the `speech.googleapis.com`
86+
// endpoint, use the `global` location. To specify a region, use a
87+
// [regional endpoint](/speech-to-text/docs/endpoints) with matching `us` or
88+
// `eu` location value.
89+
message Phrase {
90+
// The phrase itself.
91+
string value = 1;
92+
93+
// Hint Boost. Overrides the boost set at the phrase set level.
94+
// Positive value will increase the probability that a specific phrase will
95+
// be recognized over other similar sounding phrases. The higher the boost,
96+
// the higher the chance of false positive recognition as well. Negative
97+
// boost will simply be ignored. Though `boost` can accept a wide range of
98+
// positive values, most use cases are best served
99+
// with values between 0 and 20. We recommend using a binary search approach
100+
// to finding the optimal value for your use case. Speech recognition
101+
// will skip PhraseSets with a boost value of 0.
102+
float boost = 2;
103+
}
104+
105+
// The resource name of the phrase set.
106+
string name = 1;
107+
108+
// A list of word and phrases.
109+
repeated Phrase phrases = 2;
110+
111+
// Hint Boost. Positive value will increase the probability that a specific
112+
// phrase will be recognized over other similar sounding phrases. The higher
113+
// the boost, the higher the chance of false positive recognition as well.
114+
// Negative boost values would correspond to anti-biasing. Anti-biasing is not
115+
// enabled, so negative boost will simply be ignored. Though `boost` can
116+
// accept a wide range of positive values, most use cases are best served with
117+
// values between 0 (exclusive) and 20. We recommend using a binary search
118+
// approach to finding the optimal value for your use case. Speech recognition
119+
// will skip PhraseSets with a boost value of 0.
120+
float boost = 4;
121+
}
122+
123+
// Speech adaptation configuration.
124+
message SpeechAdaptation {
125+
// A collection of phrase sets. To specify the hints inline, leave the
126+
// phrase set's `name` blank and fill in the rest of its fields. Any
127+
// phrase set can use any custom class.
128+
repeated PhraseSet phrase_sets = 1;
129+
130+
// A collection of phrase set resource names to use.
131+
repeated string phrase_set_references = 2 [(google.api.resource_reference) = {
132+
type: "speech.googleapis.com/PhraseSet"
133+
}];
134+
135+
// A collection of custom classes. To specify the classes inline, leave the
136+
// class' `name` blank and fill in the rest of its fields, giving it a unique
137+
// `custom_class_id`. Refer to the inline defined class in phrase hints by its
138+
// `custom_class_id`.
139+
repeated CustomClass custom_classes = 3;
140+
}

0 commit comments

Comments
 (0)