19
19
import com .google .api .gax .longrunning .OperationFuture ;
20
20
import com .google .api .gax .rpc .ApiStreamObserver ;
21
21
import com .google .api .gax .rpc .BidiStreamingCallable ;
22
+ import com .google .api .gax .rpc .ClientStream ;
23
+ import com .google .api .gax .rpc .ResponseObserver ;
24
+ import com .google .api .gax .rpc .StreamController ;
22
25
import com .google .cloud .speech .v1p1beta1 .LongRunningRecognizeMetadata ;
23
26
import com .google .cloud .speech .v1p1beta1 .LongRunningRecognizeResponse ;
24
27
import com .google .cloud .speech .v1p1beta1 .RecognitionAudio ;
47
50
import java .util .ArrayList ;
48
51
import java .util .List ;
49
52
53
+ import javax .sound .sampled .AudioFormat ;
54
+ import javax .sound .sampled .AudioInputStream ;
55
+ import javax .sound .sampled .AudioSystem ;
56
+ import javax .sound .sampled .DataLine ;
57
+ import javax .sound .sampled .DataLine .Info ;
58
+ import javax .sound .sampled .TargetDataLine ;
59
+
50
60
public class Recognize {
51
61
52
62
/** Run speech recognition tasks. */
@@ -56,9 +66,10 @@ public static void main(String... args) throws Exception {
56
66
System .out .printf (
57
67
"\t java %s \" <command>\" \" <path-to-image>\" \n "
58
68
+ "Commands:\n "
59
- + "\t syncrecognize | asyncrecognize | streamrecognize | wordoffsets\n "
60
- + "\t | model-selection | auto-punctuation | stream-punctuation | enhanced-model\n "
61
- + "\t | metadata | diarization | multi-channel | multi-language | word-level-conf"
69
+ + "\t syncrecognize | asyncrecognize | streamrecognize | micstreamrecognize \n "
70
+ + "\t | wordoffsets | model-selection | auto-punctuation | stream-punctuation \n "
71
+ + "\t | enhanced-model| metadata | diarization | multi-channel | multi-language \n "
72
+ + "\t | word-level-conf"
62
73
+ "Path:\n \t A file path (ex: ./resources/audio.raw) or a URI "
63
74
+ "for a Cloud Storage resource (gs://...)\n " ,
64
75
Recognize .class .getCanonicalName ());
@@ -88,6 +99,8 @@ public static void main(String... args) throws Exception {
88
99
}
89
100
} else if (command .equals ("streamrecognize" )) {
90
101
streamingRecognizeFile (path );
102
+ } else if (command .equals ("micstreamrecognize" )) {
103
+ streamingMicRecognize ();
91
104
} else if (command .equals ("model-selection" )) {
92
105
if (path .startsWith ("gs://" )) {
93
106
transcribeModelSelectionGcs (path );
@@ -704,6 +717,97 @@ public SettableFuture<List<T>> future() {
704
717
}
705
718
// [END speech_stream_recognize_punctuation]
706
719
720
+ // [START speech_streaming_mic_recognize]
721
+ /** Performs microphone streaming speech recognition with a duration of 1 minute. */
722
+ public static void streamingMicRecognize () throws Exception {
723
+
724
+ ResponseObserver <StreamingRecognizeResponse > responseObserver = null ;
725
+ try (SpeechClient client = SpeechClient .create ()) {
726
+
727
+ responseObserver =
728
+ new ResponseObserver <StreamingRecognizeResponse >() {
729
+ ArrayList <StreamingRecognizeResponse > responses = new ArrayList <>();
730
+
731
+ public void onStart (StreamController controller ) {}
732
+
733
+ public void onResponse (StreamingRecognizeResponse response ) {
734
+ responses .add (response );
735
+ }
736
+
737
+ public void onComplete () {
738
+ for (StreamingRecognizeResponse response : responses ) {
739
+ StreamingRecognitionResult result = response .getResultsList ().get (0 );
740
+ SpeechRecognitionAlternative alternative = result .getAlternativesList ().get (0 );
741
+ System .out .printf ("Transcript : %s\n " , alternative .getTranscript ());
742
+ }
743
+ }
744
+
745
+ public void onError (Throwable t ) {
746
+ System .out .println (t );
747
+ }
748
+ };
749
+
750
+ ClientStream <StreamingRecognizeRequest > clientStream =
751
+ client .streamingRecognizeCallable ().splitCall (responseObserver );
752
+
753
+ RecognitionConfig recognitionConfig =
754
+ RecognitionConfig .newBuilder ()
755
+ .setEncoding (RecognitionConfig .AudioEncoding .LINEAR16 )
756
+ .setLanguageCode ("en-US" )
757
+ .setSampleRateHertz (16000 )
758
+ .build ();
759
+ StreamingRecognitionConfig streamingRecognitionConfig =
760
+ StreamingRecognitionConfig .newBuilder ().setConfig (recognitionConfig ).build ();
761
+
762
+ StreamingRecognizeRequest request =
763
+ StreamingRecognizeRequest .newBuilder ()
764
+ .setStreamingConfig (streamingRecognitionConfig )
765
+ .build (); // The first request in a streaming call has to be a config
766
+
767
+ clientStream .send (request );
768
+ // SampleRate:16000Hz, SampleSizeInBits: 16, Number of channels: 1, Signed: true,
769
+ // bigEndian: false
770
+ AudioFormat audioFormat = new AudioFormat (16000 , 16 , 1 , true , false );
771
+ DataLine .Info targetInfo =
772
+ new Info (
773
+ TargetDataLine .class ,
774
+ audioFormat ); // Set the system information to read from the microphone audio stream
775
+
776
+ if (!AudioSystem .isLineSupported (targetInfo )) {
777
+ System .out .println ("Microphone not supported" );
778
+ System .exit (0 );
779
+ }
780
+ // Target data line captures the audio stream the microphone produces.
781
+ TargetDataLine targetDataLine = (TargetDataLine ) AudioSystem .getLine (targetInfo );
782
+ targetDataLine .open (audioFormat );
783
+ targetDataLine .start ();
784
+ System .out .println ("Start speaking" );
785
+ long startTime = System .currentTimeMillis ();
786
+ // Audio Input Stream
787
+ AudioInputStream audio = new AudioInputStream (targetDataLine );
788
+ while (true ) {
789
+ long estimatedTime = System .currentTimeMillis () - startTime ;
790
+ byte [] data = new byte [6400 ];
791
+ audio .read (data );
792
+ if (estimatedTime > 60000 ) { // 60 seconds
793
+ System .out .println ("Stop speaking." );
794
+ targetDataLine .stop ();
795
+ targetDataLine .close ();
796
+ break ;
797
+ }
798
+ request =
799
+ StreamingRecognizeRequest .newBuilder ()
800
+ .setAudioContent (ByteString .copyFrom (data ))
801
+ .build ();
802
+ clientStream .send (request );
803
+ }
804
+ } catch (Exception e ) {
805
+ System .out .println (e );
806
+ }
807
+ responseObserver .onComplete ();
808
+ }
809
+ // [END speech_streaming_mic_recognize]
810
+
707
811
// [START speech_transcribe_file_with_enhanced_model]
708
812
/**
709
813
* Transcribe the given audio file using an enhanced model.
@@ -833,8 +937,9 @@ public static void transcribeDiarization(String fileName) throws Exception {
833
937
SpeechRecognitionAlternative alternative = result .getAlternatives (0 );
834
938
System .out .format ("Transcript : %s\n " , alternative .getTranscript ());
835
939
// The words array contains the entire transcript up until that point.
836
- //Referencing the last spoken word to get the associated Speaker tag
837
- System .out .format ("Speaker Tag %s: %s\n " ,
940
+ // Referencing the last spoken word to get the associated Speaker tag
941
+ System .out .format (
942
+ "Speaker Tag %s: %s\n " ,
838
943
alternative .getWords ((alternative .getWordsCount () - 1 )).getSpeakerTag (),
839
944
alternative .getTranscript ());
840
945
}
@@ -877,8 +982,9 @@ public static void transcribeDiarizationGcs(String gcsUri) throws Exception {
877
982
// use the first (most likely) one here.
878
983
SpeechRecognitionAlternative alternative = result .getAlternatives (0 );
879
984
// The words array contains the entire transcript up until that point.
880
- //Referencing the last spoken word to get the associated Speaker tag
881
- System .out .format ("Speaker Tag %s:%s\n " ,
985
+ // Referencing the last spoken word to get the associated Speaker tag
986
+ System .out .format (
987
+ "Speaker Tag %s:%s\n " ,
882
988
alternative .getWords ((alternative .getWordsCount () - 1 )).getSpeakerTag (),
883
989
alternative .getTranscript ());
884
990
}
0 commit comments