@@ -97,9 +97,10 @@ def __init__(
97
97
Path to a local models directory.
98
98
If not specified, a directory "models" will be created in the
99
99
script directory.
100
- voice (str):
101
- Name to the file containing the voice to clone.
102
- Works with a 44100Hz or 22050Hz mono 32bit float WAV file.
100
+ voice (Union[str, List[str]]):
101
+ Name(s) of the file(s) containing the voice to clone.
102
+ Works with a 44100Hz or 22050Hz mono 32bit float WAV file,
103
+ or a list of such files.
103
104
language (str):
104
105
Language to use for the coqui model.
105
106
speed (float):
@@ -135,8 +136,8 @@ def __init__(
135
136
Function to prepare text for synthesis.
136
137
If not specified, a default sentence parser will be used.
137
138
add_sentence_filter (bool):
138
- Adds a own sentence filter in addition
139
- to the one coqui tts already provides.
139
+ Adds a custom sentence filter in addition
140
+ to the one coqui TTS already provides.
140
141
pretrained (bool):
141
142
Use a pretrained model for the coqui model.
142
143
comma_silence_duration (float):
@@ -212,7 +213,6 @@ def __init__(
212
213
# Start the worker process
213
214
try :
214
215
# Only set the start method if it hasn't been set already
215
- # Check the current platform and set the start method
216
216
if sys .platform .startswith ('linux' ) or sys .platform == 'darwin' : # For Linux or macOS
217
217
mp .set_start_method ("spawn" )
218
218
elif mp .get_start_method (allow_none = True ) is None :
@@ -335,8 +335,8 @@ def _synthesize_worker(
335
335
conn (multiprocessing.Connection):
336
336
Connection to the parent process.
337
337
model_name (str): Name of the coqui model to use.
338
- cloning_reference_wav (str):
339
- Name to the file containing the voice to clone.
338
+ cloning_reference_wav (Union[ str, List[str]] ):
339
+ The file(s) containing the voice to clone.
340
340
language (str): Language to use for the coqui model.
341
341
ready_event (multiprocessing.Event):
342
342
Event to signal when the model is ready.
@@ -356,7 +356,7 @@ def _synthesize_worker(
356
356
357
357
def get_conditioning_latents (filenames : Union [str , List [str ]], tts ):
358
358
"""
359
- Method still needs some rework
359
+ Computes and/or loads speaker latents for the given filename(s).
360
360
"""
361
361
if not isinstance (filenames , list ):
362
362
filenames = [filenames ]
@@ -370,7 +370,6 @@ def get_conditioning_latents(filenames: Union[str, List[str]], tts):
370
370
if len (filenames ) == 1 :
371
371
logging .debug ("Handling of single voice file" )
372
372
373
- # verify that filename ends with .wav
374
373
filename = filenames [0 ]
375
374
if filename .endswith (".json" ):
376
375
filename_json = filename
@@ -467,7 +466,6 @@ def get_conditioning_latents(filenames: Union[str, List[str]], tts):
467
466
else :
468
467
audio_path_list = []
469
468
for filename in filenames :
470
- # verify that filename ends with .wav
471
469
if filename .endswith (".wav" ):
472
470
if voices_path :
473
471
filename_voice_wav = os .path .join (voices_path , filename )
@@ -493,8 +491,7 @@ def get_conditioning_latents(filenames: Union[str, List[str]], tts):
493
491
f"Default voice file { filename_voice_json } not found."
494
492
)
495
493
496
- # compute and write latents to json file
497
- logging .debug (f"Computing latents for { filename } " )
494
+ logging .debug (f"Computing latents for the provided list: { filenames } " )
498
495
499
496
gpt_cond_latent , speaker_embedding = tts .get_conditioning_latents (
500
497
audio_path = audio_path_list , gpt_cond_len = 30 , max_ref_length = 60
@@ -507,6 +504,7 @@ def get_conditioning_latents(filenames: Union[str, List[str]], tts):
507
504
.half ()
508
505
.tolist (),
509
506
}
507
+ # Save latents alongside the first WAV
510
508
filename_voice_json = audio_path_list [0 ][:- 3 ] + "json"
511
509
with open (filename_voice_json , "w" ) as new_file :
512
510
json .dump (latents , new_file )
@@ -720,9 +718,7 @@ def get_user_data_dir(appname):
720
718
chunk_bytes = chunk .tobytes ()
721
719
722
720
conn .send (("success" , chunk_bytes ))
723
- chunk_duration = len (chunk_bytes ) / (
724
- 4 * 24000
725
- ) # 4 bytes per sample, 24000 Hz
721
+ chunk_duration = len (chunk_bytes ) / (4 * 24000 ) # 4 bytes per sample, 24000 Hz
726
722
full_generated_seconds += chunk_duration
727
723
if i == 0 :
728
724
first_chunk_length_seconds = chunk_duration
@@ -734,8 +730,7 @@ def get_user_data_dir(appname):
734
730
chunk_production_seconds = time .time () - time_start
735
731
generated_audio_seconds = full_generated_seconds
736
732
737
- # wait only if we are faster than realtime, meaning
738
- # that chunk_production_seconds is smaller than generated_audio_seconds
733
+ # wait only if we are faster than realtime
739
734
if load_balancing :
740
735
if chunk_production_seconds < (generated_audio_seconds + load_balancing_buffer_length ):
741
736
waiting_time = generated_audio_seconds - chunk_production_seconds - load_balancing_cut_off
@@ -759,16 +754,15 @@ def get_user_data_dir(appname):
759
754
print (f"Realtime Factor: { realtime_factor } " )
760
755
print (f"Raw Inference Factor: { raw_inference_factor } " )
761
756
762
-
763
757
# Send silent audio
764
758
sample_rate = config .audio .sample_rate
765
759
766
760
end_sentence_delimeters = ".!?…。¡¿"
767
761
mid_sentence_delimeters = ";:,\n ()[]{}-“”„”—/|《》"
768
762
769
- if text [- 1 ] in end_sentence_delimeters :
763
+ if text and text [- 1 ] in end_sentence_delimeters :
770
764
silence_duration = sentence_silence_duration
771
- elif text [- 1 ] in mid_sentence_delimeters :
765
+ elif text and text [- 1 ] in mid_sentence_delimeters :
772
766
silence_duration = comma_silence_duration
773
767
else :
774
768
silence_duration = default_silence_duration
@@ -782,7 +776,7 @@ def get_user_data_dir(appname):
782
776
end_time = time .time ()
783
777
784
778
except KeyboardInterrupt :
785
- logging .info ("Keyboard interrupt received. " " Exiting worker process." )
779
+ logging .info ("Keyboard interrupt received. Exiting worker process." )
786
780
conn .send (("shutdown" , "shutdown" ))
787
781
788
782
except Exception as e :
@@ -796,7 +790,7 @@ def get_user_data_dir(appname):
796
790
print (f"Error: { e } " )
797
791
798
792
conn .send (("error" , str (e )))
799
-
793
+
800
794
sys .stdout = sys .__stdout__
801
795
sys .stderr = sys .__stderr__
802
796
@@ -807,9 +801,13 @@ def send_command(self, command, data):
807
801
message = {"command" : command , "data" : data }
808
802
self .parent_synthesize_pipe .send (message )
809
803
810
- def set_cloning_reference (self , cloning_reference_wav : str ):
804
+ def set_cloning_reference (self , cloning_reference_wav : Union [ str , List [ str ]] ):
811
805
"""
812
806
Send an 'update_reference' command and wait for a response.
807
+
808
+ Args:
809
+ cloning_reference_wav (Union[str, List[str]]):
810
+ Name(s) of the file(s) containing the voice to clone.
813
811
"""
814
812
if not isinstance (cloning_reference_wav , list ):
815
813
cloning_reference_wav = [cloning_reference_wav ]
@@ -861,17 +859,15 @@ def get_stream_info(self):
861
859
tuple: A tuple containing the audio format, number of channels,
862
860
and the sample rate.
863
861
- Format (int): The format of the audio stream.
864
- pyaudio.paInt16 represents 16-bit integers.
865
- - Channels (int): The number of audio channels.
866
- 1 represents mono audio.
867
- - Sample Rate (int): The sample rate of the audio in Hz.
868
- 16000 represents 16kHz sample rate.
862
+ pyaudio.paFloat32 represents 32-bit float samples.
863
+ - Channels (int): The number of audio channels (1 = mono).
864
+ - Sample Rate (int): The sample rate of the audio in Hz (24000).
869
865
"""
870
866
return pyaudio .paFloat32 , 1 , 24000
871
867
872
868
def _prepare_text_for_synthesis (self , text : str ):
873
869
"""
874
- Splits a text into sentences .
870
+ Splits and cleans a text for speech synthesis .
875
871
876
872
Args:
877
873
text (str): Text to prepare for synthesis.
@@ -885,19 +881,14 @@ def _prepare_text_for_synthesis(self, text: str):
885
881
if self .prepare_text_callback :
886
882
return self .prepare_text_callback (text )
887
883
888
- # A fast fix for last character, may produce weird sounds if it is with text
889
884
text = text .strip ()
890
885
text = text .replace ("</s>" , "" )
891
- # text = re.sub("```.*```", "", text, flags=re.DOTALL)
892
- # text = re.sub("`.*`", "", text, flags=re.DOTALL)
893
886
text = re .sub ("\\ (.*?\\ )" , "" , text , flags = re .DOTALL )
894
887
text = text .replace ("```" , "" )
895
888
text = text .replace ("..." , " " )
896
889
text = text .replace ("»" , "" )
897
890
text = text .replace ("«" , "" )
898
891
text = re .sub (" +" , " " , text )
899
- # text= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",text)
900
- # text= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2",text)
901
892
902
893
try :
903
894
if len (text ) > 2 and text [- 1 ] in ["." ]:
@@ -908,7 +899,6 @@ def _prepare_text_for_synthesis(self, text: str):
908
899
text = text [:- 2 ]
909
900
elif len (text ) > 3 and text [- 2 ] in ["!" , "?" , "," ]:
910
901
text = text [:- 2 ] + " " + text [- 2 ]
911
-
912
902
except Exception as e :
913
903
logging .warning (
914
904
f'Error fixing sentence end punctuation: { e } , Text: "{ text } "'
@@ -1008,41 +998,56 @@ def get_voices(self):
1008
998
voices_appended = []
1009
999
1010
1000
# Add custom voices
1011
- files = os .listdir (self .voices_path )
1012
- for file in files :
1013
- # remove ending .wav or .json from filename
1014
- if file .endswith (".wav" ):
1015
- file = file [:- 4 ]
1016
- elif file .endswith (".json" ):
1017
- file = file [:- 5 ]
1018
- else :
1019
- continue
1001
+ if self . voices_path and os .path . isdir (self .voices_path ):
1002
+ files = os . listdir ( self . voices_path )
1003
+ for file in files :
1004
+ if file .endswith (".wav" ):
1005
+ file = file [:- 4 ]
1006
+ elif file .endswith (".json" ):
1007
+ file = file [:- 5 ]
1008
+ else :
1009
+ continue
1020
1010
1021
- if file in voices_appended :
1022
- continue
1011
+ if file in voices_appended :
1012
+ continue
1023
1013
1024
- voices_appended .append (file )
1025
- voice_objects .append (CoquiVoice (file ))
1014
+ voices_appended .append (file )
1015
+ voice_objects .append (CoquiVoice (file ))
1026
1016
1027
1017
# Add predefined coqui system voices
1028
1018
for voice in self .voices_list :
1029
1019
voice_objects .append (CoquiVoice (voice ))
1030
1020
1031
1021
return voice_objects
1032
1022
1033
- def set_voice (self , voice : str ):
1023
+ def set_voice (self , voice : Union [ str , List [ str ], CoquiVoice ] ):
1034
1024
"""
1035
- Sets the voice to be used for speech synthesis.
1025
+ Sets the voice(s) to be used for speech synthesis.
1026
+
1027
+ Args:
1028
+ voice (Union[str, List[str], CoquiVoice]):
1029
+ Name of the voice, a list of voice file paths,
1030
+ or a CoquiVoice instance.
1036
1031
"""
1032
+ # If it's a CoquiVoice instance, just use its name
1037
1033
if isinstance (voice , CoquiVoice ):
1038
- self .set_cloning_reference (voice .name )
1039
- else :
1040
- installed_voices = self .get_voices ()
1041
- for installed_voice in installed_voices :
1042
- if voice in installed_voice .name :
1043
- self .set_cloning_reference (installed_voice .name )
1044
- return
1045
- self .set_cloning_reference (voice )
1034
+ return self .set_cloning_reference (voice .name )
1035
+
1036
+ # If it's a list of strings, we assume these are file paths
1037
+ if isinstance (voice , list ):
1038
+ if not voice :
1039
+ logging .warning ("Received an empty list for set_voice." )
1040
+ return
1041
+ return self .set_cloning_reference (voice )
1042
+
1043
+ # Otherwise, it's a string
1044
+ installed_voices = self .get_voices ()
1045
+ for installed_voice in installed_voices :
1046
+ if voice == installed_voice .name :
1047
+ return self .set_cloning_reference (installed_voice .name )
1048
+
1049
+ # If not found among installed_voices, treat as a new file or path
1050
+ self .set_cloning_reference (voice )
1046
1051
1047
1052
def set_voice_parameters (self , ** voice_parameters ):
1048
1053
"""
@@ -1051,7 +1056,7 @@ def set_voice_parameters(self, **voice_parameters):
1051
1056
Args:
1052
1057
**voice_parameters: The voice parameters to be used for speech synthesis.
1053
1058
1054
- This method should be overridden by the derived class to set the desired voice parameters.
1059
+ This method can be overridden by the derived class to set the desired voice parameters.
1055
1060
"""
1056
1061
pass
1057
1062
@@ -1072,7 +1077,6 @@ def shutdown(self):
1072
1077
if "shutdown" in status :
1073
1078
logging .info ("Worker process acknowledged shutdown" )
1074
1079
except EOFError :
1075
- # Pipe was closed, meaning the process is already down
1076
1080
logging .warning (
1077
1081
"Worker process pipe was closed before shutdown acknowledgement"
1078
1082
)
0 commit comments