Skip to content

Commit 25b562c

Browse files
committed
multiple voices for set_voice method in coquiengine
1 parent 7eeee52 commit 25b562c

File tree

4 files changed

+70
-66
lines changed

4 files changed

+70
-66
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ Let me know if you need any adjustments or additional languages!
5858

5959
## Updates
6060

61-
Latest Version: v0.4.47
61+
Latest Version: v0.4.48
6262

6363
Support for more kokoro languages. Full installation for also japanese and chinese languages (see updated test file):
6464
```shell

RealtimeTTS/engines/coqui_engine.py

+65-61
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,10 @@ def __init__(
9797
Path to a local models directory.
9898
If not specified, a directory "models" will be created in the
9999
script directory.
100-
voice (str):
101-
Name to the file containing the voice to clone.
102-
Works with a 44100Hz or 22050Hz mono 32bit float WAV file.
100+
voice (Union[str, List[str]]):
101+
Name(s) of the file(s) containing the voice to clone.
102+
Works with a 44100Hz or 22050Hz mono 32bit float WAV file,
103+
or a list of such files.
103104
language (str):
104105
Language to use for the coqui model.
105106
speed (float):
@@ -135,8 +136,8 @@ def __init__(
135136
Function to prepare text for synthesis.
136137
If not specified, a default sentence parser will be used.
137138
add_sentence_filter (bool):
138-
Adds a own sentence filter in addition
139-
to the one coqui tts already provides.
139+
Adds a custom sentence filter in addition
140+
to the one coqui TTS already provides.
140141
pretrained (bool):
141142
Use a pretrained model for the coqui model.
142143
comma_silence_duration (float):
@@ -212,7 +213,6 @@ def __init__(
212213
# Start the worker process
213214
try:
214215
# Only set the start method if it hasn't been set already
215-
# Check the current platform and set the start method
216216
if sys.platform.startswith('linux') or sys.platform == 'darwin': # For Linux or macOS
217217
mp.set_start_method("spawn")
218218
elif mp.get_start_method(allow_none=True) is None:
@@ -335,8 +335,8 @@ def _synthesize_worker(
335335
conn (multiprocessing.Connection):
336336
Connection to the parent process.
337337
model_name (str): Name of the coqui model to use.
338-
cloning_reference_wav (str):
339-
Name to the file containing the voice to clone.
338+
cloning_reference_wav (Union[str, List[str]]):
339+
The file(s) containing the voice to clone.
340340
language (str): Language to use for the coqui model.
341341
ready_event (multiprocessing.Event):
342342
Event to signal when the model is ready.
@@ -356,7 +356,7 @@ def _synthesize_worker(
356356

357357
def get_conditioning_latents(filenames: Union[str, List[str]], tts):
358358
"""
359-
Method still needs some rework
359+
Computes and/or loads speaker latents for the given filename(s).
360360
"""
361361
if not isinstance(filenames, list):
362362
filenames = [filenames]
@@ -370,7 +370,6 @@ def get_conditioning_latents(filenames: Union[str, List[str]], tts):
370370
if len(filenames) == 1:
371371
logging.debug("Handling of single voice file")
372372

373-
# verify that filename ends with .wav
374373
filename = filenames[0]
375374
if filename.endswith(".json"):
376375
filename_json = filename
@@ -467,7 +466,6 @@ def get_conditioning_latents(filenames: Union[str, List[str]], tts):
467466
else:
468467
audio_path_list = []
469468
for filename in filenames:
470-
# verify that filename ends with .wav
471469
if filename.endswith(".wav"):
472470
if voices_path:
473471
filename_voice_wav = os.path.join(voices_path, filename)
@@ -493,8 +491,7 @@ def get_conditioning_latents(filenames: Union[str, List[str]], tts):
493491
f"Default voice file {filename_voice_json} not found."
494492
)
495493

496-
# compute and write latents to json file
497-
logging.debug(f"Computing latents for {filename}")
494+
logging.debug(f"Computing latents for the provided list: {filenames}")
498495

499496
gpt_cond_latent, speaker_embedding = tts.get_conditioning_latents(
500497
audio_path=audio_path_list, gpt_cond_len=30, max_ref_length=60
@@ -507,6 +504,7 @@ def get_conditioning_latents(filenames: Union[str, List[str]], tts):
507504
.half()
508505
.tolist(),
509506
}
507+
# Save latents alongside the first WAV
510508
filename_voice_json = audio_path_list[0][:-3] + "json"
511509
with open(filename_voice_json, "w") as new_file:
512510
json.dump(latents, new_file)
@@ -720,9 +718,7 @@ def get_user_data_dir(appname):
720718
chunk_bytes = chunk.tobytes()
721719

722720
conn.send(("success", chunk_bytes))
723-
chunk_duration = len(chunk_bytes) / (
724-
4 * 24000
725-
) # 4 bytes per sample, 24000 Hz
721+
chunk_duration = len(chunk_bytes) / (4 * 24000) # 4 bytes per sample, 24000 Hz
726722
full_generated_seconds += chunk_duration
727723
if i == 0:
728724
first_chunk_length_seconds = chunk_duration
@@ -734,8 +730,7 @@ def get_user_data_dir(appname):
734730
chunk_production_seconds = time.time() - time_start
735731
generated_audio_seconds = full_generated_seconds
736732

737-
# wait only if we are faster than realtime, meaning
738-
# that chunk_production_seconds is smaller than generated_audio_seconds
733+
# wait only if we are faster than realtime
739734
if load_balancing:
740735
if chunk_production_seconds < (generated_audio_seconds + load_balancing_buffer_length):
741736
waiting_time = generated_audio_seconds - chunk_production_seconds - load_balancing_cut_off
@@ -759,16 +754,15 @@ def get_user_data_dir(appname):
759754
print(f"Realtime Factor: {realtime_factor}")
760755
print(f"Raw Inference Factor: {raw_inference_factor}")
761756

762-
763757
# Send silent audio
764758
sample_rate = config.audio.sample_rate
765759

766760
end_sentence_delimeters = ".!?…。¡¿"
767761
mid_sentence_delimeters = ";:,\n()[]{}-“”„”—/|《》"
768762

769-
if text[-1] in end_sentence_delimeters:
763+
if text and text[-1] in end_sentence_delimeters:
770764
silence_duration = sentence_silence_duration
771-
elif text[-1] in mid_sentence_delimeters:
765+
elif text and text[-1] in mid_sentence_delimeters:
772766
silence_duration = comma_silence_duration
773767
else:
774768
silence_duration = default_silence_duration
@@ -782,7 +776,7 @@ def get_user_data_dir(appname):
782776
end_time = time.time()
783777

784778
except KeyboardInterrupt:
785-
logging.info("Keyboard interrupt received. " "Exiting worker process.")
779+
logging.info("Keyboard interrupt received. Exiting worker process.")
786780
conn.send(("shutdown", "shutdown"))
787781

788782
except Exception as e:
@@ -796,7 +790,7 @@ def get_user_data_dir(appname):
796790
print(f"Error: {e}")
797791

798792
conn.send(("error", str(e)))
799-
793+
800794
sys.stdout = sys.__stdout__
801795
sys.stderr = sys.__stderr__
802796

@@ -807,9 +801,13 @@ def send_command(self, command, data):
807801
message = {"command": command, "data": data}
808802
self.parent_synthesize_pipe.send(message)
809803

810-
def set_cloning_reference(self, cloning_reference_wav: str):
804+
def set_cloning_reference(self, cloning_reference_wav: Union[str, List[str]]):
811805
"""
812806
Send an 'update_reference' command and wait for a response.
807+
808+
Args:
809+
cloning_reference_wav (Union[str, List[str]]):
810+
Name(s) of the file(s) containing the voice to clone.
813811
"""
814812
if not isinstance(cloning_reference_wav, list):
815813
cloning_reference_wav = [cloning_reference_wav]
@@ -861,17 +859,15 @@ def get_stream_info(self):
861859
tuple: A tuple containing the audio format, number of channels,
862860
and the sample rate.
863861
- Format (int): The format of the audio stream.
864-
pyaudio.paInt16 represents 16-bit integers.
865-
- Channels (int): The number of audio channels.
866-
1 represents mono audio.
867-
- Sample Rate (int): The sample rate of the audio in Hz.
868-
16000 represents 16kHz sample rate.
862+
pyaudio.paFloat32 represents 32-bit float samples.
863+
- Channels (int): The number of audio channels (1 = mono).
864+
- Sample Rate (int): The sample rate of the audio in Hz (24000).
869865
"""
870866
return pyaudio.paFloat32, 1, 24000
871867

872868
def _prepare_text_for_synthesis(self, text: str):
873869
"""
874-
Splits a text into sentences.
870+
Splits and cleans a text for speech synthesis.
875871
876872
Args:
877873
text (str): Text to prepare for synthesis.
@@ -885,19 +881,14 @@ def _prepare_text_for_synthesis(self, text: str):
885881
if self.prepare_text_callback:
886882
return self.prepare_text_callback(text)
887883

888-
# A fast fix for last character, may produce weird sounds if it is with text
889884
text = text.strip()
890885
text = text.replace("</s>", "")
891-
# text = re.sub("```.*```", "", text, flags=re.DOTALL)
892-
# text = re.sub("`.*`", "", text, flags=re.DOTALL)
893886
text = re.sub("\\(.*?\\)", "", text, flags=re.DOTALL)
894887
text = text.replace("```", "")
895888
text = text.replace("...", " ")
896889
text = text.replace("»", "")
897890
text = text.replace("«", "")
898891
text = re.sub(" +", " ", text)
899-
# text= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",text)
900-
# text= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2",text)
901892

902893
try:
903894
if len(text) > 2 and text[-1] in ["."]:
@@ -908,7 +899,6 @@ def _prepare_text_for_synthesis(self, text: str):
908899
text = text[:-2]
909900
elif len(text) > 3 and text[-2] in ["!", "?", ","]:
910901
text = text[:-2] + " " + text[-2]
911-
912902
except Exception as e:
913903
logging.warning(
914904
f'Error fixing sentence end punctuation: {e}, Text: "{text}"'
@@ -1008,41 +998,56 @@ def get_voices(self):
1008998
voices_appended = []
1009999

10101000
# Add custom voices
1011-
files = os.listdir(self.voices_path)
1012-
for file in files:
1013-
# remove ending .wav or .json from filename
1014-
if file.endswith(".wav"):
1015-
file = file[:-4]
1016-
elif file.endswith(".json"):
1017-
file = file[:-5]
1018-
else:
1019-
continue
1001+
if self.voices_path and os.path.isdir(self.voices_path):
1002+
files = os.listdir(self.voices_path)
1003+
for file in files:
1004+
if file.endswith(".wav"):
1005+
file = file[:-4]
1006+
elif file.endswith(".json"):
1007+
file = file[:-5]
1008+
else:
1009+
continue
10201010

1021-
if file in voices_appended:
1022-
continue
1011+
if file in voices_appended:
1012+
continue
10231013

1024-
voices_appended.append(file)
1025-
voice_objects.append(CoquiVoice(file))
1014+
voices_appended.append(file)
1015+
voice_objects.append(CoquiVoice(file))
10261016

10271017
# Add predefined coqui system voices
10281018
for voice in self.voices_list:
10291019
voice_objects.append(CoquiVoice(voice))
10301020

10311021
return voice_objects
10321022

1033-
def set_voice(self, voice: str):
1023+
def set_voice(self, voice: Union[str, List[str], CoquiVoice]):
10341024
"""
1035-
Sets the voice to be used for speech synthesis.
1025+
Sets the voice(s) to be used for speech synthesis.
1026+
1027+
Args:
1028+
voice (Union[str, List[str], CoquiVoice]):
1029+
Name of the voice, a list of voice file paths,
1030+
or a CoquiVoice instance.
10361031
"""
1032+
# If it's a CoquiVoice instance, just use its name
10371033
if isinstance(voice, CoquiVoice):
1038-
self.set_cloning_reference(voice.name)
1039-
else:
1040-
installed_voices = self.get_voices()
1041-
for installed_voice in installed_voices:
1042-
if voice in installed_voice.name:
1043-
self.set_cloning_reference(installed_voice.name)
1044-
return
1045-
self.set_cloning_reference(voice)
1034+
return self.set_cloning_reference(voice.name)
1035+
1036+
# If it's a list of strings, we assume these are file paths
1037+
if isinstance(voice, list):
1038+
if not voice:
1039+
logging.warning("Received an empty list for set_voice.")
1040+
return
1041+
return self.set_cloning_reference(voice)
1042+
1043+
# Otherwise, it's a string
1044+
installed_voices = self.get_voices()
1045+
for installed_voice in installed_voices:
1046+
if voice == installed_voice.name:
1047+
return self.set_cloning_reference(installed_voice.name)
1048+
1049+
# If not found among installed_voices, treat as a new file or path
1050+
self.set_cloning_reference(voice)
10461051

10471052
def set_voice_parameters(self, **voice_parameters):
10481053
"""
@@ -1051,7 +1056,7 @@ def set_voice_parameters(self, **voice_parameters):
10511056
Args:
10521057
**voice_parameters: The voice parameters to be used for speech synthesis.
10531058
1054-
This method should be overridden by the derived class to set the desired voice parameters.
1059+
This method can be overridden by the derived class to set the desired voice parameters.
10551060
"""
10561061
pass
10571062

@@ -1072,7 +1077,6 @@ def shutdown(self):
10721077
if "shutdown" in status:
10731078
logging.info("Worker process acknowledged shutdown")
10741079
except EOFError:
1075-
# Pipe was closed, meaning the process is already down
10761080
logging.warning(
10771081
"Worker process pipe was closed before shutdown acknowledgement"
10781082
)

requirements.txt

+3-3
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@ azure-cognitiveservices-speech==1.42.0
88
coqui_tts==0.25.3
99

1010
# elevenlabs is for ElevenlabsEngine
11-
elevenlabs==1.50.6
11+
elevenlabs==1.51.0
1212

1313
# gtts is for GTTSEngine
1414
gtts==2.5.4
1515

1616
# openai is for OpenAIEngine
17-
openai==1.61.0
17+
openai==1.63.0
1818

1919
# pyttsx3 is for SystemEngine
2020
pyttsx3==2.98
@@ -23,7 +23,7 @@ pyttsx3==2.98
2323
edge-tts==7.0.0
2424

2525
# kokoro is for KokoroEngine
26-
kokoro==0.7.3
26+
kokoro==0.7.16
2727

2828
# fugashi is to support japanese language for KokoroEngine
2929
fugashi==1.4.0

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
current_version = "0.4.47"
1+
current_version = "0.4.48"
22

33
import setuptools
44

0 commit comments

Comments
 (0)