Skip to content

Commit 9d37e35

Browse files
committed
update cli.py version for new TTS
1 parent ddfcef7 commit 9d37e35

File tree

1 file changed

+43
-58
lines changed

1 file changed

+43
-58
lines changed

cli.py

+43-58
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@
1313
from dotenv import load_dotenv
1414
from openai import OpenAI, OpenAIError
1515
from faster_whisper import WhisperModel
16-
from TTS.tts.configs.xtts_config import XttsConfig
17-
from TTS.tts.models.xtts import Xtts
16+
from TTS.api import TTS
1817
import soundfile as sf
1918
from textblob import TextBlob
2019
from pathlib import Path
@@ -41,6 +40,7 @@
4140
ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY')
4241
ELEVENLABS_TTS_VOICE = os.getenv('ELEVENLABS_TTS_VOICE')
4342
XTTS_SPEED = os.getenv('XTTS_SPEED', '1.1')
43+
os.environ["COQUI_TOS_AGREED"] = "1"
4444

4545
# Initialize OpenAI API key
4646
OpenAI.api_key = OPENAI_API_KEY
@@ -81,25 +81,16 @@
8181
character_prompt_file = os.path.join(characters_folder, f"{CHARACTER_NAME}.txt")
8282
character_audio_file = os.path.join(characters_folder, f"{CHARACTER_NAME}.wav")
8383

84-
# Load XTTS configuration
85-
xtts_config_path = os.path.join(project_dir, "XTTS-v2", "config.json")
86-
xtts_checkpoint_dir = os.path.join(project_dir, "XTTS-v2")
87-
xtts_available = os.path.exists(xtts_config_path) and os.path.exists(xtts_checkpoint_dir)
88-
89-
xtts_config = XttsConfig()
90-
91-
# Initialize XTTS model
92-
xtts_model = None
84+
# Initialize TTS model
85+
tts = None
9386
if TTS_PROVIDER == 'xtts':
94-
if xtts_available:
95-
xtts_config = XttsConfig()
96-
xtts_config.load_json(xtts_config_path)
97-
xtts_model = Xtts.init_from_config(xtts_config)
98-
xtts_model.load_checkpoint(xtts_config, checkpoint_dir=xtts_checkpoint_dir, eval=True)
99-
xtts_model.cuda() # Move to GPU if available
100-
else:
101-
print("XTTS files not found. Please download the XTTS-v2 checkpoints to use local TTS.")
102-
TTS_PROVIDER = 'openai' # Fallback to a default provider
87+
print("Initializing XTTS model (may download on first run)...")
88+
try:
89+
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
90+
print("XTTS model loaded successfully.")
91+
except Exception as e:
92+
print(f"Failed to load XTTS model: {e}")
93+
TTS_PROVIDER = 'openai' # Fallback to OpenAI
10394
print("Switched to default TTS provider: openai")
10495

10596
# Function to display ElevenLabs quota
@@ -187,26 +178,23 @@ def process_and_play(prompt, audio_file_pth):
187178
play_audio(temp_wav_path)
188179
else:
189180
print("Error: Audio file not found.")
190-
else:
191-
tts_model = xtts_model
192-
try:
193-
outputs = tts_model.synthesize(
194-
prompt,
195-
xtts_config,
196-
speaker_wav=audio_file_pth,
197-
gpt_cond_len=24,
198-
temperature=0.2,
199-
language='en',
200-
speed=float(XTTS_SPEED)
201-
)
202-
synthesized_audio = outputs['wav']
203-
src_path = os.path.join(output_dir, 'output.wav')
204-
sample_rate = xtts_config.audio.sample_rate
205-
sf.write(src_path, synthesized_audio, sample_rate)
206-
print("Audio generated successfully with XTTS.")
207-
play_audio(src_path)
208-
except Exception as e:
209-
print(f"Error during XTTS audio generation: {e}")
181+
elif TTS_PROVIDER == 'xtts':
182+
if tts is not None:
183+
try:
184+
wav = tts.tts(
185+
text=prompt,
186+
speaker_wav=audio_file_pth, # For voice cloning
187+
language="en",
188+
speed=float(XTTS_SPEED)
189+
)
190+
src_path = os.path.join(output_dir, 'output.wav')
191+
sf.write(src_path, wav, tts.synthesizer.tts_config.audio["sample_rate"])
192+
print("Audio generated successfully with XTTS.")
193+
play_audio(src_path)
194+
except Exception as e:
195+
print(f"Error during XTTS audio generation: {e}")
196+
else:
197+
print("XTTS model is not loaded. Please ensure initialization succeeded.")
210198

211199
def save_pcm_as_wav(pcm_data: bytes, file_path: str, sample_rate: int = 24000, channels: int = 1, sample_width: int = 2):
212200
""" Saves PCM data as a WAV file. """
@@ -706,24 +694,21 @@ def generate_speech(text, temp_audio_path):
706694
print(f"Failed to generate speech: {response.status_code} - {response.text}")
707695
elif TTS_PROVIDER == 'elevenlabs':
708696
elevenlabs_text_to_speech(text, temp_audio_path)
709-
else:
710-
tts_model = xtts_model
711-
try:
712-
outputs = tts_model.synthesize(
713-
text,
714-
xtts_config,
715-
speaker_wav=character_audio_file,
716-
gpt_cond_len=24,
717-
temperature=0.2,
718-
language='en',
719-
speed=float(XTTS_SPEED)
720-
)
721-
synthesized_audio = outputs['wav']
722-
sample_rate = xtts_config.audio.sample_rate
723-
sf.write(temp_audio_path, synthesized_audio, sample_rate)
724-
print("Audio generated successfully with XTTS.")
725-
except Exception as e:
726-
print(f"Error during XTTS audio generation: {e}")
697+
else: # XTTS
698+
if tts is not None:
699+
try:
700+
wav = tts.tts(
701+
text=text,
702+
speaker_wav=character_audio_file,
703+
language="en",
704+
speed=float(XTTS_SPEED)
705+
)
706+
sf.write(temp_audio_path, wav, tts.synthesizer.tts_config.audio["sample_rate"])
707+
print("Audio generated successfully with XTTS.")
708+
except Exception as e:
709+
print(f"Error during XTTS audio generation: {e}")
710+
else:
711+
print("XTTS model is not loaded.")
727712

728713
def user_chatbot_conversation():
729714
conversation_history = []

0 commit comments

Comments
 (0)