|
13 | 13 | from dotenv import load_dotenv
|
14 | 14 | from openai import OpenAI, OpenAIError
|
15 | 15 | from faster_whisper import WhisperModel
|
16 |
| -from TTS.tts.configs.xtts_config import XttsConfig |
17 |
| -from TTS.tts.models.xtts import Xtts |
| 16 | +from TTS.api import TTS |
18 | 17 | import soundfile as sf
|
19 | 18 | from textblob import TextBlob
|
20 | 19 | from pathlib import Path
|
|
41 | 40 | ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY')
|
42 | 41 | ELEVENLABS_TTS_VOICE = os.getenv('ELEVENLABS_TTS_VOICE')
|
43 | 42 | XTTS_SPEED = os.getenv('XTTS_SPEED', '1.1')
|
| 43 | +os.environ["COQUI_TOS_AGREED"] = "1" |
44 | 44 |
|
45 | 45 | # Initialize OpenAI API key
|
46 | 46 | OpenAI.api_key = OPENAI_API_KEY
|
|
81 | 81 | character_prompt_file = os.path.join(characters_folder, f"{CHARACTER_NAME}.txt")
|
82 | 82 | character_audio_file = os.path.join(characters_folder, f"{CHARACTER_NAME}.wav")
|
83 | 83 |
|
84 |
| -# Load XTTS configuration |
85 |
| -xtts_config_path = os.path.join(project_dir, "XTTS-v2", "config.json") |
86 |
| -xtts_checkpoint_dir = os.path.join(project_dir, "XTTS-v2") |
87 |
| -xtts_available = os.path.exists(xtts_config_path) and os.path.exists(xtts_checkpoint_dir) |
88 |
| - |
89 |
| -xtts_config = XttsConfig() |
90 |
| - |
91 |
| -# Initialize XTTS model |
92 |
| -xtts_model = None |
| 84 | +# Initialize TTS model |
| 85 | +tts = None |
93 | 86 | if TTS_PROVIDER == 'xtts':
|
94 |
| - if xtts_available: |
95 |
| - xtts_config = XttsConfig() |
96 |
| - xtts_config.load_json(xtts_config_path) |
97 |
| - xtts_model = Xtts.init_from_config(xtts_config) |
98 |
| - xtts_model.load_checkpoint(xtts_config, checkpoint_dir=xtts_checkpoint_dir, eval=True) |
99 |
| - xtts_model.cuda() # Move to GPU if available |
100 |
| - else: |
101 |
| - print("XTTS files not found. Please download the XTTS-v2 checkpoints to use local TTS.") |
102 |
| - TTS_PROVIDER = 'openai' # Fallback to a default provider |
| 87 | + print("Initializing XTTS model (may download on first run)...") |
| 88 | + try: |
| 89 | + tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) |
| 90 | + print("XTTS model loaded successfully.") |
| 91 | + except Exception as e: |
| 92 | + print(f"Failed to load XTTS model: {e}") |
| 93 | + TTS_PROVIDER = 'openai' # Fallback to OpenAI |
103 | 94 | print("Switched to default TTS provider: openai")
|
104 | 95 |
|
105 | 96 | # Function to display ElevenLabs quota
|
@@ -187,26 +178,23 @@ def process_and_play(prompt, audio_file_pth):
|
187 | 178 | play_audio(temp_wav_path)
|
188 | 179 | else:
|
189 | 180 | print("Error: Audio file not found.")
|
190 |
| - else: |
191 |
| - tts_model = xtts_model |
192 |
| - try: |
193 |
| - outputs = tts_model.synthesize( |
194 |
| - prompt, |
195 |
| - xtts_config, |
196 |
| - speaker_wav=audio_file_pth, |
197 |
| - gpt_cond_len=24, |
198 |
| - temperature=0.2, |
199 |
| - language='en', |
200 |
| - speed=float(XTTS_SPEED) |
201 |
| - ) |
202 |
| - synthesized_audio = outputs['wav'] |
203 |
| - src_path = os.path.join(output_dir, 'output.wav') |
204 |
| - sample_rate = xtts_config.audio.sample_rate |
205 |
| - sf.write(src_path, synthesized_audio, sample_rate) |
206 |
| - print("Audio generated successfully with XTTS.") |
207 |
| - play_audio(src_path) |
208 |
| - except Exception as e: |
209 |
| - print(f"Error during XTTS audio generation: {e}") |
| 181 | + elif TTS_PROVIDER == 'xtts': |
| 182 | + if tts is not None: |
| 183 | + try: |
| 184 | + wav = tts.tts( |
| 185 | + text=prompt, |
| 186 | + speaker_wav=audio_file_pth, # For voice cloning |
| 187 | + language="en", |
| 188 | + speed=float(XTTS_SPEED) |
| 189 | + ) |
| 190 | + src_path = os.path.join(output_dir, 'output.wav') |
| 191 | + sf.write(src_path, wav, tts.synthesizer.tts_config.audio["sample_rate"]) |
| 192 | + print("Audio generated successfully with XTTS.") |
| 193 | + play_audio(src_path) |
| 194 | + except Exception as e: |
| 195 | + print(f"Error during XTTS audio generation: {e}") |
| 196 | + else: |
| 197 | + print("XTTS model is not loaded. Please ensure initialization succeeded.") |
210 | 198 |
|
211 | 199 | def save_pcm_as_wav(pcm_data: bytes, file_path: str, sample_rate: int = 24000, channels: int = 1, sample_width: int = 2):
|
212 | 200 | """ Saves PCM data as a WAV file. """
|
@@ -706,24 +694,21 @@ def generate_speech(text, temp_audio_path):
|
706 | 694 | print(f"Failed to generate speech: {response.status_code} - {response.text}")
|
707 | 695 | elif TTS_PROVIDER == 'elevenlabs':
|
708 | 696 | elevenlabs_text_to_speech(text, temp_audio_path)
|
709 |
| - else: |
710 |
| - tts_model = xtts_model |
711 |
| - try: |
712 |
| - outputs = tts_model.synthesize( |
713 |
| - text, |
714 |
| - xtts_config, |
715 |
| - speaker_wav=character_audio_file, |
716 |
| - gpt_cond_len=24, |
717 |
| - temperature=0.2, |
718 |
| - language='en', |
719 |
| - speed=float(XTTS_SPEED) |
720 |
| - ) |
721 |
| - synthesized_audio = outputs['wav'] |
722 |
| - sample_rate = xtts_config.audio.sample_rate |
723 |
| - sf.write(temp_audio_path, synthesized_audio, sample_rate) |
724 |
| - print("Audio generated successfully with XTTS.") |
725 |
| - except Exception as e: |
726 |
| - print(f"Error during XTTS audio generation: {e}") |
| 697 | + else: # XTTS |
| 698 | + if tts is not None: |
| 699 | + try: |
| 700 | + wav = tts.tts( |
| 701 | + text=text, |
| 702 | + speaker_wav=character_audio_file, |
| 703 | + language="en", |
| 704 | + speed=float(XTTS_SPEED) |
| 705 | + ) |
| 706 | + sf.write(temp_audio_path, wav, tts.synthesizer.tts_config.audio["sample_rate"]) |
| 707 | + print("Audio generated successfully with XTTS.") |
| 708 | + except Exception as e: |
| 709 | + print(f"Error during XTTS audio generation: {e}") |
| 710 | + else: |
| 711 | + print("XTTS model is not loaded.") |
727 | 712 |
|
728 | 713 | def user_chatbot_conversation():
|
729 | 714 | conversation_history = []
|
|
0 commit comments