18
18
EndFrame ,
19
19
ErrorFrame ,
20
20
Frame ,
21
+ LLMFullResponseEndFrame ,
21
22
StartFrame ,
22
23
StartInterruptionFrame ,
23
24
TTSAudioRawFrame ,
@@ -508,6 +509,9 @@ def __init__(
508
509
self ._cumulative_time = 0
509
510
self ._started = False
510
511
512
+ # Store previous text for context within a turn
513
+ self ._previous_text = ""
514
+
511
515
def language_to_service_language (self , language : Language ) -> Optional [str ]:
512
516
"""Convert pipecat Language to ElevenLabs language code."""
513
517
return language_to_elevenlabs_language (language )
@@ -519,22 +523,32 @@ def can_generate_metrics(self) -> bool:
519
523
def _set_voice_settings (self ):
520
524
return build_elevenlabs_voice_settings (self ._settings )
521
525
526
+ def _reset_state (self ):
527
+ """Reset internal state variables."""
528
+ self ._cumulative_time = 0
529
+ self ._started = False
530
+ self ._previous_text = ""
531
+ logger .debug (f"{ self } : Reset internal state" )
532
+
522
533
async def start (self , frame : StartFrame ):
523
534
"""Initialize the service upon receiving a StartFrame."""
524
535
await super ().start (frame )
525
536
self ._output_format = output_format_from_sample_rate (self .sample_rate )
526
- self ._cumulative_time = 0
527
- self ._started = False
537
+ self ._reset_state ()
528
538
529
539
async def push_frame (self , frame : Frame , direction : FrameDirection = FrameDirection .DOWNSTREAM ):
530
540
await super ().push_frame (frame , direction )
531
541
if isinstance (frame , (StartInterruptionFrame , TTSStoppedFrame )):
532
542
# Reset timing on interruption or stop
533
- self ._started = False
534
- self . _cumulative_time = 0
543
+ self ._reset_state ()
544
+
535
545
if isinstance (frame , TTSStoppedFrame ):
536
546
await self .add_word_timestamps ([("LLMFullResponseEndFrame" , 0 ), ("Reset" , 0 )])
537
547
548
+ elif isinstance (frame , LLMFullResponseEndFrame ):
549
+ # End of turn - reset previous text
550
+ self ._previous_text = ""
551
+
538
552
def calculate_word_times (self , alignment_info : Mapping [str , Any ]) -> List [Tuple [str , float ]]:
539
553
"""Calculate word timing from character alignment data.
540
554
@@ -597,6 +611,10 @@ def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[
597
611
async def run_tts (self , text : str ) -> AsyncGenerator [Frame , None ]:
598
612
"""Generate speech from text using ElevenLabs streaming API with timestamps.
599
613
614
+ Makes a request to the ElevenLabs API to generate audio and timing data.
615
+ Tracks the duration of each utterance to ensure correct sequencing.
616
+ Includes previous text as context for better prosody continuity.
617
+
600
618
Args:
601
619
text: Text to convert to speech
602
620
@@ -613,6 +631,10 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
613
631
"model_id" : self ._model_name ,
614
632
}
615
633
634
+ # Include previous text as context if available
635
+ if self ._previous_text :
636
+ payload ["previous_text" ] = self ._previous_text
637
+
616
638
if self ._voice_settings :
617
639
payload ["voice_settings" ] = self ._voice_settings
618
640
@@ -701,6 +723,13 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
701
723
if utterance_duration > 0 :
702
724
self ._cumulative_time += utterance_duration
703
725
726
+ # Append the current text to previous_text for context continuity
727
+ # Only add a space if there's already text
728
+ if self ._previous_text :
729
+ self ._previous_text += " " + text
730
+ else :
731
+ self ._previous_text = text
732
+
704
733
except Exception as e :
705
734
logger .error (f"Error in run_tts: { e } " )
706
735
yield ErrorFrame (error = str (e ))
0 commit comments