19
19
EndFrame ,
20
20
ErrorFrame ,
21
21
Frame ,
22
+ LLMFullResponseEndFrame ,
22
23
StartFrame ,
23
24
StartInterruptionFrame ,
24
25
TTSAudioRawFrame ,
@@ -509,6 +510,9 @@ def __init__(
509
510
self ._cumulative_time = 0
510
511
self ._started = False
511
512
513
+ # Store previous text for context within a turn
514
+ self ._previous_text = ""
515
+
512
516
def language_to_service_language (self , language : Language ) -> Optional [str ]:
513
517
"""Convert pipecat Language to ElevenLabs language code."""
514
518
return language_to_elevenlabs_language (language )
@@ -526,16 +530,23 @@ async def start(self, frame: StartFrame):
526
530
self ._output_format = output_format_from_sample_rate (self .sample_rate )
527
531
self ._cumulative_time = 0
528
532
self ._started = False
533
+ self ._previous_text = ""
529
534
530
535
async def push_frame (self , frame : Frame , direction : FrameDirection = FrameDirection .DOWNSTREAM ):
531
536
await super ().push_frame (frame , direction )
532
537
if isinstance (frame , (StartInterruptionFrame , TTSStoppedFrame )):
533
538
# Reset timing on interruption or stop
534
539
self ._started = False
535
540
self ._cumulative_time = 0
541
+ self ._previous_text = ""
542
+
536
543
if isinstance (frame , TTSStoppedFrame ):
537
544
await self .add_word_timestamps ([("LLMFullResponseEndFrame" , 0 ), ("Reset" , 0 )])
538
545
546
+ elif isinstance (frame , LLMFullResponseEndFrame ):
547
+ # End of turn - reset previous text
548
+ self ._previous_text = ""
549
+
539
550
def calculate_word_times (self , alignment_info : Mapping [str , Any ]) -> List [Tuple [str , float ]]:
540
551
"""Calculate word timing from character alignment data.
541
552
@@ -598,6 +609,10 @@ def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[
598
609
async def run_tts (self , text : str ) -> AsyncGenerator [Frame , None ]:
599
610
"""Generate speech from text using ElevenLabs streaming API with timestamps.
600
611
612
+ Makes a request to the ElevenLabs API to generate audio and timing data.
613
+ Tracks the duration of each utterance to ensure correct sequencing.
614
+ Includes previous text as context for better prosody continuity.
615
+
601
616
Args:
602
617
text: Text to convert to speech
603
618
@@ -614,6 +629,11 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
614
629
"model_id" : self ._model_name ,
615
630
}
616
631
632
+ # Include previous text as context if available
633
+ if self ._previous_text :
634
+ payload ["previous_text" ] = self ._previous_text
635
+ print (f"Previous text: { self ._previous_text } " )
636
+
617
637
if self ._voice_settings :
618
638
payload ["voice_settings" ] = self ._voice_settings
619
639
@@ -702,6 +722,13 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
702
722
if utterance_duration > 0 :
703
723
self ._cumulative_time += utterance_duration
704
724
725
+ # Append the current text to previous_text for context continuity
726
+ # Only add a space if there's already text
727
+ if self ._previous_text :
728
+ self ._previous_text += " " + text
729
+ else :
730
+ self ._previous_text = text
731
+
705
732
except Exception as e :
706
733
logger .error (f"Error in run_tts: { e } " )
707
734
yield ErrorFrame (error = str (e ))
0 commit comments