12
12
import aiohttp
13
13
from loguru import logger
14
14
from pydantic import BaseModel , model_validator
15
- from sentry_sdk import push_scope
16
15
17
16
from pipecat .frames .frames import (
18
17
CancelFrame ,
19
18
EndFrame ,
20
19
ErrorFrame ,
21
20
Frame ,
21
+ LLMFullResponseEndFrame ,
22
22
StartFrame ,
23
23
StartInterruptionFrame ,
24
24
TTSAudioRawFrame ,
@@ -509,6 +509,9 @@ def __init__(
509
509
self ._cumulative_time = 0
510
510
self ._started = False
511
511
512
+ # Store previous text for context within a turn
513
+ self ._previous_text = ""
514
+
512
515
def language_to_service_language (self , language : Language ) -> Optional [str ]:
513
516
"""Convert pipecat Language to ElevenLabs language code."""
514
517
return language_to_elevenlabs_language (language )
@@ -526,16 +529,23 @@ async def start(self, frame: StartFrame):
526
529
self ._output_format = output_format_from_sample_rate (self .sample_rate )
527
530
self ._cumulative_time = 0
528
531
self ._started = False
532
+ self ._previous_text = ""
529
533
530
534
async def push_frame (self , frame : Frame , direction : FrameDirection = FrameDirection .DOWNSTREAM ):
531
535
await super ().push_frame (frame , direction )
532
536
if isinstance (frame , (StartInterruptionFrame , TTSStoppedFrame )):
533
537
# Reset timing on interruption or stop
534
538
self ._started = False
535
539
self ._cumulative_time = 0
540
+ self ._previous_text = ""
541
+
536
542
if isinstance (frame , TTSStoppedFrame ):
537
543
await self .add_word_timestamps ([("LLMFullResponseEndFrame" , 0 ), ("Reset" , 0 )])
538
544
545
+ elif isinstance (frame , LLMFullResponseEndFrame ):
546
+ # End of turn - reset previous text
547
+ self ._previous_text = ""
548
+
539
549
def calculate_word_times (self , alignment_info : Mapping [str , Any ]) -> List [Tuple [str , float ]]:
540
550
"""Calculate word timing from character alignment data.
541
551
@@ -598,6 +608,10 @@ def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[
598
608
async def run_tts (self , text : str ) -> AsyncGenerator [Frame , None ]:
599
609
"""Generate speech from text using ElevenLabs streaming API with timestamps.
600
610
611
+ Makes a request to the ElevenLabs API to generate audio and timing data.
612
+ Tracks the duration of each utterance to ensure correct sequencing.
613
+ Includes previous text as context for better prosody continuity.
614
+
601
615
Args:
602
616
text: Text to convert to speech
603
617
@@ -614,6 +628,11 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
614
628
"model_id" : self ._model_name ,
615
629
}
616
630
631
+ # Include previous text as context if available
632
+ if self ._previous_text :
633
+ payload ["previous_text" ] = self ._previous_text
634
+ print (f"Previous text: { self ._previous_text } " )
635
+
617
636
if self ._voice_settings :
618
637
payload ["voice_settings" ] = self ._voice_settings
619
638
@@ -702,6 +721,13 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
702
721
if utterance_duration > 0 :
703
722
self ._cumulative_time += utterance_duration
704
723
724
+ # Append the current text to previous_text for context continuity
725
+ # Only add a space if there's already text
726
+ if self ._previous_text :
727
+ self ._previous_text += " " + text
728
+ else :
729
+ self ._previous_text = text
730
+
705
731
except Exception as e :
706
732
logger .error (f"Error in run_tts: { e } " )
707
733
yield ErrorFrame (error = str (e ))
0 commit comments