Add previous_text context to ElevenLabsHttpTTSService

markbackman · markbackman · commit 4b21dab80148 · 2025-04-15T22:39:36.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added `previous_text` context support to ElevenLabsHttpTTSService, improving
+  speech consistency across sentences within an LLM response.
+
 - Added word/timestamp pairs to `ElevenLabsHttpTTSService`.
 
 - It is now possible to disable `SoundfileMixer` when created. You can then use
diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py
@@ -19,6 +19,7 @@
     EndFrame,
     ErrorFrame,
     Frame,
+    LLMFullResponseEndFrame,
     StartFrame,
     StartInterruptionFrame,
     TTSAudioRawFrame,
@@ -509,6 +510,9 @@ def __init__(
         self._cumulative_time = 0
         self._started = False
 
+        # Store previous text for context within a turn
+        self._previous_text = ""
+
     def language_to_service_language(self, language: Language) -> Optional[str]:
         """Convert pipecat Language to ElevenLabs language code."""
         return language_to_elevenlabs_language(language)
@@ -526,16 +530,23 @@ async def start(self, frame: StartFrame):
         self._output_format = output_format_from_sample_rate(self.sample_rate)
         self._cumulative_time = 0
         self._started = False
+        self._previous_text = ""
 
     async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
         await super().push_frame(frame, direction)
         if isinstance(frame, (StartInterruptionFrame, TTSStoppedFrame)):
             # Reset timing on interruption or stop
             self._started = False
             self._cumulative_time = 0
+            self._previous_text = ""
+
             if isinstance(frame, TTSStoppedFrame):
                 await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
 
+        elif isinstance(frame, LLMFullResponseEndFrame):
+            # End of turn - reset previous text
+            self._previous_text = ""
+
     def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
         """Calculate word timing from character alignment data.
 
@@ -598,6 +609,10 @@ def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         """Generate speech from text using ElevenLabs streaming API with timestamps.
 
+        Makes a request to the ElevenLabs API to generate audio and timing data.
+        Tracks the duration of each utterance to ensure correct sequencing.
+        Includes previous text as context for better prosody continuity.
+
         Args:
             text: Text to convert to speech
 
@@ -614,6 +629,11 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
             "model_id": self._model_name,
         }
 
+        # Include previous text as context if available
+        if self._previous_text:
+            payload["previous_text"] = self._previous_text
+            print(f"Previous text: {self._previous_text}")
+
         if self._voice_settings:
             payload["voice_settings"] = self._voice_settings
 
@@ -702,6 +722,13 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 if utterance_duration > 0:
                     self._cumulative_time += utterance_duration
 
+                # Append the current text to previous_text for context continuity
+                # Only add a space if there's already text
+                if self._previous_text:
+                    self._previous_text += " " + text
+                else:
+                    self._previous_text = text
+
         except Exception as e:
             logger.error(f"Error in run_tts: {e}")
             yield ErrorFrame(error=str(e))