Merge pull request #1600 from pipecat-ai/mb/11labs-previous-text

markbackman · web-flow · commit 7ff748defdb5 · 2025-04-16T22:33:38.000-04:00
Add previous_text context to ElevenLabsHttpTTSService
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   you to control aggregator settings. You can now pass these arguments when
   creating aggregator pairs with `create_context_aggregator()`.
 
+- Added `previous_text` context support to ElevenLabsHttpTTSService, improving
+  speech consistency across sentences within an LLM response.
+
 - Added word/timestamp pairs to `ElevenLabsHttpTTSService`.
 
 - It is now possible to disable `SoundfileMixer` when created. You can then use
diff --git a/src/pipecat/services/elevenlabs/tts.py b/src/pipecat/services/elevenlabs/tts.py
@@ -18,6 +18,7 @@
     EndFrame,
     ErrorFrame,
     Frame,
+    LLMFullResponseEndFrame,
     StartFrame,
     StartInterruptionFrame,
     TTSAudioRawFrame,
@@ -508,6 +509,9 @@ def __init__(
         self._cumulative_time = 0
         self._started = False
 
+        # Store previous text for context within a turn
+        self._previous_text = ""
+
     def language_to_service_language(self, language: Language) -> Optional[str]:
         """Convert pipecat Language to ElevenLabs language code."""
         return language_to_elevenlabs_language(language)
@@ -519,22 +523,32 @@ def can_generate_metrics(self) -> bool:
     def _set_voice_settings(self):
         return build_elevenlabs_voice_settings(self._settings)
 
+    def _reset_state(self):
+        """Reset internal state variables."""
+        self._cumulative_time = 0
+        self._started = False
+        self._previous_text = ""
+        logger.debug(f"{self}: Reset internal state")
+
     async def start(self, frame: StartFrame):
         """Initialize the service upon receiving a StartFrame."""
         await super().start(frame)
         self._output_format = output_format_from_sample_rate(self.sample_rate)
-        self._cumulative_time = 0
-        self._started = False
+        self._reset_state()
 
     async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
         await super().push_frame(frame, direction)
         if isinstance(frame, (StartInterruptionFrame, TTSStoppedFrame)):
             # Reset timing on interruption or stop
-            self._started = False
-            self._cumulative_time = 0
+            self._reset_state()
+
             if isinstance(frame, TTSStoppedFrame):
                 await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
 
+        elif isinstance(frame, LLMFullResponseEndFrame):
+            # End of turn - reset previous text
+            self._previous_text = ""
+
     def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
         """Calculate word timing from character alignment data.
 
@@ -597,6 +611,10 @@ def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         """Generate speech from text using ElevenLabs streaming API with timestamps.
 
+        Makes a request to the ElevenLabs API to generate audio and timing data.
+        Tracks the duration of each utterance to ensure correct sequencing.
+        Includes previous text as context for better prosody continuity.
+
         Args:
             text: Text to convert to speech
 
@@ -613,6 +631,10 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
             "model_id": self._model_name,
         }
 
+        # Include previous text as context if available
+        if self._previous_text:
+            payload["previous_text"] = self._previous_text
+
         if self._voice_settings:
             payload["voice_settings"] = self._voice_settings
 
@@ -701,6 +723,13 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
                 if utterance_duration > 0:
                     self._cumulative_time += utterance_duration
 
+                # Append the current text to previous_text for context continuity
+                # Only add a space if there's already text
+                if self._previous_text:
+                    self._previous_text += " " + text
+                else:
+                    self._previous_text = text
+
         except Exception as e:
             logger.error(f"Error in run_tts: {e}")
             yield ErrorFrame(error=str(e))