Skip to content

Commit 7ff748d

Browse files
authored
Merge pull request #1600 from pipecat-ai/mb/11labs-previous-text
Add previous_text context to ElevenLabsHttpTTSService
2 parents 384f809 + 2dafbee commit 7ff748d

File tree

2 files changed

+36
-4
lines changed

2 files changed

+36
-4
lines changed

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1616
you to control aggregator settings. You can now pass these arguments when
1717
creating aggregator pairs with `create_context_aggregator()`.
1818

19+
- Added `previous_text` context support to ElevenLabsHttpTTSService, improving
20+
speech consistency across sentences within an LLM response.
21+
1922
- Added word/timestamp pairs to `ElevenLabsHttpTTSService`.
2023

2124
- It is now possible to disable `SoundfileMixer` when created. You can then use

src/pipecat/services/elevenlabs/tts.py

+33-4
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
EndFrame,
1919
ErrorFrame,
2020
Frame,
21+
LLMFullResponseEndFrame,
2122
StartFrame,
2223
StartInterruptionFrame,
2324
TTSAudioRawFrame,
@@ -508,6 +509,9 @@ def __init__(
508509
self._cumulative_time = 0
509510
self._started = False
510511

512+
# Store previous text for context within a turn
513+
self._previous_text = ""
514+
511515
def language_to_service_language(self, language: Language) -> Optional[str]:
512516
"""Convert pipecat Language to ElevenLabs language code."""
513517
return language_to_elevenlabs_language(language)
@@ -519,22 +523,32 @@ def can_generate_metrics(self) -> bool:
519523
def _set_voice_settings(self):
520524
return build_elevenlabs_voice_settings(self._settings)
521525

526+
def _reset_state(self):
527+
"""Reset internal state variables."""
528+
self._cumulative_time = 0
529+
self._started = False
530+
self._previous_text = ""
531+
logger.debug(f"{self}: Reset internal state")
532+
522533
async def start(self, frame: StartFrame):
523534
"""Initialize the service upon receiving a StartFrame."""
524535
await super().start(frame)
525536
self._output_format = output_format_from_sample_rate(self.sample_rate)
526-
self._cumulative_time = 0
527-
self._started = False
537+
self._reset_state()
528538

529539
async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
530540
await super().push_frame(frame, direction)
531541
if isinstance(frame, (StartInterruptionFrame, TTSStoppedFrame)):
532542
# Reset timing on interruption or stop
533-
self._started = False
534-
self._cumulative_time = 0
543+
self._reset_state()
544+
535545
if isinstance(frame, TTSStoppedFrame):
536546
await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
537547

548+
elif isinstance(frame, LLMFullResponseEndFrame):
549+
# End of turn - reset previous text
550+
self._previous_text = ""
551+
538552
def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
539553
"""Calculate word timing from character alignment data.
540554
@@ -597,6 +611,10 @@ def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[
597611
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
598612
"""Generate speech from text using ElevenLabs streaming API with timestamps.
599613
614+
Makes a request to the ElevenLabs API to generate audio and timing data.
615+
Tracks the duration of each utterance to ensure correct sequencing.
616+
Includes previous text as context for better prosody continuity.
617+
600618
Args:
601619
text: Text to convert to speech
602620
@@ -613,6 +631,10 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
613631
"model_id": self._model_name,
614632
}
615633

634+
# Include previous text as context if available
635+
if self._previous_text:
636+
payload["previous_text"] = self._previous_text
637+
616638
if self._voice_settings:
617639
payload["voice_settings"] = self._voice_settings
618640

@@ -701,6 +723,13 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
701723
if utterance_duration > 0:
702724
self._cumulative_time += utterance_duration
703725

726+
# Append the current text to previous_text for context continuity
727+
# Only add a space if there's already text
728+
if self._previous_text:
729+
self._previous_text += " " + text
730+
else:
731+
self._previous_text = text
732+
704733
except Exception as e:
705734
logger.error(f"Error in run_tts: {e}")
706735
yield ErrorFrame(error=str(e))

0 commit comments

Comments
 (0)