Skip to content

Commit 4b21dab

Browse files
committed
Add previous_text context to ElevenLabsHttpTTSService
1 parent 3bb1ff8 commit 4b21dab

File tree

2 files changed

+30
-0
lines changed

2 files changed

+30
-0
lines changed

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99

1010
### Added
1111

12+
- Added `previous_text` context support to ElevenLabsHttpTTSService, improving
13+
speech consistency across sentences within an LLM response.
14+
1215
- Added word/timestamp pairs to `ElevenLabsHttpTTSService`.
1316

1417
- It is now possible to disable `SoundfileMixer` when created. You can then use

src/pipecat/services/elevenlabs/tts.py

+27
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
EndFrame,
2020
ErrorFrame,
2121
Frame,
22+
LLMFullResponseEndFrame,
2223
StartFrame,
2324
StartInterruptionFrame,
2425
TTSAudioRawFrame,
@@ -509,6 +510,9 @@ def __init__(
509510
self._cumulative_time = 0
510511
self._started = False
511512

513+
# Store previous text for context within a turn
514+
self._previous_text = ""
515+
512516
def language_to_service_language(self, language: Language) -> Optional[str]:
513517
"""Convert pipecat Language to ElevenLabs language code."""
514518
return language_to_elevenlabs_language(language)
@@ -526,16 +530,23 @@ async def start(self, frame: StartFrame):
526530
self._output_format = output_format_from_sample_rate(self.sample_rate)
527531
self._cumulative_time = 0
528532
self._started = False
533+
self._previous_text = ""
529534

530535
async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirection.DOWNSTREAM):
531536
await super().push_frame(frame, direction)
532537
if isinstance(frame, (StartInterruptionFrame, TTSStoppedFrame)):
533538
# Reset timing on interruption or stop
534539
self._started = False
535540
self._cumulative_time = 0
541+
self._previous_text = ""
542+
536543
if isinstance(frame, TTSStoppedFrame):
537544
await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
538545

546+
elif isinstance(frame, LLMFullResponseEndFrame):
547+
# End of turn - reset previous text
548+
self._previous_text = ""
549+
539550
def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[str, float]]:
540551
"""Calculate word timing from character alignment data.
541552
@@ -598,6 +609,10 @@ def calculate_word_times(self, alignment_info: Mapping[str, Any]) -> List[Tuple[
598609
async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
599610
"""Generate speech from text using ElevenLabs streaming API with timestamps.
600611
612+
Makes a request to the ElevenLabs API to generate audio and timing data.
613+
Tracks the duration of each utterance to ensure correct sequencing.
614+
Includes previous text as context for better prosody continuity.
615+
601616
Args:
602617
text: Text to convert to speech
603618
@@ -614,6 +629,11 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
614629
"model_id": self._model_name,
615630
}
616631

632+
# Include previous text as context if available
633+
if self._previous_text:
634+
payload["previous_text"] = self._previous_text
635+
print(f"Previous text: {self._previous_text}")
636+
617637
if self._voice_settings:
618638
payload["voice_settings"] = self._voice_settings
619639

@@ -702,6 +722,13 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
702722
if utterance_duration > 0:
703723
self._cumulative_time += utterance_duration
704724

725+
# Append the current text to previous_text for context continuity
726+
# Only add a space if there's already text
727+
if self._previous_text:
728+
self._previous_text += " " + text
729+
else:
730+
self._previous_text = text
731+
705732
except Exception as e:
706733
logger.error(f"Error in run_tts: {e}")
707734
yield ErrorFrame(error=str(e))

0 commit comments

Comments
 (0)