Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/other/text-to-speech/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ livekit-plugins-openai>=0.12.2
livekit-plugins-cartesia>=0.4.11
livekit-plugins-elevenlabs>=0.8.1
livekit-plugins-speechify>=0.1.0
livekit-plugins-typecast>=0.1.0
python-dotenv~=1.0
131 changes: 131 additions & 0 deletions examples/other/text-to-speech/typecast_tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import asyncio
import logging

from dotenv import load_dotenv

from livekit import rtc
from livekit.agents import AutoSubscribe, JobContext, WorkerOptions, cli
from livekit.plugins import typecast

load_dotenv()

logger = logging.getLogger("typecast-tts-demo")
logger.setLevel(logging.INFO)


async def entrypoint(job: JobContext):
logger.info("starting Typecast TTS example agent")

# Example 0: List available voices (optional)
logger.info("Listing available Typecast voices...")

# Create TTS instance with default voice
# You can also specify a voice ID: typecast.TTS(voice="tc_your_voice_id")
tts = typecast.TTS(language="eng") # Uses DEFAULT_VOICE_ID

try:
voices = await tts.list_voices()
logger.info(f"Found {len(voices)} available voices")

# Display first 3 voices as examples
for i, voice in enumerate(voices[:3], 1):
logger.info(
f" {i}. {voice.name} ({voice.id}) - Emotions: {', '.join(voice.emotions[:3])}..."
)

# You can filter by model
# voices_filtered = await tts.list_voices(model="ssfm-v21")
except Exception as e:
logger.warning(f"Could not list voices: {e}")

# Optionally, select a specific voice from the list
# For this demo, we'll use the default voice
logger.info(f"Using default voice: {typecast.DEFAULT_VOICE_ID}")

source = rtc.AudioSource(tts.sample_rate, tts.num_channels)
track = rtc.LocalAudioTrack.create_audio_track("agent-mic", source)
options = rtc.TrackPublishOptions()
options.source = rtc.TrackSource.SOURCE_MICROPHONE

await job.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_NONE)
publication = await job.room.local_participant.publish_track(track, options)
await publication.wait_for_subscription()

# Example 1: Basic synthesis
logger.info("Example 1: Basic synthesis")
text1 = "Hello! Welcome to Typecast text-to-speech demonstration."
async for output in tts.synthesize(text1):
await source.capture_frame(output.frame)

await asyncio.sleep(1)

# Example 2: Happy emotion
logger.info("Example 2: Synthesizing with happy emotion")
tts.update_options(
prompt_options=typecast.PromptOptions(
emotion_preset="happy",
emotion_intensity=1.5,
)
)
text2 = "This is great! I'm so excited to demonstrate emotional expression!"
async for output in tts.synthesize(text2):
await source.capture_frame(output.frame)

await asyncio.sleep(1)

# Example 3: Sad emotion
logger.info("Example 3: Synthesizing with sad emotion")
tts.update_options(
prompt_options=typecast.PromptOptions(
emotion_preset="sad",
emotion_intensity=1.2,
)
)
text3 = "Sometimes things don't go as planned, and that's okay."
async for output in tts.synthesize(text3):
await source.capture_frame(output.frame)

await asyncio.sleep(1)

# Example 4: Back to normal with audio adjustments
logger.info("Example 4: Normal emotion with audio adjustments")
tts.update_options(
prompt_options=typecast.PromptOptions(
emotion_preset="normal",
emotion_intensity=1.0,
),
output_options=typecast.OutputOptions(
volume=110, # Louder volume
audio_pitch=1, # Higher pitch
audio_tempo=1.1, # Faster tempo
),
)
text4 = "Now I'm speaking faster with a higher pitch!"
async for output in tts.synthesize(text4):
await source.capture_frame(output.frame)

await asyncio.sleep(1)

# Example 5: Reproducible synthesis with seed
logger.info("Example 5: Using seed for reproducible synthesis")
tts.update_options(
seed=42, # Same seed will produce the same output
prompt_options=typecast.PromptOptions(
emotion_preset="normal",
emotion_intensity=1.0,
),
output_options=typecast.OutputOptions(
volume=100,
audio_pitch=0,
audio_tempo=1.0,
),
)
text5 = "This synthesis can be reproduced with the same seed value."
async for output in tts.synthesize(text5):
await source.capture_frame(output.frame)

logger.info("Typecast TTS demonstration completed!")


if __name__ == "__main__":
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))
15 changes: 15 additions & 0 deletions livekit-plugins/livekit-plugins-typecast/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Typecast plugin for LiveKit Agents

Support for voice synthesis with [Typecast](https://typecast.ai/).

## Installation

```bash
pip install livekit-plugins-typecast
```

## Pre-requisites

You'll need an API key from Typecast. Visit the [Typecast API](https://typecast.ai/developers/api) page to get started.

The API key can be set as an environment variable: `TYPECAST_API_KEY`
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright 2023 LiveKit, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Typecast TTS plugin for LiveKit Agents

Typecast provides high-quality, emotionally expressive text-to-speech synthesis
with support for multiple languages and voice styles.

See https://typecast.ai for more information.
"""

from .models import (
DEFAULT_VOICE_ID,
AudioFormat,
OutputOptions,
PromptOptions,
TTSLanguages,
TTSModels,
Voice,
)
from .tts import TTS
from .version import __version__

__all__ = [
"TTS",
"Voice",
"DEFAULT_VOICE_ID",
"TTSModels",
"TTSLanguages",
"AudioFormat",
"PromptOptions",
"OutputOptions",
"__version__",
]

from livekit.agents import Plugin

from .log import logger


class TypecastPlugin(Plugin):
def __init__(self) -> None:
super().__init__(__name__, __version__, __package__, logger)


Plugin.register_plugin(TypecastPlugin())

# Cleanup docs of unexported modules
_module = dir()
NOT_IN_ALL = [m for m in _module if m not in __all__]

__pdoc__ = {}

for n in NOT_IN_ALL:
__pdoc__[n] = False
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import logging

logger = logging.getLogger("livekit.plugins.typecast")
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Copyright 2023 LiveKit, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass
from typing import Literal

# Typecast TTS models
TTSModels = Literal["ssfm-v21"]

# Default voice ID (Olivia - supports multiple emotions)
DEFAULT_VOICE_ID = "tc_62a8975e695ad26f7fb514d1"

# Audio format options
AudioFormat = Literal["wav", "mp3"]

# Supported languages (ISO 639-3 codes)
TTSLanguages = Literal[
"eng", # English
"kor", # Korean
"jpn", # Japanese
"zho", # Chinese
"spa", # Spanish
"deu", # German
"fra", # French
"ita", # Italian
"rus", # Russian
"ara", # Arabic
"por", # Portuguese
"nld", # Dutch
"pol", # Polish
"swe", # Swedish
"tur", # Turkish
"hin", # Hindi
"tha", # Thai
"vie", # Vietnamese
"ind", # Indonesian
]


@dataclass
class Voice:
"""
Typecast voice model information.

Attributes:
id: Unique voice identifier (e.g., "tc_62a8975e695ad26f7fb514d1")
name: Human-readable voice name (e.g., "Olivia")
model: TTS model type (e.g., "ssfm-v21")
emotions: List of supported emotions (e.g., ["normal", "happy", "sad", "angry"])
"""

id: str
name: str
model: str
emotions: list[str]


@dataclass
class PromptOptions:
"""
Options for controlling the emotional expression in Typecast TTS synthesis.

Attributes:
emotion_preset: Emotion type (e.g., "normal", "happy", "sad", "angry")
emotion_intensity: Intensity of the emotion (0.0 ~ 2.0, default: 1.0)
"""

emotion_preset: str = "normal"
emotion_intensity: float = 1.0

def to_dict(self) -> dict:
return {
"emotion_preset": self.emotion_preset,
"emotion_intensity": self.emotion_intensity,
}


@dataclass
class OutputOptions:
"""
Options for controlling the audio output characteristics.

Attributes:
volume: Volume level (0 ~ 200, default: 100)
audio_pitch: Pitch adjustment in semitones (-12 ~ +12, default: 0)
audio_tempo: Speed multiplier (0.5x ~ 2.0x, default: 1.0)
audio_format: Output format ("wav" or "mp3", default: "wav")
"""

volume: int = 100
audio_pitch: int = 0
audio_tempo: float = 1.0
audio_format: AudioFormat = "wav"

def to_dict(self) -> dict:
return {
"volume": self.volume,
"audio_pitch": self.audio_pitch,
"audio_tempo": self.audio_tempo,
"audio_format": self.audio_format,
}
Loading