strands-agents · krokoko · Jun 3, 2025 · Jun 12, 2025 · Jun 12, 2025
diff --git a/04-UX-demos/04-speech-to-speech-assistant/.gitignore b/04-UX-demos/04-speech-to-speech-assistant/.gitignore
@@ -0,0 +1,10 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# Virtual environments
+.venv
diff --git a/04-UX-demos/04-speech-to-speech-assistant/README.md b/04-UX-demos/04-speech-to-speech-assistant/README.md
@@ -0,0 +1,45 @@
+# Speech to speech assistant
+
+Speech to speech agent powered by [FastRTC](https://fastrtc.org/) and [Strands SDK](https://github.com/strands-agents/sdk-python). 
+
+- [Moonshine](https://huggingface.co/UsefulSensors/moonshine) for the speech to text part (running locally)
+- Claude 3.5 Haiku for the strands agent (Bedrock call)
+- [Kokoro](https://huggingface.co/spaces/hexgrad/Kokoro-TTS) for the text to speech part (running locally)
+
+![Agent architecture](img/architecture.png)
+
+## Prerequisites
+
+1. Ensure you enable model access to Anthropic Claude 3.5 Haiku (anthropic.claude-3-5-haiku-20241022-v1) in the Bedrock console in all the regions used by the system defined profile used by this sample. To use a different model, update the line containing `model_id="us.anthropic.claude-3-5-haiku-20241022-v1:0"` in [main.py](./main.py).
+2. Chrome, Safari, or Edge browser environment (Firefox is currently not supported)
+3. Microphone and speakers
+4. [Python](https://www.python.org/downloads/) 3.11 or higher
+
+## Getting started
+
+1. Install [uv](https://docs.astral.sh/uv/getting-started/installation/).
+
+2. Run `uv run main.py`
+
+## Usage
+
+1. Once the sample is running, access the UI at `http://127.0.0.1:7860` through a web browser.
+
+2. Click to access microphone, then hit `record`. Once it is connected, you can interact with the agent.
+
+## Agent description
+
+### Agent Details
+
+|Feature             |Description                                                |
+|--------------------|-----------------------------------------------------------|
+|Native tools used   |None                                 |
+|Agent Structure     |Single agent architecture                                  |
+|Model Provider	| Amazon Bedrock|
+
+## Sample queries
+
+Here are some sample queries you can try with this agent:
+
+* What is your name?
+* How old are you ?
diff --git a/04-UX-demos/04-speech-to-speech-assistant/img/architecture.png b/04-UX-demos/04-speech-to-speech-assistant/img/architecture.png
diff --git a/04-UX-demos/04-speech-to-speech-assistant/main.py b/04-UX-demos/04-speech-to-speech-assistant/main.py
@@ -0,0 +1,94 @@
+"""
+main.py
+Speech-to-speech assistant demo using Strands, Bedrock, and FastRTC.
+"""
+import logging
+from strands import Agent
+from strands.models import BedrockModel
+from fastrtc import (
+    get_stt_model, # moonshine/base or moonshine/tiny
+    get_tts_model, #kokoro
+    Stream,
+    ReplyOnPause,
+    KokoroTTSOptions,
+)
+
+# Configure logging
+logging.getLogger("strands").setLevel(logging.INFO)
+
+logging.basicConfig(
+    format="%(levelname)s | %(name)s | %(message)s",
+    handlers=[logging.StreamHandler()]
+)
+
+# System prompt for agent
+system_prompt = """
+You are a helpful assistant named Olaf, answering questions from the user.
+- Be clear, concise, and specific in your answers.
+- If you don't know the answer, say "I don't know".
+- If you don't have the information you need to answer a question, say "I don't know".
+"""
+
+# Initialize models
+stt_model = get_stt_model()  # Speech-to-text model
+tts_model = get_tts_model(model="kokoro")  # Text-to-speech model
+
+# Configure the TTS model options
+# KokoroTTSOptions allows you to set the voice, speed, and language
+
+tts_options = KokoroTTSOptions(
+    voice="af_heart",
+    speed=1.0,
+    lang="en-us"
+)
+
+def process_response(audio):
+    """
+    Process audio input and generate LLM response with TTS.
+
+    Args:
+        audio (bytes): Audio input data.
+    Yields:
+        bytes: Audio chunks of the TTS response.
+    """
+    # Convert speech to text using STT model
+    text = stt_model.stt(audio)
+    if not text.strip():
+        return
+
+    # Get response from agent (LLM)
+    response = agent(text)
+    # Extract the content string from AgentResult
+    response_content = getattr(response, "content", str(response))
+
+    # Convert response to audio using TTS model
+    for audio_chunk in tts_model.stream_tts_sync(response_content or "", options=tts_options):
+        # Yield the audio chunk
+        yield audio_chunk
+
+# BedrockModel is used to access Anthropic Claude 3.5 Haiku via AWS Bedrock
+bedrock_model = BedrockModel(
+    model_id="us.anthropic.claude-3-5-haiku-20241022-v1:0",
+    region_name='us-west-2',
+    temperature=0.3,
+)
+
+# Agent wraps the LLM and system prompt
+agent = Agent(model=bedrock_model, system_prompt=system_prompt)
+
+# Stream handles the audio input/output and UI
+stream = Stream(
+    handler=ReplyOnPause(process_response, input_sample_rate=16000),
+    additional_outputs_handler=lambda a, b: b,  # Pass-through handler for additional outputs
+    modality="audio",
+    mode="send-receive",
+    ui_args={
+        "pulse_color": "rgb(255, 255, 255)",
+        "icon_button_color": "rgb(255, 255, 255)",
+        "title": "🔊 Audio Assistant",
+    },
+)
+
+if __name__ == "__main__":
+    # Launch the UI on port 7860
+    stream.ui.launch(server_port=7860)
diff --git a/04-UX-demos/04-speech-to-speech-assistant/pyproject.toml b/04-UX-demos/04-speech-to-speech-assistant/pyproject.toml
@@ -0,0 +1,11 @@
+[project]
+name = "strands-experiment"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "fastrtc[stt,tts,vad]>=0.0.25",
+    "strands-agents>=0.1.2",
+    "strands-agents-tools>=0.1.1",
+]