Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 25 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,17 @@ On the new component panel, copy and paste the following attribute template into

```json
{
"speech_provider": "google|elevenlabs",
"speech_provider": "google|elevenlabs|local",
"speech_provider_key": "<SECRET-KEY>",
"speech_service_name": "<service name>",
"speech_voice": "<VOICE-OPTION>",
"completion_provider": "openai",
"completion_model": "gpt-4|gpt-3.5-turbo",
"completion_provider_org": "<org-abc123>",
"completion_provider_key": "<sk-mykey>",
"completion_persona": "<PERSONA>",
"listen": true,
"listen_provider": "google",
"listen_provider": "google|<service name>",
"listen_trigger_say": "<TRIGGER-PHRASE>",
"listen_trigger_completion": "<COMPLETION-PHRASE>",
"listen_trigger_command": "<COMMAND-TO-RETRIEVE-STORED-TEXT>",
Expand All @@ -69,8 +70,9 @@ The following attributes are available for the `viam-labs:speech:speechio` speec

| Name | Type | Inclusion | Description |
| ------- | ------ | ------------ | ----------- |
| `speech_provider` | string | Optional | The speech provider for the voice service: `"google"` or `"elevenlabs"`. Default: `"google"`. |
| `speech_provider` | string | Optional | The speech provider for the voice service: `"google"`, `"elevenlabs"`, or `"local"`. Default: `"google"`. |
| `speech_provider_key` | string | **Required** | The secret key for the provider - only required for elevenlabs. Default: `""`. |
| `speech_provider_name` | string | **Required** | The name of the service to provide text-to-speech (TTS) capabilities - only required when `"local"` is set for `speech_provider` field. Default: `""`. |
| `speech_voice` | string | Optional | If the speech_provider (example: elevenlabs) provides voice options, you can select the voice here. Default: `"Josh"`. |
| `completion_provider` | string | Optional | `"openai"`. Other providers may be supported in the future. [completion_provider_org](#completion_provider_org) and [completion_provider_key](#completion_provider_key) must also be provided. Default: `"openai"`. |
| `completion_model` | string | Optional | `gpt-4` or `gpt-3.5-turbo`. Other models may be supported in the future. [completion_provider_org](#completion_provider_org) and [completion_provider_key](#completion_provider_key) must also be provided. Default: `"gpt-4"`. |
Expand Down Expand Up @@ -108,6 +110,26 @@ The following attributes are available for the `viam-labs:speech:speechio` speec
```
In the above case, the `listen_provider` and `depends_on` value are set to the name of the configured `viam-labs:speech:stt-vosk` service for the robot config.

\*If the `speech_provider` is another speech service (set to `"local"`), it should be set as a dependency for the "speechio" service. This must be done using the "Raw JSON" editor within the robot configuration by setting the `"depends_on"` field for the service:

```json
{
"name": "speechio",
"type": "speech",
"namespace": "viam-labs",
"model": "viam-labs:speech:speechio",
"attributes": {
"speech_provider": "local",
"speech_service_name": "piper",
/* other configuration for the service */
},
"depends_on": [
"piper"
]
}
```
In the above case, the `speech_service_name` and `depends_on` value are set to the name of the configured [`viam-labs:speech:tts-piper` service](https://github.com/viam-labs/tts-piper) for the robot config.

### Example configuration

The following configuration sets up listening mode with local speech-to-text, uses an ElevenLabs voice "Antoni", makes AI completions available, and uses a 'Gollum' persona for AI completions:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "speech"
version = "0.5.2"
version = "0.6.0"
authors = [
{ name="Matt Vella", email="[email protected]" },
]
Expand Down
6 changes: 4 additions & 2 deletions src/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

from viam.module.module import Module

from .speech import SpeechIOService, SpeechService
from speech_service_api import SpeechService
from .speech import SpeechIOService


async def main():
"""This function creates and starts a new module, after adding all desired resources.
Expand All @@ -15,4 +17,4 @@ async def main():


if __name__ == "__main__":
asyncio.run(main())
asyncio.run(main())
27 changes: 24 additions & 3 deletions src/speech/speechio.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
class SpeechProvider(str, Enum):
google = "google"
elevenlabs = "elevenlabs"
local = "local"


class CompletionProvider(str, Enum):
Expand All @@ -51,6 +52,7 @@ class RecState:

rec_state = RecState()


class SpeechIOService(SpeechService, Reconfigurable):
"""This is the specific implementation of a ``SpeechService`` (defined in api.py)

Expand Down Expand Up @@ -95,6 +97,9 @@ async def say(self, text: str, blocking: bool, cache_only: bool = False) -> str:
if str == "":
raise ValueError("No text provided")

if self.tts is not None:
return await self.tts.say(text=text, blocking=blocking)

LOGGER.info("Generating audio...")
if not os.path.isdir(CACHEDIR):
os.mkdir(CACHEDIR)
Expand All @@ -109,7 +114,7 @@ async def say(self, text: str, blocking: bool, cache_only: bool = False) -> str:
)
try:
if not os.path.isfile(file): # read from cache if it exists
if self.speech_provider == "elevenlabs":
if self.speech_provider == SpeechProvider.elevenlabs:
audio = eleven.generate(text=text, voice=self.speech_voice)
eleven.save(audio=audio, filename=file)
else:
Expand Down Expand Up @@ -153,6 +158,8 @@ async def listen_trigger(self, type: str) -> str:
return "OK"

async def is_speaking(self) -> bool:
if self.tts is not None:
return await self.tts.is_speaking()
return mixer.music.get_busy()

async def completion(
Expand Down Expand Up @@ -245,9 +252,13 @@ async def to_text(self, speech: bytes, format: str = "mp3"):
return ""

async def to_speech(self, text):
if self.speech_provider == "elevenlabs":
if self.speech_provider == SpeechProvider.elevenlabs:
audio = eleven.generate(text=text, voice=self.speech_voice)
return audio

if self.speech_provider == SpeechProvider.local and self.tts is not None:
audio = await self.tts.to_speech(text=text)
return audio
else:
mp3_fp = BytesIO()
sp = gTTS(text=text, lang="en", slow=False)
Expand Down Expand Up @@ -322,6 +333,7 @@ def reconfigure(
str(attrs.get("speech_provider", "google"))
]
self.speech_provider_key = str(attrs.get("speech_provider_key", ""))
self.speech_service_name = str(attrs.get("speech_service_name", ""))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I went with this approach of the "local" value for the speech provider field paired with this field to keep the Enum usage for the SpeechProvider. If we want this configuration to match the listen_provider setting just the service name, I can refactor for consistency and remove the Enum.

self.speech_voice = str(attrs.get("speech_voice", "Josh"))
self.completion_provider = CompletionProvider[
str(attrs.get("completion_provider", "openaigpt35turbo"))
Expand Down Expand Up @@ -350,13 +362,22 @@ def reconfigure(
self.command_list = []
self.trigger_active = False
self.active_trigger_type = ""
self.stt = None
self.stt: None | SpeechService = None
self.tts: None | SpeechService = None

if (
self.speech_provider == SpeechProvider.elevenlabs
and self.speech_provider_key != ""
):
eleven.set_api_key(self.speech_provider_key)
elif (
self.speech_provider == SpeechProvider.local
and self.speech_service_name != ""
):
tts = dependencies[
SpeechService.get_resource_name(self.speech_service_name)
]
self.tts = cast(SpeechService, tts)
else:
self.speech_provider = SpeechProvider.google

Expand Down