From b509bc1e13e08c0141dd2d4aaf63285fce84b412 Mon Sep 17 00:00:00 2001 From: HipsterBrown Date: Fri, 29 Mar 2024 09:58:35 -0400 Subject: [PATCH] feat: add local speech provider support --- README.md | 28 +++++++++++++++++++++++++--- pyproject.toml | 2 +- src/__main__.py | 6 ++++-- src/speech/speechio.py | 27 ++++++++++++++++++++++++--- 4 files changed, 54 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 07ceceb..a3df50a 100644 --- a/README.md +++ b/README.md @@ -39,8 +39,9 @@ On the new component panel, copy and paste the following attribute template into ```json { - "speech_provider": "google|elevenlabs", + "speech_provider": "google|elevenlabs|local", "speech_provider_key": "", + "speech_service_name": "", "speech_voice": "", "completion_provider": "openai", "completion_model": "gpt-4|gpt-3.5-turbo", @@ -48,7 +49,7 @@ On the new component panel, copy and paste the following attribute template into "completion_provider_key": "", "completion_persona": "", "listen": true, - "listen_provider": "google", + "listen_provider": "google|", "listen_trigger_say": "", "listen_trigger_completion": "", "listen_trigger_command": "", @@ -69,8 +70,9 @@ The following attributes are available for the `viam-labs:speech:speechio` speec | Name | Type | Inclusion | Description | | ------- | ------ | ------------ | ----------- | -| `speech_provider` | string | Optional | The speech provider for the voice service: `"google"` or `"elevenlabs"`. Default: `"google"`. | +| `speech_provider` | string | Optional | The speech provider for the voice service: `"google"`, `"elevenlabs"`, or `"local"`. Default: `"google"`. | | `speech_provider_key` | string | **Required** | The secret key for the provider - only required for elevenlabs. Default: `""`. | +| `speech_provider_name` | string | **Required** | The name of the service to provide text-to-speech (TTS) capabilities - only required when `"local"` is set for `speech_provider` field. Default: `""`. | | `speech_voice` | string | Optional | If the speech_provider (example: elevenlabs) provides voice options, you can select the voice here. Default: `"Josh"`. | | `completion_provider` | string | Optional | `"openai"`. Other providers may be supported in the future. [completion_provider_org](#completion_provider_org) and [completion_provider_key](#completion_provider_key) must also be provided. Default: `"openai"`. | | `completion_model` | string | Optional | `gpt-4` or `gpt-3.5-turbo`. Other models may be supported in the future. [completion_provider_org](#completion_provider_org) and [completion_provider_key](#completion_provider_key) must also be provided. Default: `"gpt-4"`. | @@ -108,6 +110,26 @@ The following attributes are available for the `viam-labs:speech:speechio` speec ``` In the above case, the `listen_provider` and `depends_on` value are set to the name of the configured `viam-labs:speech:stt-vosk` service for the robot config. +\*If the `speech_provider` is another speech service (set to `"local"`), it should be set as a dependency for the "speechio" service. This must be done using the "Raw JSON" editor within the robot configuration by setting the `"depends_on"` field for the service: + +```json +{ + "name": "speechio", + "type": "speech", + "namespace": "viam-labs", + "model": "viam-labs:speech:speechio", + "attributes": { + "speech_provider": "local", + "speech_service_name": "piper", + /* other configuration for the service */ + }, + "depends_on": [ + "piper" + ] +} +``` +In the above case, the `speech_service_name` and `depends_on` value are set to the name of the configured [`viam-labs:speech:tts-piper` service](https://github.com/viam-labs/tts-piper) for the robot config. + ### Example configuration The following configuration sets up listening mode with local speech-to-text, uses an ElevenLabs voice "Antoni", makes AI completions available, and uses a 'Gollum' persona for AI completions: diff --git a/pyproject.toml b/pyproject.toml index 2cfa7e1..b346407 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "speech" -version = "0.5.2" +version = "0.6.0" authors = [ { name="Matt Vella", email="mcvella@gmail.com" }, ] diff --git a/src/__main__.py b/src/__main__.py index d2e04a5..09b8335 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -2,7 +2,9 @@ from viam.module.module import Module -from .speech import SpeechIOService, SpeechService +from speech_service_api import SpeechService +from .speech import SpeechIOService + async def main(): """This function creates and starts a new module, after adding all desired resources. @@ -15,4 +17,4 @@ async def main(): if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) diff --git a/src/speech/speechio.py b/src/speech/speechio.py index fce9775..a225f49 100644 --- a/src/speech/speechio.py +++ b/src/speech/speechio.py @@ -30,6 +30,7 @@ class SpeechProvider(str, Enum): google = "google" elevenlabs = "elevenlabs" + local = "local" class CompletionProvider(str, Enum): @@ -51,6 +52,7 @@ class RecState: rec_state = RecState() + class SpeechIOService(SpeechService, Reconfigurable): """This is the specific implementation of a ``SpeechService`` (defined in api.py) @@ -95,6 +97,9 @@ async def say(self, text: str, blocking: bool, cache_only: bool = False) -> str: if str == "": raise ValueError("No text provided") + if self.tts is not None: + return await self.tts.say(text=text, blocking=blocking) + LOGGER.info("Generating audio...") if not os.path.isdir(CACHEDIR): os.mkdir(CACHEDIR) @@ -109,7 +114,7 @@ async def say(self, text: str, blocking: bool, cache_only: bool = False) -> str: ) try: if not os.path.isfile(file): # read from cache if it exists - if self.speech_provider == "elevenlabs": + if self.speech_provider == SpeechProvider.elevenlabs: audio = eleven.generate(text=text, voice=self.speech_voice) eleven.save(audio=audio, filename=file) else: @@ -153,6 +158,8 @@ async def listen_trigger(self, type: str) -> str: return "OK" async def is_speaking(self) -> bool: + if self.tts is not None: + return await self.tts.is_speaking() return mixer.music.get_busy() async def completion( @@ -245,9 +252,13 @@ async def to_text(self, speech: bytes, format: str = "mp3"): return "" async def to_speech(self, text): - if self.speech_provider == "elevenlabs": + if self.speech_provider == SpeechProvider.elevenlabs: audio = eleven.generate(text=text, voice=self.speech_voice) return audio + + if self.speech_provider == SpeechProvider.local and self.tts is not None: + audio = await self.tts.to_speech(text=text) + return audio else: mp3_fp = BytesIO() sp = gTTS(text=text, lang="en", slow=False) @@ -322,6 +333,7 @@ def reconfigure( str(attrs.get("speech_provider", "google")) ] self.speech_provider_key = str(attrs.get("speech_provider_key", "")) + self.speech_service_name = str(attrs.get("speech_service_name", "")) self.speech_voice = str(attrs.get("speech_voice", "Josh")) self.completion_provider = CompletionProvider[ str(attrs.get("completion_provider", "openaigpt35turbo")) @@ -350,13 +362,22 @@ def reconfigure( self.command_list = [] self.trigger_active = False self.active_trigger_type = "" - self.stt = None + self.stt: None | SpeechService = None + self.tts: None | SpeechService = None if ( self.speech_provider == SpeechProvider.elevenlabs and self.speech_provider_key != "" ): eleven.set_api_key(self.speech_provider_key) + elif ( + self.speech_provider == SpeechProvider.local + and self.speech_service_name != "" + ): + tts = dependencies[ + SpeechService.get_resource_name(self.speech_service_name) + ] + self.tts = cast(SpeechService, tts) else: self.speech_provider = SpeechProvider.google