uberduck-ai · zachwe · Mar 29, 2024 · Mar 28, 2024 · Mar 28, 2024 · Mar 29, 2024
diff --git a/.env.example b/.env.example
@@ -1,5 +1,6 @@
 UBERDUCK_API_HOST=localhost:8000/audio/response
 ML_API_URL=http://openduck_ml_1:8000
+GPT_SOVITS_API_URL=http://openduck-gpt-sovits-1:9880
 IS_DEV=True
 LOGGING_BUCKET=fill-me-in-optional
 AUDIO_UPLOAD_BUCKET=fill-me-in

diff --git a/clients/daily/daily_bot.py b/clients/daily/daily_bot.py
@@ -15,7 +15,6 @@
 
 
 class PyAudioApp:
-
     def __init__(self):
         self.__app_quit = False
 

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -16,3 +16,8 @@ services:
     env_file:
       - .env.dev
     runtime: nvidia
+  gpt-sovits:
+    image: docker.io/breakstring/gpt-sovits
+    command: python api.py
+    volumes:
+      - .:/openduck-py
diff --git a/openduck-py/openduck_py/configs/tts_config.py b/openduck-py/openduck_py/configs/tts_config.py
@@ -1,11 +1,15 @@
 from typing import Literal, Optional
 
-TTSProviders = Literal["local", "azure", "elevenlabs", "openai"]
+TTSProviders = Literal["styletts2", "gptsovits", "azure", "elevenlabs", "openai"]
 
 
 class TTSConfig:
     def __init__(
-        self, provider: TTSProviders = "local", voice_id: Optional[str] = None
+        self,
+        provider: TTSProviders = "gptsovits",
+        voice_id: Optional[
+            str
+        ] = "/openduck-py/openduck-py/models/styletts2/cartoon-boy-upbeat.wav",
     ):
         self.provider = provider
         self.voice_id = voice_id
diff --git a/openduck-py/openduck_py/logging/slack.py b/openduck-py/openduck_py/logging/slack.py
@@ -13,7 +13,6 @@
 
 
 def log_audio_to_slack(audio_path):
-
     print("log_audio_to_slack", audio_path, LOGGING_BUCKET, SLACK_LOGS_CHANNEL_ID)
     assert os.path.exists(audio_path), f"{audio_path} does not exist"
 

diff --git a/openduck-py/openduck_py/response_agent.py b/openduck-py/openduck_py/response_agent.py
@@ -33,6 +33,7 @@
 from openduck_py.utils.third_party_tts import (
     aio_azure_tts,
     aio_elevenlabs_tts,
+    aio_gptsovits_tts,
     aio_openai_tts,
 )
 
@@ -41,7 +42,6 @@
 
 
 async def _completion_with_retry(chat_model, messages):
-
     # NOTE(zach): retries
     response = None
     for _retry in range(3):
@@ -291,7 +291,6 @@ async def interrupt(self, task: asyncio.Task):
         self.is_responding = False
 
     async def receive_audio(self, message: bytes):
-
         if ASR_METHOD == "deepgram":
             self.dg_connection.send(message)
 
@@ -476,7 +475,7 @@ async def speak_response(
             print("Echo detected, not sending response.")
             return
 
-        if self.tts_config.provider == "local":
+        if self.tts_config.provider == "styletts2":
             normalized = await _normalize_text(response_text)
             t_normalize = time()
             await log_event(
@@ -486,7 +485,6 @@ async def speak_response(
                 meta={"text": normalized},
                 latency=t_normalize - t_chat,
             )
-
             audio_bytes_iter = _inference(normalized)
         else:
             t_normalize = time()
@@ -506,6 +504,10 @@ async def speak_response(
                 audio_bytes_iter = aio_openai_tts(response_text)
             elif self.tts_config.provider == "azure":
                 audio_bytes_iter = aio_azure_tts(response_text)
+            elif self.tts_config.provider == "gptsovits":
+                audio_bytes_iter = aio_gptsovits_tts(
+                    response_text, voice_ref=self.tts_config.voice_id
+                )
 
         audio_chunk_bytes = bytes()
         _idx = 0

diff --git a/openduck-py/openduck_py/routers/voice.py b/openduck-py/openduck_py/routers/voice.py
@@ -64,7 +64,6 @@ def _check_for_exceptions(response_task: Optional[asyncio.Task]):
 async def daily_consumer(
     queue: asyncio.Queue, interrupt: asyncio.Event, mic: VirtualMicrophoneDevice
 ):
-
     buffer_estimate = 0
     buffer_estimate_t0 = None
 

diff --git a/openduck-py/openduck_py/utils/third_party_tts.py b/openduck-py/openduck_py/utils/third_party_tts.py
@@ -4,6 +4,9 @@
 """
 
 import asyncio
+import io
+import librosa
+import numpy as np
 import os
 from typing import AsyncGenerator
 
@@ -12,6 +15,7 @@
 import openai
 
 elevenlabs_api_key = os.environ.get("ELEVENLABS_API_KEY")
+GPT_SOVITS_API_URL = os.environ.get("GPT_SOVITS_API_URL")
 
 
 def elevenlabs_tts():
@@ -24,6 +28,25 @@ def elevenlabs_tts():
 CHUNK_SIZE = 8192
 
 
+async def aio_gptsovits_tts(text, voice_ref) -> AsyncGenerator[bytes, None]:
+    result = httpx.get(
+        GPT_SOVITS_API_URL,
+        params={
+            "refer_wav_path": voice_ref,
+            "prompt_text": "Abandon all aspirations for any kind of cohesive architecture,",
+            "prompt_language": "en",
+            "text": text,
+            "text_language": "en",
+        },
+    )
+    result.raise_for_status()
+    wav, _ = librosa.load(io.BytesIO(result.content), sr=24000)
+    bytes = np.int16(wav * 32767).tobytes()
+    chunk_size = 16384
+    for chunk in [bytes[i : i + chunk_size] for i in range(0, len(bytes), chunk_size)]:
+        yield chunk
+
+
 async def aio_elevenlabs_tts(
     text, voice_id=ELEVENLABS_VIKRAM
 ) -> AsyncGenerator[bytes, None]: