Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPT-SoVITS #113

Merged
merged 5 commits into from
Mar 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
UBERDUCK_API_HOST=localhost:8000/audio/response
ML_API_URL=http://openduck_ml_1:8000
GPT_SOVITS_API_URL=http://openduck-gpt-sovits-1:9880
IS_DEV=True
LOGGING_BUCKET=fill-me-in-optional
AUDIO_UPLOAD_BUCKET=fill-me-in
Expand Down
1 change: 0 additions & 1 deletion clients/daily/daily_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@


class PyAudioApp:

def __init__(self):
self.__app_quit = False

Expand Down
5 changes: 5 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,8 @@ services:
env_file:
- .env.dev
runtime: nvidia
gpt-sovits:
image: docker.io/breakstring/gpt-sovits
command: python api.py
volumes:
- .:/openduck-py
8 changes: 6 additions & 2 deletions openduck-py/openduck_py/configs/tts_config.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from typing import Literal, Optional

TTSProviders = Literal["local", "azure", "elevenlabs", "openai"]
TTSProviders = Literal["styletts2", "gptsovits", "azure", "elevenlabs", "openai"]


class TTSConfig:
def __init__(
self, provider: TTSProviders = "local", voice_id: Optional[str] = None
self,
provider: TTSProviders = "gptsovits",
voice_id: Optional[
str
] = "/openduck-py/openduck-py/models/styletts2/cartoon-boy-upbeat.wav",
):
self.provider = provider
self.voice_id = voice_id
1 change: 0 additions & 1 deletion openduck-py/openduck_py/logging/slack.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@


def log_audio_to_slack(audio_path):

print("log_audio_to_slack", audio_path, LOGGING_BUCKET, SLACK_LOGS_CHANNEL_ID)
assert os.path.exists(audio_path), f"{audio_path} does not exist"

Expand Down
10 changes: 6 additions & 4 deletions openduck-py/openduck_py/response_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from openduck_py.utils.third_party_tts import (
aio_azure_tts,
aio_elevenlabs_tts,
aio_gptsovits_tts,
aio_openai_tts,
)

Expand All @@ -41,7 +42,6 @@


async def _completion_with_retry(chat_model, messages):

# NOTE(zach): retries
response = None
for _retry in range(3):
Expand Down Expand Up @@ -291,7 +291,6 @@ async def interrupt(self, task: asyncio.Task):
self.is_responding = False

async def receive_audio(self, message: bytes):

if ASR_METHOD == "deepgram":
self.dg_connection.send(message)

Expand Down Expand Up @@ -476,7 +475,7 @@ async def speak_response(
print("Echo detected, not sending response.")
return

if self.tts_config.provider == "local":
if self.tts_config.provider == "styletts2":
normalized = await _normalize_text(response_text)
t_normalize = time()
await log_event(
Expand All @@ -486,7 +485,6 @@ async def speak_response(
meta={"text": normalized},
latency=t_normalize - t_chat,
)

audio_bytes_iter = _inference(normalized)
else:
t_normalize = time()
Expand All @@ -506,6 +504,10 @@ async def speak_response(
audio_bytes_iter = aio_openai_tts(response_text)
elif self.tts_config.provider == "azure":
audio_bytes_iter = aio_azure_tts(response_text)
elif self.tts_config.provider == "gptsovits":
audio_bytes_iter = aio_gptsovits_tts(
response_text, voice_ref=self.tts_config.voice_id
)

audio_chunk_bytes = bytes()
_idx = 0
Expand Down
1 change: 0 additions & 1 deletion openduck-py/openduck_py/routers/voice.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def _check_for_exceptions(response_task: Optional[asyncio.Task]):
async def daily_consumer(
queue: asyncio.Queue, interrupt: asyncio.Event, mic: VirtualMicrophoneDevice
):

buffer_estimate = 0
buffer_estimate_t0 = None

Expand Down
23 changes: 23 additions & 0 deletions openduck-py/openduck_py/utils/third_party_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
"""

import asyncio
import io
import librosa
import numpy as np
import os
from typing import AsyncGenerator

Expand All @@ -12,6 +15,7 @@
import openai

elevenlabs_api_key = os.environ.get("ELEVENLABS_API_KEY")
GPT_SOVITS_API_URL = os.environ.get("GPT_SOVITS_API_URL")


def elevenlabs_tts():
Expand All @@ -24,6 +28,25 @@ def elevenlabs_tts():
CHUNK_SIZE = 8192


async def aio_gptsovits_tts(text, voice_ref) -> AsyncGenerator[bytes, None]:
result = httpx.get(
GPT_SOVITS_API_URL,
params={
"refer_wav_path": voice_ref,
"prompt_text": "Abandon all aspirations for any kind of cohesive architecture,",
"prompt_language": "en",
"text": text,
"text_language": "en",
},
)
result.raise_for_status()
wav, _ = librosa.load(io.BytesIO(result.content), sr=24000)
bytes = np.int16(wav * 32767).tobytes()
chunk_size = 16384
for chunk in [bytes[i : i + chunk_size] for i in range(0, len(bytes), chunk_size)]:
yield chunk


async def aio_elevenlabs_tts(
text, voice_id=ELEVENLABS_VIKRAM
) -> AsyncGenerator[bytes, None]:
Expand Down
Loading