chunk styletts2 response and use bertie

uberduck-ai · Feb 19, 2024 · 2531b1f · 2531b1f
1 parent 81434ae
commit 2531b1f
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 11 deletions.
diff --git a/openduck-py/openduck_py/routers/voice.py b/openduck-py/openduck_py/routers/voice.py
@@ -1,4 +1,5 @@
 import io
+import re
 from tempfile import NamedTemporaryFile
 from uuid import uuid4
 from fastapi import APIRouter, Depends, UploadFile, File, Form
@@ -107,11 +108,13 @@ async def audio_response(
     chat.history_json["messages"] = messages
     await db.commit()
 
-    # TODO: Process styletts2 in chunks of text, and return one chunk at a time in a streaming fashion
-    audio = styletts2.styletts2_inference(
-        # TODO: better way to deal with long responses. chunk them
-        text=response_message.content[:500],
-    )
+    audio_chunks = []
+    sentences = re.split(r"(?<=[.!?]) +", response_message.content)
+    for i in range(0, len(sentences), 2):
+        chunk_text = " ".join(sentences[i : i + 2])
+        audio_chunk = styletts2.styletts2_inference(text=chunk_text)
+        audio_chunks.append(audio_chunk)
+    audio = np.concatenate(audio_chunks)
     audio = np.int16(audio * 32767)  # Scale to 16-bit integer values
     output = StreamingResponse(io.BytesIO(audio), media_type="application/octet-stream")
     return output
diff --git a/openduck-py/openduck_py/voices/styletts2.py b/openduck-py/openduck_py/voices/styletts2.py
@@ -294,7 +294,7 @@ def resize_array(input_array, new_size):
     loader=lambda x: load_plbert(plbert_config, x),
 )
 
-model_path = "styletts2/rap_v1.pt"
+model_path = "styletts2/prototype_voice.pth"
 model_bucket = "uberduck-models-us-west-2"
 model, sampler = load_model(
     cache=cache,
@@ -306,7 +306,7 @@ def resize_array(input_array, new_size):
     model_params=model_params,
 )
 
-style_prompt_path = "511f17d1-8a30-4be8-86aa-4cdd8b0aed70.wav"
+style_prompt_path = "bertie-chipper.wav"
 style_prompt_bucket = "uberduck-audio-files"
 
 ref_s = load_object_from_s3(
@@ -316,10 +316,7 @@ def resize_array(input_array, new_size):
 )
 
 
-def styletts2_inference(
-    text: str,
-    language: str = "english",
-):
+def styletts2_inference(text: str, language: str = "english"):
     print("styletts2.run started")
 
     # NOTE (Sam): to deal with short inference issue https://github.com/yl4579/StyleTTS2/issues/46.