💥 breaking improve #91

- add tts pipeline - add tn pipeline - refactor ssml gen - add 增加 stream_chunk_size 参数 - add 简单的多线程 - add tn_config - refactor batch generate - add 增加 wetext tn （处理 en） - TODO 适配 api - TODO 适配 webui - TODO 支持其他model
lenML · Jul 11, 2024 · 3d62765 · 3d62765
1 parent 7e54bb7
commit 3d62765
Show file tree

Hide file tree

Showing 85 changed files with 1,891 additions and 462 deletions.
diff --git a/data/load_json_spk.py b/data/load_json_spk.py
@@ -1,6 +1,6 @@
 import json
 
-from modules.speaker import speaker_mgr
+from modules.core.speaker import speaker_mgr
 
 # 出处: https://github.com/2noise/ChatTTS/issues/238
 data = json.load(open("./data/slct_voice_240605.json", "r"))

diff --git a/modules/api/impl/google_api.py b/modules/api/impl/google_api.py
@@ -5,12 +5,12 @@
 
 from modules.api import utils as api_utils
 from modules.api.Api import APIManager
-from modules.api.impl.handler.SSMLHandler import SSMLHandler
-from modules.api.impl.handler.TTSHandler import TTSHandler
-from modules.api.impl.model.audio_model import AdjustConfig, AudioFormat
-from modules.api.impl.model.chattts_model import ChatTTSConfig, InferConfig
-from modules.api.impl.model.enhancer_model import EnhancerConfig
-from modules.speaker import Speaker, speaker_mgr
+from modules.core.handler.datacls.audio_model import AdjustConfig, AudioFormat
+from modules.core.handler.datacls.chattts_model import ChatTTSConfig, InferConfig
+from modules.core.handler.datacls.enhancer_model import EnhancerConfig
+from modules.core.handler.SSMLHandler import SSMLHandler
+from modules.core.handler.TTSHandler import TTSHandler
+from modules.core.speaker import Speaker, speaker_mgr
 
 
 class SynthesisInput(BaseModel):

diff --git a/modules/api/impl/handler/SSMLHandler.py b/modules/api/impl/handler/SSMLHandler.py
diff --git a/modules/api/impl/openai_api.py b/modules/api/impl/openai_api.py
@@ -7,12 +7,12 @@
 
 from modules.api import utils as api_utils
 from modules.api.Api import APIManager
-from modules.api.impl.handler.TTSHandler import TTSHandler
-from modules.api.impl.model.audio_model import AdjustConfig, AudioFormat
-from modules.api.impl.model.chattts_model import ChatTTSConfig, InferConfig
-from modules.api.impl.model.enhancer_model import EnhancerConfig
+from modules.core.handler.datacls.audio_model import AdjustConfig, AudioFormat
+from modules.core.handler.datacls.chattts_model import ChatTTSConfig, InferConfig
+from modules.core.handler.datacls.enhancer_model import EnhancerConfig
+from modules.core.handler.TTSHandler import TTSHandler
+from modules.core.speaker import Speaker, speaker_mgr
 from modules.data import styles_mgr
-from modules.speaker import Speaker, speaker_mgr
 
 
 class AudioSpeechRequest(BaseModel):

diff --git a/modules/api/impl/speaker_api.py b/modules/api/impl/speaker_api.py
@@ -4,7 +4,7 @@
 
 from modules.api import utils as api_utils
 from modules.api.Api import APIManager
-from modules.speaker import speaker_mgr
+from modules.core.speaker import speaker_mgr
 
 
 class CreateSpeaker(BaseModel):

diff --git a/modules/api/impl/ssml_api.py b/modules/api/impl/ssml_api.py
@@ -3,10 +3,10 @@
 from pydantic import BaseModel
 
 from modules.api.Api import APIManager
-from modules.api.impl.handler.SSMLHandler import SSMLHandler
-from modules.api.impl.model.audio_model import AdjustConfig, AudioFormat
-from modules.api.impl.model.chattts_model import InferConfig
-from modules.api.impl.model.enhancer_model import EnhancerConfig
+from modules.core.handler.datacls.audio_model import AdjustConfig, AudioFormat
+from modules.core.handler.datacls.chattts_model import InferConfig
+from modules.core.handler.datacls.tacls.enhancer_model import EnhancerConfig
+from modules.core.handler.SSMLHandler import SSMLHandler
 
 
 class SSMLRequest(BaseModel):

diff --git a/modules/api/impl/tts_api.py b/modules/api/impl/tts_api.py
@@ -7,11 +7,11 @@
 
 from modules.api import utils as api_utils
 from modules.api.Api import APIManager
-from modules.api.impl.handler.TTSHandler import TTSHandler
-from modules.api.impl.model.audio_model import AdjustConfig, AudioFormat
-from modules.api.impl.model.chattts_model import ChatTTSConfig, InferConfig
-from modules.api.impl.model.enhancer_model import EnhancerConfig
-from modules.speaker import Speaker
+from modules.core.handler.datacls.audio_model import AdjustConfig, AudioFormat
+from modules.core.handler.datacls.chattts_model import ChatTTSConfig, InferConfig
+from modules.core.handler.datacls.enhancer_model import EnhancerConfig
+from modules.core.handler.TTSHandler import TTSHandler
+from modules.core.speaker import Speaker
 
 logger = logging.getLogger(__name__)
 

diff --git a/modules/api/impl/xtts_v2_api.py b/modules/api/impl/xtts_v2_api.py
@@ -5,11 +5,11 @@
 from pydantic import BaseModel
 
 from modules.api.Api import APIManager
-from modules.api.impl.handler.TTSHandler import TTSHandler
-from modules.api.impl.model.audio_model import AdjustConfig, AudioFormat
-from modules.api.impl.model.chattts_model import ChatTTSConfig, InferConfig
-from modules.api.impl.model.enhancer_model import EnhancerConfig
-from modules.speaker import speaker_mgr
+from modules.core.handler.datacls.chattts_model import ChatTTSConfig, InferConfig
+from modules.core.handler.datacls.enhancer_model import EnhancerConfig
+from modules.core.handler.datacls.tacls.audio_model import AdjustConfig, AudioFormat
+from modules.core.handler.TTSHandler import TTSHandler
+from modules.core.speaker import speaker_mgr
 
 logger = logging.getLogger(__name__)
 

diff --git a/modules/api/utils.py b/modules/api/utils.py
@@ -1,11 +1,10 @@
-from typing import Any, Union
+from typing import Any, Dict, Union
 
 from pydantic import BaseModel
 from pydub import AudioSegment
 
+from modules.core.speaker import speaker_mgr
 from modules.data import styles_mgr
-from modules.speaker import speaker_mgr
-from modules.ssml import merge_prompt
 
 
 class ParamsTypeError(Exception):
@@ -36,6 +35,27 @@ def to_number(value, t, default=0):
         return default
 
 
+def merge_prompt(attrs: dict, elem: Dict[str, Any]):
+
+    def attr_num(attrs: Dict[str, Any], k: str, min_value: int, max_value: int):
+        val = elem.get(k, attrs.get(k, ""))
+        if val == "":
+            return
+        if val == "max":
+            val = max_value
+        if val == "min":
+            val = min_value
+        val = np.clip(int(val), min_value, max_value)
+        if "prefix" not in attrs or attrs["prefix"] == None:
+            attrs["prefix"] = ""
+        attrs["prefix"] += " " + f"[{k}_{val}]"
+
+    attr_num(attrs, "oral", 0, 9)
+    attr_num(attrs, "speed", 0, 9)
+    attr_num(attrs, "laugh", 0, 2)
+    attr_num(attrs, "break", 0, 7)
+
+
 def calc_spk_style(spk: Union[str, int], style: Union[str, int]):
     voice_attrs = {
         "spk": None,

diff --git a/modules/ssml_parser/__init__.py → modules/core/__init__.py b/modules/ssml_parser/__init__.py → modules/core/__init__.py
diff --git a/modules/api/impl/handler/AudioHandler.py → modules/core/handler/AudioHandler.py b/modules/api/impl/handler/AudioHandler.py → modules/core/handler/AudioHandler.py
@@ -7,9 +7,9 @@
 from fastapi import Request
 from pydub import AudioSegment
 
-from modules.api.impl.model.audio_model import AudioFormat
-from modules.ChatTTSInfer import ChatTTSInfer
-from modules.utils.audio import ndarray_to_segment
+from modules.core.handler.datacls.audio_model import AudioFormat
+from modules.core.models.zoo.ChatTTSInfer import ChatTTSInfer
+from modules.utils.audio_utils import ndarray_to_segment
 
 
 def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):

diff --git a/modules/core/handler/SSMLHandler.py b/modules/core/handler/SSMLHandler.py
@@ -0,0 +1,77 @@
+from typing import Generator
+
+import numpy as np
+from fastapi import HTTPException
+
+from modules.core.handler.AudioHandler import AudioHandler
+from modules.core.handler.datacls.audio_model import AdjustConfig
+from modules.core.handler.datacls.chattts_model import ChatTTSConfig, InferConfig
+from modules.core.handler.datacls.enhancer_model import EnhancerConfig
+from modules.core.pipeline.factory import PipelineFactory
+from modules.core.pipeline.pipeline import TTSPipeline
+from modules.core.pipeline.processor import TTSPipelineContext
+from modules.core.ssml.SSMLParser import create_ssml_v01_parser
+from modules.core.ssml.SynthesizeSSML import SynthesizeSSML
+from modules.Enhancer.ResembleEnhance import apply_audio_enhance_full
+from modules.normalization import text_normalize
+from modules.utils import audio_utils
+
+
+class SSMLHandler(AudioHandler):
+    def __init__(
+        self,
+        ssml_content: str,
+        infer_config: InferConfig,
+        adjust_config: AdjustConfig,
+        enhancer_config: EnhancerConfig,
+    ) -> None:
+        assert isinstance(ssml_content, str), "ssml_content must be a string."
+        assert isinstance(
+            infer_config, InferConfig
+        ), "infer_config must be an InferConfig object."
+        assert isinstance(
+            adjust_config, AdjustConfig
+        ), "adjest_config should be AdjustConfig"
+        assert isinstance(
+            enhancer_config, EnhancerConfig
+        ), "enhancer_config must be an EnhancerConfig object."
+
+        self.ssml_content = ssml_content
+        self.infer_config = infer_config
+        self.adjest_config = adjust_config
+        self.enhancer_config = enhancer_config
+
+        self.validate()
+
+    def validate(self):
+        # TODO params checker
+        pass
+
+    def create_pipeline(self):
+        ssml_content = self.ssml_content
+        infer_config = self.infer_config
+        adjust_config = self.adjest_config
+        enhancer_config = self.enhancer_config
+
+        ctx = TTSPipelineContext(
+            ssml=ssml_content,
+            tts_config=ChatTTSConfig(),
+            infer_config=infer_config,
+            adjust_config=adjust_config,
+            enhancer_config=enhancer_config,
+        )
+        pipeline = PipelineFactory.create(ctx)
+        return pipeline
+
+    def enqueue(self) -> tuple[np.ndarray, int]:
+        pipeline = self.create_pipeline()
+        results = pipeline.generate()
+
+        sample_rate = results[0][0]
+        audio_data = np.concatenate([r[1] for r in results], axis=0)
+
+        return audio_data, sample_rate
+
+    def enqueue_stream(self) -> Generator[tuple[np.ndarray, int], None, None]:
+        pipeline = self.create_pipeline()
+        # TODO
diff --git a/modules/api/impl/handler/TTSHandler.py → modules/core/handler/TTSHandler.py b/modules/api/impl/handler/TTSHandler.py → modules/core/handler/TTSHandler.py
@@ -3,16 +3,16 @@
 
 import numpy as np
 
-from modules.api.impl.handler.AudioHandler import AudioHandler
-from modules.api.impl.model.audio_model import AdjustConfig
-from modules.api.impl.model.chattts_model import ChatTTSConfig, InferConfig
-from modules.api.impl.model.enhancer_model import EnhancerConfig
+from modules.core.handler.AudioHandler import AudioHandler
+from modules.core.handler.datacls.audio_model import AdjustConfig
+from modules.core.handler.datacls.chattts_model import ChatTTSConfig, InferConfig
+from modules.core.handler.datacls.enhancer_model import EnhancerConfig
+from modules.core.speaker import Speaker
 from modules.Enhancer.ResembleEnhance import apply_audio_enhance_full
 from modules.normalization import text_normalize
-from modules.speaker import Speaker
 from modules.synthesize_audio import synthesize_audio
 from modules.synthesize_stream import synthesize_stream
-from modules.utils.audio import apply_normalize, apply_prosody_to_audio_data
+from modules.utils.audio_utils import apply_normalize, apply_prosody_to_audio_data
 
 logger = logging.getLogger(__name__)
 

diff --git a/modules/api/impl/model/audio_model.py → modules/core/handler/datacls/audio_model.py b/modules/api/impl/model/audio_model.py → modules/core/handler/datacls/audio_model.py
diff --git a/modules/api/impl/model/chattts_model.py → ...les/core/handler/datacls/chattts_model.py b/modules/api/impl/model/chattts_model.py → ...les/core/handler/datacls/chattts_model.py
@@ -2,10 +2,14 @@
 
 
 class ChatTTSConfig(BaseModel):
+    # model id
+    mid: str = "chat-tts"
+
     style: str = ""
     temperature: float = 0.3
     top_p: float = 0.7
     top_k: int = 20
+    prompt: str = ""
     prompt1: str = ""
     prompt2: str = ""
     prefix: str = ""
@@ -17,3 +21,6 @@ class InferConfig(BaseModel):
     # end_of_sentence
     eos: str = "[uv_break]"
     seed: int = 42
+
+    stream: bool = False
+    stream_chunk_size: int = 96
diff --git a/modules/api/impl/model/enhancer_model.py → ...es/core/handler/datacls/enhancer_model.py b/modules/api/impl/model/enhancer_model.py → ...es/core/handler/datacls/enhancer_model.py
diff --git a/modules/core/handler/datacls/tn_model.py b/modules/core/handler/datacls/tn_model.py
@@ -0,0 +1,8 @@
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class TNConfig(BaseModel):
+    enabled: Optional[list[str]] = None
+    disabled: Optional[list[str]] = None
diff --git a/modules/core/models/RefinerModel.py b/modules/core/models/RefinerModel.py