diff --git a/.gitignore b/.gitignore
index 4e6a2e2..02c2ac8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@
 **/GPT_SoVITS/GPT_SoVITS/pretrained_models/
 **/GPT_SoVITS/GPT_SoVITS/text/
 **/GPT_SoVITS/GPT_SoVITS/tools/i18n/locale/
+**/GPT_SoVITS/GPT_SoVITS/inference_webui.py
 **/GPT_SoVITS/GPT_SoVITS/my_utils.py
 **/GPT_SoVITS/GPT_SoVITS/onnx_export.py
 **/GPT_SoVITS/GPT_SoVITS/process_ckpt.py
diff --git a/EVT_Core/Dataset/GPT_SoVITS/utils/Creating_Directories.py b/EVT_Core/Dataset/GPT_SoVITS/utils/Creating_Directories.py
index 32e0c3f..15716fc 100644
--- a/EVT_Core/Dataset/GPT_SoVITS/utils/Creating_Directories.py
+++ b/EVT_Core/Dataset/GPT_SoVITS/utils/Creating_Directories.py
@@ -2,26 +2,26 @@
 
 
 def create_directories(
-    wav_dir_prepared,
     wav_dir_split,
+    csv_dir_prepared,
     csv_dir_merged,
     csv_dir_final
 ):
     '''
     Create csv directory
     '''
-    if not os.path.exists(wav_dir_prepared):
-        try:
-            os.makedirs(wav_dir_prepared, exist_ok = True)
-        except OSError:
-            print('Creation of directory %s failed' %wav_dir_prepared)
-
     if not os.path.exists(wav_dir_split):
         try:
             os.makedirs(wav_dir_split, exist_ok = True)
         except OSError:
             print('Creation of directory %s failed' %wav_dir_split)
 
+    if not os.path.exists(csv_dir_prepared):
+        try:
+            os.makedirs(csv_dir_prepared, exist_ok = True)
+        except OSError:
+            print('Creation of directory %s failed' %csv_dir_prepared)
+
     if not os.path.exists(csv_dir_merged):
         try:
             os.makedirs(csv_dir_merged, exist_ok = True)
diff --git a/EVT_Core/Dataset/VITS/utils/Creating_Directories.py b/EVT_Core/Dataset/VITS/utils/Creating_Directories.py
index 32e0c3f..927ebd7 100644
--- a/EVT_Core/Dataset/VITS/utils/Creating_Directories.py
+++ b/EVT_Core/Dataset/VITS/utils/Creating_Directories.py
@@ -4,6 +4,7 @@
 def create_directories(
     wav_dir_prepared,
     wav_dir_split,
+    csv_dir_prepared,
     csv_dir_merged,
     csv_dir_final
 ):
@@ -22,6 +23,12 @@ def create_directories(
         except OSError:
             print('Creation of directory %s failed' %wav_dir_split)
 
+    if not os.path.exists(csv_dir_prepared):
+        try:
+            os.makedirs(csv_dir_prepared, exist_ok = True)
+        except OSError:
+            print('Creation of directory %s failed' %csv_dir_prepared)
+
     if not os.path.exists(csv_dir_merged):
         try:
             os.makedirs(csv_dir_merged, exist_ok = True)
diff --git a/EVT_Core/TTS/GPT_SoVITS/Convert.py b/EVT_Core/TTS/GPT_SoVITS/Convert.py
index 4da2374..1487763 100644
--- a/EVT_Core/TTS/GPT_SoVITS/Convert.py
+++ b/EVT_Core/TTS/GPT_SoVITS/Convert.py
@@ -124,7 +124,7 @@ def change_tts_inference(
         os.environ["infer_ttswebui"]=str(webui_port_infer_tts)
         os.environ["is_share"]=str(is_share)
         os.environ['USE_WEBUI']=str(use_webui)
-        cmd = f'"{python_exec}" "GPT_SoVITS/inference.py"'
+        cmd = f'"{python_exec}" "GPT_SoVITS/inference_gui.py"'
         print("TTS推理进程已开启")
         print(cmd)
         p_tts_inference = subprocess.Popen(cmd, shell=True)
diff --git a/EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference_gui.py b/EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference_gui.py
new file mode 100644
index 0000000..2059155
--- /dev/null
+++ b/EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference_gui.py
@@ -0,0 +1,310 @@
+import os
+import sys
+from PyQt5.QtCore import QEvent
+from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit
+from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox
+import soundfile as sf
+
+from tools.i18n.i18n import I18nAuto
+i18n = I18nAuto()
+
+from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav
+
+
+class GPTSoVITSGUI(QMainWindow):
+    GPT_Path = gpt_path
+    SoVITS_Path = sovits_path
+
+    def __init__(self):
+        super().__init__()
+
+        self.setWindowTitle('GPT-SoVITS GUI')
+        self.setGeometry(800, 450, 950, 850)
+
+        self.setStyleSheet("""
+            QWidget {
+                background-color: #a3d3b1; 
+            }
+
+            QTabWidget::pane {
+                background-color: #a3d3b1;  
+            }
+
+            QTabWidget::tab-bar {
+                alignment: left;
+            }
+
+            QTabBar::tab {
+                background: #8da4bf; 
+                color: #ffffff;  
+                padding: 8px;
+            }
+
+            QTabBar::tab:selected {
+                background: #2a3f54; 
+            }
+
+            QLabel {
+                color: #000000;  
+            }
+
+            QPushButton {
+                background-color: #4CAF50; 
+                color: white;  
+                padding: 8px;
+                border: 1px solid #4CAF50;
+                border-radius: 4px;
+            }
+
+            QPushButton:hover {
+                background-color: #45a049;  
+                border: 1px solid #45a049;
+                box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.1);
+            }
+        """)    
+
+        license_text = (
+        "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. "
+        "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
+        license_label = QLabel(license_text)
+        license_label.setWordWrap(True)
+
+        self.GPT_model_label = QLabel("选择GPT模型:")
+        self.GPT_model_input = QLineEdit()
+        self.GPT_model_input.setPlaceholderText("拖拽或选择文件")
+        self.GPT_model_input.setText(self.GPT_Path)
+        self.GPT_model_input.setReadOnly(True)
+        self.GPT_model_button = QPushButton("选择GPT模型文件")
+        self.GPT_model_button.clicked.connect(self.select_GPT_model)
+
+        self.SoVITS_model_label = QLabel("选择SoVITS模型:")
+        self.SoVITS_model_input = QLineEdit()
+        self.SoVITS_model_input.setPlaceholderText("拖拽或选择文件")
+        self.SoVITS_model_input.setText(self.SoVITS_Path)
+        self.SoVITS_model_input.setReadOnly(True)
+        self.SoVITS_model_button = QPushButton("选择SoVITS模型文件")
+        self.SoVITS_model_button.clicked.connect(self.select_SoVITS_model)
+
+        self.ref_audio_label = QLabel("上传参考音频:")
+        self.ref_audio_input = QLineEdit()
+        self.ref_audio_input.setPlaceholderText("拖拽或选择文件")
+        self.ref_audio_input.setReadOnly(True)
+        self.ref_audio_button = QPushButton("选择音频文件")
+        self.ref_audio_button.clicked.connect(self.select_ref_audio)
+
+        self.ref_text_label = QLabel("参考音频文本:")
+        self.ref_text_input = QLineEdit()
+        self.ref_text_input.setPlaceholderText("直接输入文字或上传文本")
+        self.ref_text_button = QPushButton("上传文本")
+        self.ref_text_button.clicked.connect(self.upload_ref_text)
+
+        self.ref_language_label = QLabel("参考音频语言:")
+        self.ref_language_combobox = QComboBox()
+        self.ref_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"])
+        self.ref_language_combobox.setCurrentText("多语种混合")
+
+        self.target_text_label = QLabel("合成目标文本:")
+        self.target_text_input = QLineEdit()
+        self.target_text_input.setPlaceholderText("直接输入文字或上传文本")
+        self.target_text_button = QPushButton("上传文本")
+        self.target_text_button.clicked.connect(self.upload_target_text)
+
+        self.target_language_label = QLabel("合成音频语言:")
+        self.target_language_combobox = QComboBox()
+        self.target_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"])
+        self.target_language_combobox.setCurrentText("多语种混合")
+
+        self.output_label = QLabel("输出音频路径:")
+        self.output_input = QLineEdit()
+        self.output_input.setPlaceholderText("拖拽或选择文件")
+        self.output_input.setReadOnly(True)
+        self.output_button = QPushButton("选择文件夹")
+        self.output_button.clicked.connect(self.select_output_path)
+
+        self.output_text = QTextEdit()
+        self.output_text.setReadOnly(True)
+
+        self.add_drag_drop_events([
+            self.GPT_model_input,
+            self.SoVITS_model_input,
+            self.ref_audio_input,
+            self.ref_text_input,
+            self.target_text_input,
+            self.output_input,
+        ])
+
+        self.synthesize_button = QPushButton("合成")
+        self.synthesize_button.clicked.connect(self.synthesize)
+
+        self.clear_output_button = QPushButton("清空输出")
+        self.clear_output_button.clicked.connect(self.clear_output)
+
+        self.status_bar = QStatusBar()
+
+        main_layout = QVBoxLayout()
+
+        input_layout = QGridLayout(self)
+        input_layout.setSpacing(10)
+
+        input_layout.addWidget(license_label, 0, 0, 1, 3)
+
+        input_layout.addWidget(self.GPT_model_label, 1, 0)
+        input_layout.addWidget(self.GPT_model_input, 2, 0, 1, 2)
+        input_layout.addWidget(self.GPT_model_button, 2, 2)
+
+        input_layout.addWidget(self.SoVITS_model_label, 3, 0)
+        input_layout.addWidget(self.SoVITS_model_input, 4, 0, 1, 2)
+        input_layout.addWidget(self.SoVITS_model_button, 4, 2)
+
+        input_layout.addWidget(self.ref_audio_label, 5, 0)
+        input_layout.addWidget(self.ref_audio_input, 6, 0, 1, 2)
+        input_layout.addWidget(self.ref_audio_button, 6, 2)
+
+        input_layout.addWidget(self.ref_language_label, 7, 0)
+        input_layout.addWidget(self.ref_language_combobox, 8, 0, 1, 1)
+        input_layout.addWidget(self.ref_text_label, 9, 0)
+        input_layout.addWidget(self.ref_text_input, 10, 0, 1, 2)
+        input_layout.addWidget(self.ref_text_button, 10, 2)
+
+        input_layout.addWidget(self.target_language_label, 11, 0)
+        input_layout.addWidget(self.target_language_combobox, 12, 0, 1, 1)
+        input_layout.addWidget(self.target_text_label, 13, 0)
+        input_layout.addWidget(self.target_text_input, 14, 0, 1, 2)
+        input_layout.addWidget(self.target_text_button, 14, 2)
+
+        input_layout.addWidget(self.output_label, 15, 0)
+        input_layout.addWidget(self.output_input, 16, 0, 1, 2)
+        input_layout.addWidget(self.output_button, 16, 2)
+
+        main_layout.addLayout(input_layout)
+
+        output_layout = QVBoxLayout()
+        output_layout.addWidget(self.output_text)
+        main_layout.addLayout(output_layout)
+
+        main_layout.addWidget(self.synthesize_button)
+
+        main_layout.addWidget(self.clear_output_button)
+
+        main_layout.addWidget(self.status_bar)
+
+        self.central_widget = QWidget()
+        self.central_widget.setLayout(main_layout)
+        self.setCentralWidget(self.central_widget)
+
+    def dragEnterEvent(self, event):
+        if event.mimeData().hasUrls():
+            event.acceptProposedAction()
+
+    def dropEvent(self, event):
+        if event.mimeData().hasUrls():
+            file_paths = [url.toLocalFile() for url in event.mimeData().urls()]
+            if len(file_paths) == 1:
+                self.update_ref_audio(file_paths[0])
+            else:
+                self.update_ref_audio(", ".join(file_paths))
+
+    def add_drag_drop_events(self, widgets):
+        for widget in widgets:
+            widget.setAcceptDrops(True)
+            widget.installEventFilter(self)
+
+    def eventFilter(self, obj, event):
+        if event.type() in (QEvent.DragEnter, QEvent.Drop):
+            mime_data = event.mimeData()
+            if mime_data.hasUrls():
+                event.acceptProposedAction()
+
+        return super().eventFilter(obj, event)
+
+    def select_GPT_model(self):
+        file_path, _ = QFileDialog.getOpenFileName(self, "选择GPT模型文件", "", "GPT Files (*.ckpt)")
+        if file_path:
+            self.GPT_model_input.setText(file_path)
+
+    def select_SoVITS_model(self):
+        file_path, _ = QFileDialog.getOpenFileName(self, "选择SoVITS模型文件", "", "SoVITS Files (*.pth)")
+        if file_path:
+            self.SoVITS_model_input.setText(file_path)
+
+    def select_ref_audio(self):
+        file_path, _ = QFileDialog.getOpenFileName(self, "选择参考音频文件", "", "Audio Files (*.wav *.mp3)")
+        if file_path:
+            self.update_ref_audio(file_path)
+
+    def upload_ref_text(self):
+        file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
+        if file_path:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                content = file.read()
+                self.ref_text_input.setText(content)
+
+    def upload_target_text(self):
+        file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
+        if file_path:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                content = file.read()
+                self.target_text_input.setText(content)
+
+    def select_output_path(self):
+        options = QFileDialog.Options()
+        options |= QFileDialog.DontUseNativeDialog
+        options |= QFileDialog.ShowDirsOnly
+
+        folder_dialog = QFileDialog()
+        folder_dialog.setOptions(options)
+        folder_dialog.setFileMode(QFileDialog.Directory)
+
+        if folder_dialog.exec_():
+            folder_path = folder_dialog.selectedFiles()[0]
+            self.output_input.setText(folder_path)
+
+    def update_ref_audio(self, file_path):
+        self.ref_audio_input.setText(file_path)
+
+    def clear_output(self):
+        self.output_text.clear()
+
+    def synthesize(self):
+        GPT_model_path = self.GPT_model_input.text()
+        SoVITS_model_path = self.SoVITS_model_input.text()
+        ref_audio_path = self.ref_audio_input.text()
+        language_combobox = self.ref_language_combobox.currentText()
+        language_combobox = i18n(language_combobox)
+        ref_text = self.ref_text_input.text()
+        target_language_combobox = self.target_language_combobox.currentText()
+        target_language_combobox = i18n(target_language_combobox)
+        target_text = self.target_text_input.text()
+        output_path = self.output_input.text()
+
+        if GPT_model_path != self.GPT_Path:
+            change_gpt_weights(gpt_path=GPT_model_path)
+            self.GPT_Path = GPT_model_path
+        if SoVITS_model_path != self.SoVITS_Path:
+            change_sovits_weights(sovits_path=SoVITS_model_path)
+            self.SoVITS_Path = SoVITS_model_path
+
+        synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, 
+                                       prompt_text=ref_text, 
+                                       prompt_language=language_combobox, 
+                                       text=target_text, 
+                                       text_language=target_language_combobox)
+
+        result_list = list(synthesis_result)
+
+        if result_list:
+            last_sampling_rate, last_audio_data = result_list[-1]
+            output_wav_path = os.path.join(output_path, "output.wav") 
+            sf.write(output_wav_path, last_audio_data, last_sampling_rate)
+
+            result = "Audio saved to " + output_wav_path
+
+        self.status_bar.showMessage("合成完成！输出路径：" + output_wav_path, 5000)
+        self.output_text.append("处理结果：\n" + result)
+
+
+if __name__ == '__main__':
+    app = QApplication(sys.argv)
+    mainWin = GPTSoVITSGUI()
+    mainWin.show()
+    sys.exit(app.exec_())
\ No newline at end of file
diff --git a/EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference.py b/EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference_webui.py
similarity index 67%
rename from EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference.py
rename to EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference_webui.py
index 0958a6e..b21b954 100644
--- a/EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference.py
+++ b/EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference_webui.py
@@ -70,14 +70,6 @@
 
 i18n = I18nAuto()
 
-import sys
-from PyQt5.QtCore import QEvent
-from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit
-from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox
-import soundfile as sf
-
-use_webui = eval(os.environ.get('USE_WEBUI', "True"))
-
 # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'  # 确保直接启动推理UI时也能够设置。
 
 if torch.cuda.is_available():
@@ -516,7 +508,7 @@ def cut4(inp):
     return "\n".join(opts)
 
 
-# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference.py
+# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
 def cut5(inp):
     # if not re.search(r'[^\w\s]', inp[-1]):
     # inp += '。'
@@ -648,305 +640,7 @@ def get_weights_names():
         gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。"))
 
 
-class GPTSoVITSGUI(QMainWindow):
-    gpt_path = gpt_path
-    sovits_path = sovits_path
-
-    def __init__(self):
-        super().__init__()
-
-        self.setWindowTitle('GPT-SoVITS GUI')
-        self.setGeometry(800, 450, 950, 850)
-
-        self.setStyleSheet("""
-            QWidget {
-                background-color: #a3d3b1; 
-            }
-
-            QTabWidget::pane {
-                background-color: #a3d3b1;  
-            }
-
-            QTabWidget::tab-bar {
-                alignment: left;
-            }
-
-            QTabBar::tab {
-                background: #8da4bf; 
-                color: #ffffff;  
-                padding: 8px;
-            }
-
-            QTabBar::tab:selected {
-                background: #2a3f54; 
-            }
-
-            QLabel {
-                color: #000000;  
-            }
-
-            QPushButton {
-                background-color: #4CAF50; 
-                color: white;  
-                padding: 8px;
-                border: 1px solid #4CAF50;
-                border-radius: 4px;
-            }
-
-            QPushButton:hover {
-                background-color: #45a049;  
-                border: 1px solid #45a049;
-                box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.1);
-            }
-        """)    
-
-        license_text = (
-        "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. "
-        "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
-        license_label = QLabel(license_text)
-        license_label.setWordWrap(True)
-
-        self.GPT_model_label = QLabel("选择GPT模型:")
-        self.GPT_model_input = QLineEdit()
-        self.GPT_model_input.setPlaceholderText("拖拽或选择文件")
-        self.GPT_model_input.setText(self.gpt_path)
-        self.GPT_model_input.setReadOnly(True)
-        self.GPT_model_button = QPushButton("选择GPT模型文件")
-        self.GPT_model_button.clicked.connect(self.select_GPT_model)
-
-        self.SoVITS_model_label = QLabel("选择SoVITS模型:")
-        self.SoVITS_model_input = QLineEdit()
-        self.SoVITS_model_input.setPlaceholderText("拖拽或选择文件")
-        self.SoVITS_model_input.setText(self.sovits_path)
-        self.SoVITS_model_input.setReadOnly(True)
-        self.SoVITS_model_button = QPushButton("选择SoVITS模型文件")
-        self.SoVITS_model_button.clicked.connect(self.select_SoVITS_model)
-
-        self.ref_audio_label = QLabel("上传参考音频:")
-        self.ref_audio_input = QLineEdit()
-        self.ref_audio_input.setPlaceholderText("拖拽或选择文件")
-        self.ref_audio_input.setReadOnly(True)
-        self.ref_audio_button = QPushButton("选择音频文件")
-        self.ref_audio_button.clicked.connect(self.select_ref_audio)
-
-        self.ref_text_label = QLabel("参考音频文本:")
-        self.ref_text_input = QLineEdit()
-        self.ref_text_input.setPlaceholderText("直接输入文字或上传文本")
-        self.ref_text_button = QPushButton("上传文本")
-        self.ref_text_button.clicked.connect(self.upload_ref_text)
-
-        self.ref_language_label = QLabel("参考音频语言:")
-        self.ref_language_combobox = QComboBox()
-        self.ref_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"])
-        self.ref_language_combobox.setCurrentText("多语种混合")
-
-        self.target_text_label = QLabel("合成目标文本:")
-        self.target_text_input = QLineEdit()
-        self.target_text_input.setPlaceholderText("直接输入文字或上传文本")
-        self.target_text_button = QPushButton("上传文本")
-        self.target_text_button.clicked.connect(self.upload_target_text)
-
-        self.target_language_label = QLabel("合成音频语言:")
-        self.target_language_combobox = QComboBox()
-        self.target_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"])
-        self.target_language_combobox.setCurrentText("多语种混合")
-
-        self.output_label = QLabel("输出音频路径:")
-        self.output_input = QLineEdit()
-        self.output_input.setPlaceholderText("拖拽或选择文件")
-        self.output_input.setReadOnly(True)
-        self.output_button = QPushButton("选择文件夹")
-        self.output_button.clicked.connect(self.select_output_path)
-
-        self.output_text = QTextEdit()
-        self.output_text.setReadOnly(True)
-
-        self.add_drag_drop_events([
-            self.GPT_model_input,
-            self.SoVITS_model_input,
-            self.ref_audio_input,
-            self.ref_text_input,
-            self.target_text_input,
-            self.output_input,
-        ])
-
-        self.synthesize_button = QPushButton("合成")
-        self.synthesize_button.clicked.connect(self.synthesize)
-
-        self.clear_output_button = QPushButton("清空输出")
-        self.clear_output_button.clicked.connect(self.clear_output)
-
-        self.status_bar = QStatusBar()
-
-        main_layout = QVBoxLayout()
-
-        input_layout = QGridLayout(self)
-        input_layout.setSpacing(10)
-
-        input_layout.addWidget(license_label, 0, 0, 1, 3)
-
-        input_layout.addWidget(self.GPT_model_label, 1, 0)
-        input_layout.addWidget(self.GPT_model_input, 2, 0, 1, 2)
-        input_layout.addWidget(self.GPT_model_button, 2, 2)
-
-        input_layout.addWidget(self.SoVITS_model_label, 3, 0)
-        input_layout.addWidget(self.SoVITS_model_input, 4, 0, 1, 2)
-        input_layout.addWidget(self.SoVITS_model_button, 4, 2)
-
-        input_layout.addWidget(self.ref_audio_label, 5, 0)
-        input_layout.addWidget(self.ref_audio_input, 6, 0, 1, 2)
-        input_layout.addWidget(self.ref_audio_button, 6, 2)
-
-        input_layout.addWidget(self.ref_language_label, 7, 0)
-        input_layout.addWidget(self.ref_language_combobox, 8, 0, 1, 1)
-        input_layout.addWidget(self.ref_text_label, 9, 0)
-        input_layout.addWidget(self.ref_text_input, 10, 0, 1, 2)
-        input_layout.addWidget(self.ref_text_button, 10, 2)
-
-        input_layout.addWidget(self.target_language_label, 11, 0)
-        input_layout.addWidget(self.target_language_combobox, 12, 0, 1, 1)
-        input_layout.addWidget(self.target_text_label, 13, 0)
-        input_layout.addWidget(self.target_text_input, 14, 0, 1, 2)
-        input_layout.addWidget(self.target_text_button, 14, 2)
-
-        input_layout.addWidget(self.output_label, 15, 0)
-        input_layout.addWidget(self.output_input, 16, 0, 1, 2)
-        input_layout.addWidget(self.output_button, 16, 2)
-
-        main_layout.addLayout(input_layout)
-
-        output_layout = QVBoxLayout()
-        output_layout.addWidget(self.output_text)
-        main_layout.addLayout(output_layout)
-
-        main_layout.addWidget(self.synthesize_button)
-
-        main_layout.addWidget(self.clear_output_button)
-
-        main_layout.addWidget(self.status_bar)
-
-        self.central_widget = QWidget()
-        self.central_widget.setLayout(main_layout)
-        self.setCentralWidget(self.central_widget)
-
-    def dragEnterEvent(self, event):
-        if event.mimeData().hasUrls():
-            event.acceptProposedAction()
-
-    def dropEvent(self, event):
-        if event.mimeData().hasUrls():
-            file_paths = [url.toLocalFile() for url in event.mimeData().urls()]
-
-            if len(file_paths) == 1:
-                self.update_ref_audio(file_paths[0])
-            else:
-                self.update_ref_audio(", ".join(file_paths))
-
-    def add_drag_drop_events(self, widgets):
-        for widget in widgets:
-            widget.setAcceptDrops(True)
-            widget.installEventFilter(self)
-
-    def eventFilter(self, obj, event):
-        if event.type() == QEvent.DragEnter:
-            mime_data = event.mimeData()
-            if mime_data.hasUrls():
-                event.acceptProposedAction()
-
-        elif event.type() == QEvent.Drop:
-            mime_data = event.mimeData()
-            if mime_data.hasUrls():
-                event.acceptProposedAction()
-
-        return super().eventFilter(obj, event)
-
-    def select_GPT_model(self):
-        file_path, _ = QFileDialog.getOpenFileName(self, "选择GPT模型文件", "", "GPT Files (*.ckpt)")
-        if file_path:
-            self.GPT_model_input.setText(file_path)
-
-    def select_SoVITS_model(self):
-        file_path, _ = QFileDialog.getOpenFileName(self, "选择SoVITS模型文件", "", "SoVITS Files (*.pth)")
-        if file_path:
-            self.SoVITS_model_input.setText(file_path)
-
-    def select_ref_audio(self):
-        file_path, _ = QFileDialog.getOpenFileName(self, "选择参考音频文件", "", "Audio Files (*.wav *.mp3)")
-        if file_path:
-            self.update_ref_audio(file_path)
-
-    def upload_ref_text(self):
-        file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
-        if file_path:
-            with open(file_path, 'r', encoding='utf-8') as file:
-                content = file.read()
-                self.ref_text_input.setText(content)
-
-    def upload_target_text(self):
-        file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
-        if file_path:
-            with open(file_path, 'r', encoding='utf-8') as file:
-                content = file.read()
-                self.target_text_input.setText(content)
-
-    def select_output_path(self):
-        options = QFileDialog.Options()
-        options |= QFileDialog.DontUseNativeDialog
-        options |= QFileDialog.ShowDirsOnly
-
-        folder_dialog = QFileDialog()
-        folder_dialog.setOptions(options)
-        folder_dialog.setFileMode(QFileDialog.Directory)
-
-        if folder_dialog.exec_():
-            folder_path = folder_dialog.selectedFiles()[0]
-            self.output_input.setText(folder_path)
-
-    def update_ref_audio(self, file_path):
-        self.ref_audio_input.setText(file_path)
-
-    def clear_output(self):
-        self.output_text.clear()
-
-    def synthesize(self):
-        GPT_model_path = self.GPT_model_input.text()
-        SoVITS_model_path = self.SoVITS_model_input.text()
-        ref_audio_path = self.ref_audio_input.text()
-        language_combobox = self.ref_language_combobox.currentText()
-        language_combobox = i18n(language_combobox)
-        ref_text = self.ref_text_input.text()
-        target_language_combobox = self.target_language_combobox.currentText()
-        target_language_combobox = i18n(target_language_combobox)
-        target_text = self.target_text_input.text()
-        output_path = self.output_input.text()
-
-        if GPT_model_path != self.gpt_path:
-            change_gpt_weights(gpt_path=GPT_model_path)
-            self.gpt_path = GPT_model_path
-        if SoVITS_model_path != self.sovits_path:
-            change_sovits_weights(sovits_path=SoVITS_model_path)
-            self.sovits_path = SoVITS_model_path
-
-        synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, 
-                                       prompt_text=ref_text, 
-                                       prompt_language=language_combobox, 
-                                       text=target_text, 
-                                       text_language=target_language_combobox)
-
-        result_list = list(synthesis_result)
-
-        if result_list:
-            last_sampling_rate, last_audio_data = result_list[-1]
-            output_wav_path = os.path.join(output_path, "output.wav") 
-            sf.write(output_wav_path, last_audio_data, last_sampling_rate)
-
-            result = "Audio saved to " + output_wav_path
-
-        self.status_bar.showMessage("合成完成！输出路径：" + output_wav_path, 5000)
-        self.output_text.append("处理结果：\n" + result)
-
-
-if use_webui:
+if __name__ == '__main__':
     app.queue(concurrency_count=511, max_size=1022).launch(
         server_name="0.0.0.0",
         inbrowser=True,
@@ -954,8 +648,3 @@ def synthesize(self):
         server_port=infer_ttswebui,
         quiet=True,
     )
-else:
-    app = QApplication(sys.argv)
-    mainWin = GPTSoVITSGUI()
-    mainWin.show()
-    sys.exit(app.exec_())
\ No newline at end of file
diff --git a/EVT_Core/TTS/VITS/Convert.py b/EVT_Core/TTS/VITS/Convert.py
index 6641c44..d9c5f02 100644
--- a/EVT_Core/TTS/VITS/Convert.py
+++ b/EVT_Core/TTS/VITS/Convert.py
@@ -1,137 +1,48 @@
 import os
-import re
-import langdetect
-#import IPython.display as ipd
-import torch
-#from torch.utils.data import DataLoader
+import sys
 from typing import Optional
+from subprocess import Popen
 from pathlib import Path
-from scipy.io.wavfile import write
-from datetime import datetime
 
-from .vits.Commons import intersperse
-from .vits.Utils import get_hparams_from_file, load_checkpoint
-#from .vits.Data_Utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
-from .vits.Models import SynthesizerTrn
-from .vits.text import text_to_sequence
-from .vits.text.symbols import symbols
 
+current_dir = Path(__file__).absolute().parent.as_posix()
+os.chdir(current_dir)
+sys.path.insert(0, f"{current_dir}/VITS2_finetuning")
 
-if torch.cuda.is_available() is True:
-    device = 'cuda:0'
-else:
-    device = 'cpu'
 
+python_exec = sys.executable or "python"
 
-def Get_Config_Path(ConfigPath):
-    if Path(ConfigPath).is_dir():
-        ConfigPaths = [File for File in os.listdir(ConfigPath) if Path(File).suffix == '.json']
-        ConfigPath = sorted(ConfigPaths, key = lambda ConfigPath: re.sub(r'[A-Za-z]+', '', Path(ConfigPath).name))[-1]
-    return ConfigPath
+p_infer = None
 
 
-def Get_Model_Path(ModelPath):
-    if Path(ModelPath).is_dir():
-        ModelPaths = [File for File in os.listdir(ModelPath) if Path(File).suffix == '.pth' and 'G_' in File]
-        ModelPath = sorted(ModelPaths, key = lambda ModelPath: re.sub(r'G_[A-Za-z]+', '', Path(ModelPath).name))[-1]
-    return ModelPath
-
-
-class Voice_Converting:
+def Convert(
+    Config_Path_Load: str = ...,
+    Model_Path_Load: str = ...,
+    Text: str = '请输入语句',
+    Language: Optional[str] = None,
+    Speaker: str = ...,
+    EmotionStrength: float = .667,
+    PhonemeDuration: float = 0.8,
+    SpeechRate: float = 1.,
+    Audio_Path_Save: str = "audio.wav"
+):
     '''
     Convert text to speech and save as audio files
     '''
-    def __init__(self,
-        Config_Path_Load: str = ...,
-        Model_Path_Load: str = ...,
-        Text: str = '请输入语句',
-        Language: Optional[str] = None,
-        Speaker: str = ...,
-        EmotionStrength: float = .667,
-        PhonemeDuration: float = 0.8,
-        SpeechRate: float = 1.,
-        Audio_Path_Save: str = ...
-    ):
-        self.Config_Path_Load = Get_Config_Path(Config_Path_Load)
-        self.Model_Path_Load = Get_Model_Path(Model_Path_Load)
-        self.Text = Text
-        self.Language = Language
-        self.Speaker = Speaker
-        self.EmotionStrength = EmotionStrength
-        self.PhonemeDuration = PhonemeDuration
-        self.SpeechRate = SpeechRate
-        self.Audio_Path_Save = Audio_Path_Save
-
-        os.remove(Audio_Path_Save) if Path(Audio_Path_Save).exists() else os.makedirs(Path(Audio_Path_Save).parent.__str__(), exist_ok = True)
-
-    def Converting(self):
-        hps = get_hparams_from_file(self.Config_Path_Load)
-
-        net_g = SynthesizerTrn(
-            len(symbols),
-            80 if 'use_mel_posterior_encoder' in hps.model.keys() and hps.model.use_mel_posterior_encoder == True else hps.data.filter_length // 2 + 1,
-            hps.train.segment_size // hps.data.hop_length,
-            n_speakers=hps.data.n_speakers,
-            **hps.model).to(device)
-        _ = net_g.eval()
-
-        _ = load_checkpoint(self.Model_Path_Load, net_g, None)
-
-        def get_text(text, hps):
-            text_norm = text_to_sequence(text, hps.data.text_cleaners)
-            if hps.data.add_blank:
-                text_norm = intersperse(text_norm, 0)
-            text_norm = torch.LongTensor(text_norm)
-            return text_norm
-
-        def langdetector(text):  # from PolyLangVITS
-            try:
-                LangDict = {
-                    'zh-cn': 'ZH',
-                    'en':    'EN',
-                    'ja':    'JA'
-                }
-                Lang = LangDict.get(langdetect.detect(text).lower())
-                return f'[{Lang}]{text}[{Lang}]'
-            except Exception as e:
-                raise Exception("Failed to detect language!")
-
-        stn_tst = get_text(
-            langdetector(re.sub(r"[\[\]\(\)\{\}]", "", self.Text)) if self.Language is not None else f"[{self.Language}]{self.Text}[{self.Language}]",
-            hps
-        )
-
-        with torch.no_grad():
-            x_tst = stn_tst.to(device).unsqueeze(0)
-            x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
-            speakers = list(hps.speakers.keys()) if hasattr(hps.speakers, 'keys') else hps.speakers
-            sid = torch.LongTensor([speakers.index(self.Speaker)]).to(device) if self.Speaker is not None else 0
-            audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=self.EmotionStrength, noise_scale_w=self.PhonemeDuration, length_scale=self.SpeechRate)[0][0,0].data.cpu().float().numpy()
-            write(os.path.normpath(self.Audio_Path_Save), hps.data.sampling_rate, audio) #ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
-
-
-''' # Voice Conversion 
-dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
-collate_fn = TextAudioSpeakerCollate()
-loader = DataLoader(dataset, num_workers=8, shuffle=False,
-    batch_size=1, pin_memory=True,
-    drop_last=True, collate_fn=collate_fn)
-data_list = list(loader)
-
-with torch.no_grad():
-    x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.to(device) for x in data_list[0]]
-    sid_tgt1 = torch.LongTensor([1]).to(device)
-    sid_tgt2 = torch.LongTensor([2]).to(device)
-    sid_tgt3 = torch.LongTensor([4]).to(device)
-    audio1 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data.cpu().float().numpy()
-    audio2 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[0][0,0].data.cpu().float().numpy()
-    audio3 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[0][0,0].data.cpu().float().numpy()
-print("Original SID: %d" % sid_src.item())
-ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sampling_rate, normalize=False))
-print("Converted SID: %d" % sid_tgt1.item())
-ipd.display(ipd.Audio(audio1, rate=hps.data.sampling_rate, normalize=False))
-print("Converted SID: %d" % sid_tgt2.item())
-ipd.display(ipd.Audio(audio2, rate=hps.data.sampling_rate, normalize=False))
-print("Converted SID: %d" % sid_tgt3.item())
-ipd.display(ipd.Audio(audio3, rate=hps.data.sampling_rate, normalize=False))
-'''
\ No newline at end of file
+    global p_infer
+    if p_infer is None:
+        os.environ['Config_Path_Load'] = str(Config_Path_Load)
+        os.environ['Model_Path_Load'] = str(Model_Path_Load)
+        os.environ['Text'] = str(Text)
+        os.environ['Language'] = str(Language)
+        os.environ['Speaker'] = str(Speaker)
+        os.environ['EmotionStrength'] = str(EmotionStrength)
+        os.environ['PhonemeDuration'] = str(PhonemeDuration)
+        os.environ['SpeechRate'] = str(SpeechRate)
+        os.environ['Audio_Path_Save'] = str(Audio_Path_Save)
+        print("Start converting...")
+        p_infer = Popen(f'"{python_exec}" "VITS2_finetuning/inference.py"', shell = True)
+        p_infer.wait()
+        p_infer = None
+    else:
+        print("已有正在进行的推理任务，需先终止才能开启下一次任务")
\ No newline at end of file
diff --git a/EVT_Core/Train/VITS/vits/Attentions.py b/EVT_Core/TTS/VITS/VITS2_finetuning/Attentions.py
similarity index 99%
rename from EVT_Core/Train/VITS/vits/Attentions.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/Attentions.py
index 77314d8..4fc2a33 100644
--- a/EVT_Core/Train/VITS/vits/Attentions.py
+++ b/EVT_Core/TTS/VITS/VITS2_finetuning/Attentions.py
@@ -4,8 +4,8 @@
 from torch.nn import functional as F
 from torch.nn.utils import remove_weight_norm, weight_norm
 
-from .Modules import LayerNorm
-from .Commons import subsequent_mask, convert_pad_shape, fused_add_tanh_sigmoid_multiply
+from modules import LayerNorm
+from commons import subsequent_mask, convert_pad_shape, fused_add_tanh_sigmoid_multiply
 
 
 class MultiHeadAttention(nn.Module):
diff --git a/EVT_Core/Train/VITS/vits/Commons.py b/EVT_Core/TTS/VITS/VITS2_finetuning/Commons.py
similarity index 100%
rename from EVT_Core/Train/VITS/vits/Commons.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/Commons.py
diff --git a/EVT_Core/Train/VITS/vits/Models.py b/EVT_Core/TTS/VITS/VITS2_finetuning/Models.py
similarity index 93%
rename from EVT_Core/Train/VITS/vits/Models.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/Models.py
index 521e7c3..c207b3c 100644
--- a/EVT_Core/Train/VITS/vits/Models.py
+++ b/EVT_Core/TTS/VITS/VITS2_finetuning/Models.py
@@ -5,10 +5,10 @@
 from torch.nn import Conv1d, ConvTranspose1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 
-from . import Modules
-from . import Attentions
-from . import Commons
-from . import monotonic_align
+import modules
+import attentions
+import commons
+import monotonic_align
 
 
 AVAILABLE_FLOW_TYPES = [
@@ -43,25 +43,25 @@ def __init__(self,
         self.n_flows = n_flows
         self.gin_channels = gin_channels
 
-        self.log_flow = Modules.Log()
+        self.log_flow = modules.Log()
         self.flows = nn.ModuleList()
-        self.flows.append(Modules.ElementwiseAffine(2))
+        self.flows.append(modules.ElementwiseAffine(2))
         for i in range(n_flows):
-            self.flows.append(Modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
-            self.flows.append(Modules.Flip())
+            self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
+            self.flows.append(modules.Flip())
 
         self.post_pre = nn.Conv1d(1, filter_channels, 1)
         self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.post_convs = Modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
+        self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
         self.post_flows = nn.ModuleList()
-        self.post_flows.append(Modules.ElementwiseAffine(2))
+        self.post_flows.append(modules.ElementwiseAffine(2))
         for i in range(4):
-            self.post_flows.append(Modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
-            self.post_flows.append(Modules.Flip())
+            self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
+            self.post_flows.append(modules.Flip())
 
         self.pre = nn.Conv1d(in_channels, filter_channels, 1)
         self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.convs = Modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
+        self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
         if gin_channels != 0:
             self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
 
@@ -131,9 +131,9 @@ def __init__(self,
 
         self.drop = nn.Dropout(p_dropout)
         self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
-        self.norm_1 = Modules.LayerNorm(filter_channels)
+        self.norm_1 = modules.LayerNorm(filter_channels)
         self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
-        self.norm_2 = Modules.LayerNorm(filter_channels)
+        self.norm_2 = modules.LayerNorm(filter_channels)
         self.proj = nn.Conv1d(filter_channels, 1, 1)
 
         if gin_channels != 0:
@@ -176,15 +176,15 @@ def __init__(self,
 
         self.drop = nn.Dropout(p_dropout)
         self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
-        # self.norm_1 = Modules.LayerNorm(filter_channels)
+        # self.norm_1 = modules.LayerNorm(filter_channels)
         self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
-        # self.norm_2 = Modules.LayerNorm(filter_channels)
+        # self.norm_2 = modules.LayerNorm(filter_channels)
         self.dur_proj = nn.Conv1d(1, filter_channels, 1)
 
         self.pre_out_conv_1 = nn.Conv1d(2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
-        self.pre_out_norm_1 = Modules.LayerNorm(filter_channels)
+        self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
         self.pre_out_conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
-        self.pre_out_norm_2 = Modules.LayerNorm(filter_channels)
+        self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
 
         # if gin_channels != 0:
         #   self.cond = nn.Conv1d(gin_channels, in_channels, 1)
@@ -246,21 +246,21 @@ def __init__(
         self.conv_1 = nn.Conv1d(
             in_channels, filter_channels, kernel_size, padding=kernel_size // 2
         )
-        self.norm_1 = Modules.LayerNorm(filter_channels)
+        self.norm_1 = modules.LayerNorm(filter_channels)
         self.conv_2 = nn.Conv1d(
             filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
         )
-        self.norm_2 = Modules.LayerNorm(filter_channels)
+        self.norm_2 = modules.LayerNorm(filter_channels)
         self.dur_proj = nn.Conv1d(1, filter_channels, 1)
 
         self.pre_out_conv_1 = nn.Conv1d(
             2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
         )
-        self.pre_out_norm_1 = Modules.LayerNorm(filter_channels)
+        self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
         self.pre_out_conv_2 = nn.Conv1d(
             filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
         )
-        self.pre_out_norm_2 = Modules.LayerNorm(filter_channels)
+        self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
 
         # if gin_channels != 0:
         #   self.cond = nn.Conv1d(gin_channels, in_channels, 1)
@@ -329,7 +329,7 @@ def __init__(self,
         nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
 
         # Transformer Encoder
-        self.encoder = Attentions.Encoder(
+        self.encoder = attentions.Encoder(
             hidden_channels,
             filter_channels,
             n_heads,
@@ -343,7 +343,7 @@ def __init__(self,
     def forward(self, x, x_lengths, g=None):
         x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
         x = torch.transpose(x, 1, -1) # [b, h, t]
-        x_mask = torch.unsqueeze(Commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
 
         x = self.encoder(x * x_mask, x_mask, g=g)
         stats = self.proj(x) * x_mask
@@ -374,7 +374,7 @@ def __init__(self,
         self.mean_only = mean_only
 
         self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-        self.pre_transformer = Attentions.Encoder(
+        self.pre_transformer = attentions.Encoder(
             hidden_channels,
             hidden_channels,
             n_heads=2,
@@ -383,7 +383,7 @@ def __init__(self,
             p_dropout=p_dropout,
             # window_size=None,
         )
-        self.enc = Modules.WN(
+        self.enc = modules.WN(
             hidden_channels,
             kernel_size,
             dilation_rate,
@@ -439,7 +439,7 @@ def __init__(self,
         self.half_channels = channels // 2
         self.mean_only = mean_only
         # vits2
-        self.pre_transformer = Attentions.Encoder(
+        self.pre_transformer = attentions.Encoder(
             self.half_channels,
             self.half_channels,
             n_heads=2,
@@ -450,7 +450,7 @@ def __init__(self,
         )
 
         self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-        self.enc = Modules.WN(
+        self.enc = modules.WN(
             hidden_channels,
             kernel_size,
             dilation_rate,
@@ -459,7 +459,7 @@ def __init__(self,
             gin_channels=gin_channels,
         )
         # vits2
-        self.post_transformer = Attentions.Encoder(
+        self.post_transformer = attentions.Encoder(
             self.hidden_channels,
             self.hidden_channels,
             n_heads=2,
@@ -523,7 +523,7 @@ def __init__(self,
         self.mean_only = mean_only
 
         self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-        self.enc = Attentions.FFT(
+        self.enc = attentions.FFT(
             hidden_channels,
             filter_channels,
             n_heads,
@@ -576,7 +576,7 @@ def __init__(self,
         self.mean_only = mean_only
         self.residual_connection = residual_connection
         # vits2
-        self.pre_transformer = Attentions.Encoder(
+        self.pre_transformer = attentions.Encoder(
             self.half_channels,
             self.half_channels,
             n_heads=2,
@@ -679,7 +679,7 @@ def __init__(self,
                             mean_only=True
                         )
                     )
-                    self.flows.append(Modules.Flip())
+                    self.flows.append(modules.Flip())
             elif transformer_flow_type == "pre_conv2":
                 for i in range(n_flows):
                     self.flows.append(
@@ -693,7 +693,7 @@ def __init__(self,
                             mean_only=True,
                         )
                     )
-                    self.flows.append(Modules.Flip())
+                    self.flows.append(modules.Flip())
             elif transformer_flow_type == "fft":
                 for i in range(n_flows):
                     self.flows.append(
@@ -707,11 +707,11 @@ def __init__(self,
                             mean_only=True
                         )
                     )
-                    self.flows.append(Modules.Flip())
+                    self.flows.append(modules.Flip())
             elif transformer_flow_type == "mono_layer_inter_residual":
                 for i in range(n_flows):
                     self.flows.append(
-                        Modules.ResidualCouplingLayer(
+                        modules.ResidualCouplingLayer(
                             channels,
                             hidden_channels,
                             kernel_size,
@@ -721,7 +721,7 @@ def __init__(self,
                             mean_only=True
                         )
                     )
-                    self.flows.append(Modules.Flip())
+                    self.flows.append(modules.Flip())
                     self.flows.append(
                         MonoTransformerFlowLayer(
                             channels, hidden_channels, mean_only=True
@@ -730,7 +730,7 @@ def __init__(self,
         elif transformer_flow_type == "mono_layer_post_residual":
             for i in range(n_flows):
                 self.flows.append(
-                    Modules.ResidualCouplingLayer(
+                    modules.ResidualCouplingLayer(
                         channels,
                         hidden_channels,
                         kernel_size,
@@ -740,7 +740,7 @@ def __init__(self,
                         mean_only=True,
                     )
                 )
-                self.flows.append(Modules.Flip())
+                self.flows.append(modules.Flip())
                 self.flows.append(
                     MonoTransformerFlowLayer(
                         channels,
@@ -752,7 +752,7 @@ def __init__(self,
         else:
             for i in range(n_flows):
                 self.flows.append(
-                    Modules.ResidualCouplingLayer(
+                    modules.ResidualCouplingLayer(
                         channels,
                         hidden_channels,
                         kernel_size,
@@ -762,7 +762,7 @@ def __init__(self,
                         mean_only=True
                     )
                 )
-                self.flows.append(Modules.Flip())
+                self.flows.append(modules.Flip())
 
     def forward(self, x, x_mask, g=None, reverse=False):
         if not reverse:
@@ -794,11 +794,11 @@ def __init__(self,
         self.gin_channels = gin_channels
 
         self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
-        self.enc = Modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+        self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
 
     def forward(self, x, x_lengths, g=None): # x: LinearSpectrum; g: GlobalCondition
-        x_mask = torch.unsqueeze(Commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
         x = self.pre(x) * x_mask
         x = self.enc(x, x_mask, g=g)
         stats = self.proj(x) * x_mask
@@ -822,7 +822,7 @@ def __init__(self,
         self.num_kernels = len(resblock_kernel_sizes)
         self.num_upsamples = len(upsample_rates)
         self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
-        resblock = Modules.ResBlock1 if resblock == '1' else Modules.ResBlock2
+        resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
 
         self.ups = nn.ModuleList()
         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
@@ -839,7 +839,7 @@ def __init__(self,
                     self.resblocks.append(resblock(ch, k, d))
 
         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-        self.ups.apply(Commons.init_weights)
+        self.ups.apply(commons.init_weights)
 
         if gin_channels != 0:
             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
@@ -850,7 +850,7 @@ def forward(self, x, g=None):
             x = x + self.cond(g)
 
         for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, Modules.LRELU_SLOPE)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
             x = self.ups[i](x)
             xs = None
             for j in range(self.num_kernels):
@@ -892,7 +892,7 @@ def __init__(self,
                         32,
                         (kernel_size, 1),
                         (stride, 1),
-                        padding=(Commons.get_padding(kernel_size, 1), 0),
+                        padding=(commons.get_padding(kernel_size, 1), 0),
                     )
                 ),
                 norm_f(
@@ -901,7 +901,7 @@ def __init__(self,
                         128,
                         (kernel_size, 1),
                         (stride, 1),
-                        padding=(Commons.get_padding(kernel_size, 1), 0),
+                        padding=(commons.get_padding(kernel_size, 1), 0),
                     )
                 ),
                 norm_f(
@@ -910,7 +910,7 @@ def __init__(self,
                         512,
                         (kernel_size, 1),
                         (stride, 1),
-                        padding=(Commons.get_padding(kernel_size, 1), 0),
+                        padding=(commons.get_padding(kernel_size, 1), 0),
                     )
                 ),
                 norm_f(
@@ -919,7 +919,7 @@ def __init__(self,
                         1024,
                         (kernel_size, 1),
                         (stride, 1),
-                        padding=(Commons.get_padding(kernel_size, 1), 0),
+                        padding=(commons.get_padding(kernel_size, 1), 0),
                     )
                 ),
                 norm_f(
@@ -928,7 +928,7 @@ def __init__(self,
                         1024,
                         (kernel_size, 1),
                         1,
-                        padding=(Commons.get_padding(kernel_size, 1), 0),
+                        padding=(commons.get_padding(kernel_size, 1), 0),
                     )
                 ),
             ]
@@ -948,7 +948,7 @@ def forward(self, x):
 
         for l in self.convs:
             x = l(x)
-            x = F.leaky_relu(x, Modules.LRELU_SLOPE)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
             fmap.append(x)
         x = self.conv_post(x)
         fmap.append(x)
@@ -978,7 +978,7 @@ def forward(self, x):
 
         for l in self.convs:
             x = l(x)
-            x = F.leaky_relu(x, Modules.LRELU_SLOPE)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
             fmap.append(x)
         x = self.conv_post(x)
         fmap.append(x)
@@ -1163,7 +1163,7 @@ def forward(self, x, x_lengths, y, y_lengths, sid=None):
         m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
         logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
 
-        z_slice, ids_slice = Commons.rand_slice_segments(z, y_lengths, self.segment_size)
+        z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
         o = self.dec(z_slice, g=g)
         return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q), (x, logw, logw_)
 
@@ -1183,9 +1183,9 @@ def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_sca
         w = torch.exp(logw) * x_mask * length_scale
         w_ceil = torch.ceil(w)
         y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
-        y_mask = torch.unsqueeze(Commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
+        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
         attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
-        attn = Commons.generate_path(w_ceil, attn_mask)
+        attn = commons.generate_path(w_ceil, attn_mask)
 
         m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
         logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
diff --git a/EVT_Core/TTS/VITS/vits/Modules.py b/EVT_Core/TTS/VITS/VITS2_finetuning/Modules.py
similarity index 99%
rename from EVT_Core/TTS/VITS/vits/Modules.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/Modules.py
index 62da8a3..7a845a1 100644
--- a/EVT_Core/TTS/VITS/vits/Modules.py
+++ b/EVT_Core/TTS/VITS/VITS2_finetuning/Modules.py
@@ -5,8 +5,8 @@
 from torch.nn import Conv1d
 from torch.nn.utils import weight_norm, remove_weight_norm
 
-from .Commons import fused_add_tanh_sigmoid_multiply, init_weights, get_padding
-from .Transforms import piecewise_rational_quadratic_transform
+from commons import fused_add_tanh_sigmoid_multiply, init_weights, get_padding
+from transforms import piecewise_rational_quadratic_transform
 
 
 LRELU_SLOPE = 0.1
diff --git a/EVT_Core/TTS/VITS/vits/Transforms.py b/EVT_Core/TTS/VITS/VITS2_finetuning/Transforms.py
similarity index 100%
rename from EVT_Core/TTS/VITS/vits/Transforms.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/Transforms.py
diff --git a/EVT_Core/Train/VITS/vits/Utils.py b/EVT_Core/TTS/VITS/VITS2_finetuning/Utils.py
similarity index 90%
rename from EVT_Core/Train/VITS/vits/Utils.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/Utils.py
index a9e9f1c..d9d6485 100644
--- a/EVT_Core/Train/VITS/vits/Utils.py
+++ b/EVT_Core/TTS/VITS/VITS2_finetuning/Utils.py
@@ -1,4 +1,5 @@
 import os
+import re
 import glob
 import sys
 import shutil
@@ -11,6 +12,7 @@
 import matplotlib.pylab as plt
 import numpy as np
 import torch
+from typing import Optional
 from pathlib import Path
 
 
@@ -151,22 +153,6 @@ def load_audiopaths_sid_text(filename, split = "|"):
     return audiopaths_sid_text
 
 
-def get_hparams(
-    Config_Path: str,
-    Model_Dir: str
-):
-    if not os.path.exists(Model_Dir):
-        os.makedirs(Model_Dir)
-
-    with open(Config_Path, 'r', encoding = 'utf-8') as f:
-        data = f.read()
-    config = json.loads(data)
-
-    hparams = HParams(**config)
-    hparams.model_dir = Model_Dir
-    return hparams
-
-
 def add_elements(
     Iterable1,
     Iterable2
@@ -242,4 +228,34 @@ def __contains__(self, key):
         return key in self.__dict__
 
     def __repr__(self):
-        return self.__dict__.__repr__()
\ No newline at end of file
+        return self.__dict__.__repr__()
+
+
+def get_hparams(
+    Config_Path: str,
+    Model_Dir: Optional[str] = None
+):
+    with open(Config_Path, 'r', encoding = 'utf-8') as f:
+        data = f.read()
+    config = json.loads(data)
+    hparams = HParams(**config)
+
+    if Model_Dir is not None:
+        os.makedirs(Model_Dir) if not Path(Model_Dir).exists() else None
+        hparams.model_dir = Model_Dir
+
+    return hparams
+
+
+def Get_Config_Path(ConfigPath):
+    if Path(ConfigPath).is_dir():
+        ConfigPaths = [File for File in os.listdir(ConfigPath) if Path(File).suffix == '.json']
+        ConfigPath = sorted(ConfigPaths, key = lambda ConfigPath: re.sub(r'[A-Za-z]+', '', Path(ConfigPath).name))[-1]
+    return ConfigPath
+
+
+def Get_Model_Path(ModelPath):
+    if Path(ModelPath).is_dir():
+        ModelPaths = [File for File in os.listdir(ModelPath) if Path(File).suffix == '.pth' and 'G_' in File]
+        ModelPath = sorted(ModelPaths, key = lambda ModelPath: re.sub(r'G_[A-Za-z]+', '', Path(ModelPath).name))[-1]
+    return ModelPath
\ No newline at end of file
diff --git a/EVT_Core/TTS/VITS/vits/__init__.py b/EVT_Core/TTS/VITS/VITS2_finetuning/__init__.py
similarity index 100%
rename from EVT_Core/TTS/VITS/vits/__init__.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/__init__.py
diff --git a/EVT_Core/TTS/VITS/VITS2_finetuning/inference.py b/EVT_Core/TTS/VITS/VITS2_finetuning/inference.py
new file mode 100644
index 0000000..766e628
--- /dev/null
+++ b/EVT_Core/TTS/VITS/VITS2_finetuning/inference.py
@@ -0,0 +1,98 @@
+import os
+import re
+import argparse
+import langdetect
+#import IPython.display as ipd
+import torch
+from typing import Optional
+from pathlib import Path
+from scipy.io.wavfile import write
+from datetime import datetime
+
+from commons import intersperse
+from utils import get_hparams, load_checkpoint, Get_Config_Path, Get_Model_Path
+from models import SynthesizerTrn
+from text import text_to_sequence
+from text.symbols import symbols
+
+
+if torch.cuda.is_available() is True:
+    device = 'cuda:0'
+else:
+    device = 'cpu'
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--Config_Path_Load", type = str, default = "...")
+parser.add_argument("--Model_Path_Load", type = str, default = "...")
+parser.add_argument("--Text", type = str, default = "请输入语句")
+parser.add_argument("--Language", type = Optional[str], default = None)
+parser.add_argument("--Speaker", type = str, default = "...")
+parser.add_argument("--EmotionStrength", type = float, default = .667)
+parser.add_argument("--PhonemeDuration", type = float, default = 0.8)
+parser.add_argument("--SpeechRate", type = float, default = 1.)
+parser.add_argument("--Audio_Path_Save", type = str, default = "audio.wav")
+args = parser.parse_args()
+#logging.info(str(args))
+
+Config_Path_Load = Get_Config_Path(os.environ.get('FileList_Path_Training', args.Config_Path_Load))
+Model_Path_Load = Get_Model_Path(os.environ.get('Model_Path_Load', args.Model_Path_Load))
+Text = str(os.environ.get('Text', args.Text))
+Language = str(os.environ.get('Language', args.Language))
+Speaker = str(os.environ.get('Speaker', args.Speaker))
+EmotionStrength = float(os.environ.get('EmotionStrength', args.EmotionStrength))
+PhonemeDuration = float(os.environ.get('PhonemeDuration', args.PhonemeDuration))
+SpeechRate = float(os.environ.get('SpeechRate', args.SpeechRate))
+Audio_Path_Save = str(os.environ.get('Audio_Path_Save', args.Audio_Path_Save))
+
+os.remove(Audio_Path_Save) if Path(Audio_Path_Save).exists() else os.makedirs(Path(Audio_Path_Save).parent.__str__(), exist_ok = True)
+
+
+def Convert():
+    hps = get_hparams(Config_Path_Load)
+
+    net_g = SynthesizerTrn(
+        len(symbols),
+        80 if 'use_mel_posterior_encoder' in hps.model.keys() and hps.model.use_mel_posterior_encoder == True else hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        n_speakers=hps.data.n_speakers,
+        **hps.model).to(device)
+    _ = net_g.eval()
+
+    _ = load_checkpoint(Model_Path_Load, net_g, None)
+
+    def get_text(text, hps):
+        text_norm = text_to_sequence(text, hps.data.text_cleaners)
+        if hps.data.add_blank:
+            text_norm = intersperse(text_norm, 0)
+        text_norm = torch.LongTensor(text_norm)
+        return text_norm
+
+    def langdetector(text):  # from PolyLangVITS
+        try:
+            LangDict = {
+                'zh-cn': 'ZH',
+                'en':    'EN',
+                'ja':    'JA'
+            }
+            Lang = LangDict.get(langdetect.detect(text).lower())
+            return f'[{Lang}]{text}[{Lang}]'
+        except Exception as e:
+            raise Exception("Failed to detect language!")
+
+    stn_tst = get_text(
+        langdetector(re.sub(r"[\[\]\(\)\{\}]", "", Text)) if Language is not None else f"[{Language}]{Text}[{Language}]",
+        hps
+    )
+
+    with torch.no_grad():
+        x_tst = stn_tst.to(device).unsqueeze(0)
+        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
+        speakers = list(hps.speakers.keys()) if hasattr(hps.speakers, 'keys') else hps.speakers
+        sid = torch.LongTensor([speakers.index(Speaker)]).to(device) if Speaker is not None else 0
+        audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=EmotionStrength, noise_scale_w=PhonemeDuration, length_scale=SpeechRate)[0][0,0].data.cpu().float().numpy()
+        write(os.path.normpath(Audio_Path_Save), hps.data.sampling_rate, audio) #ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
+
+
+if __name__ == "__main__":
+    Convert()
\ No newline at end of file
diff --git a/EVT_Core/Train/VITS/vits/monotonic_align/Core.py b/EVT_Core/TTS/VITS/VITS2_finetuning/monotonic_align/Core.py
similarity index 100%
rename from EVT_Core/Train/VITS/vits/monotonic_align/Core.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/monotonic_align/Core.py
diff --git a/EVT_Core/Train/VITS/vits/monotonic_align/__init__.py b/EVT_Core/TTS/VITS/VITS2_finetuning/monotonic_align/__init__.py
similarity index 94%
rename from EVT_Core/Train/VITS/vits/monotonic_align/__init__.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/monotonic_align/__init__.py
index 3d3d289..81b52ab 100644
--- a/EVT_Core/Train/VITS/vits/monotonic_align/__init__.py
+++ b/EVT_Core/TTS/VITS/VITS2_finetuning/monotonic_align/__init__.py
@@ -1,7 +1,7 @@
 import numpy as np
 import torch
 
-from .Core import maximum_path_nb
+from .core import maximum_path_nb
 
 
 def maximum_path(neg_cent, mask):
diff --git a/EVT_Core/Train/VITS/vits/text/LICENSE b/EVT_Core/TTS/VITS/VITS2_finetuning/text/LICENSE
similarity index 100%
rename from EVT_Core/Train/VITS/vits/text/LICENSE
rename to EVT_Core/TTS/VITS/VITS2_finetuning/text/LICENSE
diff --git a/EVT_Core/Train/VITS/vits/text/__init__.py b/EVT_Core/TTS/VITS/VITS2_finetuning/text/__init__.py
similarity index 100%
rename from EVT_Core/Train/VITS/vits/text/__init__.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/text/__init__.py
diff --git a/EVT_Core/TTS/VITS/vits/text/cleaners.py b/EVT_Core/TTS/VITS/VITS2_finetuning/text/cleaners.py
similarity index 100%
rename from EVT_Core/TTS/VITS/vits/text/cleaners.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/text/cleaners.py
diff --git a/EVT_Core/TTS/VITS/vits/text/english.py b/EVT_Core/TTS/VITS/VITS2_finetuning/text/english.py
similarity index 100%
rename from EVT_Core/TTS/VITS/vits/text/english.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/text/english.py
diff --git a/EVT_Core/TTS/VITS/vits/text/japanese.py b/EVT_Core/TTS/VITS/VITS2_finetuning/text/japanese.py
similarity index 100%
rename from EVT_Core/TTS/VITS/vits/text/japanese.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/text/japanese.py
diff --git a/EVT_Core/TTS/VITS/vits/text/mandarin.py b/EVT_Core/TTS/VITS/VITS2_finetuning/text/mandarin.py
similarity index 100%
rename from EVT_Core/TTS/VITS/vits/text/mandarin.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/text/mandarin.py
diff --git a/EVT_Core/TTS/VITS/vits/text/symbols.py b/EVT_Core/TTS/VITS/VITS2_finetuning/text/symbols.py
similarity index 100%
rename from EVT_Core/TTS/VITS/vits/text/symbols.py
rename to EVT_Core/TTS/VITS/VITS2_finetuning/text/symbols.py
diff --git a/EVT_Core/TTS/VITS/vits/Commons.py b/EVT_Core/TTS/VITS/vits/Commons.py
deleted file mode 100644
index ce8d26f..0000000
--- a/EVT_Core/TTS/VITS/vits/Commons.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import torch
-from torch.nn import functional as F
-
-
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        m.weight.data.normal_(mean, std)
-
-
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size*dilation - dilation)/2)
-
-
-def intersperse(lst, item):
-    result = [item] * (len(lst) * 2 + 1)
-    result[1::2] = lst
-    return result
-
-
-def slice_segments(x, ids_str, segment_size=4):
-    ret = torch.zeros_like(x[:, :, :segment_size])
-    for i in range(x.size(0)):
-        idx_str = ids_str[i]
-        idx_end = idx_str + segment_size
-        ret[i] = x[i, :, idx_str:idx_end]
-    return ret
-
-
-def rand_slice_segments(x, x_lengths=None, segment_size=4):
-    b, d, t = x.size()
-    if x_lengths is None:
-        x_lengths = t
-    ids_str_max = x_lengths - segment_size + 1
-    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
-    ret = slice_segments(x, ids_str, segment_size)
-    return ret, ids_str
-
-
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
-    n_channels_int = n_channels[0]
-    in_act = input_a + input_b
-    t_act = torch.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
-    acts = t_act * s_act
-    return acts
-
-
-def convert_pad_shape(pad_shape):
-    l = pad_shape[::-1]
-    pad_shape = [item for sublist in l for item in sublist]
-    return pad_shape
-
-
-def sequence_mask(length, max_length=None):
-    if max_length is None:
-        max_length = length.max()
-    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
-    return x.unsqueeze(0) < length.unsqueeze(1)
-
-
-def generate_path(duration, mask):
-    """
-    duration: [b, 1, t_x]
-    mask: [b, 1, t_y, t_x]
-    """
-    device = duration.device
-
-    b, _, t_y, t_x = mask.shape
-    cum_duration = torch.cumsum(duration, -1)
-
-    cum_duration_flat = cum_duration.view(b * t_x)
-    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
-    path = path.view(b, t_x, t_y)
-    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
-    path = path.unsqueeze(1).transpose(2,3) * mask
-    return path
\ No newline at end of file
diff --git a/EVT_Core/TTS/VITS/vits/Utils.py b/EVT_Core/TTS/VITS/vits/Utils.py
deleted file mode 100644
index 4a6a17c..0000000
--- a/EVT_Core/TTS/VITS/vits/Utils.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import os
-import sys
-import logging
-logging.basicConfig(stream = sys.stdout, level = logging.DEBUG)
-logger = logging
-import json
-import torch
-
-
-MATPLOTLIB_FLAG = False
-
-
-def load_checkpoint(checkpoint_path, model, optimizer=None):
-    assert os.path.isfile(checkpoint_path)
-    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
-    iteration = checkpoint_dict['iteration']
-    learning_rate = checkpoint_dict['learning_rate']
-    if optimizer is not None:
-        optimizer.load_state_dict(checkpoint_dict['optimizer'])
-    saved_state_dict = checkpoint_dict['model']
-    if hasattr(model, 'module'):
-        state_dict = model.module.state_dict()
-    else:
-        state_dict = model.state_dict()
-    new_state_dict= {}
-    for k, v in state_dict.items():
-        try:
-            new_state_dict[k] = saved_state_dict[k]
-        except:
-            logger.info("%s is not in the checkpoint" % k)
-            new_state_dict[k] = v
-    if hasattr(model, 'module'):
-        model.module.load_state_dict(new_state_dict)
-    else:
-        model.load_state_dict(new_state_dict)
-    logger.info(f"Loaded checkpoint '{checkpoint_path}' (iteration {iteration})")
-    return model, optimizer, learning_rate, iteration
-
-
-def get_hparams_from_file(config_path):
-    with open(config_path, 'r', encoding = 'utf-8') as f:
-        data = f.read()
-    config = json.loads(data)
-
-    hparams = HParams(**config)
-    return hparams
-
-
-class HParams():
-    def __init__(self, **kwargs):
-        for k, v in kwargs.items():
-            if type(v) == dict:
-                v = HParams(**v)
-            self[k] = v
-
-    def keys(self):
-        return self.__dict__.keys()
-
-    def items(self):
-        return self.__dict__.items()
-
-    def values(self):
-        return self.__dict__.values()
-
-    def __len__(self):
-        return len(self.__dict__)
-
-    def __getitem__(self, key):
-        return getattr(self, key)
-
-    def __setitem__(self, key, value):
-        return setattr(self, key, value)
-
-    def __contains__(self, key):
-        return key in self.__dict__
-
-    def __repr__(self):
-        return self.__dict__.__repr__()
\ No newline at end of file
diff --git a/EVT_Core/Train/VITS/Train.py b/EVT_Core/Train/VITS/Train.py
index e13517f..843d8a2 100644
--- a/EVT_Core/Train/VITS/Train.py
+++ b/EVT_Core/Train/VITS/Train.py
@@ -1,718 +1,72 @@
-from typing import Optional
-from pathlib import Path
-from datetime import datetime
 import os
 import sys
-import re
-import json
-import platform
-import logging
-logging.basicConfig(stream = sys.stdout, encoding = 'utf-8')
-logging.getLogger('numba').setLevel(logging.WARNING)
-import torchaudio
-import torch
-from torch.nn import functional as F
-from torch.utils.data import DataLoader
-from torch.utils.tensorboard import SummaryWriter
-import torch.multiprocessing as mp
-import torch.distributed as dist
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.cuda.amp import autocast, GradScaler
-torch.backends.cudnn.benchmark = True
-from concurrent.futures import ThreadPoolExecutor
-from tqdm import tqdm
-
-from .vits.Data_Utils import (
-    TextAudioSpeakerLoader,
-    TextAudioSpeakerCollate,
-    DistributedBucketSampler
-)
-from .vits.Models import (
-    AVAILABLE_FLOW_TYPES,
-    AVAILABLE_DURATION_DISCRIMINATOR_TYPES,
-    SynthesizerTrn,
-    MultiPeriodDiscriminator,
-    DurationDiscriminatorV1,
-    DurationDiscriminatorV2
-)
-from .vits.Mel_Processing import (
-    mel_spectrogram_torch,
-    spec_to_mel_torch
-)
-from .vits.Commons import (
-    slice_segments,
-    clip_grad_value_
-)
-from .vits.Losses import (
-    generator_loss,
-    discriminator_loss,
-    feature_loss,
-    kl_loss
-)
-from .vits.Utils import (
-    load_audiopaths_sid_text,
-    plot_spectrogram_to_numpy,
-    summarize,
-    plot_alignment_to_numpy,
-    save_checkpoint,
-    get_logger,
-    add_elements,
-    #check_git_hash,
-    load_checkpoint,
-    remove_old_checkpoints,
-    latest_checkpoint_path,
-    get_hparams
-)
-from .vits.text import (
-    _clean_text,
-    #symbols
-)
-from .vits.text.symbols import symbols
-
-
-global_step = 0
-
-
-class Preprocessing:
-    '''
-    Preprocess
-    '''
-    def __init__(self,
-        FileList_Path_Training: str,
-        FileList_Path_Validation: str,
-        Config_Dir_Save: str = './',
-        Set_Eval_Interval: int = 1000,
-        Set_Epochs: int = 10000,
-        Set_Batch_Size: int = 16,
-        Set_FP16_Run: bool = True,
-        Keep_Original_Speakers: bool = False,
-        Config_Path_Load: Optional[str] = None
-    ):
-        self.FileList_Path_Training = FileList_Path_Training
-        self.FileList_Path_Validation = FileList_Path_Validation
-        self.Config_Dir_Save = Config_Dir_Save
-        self.Set_Eval_Interval = Set_Eval_Interval
-        self.Set_Epochs = Set_Epochs
-        self.Set_Batch_Size = Set_Batch_Size
-        self.Set_FP16_Run = Set_FP16_Run
-        self.Keep_Original_Speakers = Keep_Original_Speakers
-        self.Config_Path_Load = Config_Path_Load if Keep_Original_Speakers else None
-
-        os.makedirs(self.Config_Dir_Save, exist_ok = True)
-        self.Config_Path_Edited = Path(Config_Dir_Save).joinpath(f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json").__str__()
-        self.FileList_Path_Training_Updated = Path(self.Config_Path_Edited).parent.joinpath(Path(self.FileList_Path_Training).name).__str__()
-        self.FileList_Path_Validation_Updated = Path(self.Config_Path_Edited).parent.joinpath(Path(self.FileList_Path_Validation).name).__str__()
-        self.Out_Extension = "cleaned"
-
-    def Configurator(self):
-        '''
-        Edit JSON file
-        '''
-        def Get_Languages(Text_Path_Training, Text_Path_Validation):
-            Languages = []
-            for Text_Path in [Text_Path_Training, Text_Path_Validation]:
-                with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as File:
-                    Lines = File.readlines()
-                for _, Line in enumerate(Lines):
-                    Line_Text = Line.split('|', maxsplit = 2)[2]
-                    Language = re.split(r'[\[\]]', Line_Text)[1]
-                    Languages.append(Language) if Language not in Languages else None
-            if set(Languages).issubset({'ZH', 'EN', 'JA'}):
-                if set(Languages) == {'ZH'}:
-                    return "mandarin"
-                else:
-                    return "mandarin_english_japanese"
-            else:
-                raise Exception('Unsupported language!')
-
-        def Get_NewSpeakers(Text_Path_Training, Text_Path_Validation):
-            Speakers = []
-            for Text_Path in [Text_Path_Training, Text_Path_Validation]:
-                with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as File:
-                    Lines = File.readlines()
-                for _, Line in enumerate(Lines):
-                    Speaker = Line.split('|', maxsplit = 2)[1]
-                    Speakers.append(Speaker) if Speaker not in Speakers else None
-            return Speakers
-
-        def Get_OldSpeakers(Config_Path_Load):
-            if Config_Path_Load is not None and Path(Config_Path_Load).exists():
-                with open(file = Config_Path_Load, mode = 'rb') as ConfigFile_Extra:
-                    OldSpeakers = json.load(ConfigFile_Extra)["speakers"]
-            else:
-                OldSpeakers = []
-            return OldSpeakers
-
-        Language = Get_Languages(self.FileList_Path_Training, self.FileList_Path_Validation)
-        NewSpeakers = Get_NewSpeakers(self.FileList_Path_Training, self.FileList_Path_Validation)
-        OldSpeakers = Get_OldSpeakers(self.Config_Path_Load) if self.Keep_Original_Speakers else []
-
-        with open(file = Path(__file__).parent.joinpath('./configs', f'{Language}_base.json').__str__(), mode = 'rb') as ConfigFile_Default:
-            Params = json.load(ConfigFile_Default)
-        try:
-            Params_Old = Params
-            Params_Old["train"]["eval_interval"]   = self.Set_Eval_Interval
-            Params_Old["train"]["epochs"]          = self.Set_Epochs
-            Params_Old["train"]["batch_size"]      = self.Set_Batch_Size
-            Params_Old["train"]["fp16_run"]        = self.Set_FP16_Run
-            Params_Old["data"]["training_files"]   = f'{self.FileList_Path_Training_Updated}.{self.Out_Extension}'
-            Params_Old["data"]["validation_files"] = f'{self.FileList_Path_Validation_Updated}.{self.Out_Extension}'
-            Params_Old["data"]["text_cleaners"]    = [(Language + "_cleaners").lower()]
-            Params_Old["data"]["n_speakers"]       = add_elements(OldSpeakers, NewSpeakers).__len__()
-            Params_Old["speakers"]                 = add_elements(OldSpeakers, NewSpeakers)
-            Params_New = Params_Old
-        except:
-            raise Exception("Please check if params exist")
-        with open(self.Config_Path_Edited, 'w', encoding = 'utf-8') as File_New:
-            json.dump(Params_New, File_New, indent = 4)
-        print(f"Config created in {self.Config_Dir_Save}")
-
-    def Cleaner(self):
-        '''
-        Convert natural language text to symbols
-        '''
-        def Update_SID(Config_Path, Text_Path, Save_Path):
-            with open(file = Config_Path, mode = 'rb') as ConfigFile:
-                NewSpeakers = json.load(ConfigFile)["speakers"]
-            with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as TextFile:
-                Lines = TextFile.readlines()
-            for Index, Line in enumerate(Lines):
-                Line_Path = Line.split('|', maxsplit = 1)[0]
-                Line_Path = Path(Text_Path).parent.joinpath(Line_Path).as_posix() if not Path(Line_Path).is_absolute() else Line_Path
-                Speaker = Line.split('|', maxsplit = 2)[1]
-                SpeakerID = NewSpeakers.index(Speaker)
-                Line_Text = Line.split('|', maxsplit = 2)[2]
-                Line = f"{Line_Path}|{SpeakerID}|{Line_Text}"
-                Lines[Index] = Line
-            with open(file = Save_Path, mode = 'w', encoding = 'utf-8') as TextFile:
-                TextFile.writelines(Lines)
-
-        def Get_Cleaners(Config_Path):
-            with open(file = Config_Path, mode = 'rb') as ConfigFile:
-                NewCleaners = json.load(ConfigFile)["data"]["text_cleaners"]
-            return NewCleaners
-
-        for Index, FileList in enumerate([self.FileList_Path_Training, self.FileList_Path_Validation]):
-            print("START:", FileList)
-            FileList_Updated = [self.FileList_Path_Training_Updated, self.FileList_Path_Validation_Updated][Index]
-            Update_SID(self.Config_Path_Edited, FileList, FileList_Updated)
-            Path_SID_Text = load_audiopaths_sid_text(FileList_Updated)
-            for i in range(len(Path_SID_Text)):
-                Path_SID_Text[i][2] = _clean_text(Path_SID_Text[i][2], Get_Cleaners(self.Config_Path_Edited))
-            Filelist_Cleaned = FileList_Updated + "." + self.Out_Extension
-            with open(Filelist_Cleaned, 'w', encoding = 'utf-8') as f:
-                f.writelines(["|".join(x) + "\n" for x in Path_SID_Text])
-
-    def Resampler(self):
-        '''
-        Resample dataset audio to fit the sampling rate setting in config
-        '''
-        def Get_Resample_List(Config_Path, Text_Path):
-            ResampleList = []
-            with open(file = Config_Path, mode = 'rb') as ConfigFile:
-                SampleRate_New = json.load(ConfigFile)['data']['sampling_rate']
-            with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as TextFile:
-                Lines = TextFile.readlines()
-            for Line in Lines:
-                Line_Path = Line.split('|', maxsplit = 1)[0]
-                ResampleList.append((Line_Path, SampleRate_New))
-            return ResampleList
-
-        def Resample(Audio_Path, SampleRate_New):
-            AudioData_Old, SampleRate_Old = torchaudio.load(Audio_Path)
-            AudioData_New = torchaudio.transforms.Resample(orig_freq = SampleRate_Old, new_freq = SampleRate_New)(AudioData_Old)
-            torchaudio.save(Audio_Path, src = AudioData_New, sample_rate = SampleRate_New)
-
-        for FileList in (self.FileList_Path_Validation, self.FileList_Path_Training):
-            print("Resampling audio according to", FileList)
-            with ThreadPoolExecutor(max_workers = os.cpu_count()) as Executor:
-                Executor.map(
-                    Resample,
-                    *zip(*Get_Resample_List(self.Config_Path_Edited, FileList))
-                )
-
-
-class Training:
-    '''
-    Train
-    '''
-    def __init__(self,
-        Num_Workers: int = 4,
-        Model_Path_Pretrained_G: Optional[str] = None,
-        Model_Path_Pretrained_D: Optional[str] = None,
-        Keep_Original_Speakers: bool = False,
-        Log_Dir: str = "./"
-    ):
-        self.Num_Workers = Num_Workers
-        self.Model_Path_Pretrained_G = Model_Path_Pretrained_G
-        self.Model_Path_Pretrained_D = Model_Path_Pretrained_D
-        self.Keep_Original_Speakers = Keep_Original_Speakers
-        self.Log_Dir = Log_Dir
-
-        self.UsePretrainedModel = False if None in (self.Model_Path_Pretrained_G, self.Model_Path_Pretrained_D) else True
-
-    def evaluate(self,
-        hps, generator, eval_loader, writer_eval
-    ):
-        generator.eval()
-        with torch.no_grad():
-            for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(eval_loader):
-                x, x_lengths = x.cuda(0), x_lengths.cuda(0)
-                spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0)
-                y, y_lengths = y.cuda(0), y_lengths.cuda(0)
-                speakers = speakers.cuda(0)
-
-                # remove else
-                x = x[:1]
-                x_lengths = x_lengths[:1]
-                spec = spec[:1]
-                spec_lengths = spec_lengths[:1]
-                y = y[:1]
-                y_lengths = y_lengths[:1]
-                speakers = speakers[:1]
-                break
-            y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, max_len=1000)
-            y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length
-
-            mel = spec_to_mel_torch(
-                spec,
-                hps.data.filter_length,
-                hps.data.n_mel_channels,
-                hps.data.sampling_rate,
-                hps.data.mel_fmin,
-                hps.data.mel_fmax
-            ) if not (hps.model.use_mel_posterior_encoder or hps.data.use_mel_posterior_encoder) else spec
-            y_hat_mel = mel_spectrogram_torch(
-                y_hat.squeeze(1).float(),
-                hps.data.filter_length,
-                hps.data.n_mel_channels,
-                hps.data.sampling_rate,
-                hps.data.hop_length,
-                hps.data.win_length,
-                hps.data.mel_fmin,
-                hps.data.mel_fmax
-            )
-        image_dict = {"gen/mel": plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy())}
-        audio_dict = {"gen/audio": y_hat[0, :, :y_hat_lengths[0]]}
-        if global_step == 0:
-            image_dict.update({"gt/mel": plot_spectrogram_to_numpy(mel[0].cpu().numpy())})
-            audio_dict.update({"gt/audio": y[0, :, :y_lengths[0]]})
-
-        summarize(
-            writer=writer_eval,
-            global_step=global_step,
-            images=image_dict,
-            audios=audio_dict,
-            audio_sampling_rate=hps.data.sampling_rate
-        )
-        generator.train()
-
-    def train_and_evaluate(self,
-        rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers
-    ):
-        net_g, net_d, net_dur_disc = nets
-        optim_g, optim_d, optim_dur_disc = optims
-        scheduler_g, scheduler_d, scheduler_dur_disc = schedulers
-        train_loader, eval_loader = loaders
-        if writers is not None:
-            writer, writer_eval = writers
-
-        train_loader.batch_sampler.set_epoch(epoch)
-        global global_step
-
-        net_g.train()
-        net_d.train()
-        net_dur_disc.train() if net_dur_disc is not None else None
-
-        if rank == 0:
-            loader = tqdm(train_loader, desc='Loading train data')
-        else:
-            loader = train_loader
-
-        for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(loader):
-            if net_g.module.use_noise_scaled_mas:
-                current_mas_noise_scale = net_g.module.mas_noise_scale_initial - net_g.module.noise_scale_delta * global_step
-                net_g.module.current_mas_noise_scale = max(current_mas_noise_scale, 0.0)
-            x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True)
-            spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
-            y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True)
-            speakers = speakers.cuda(rank, non_blocking=True)
-
-            with autocast(enabled=hps.train.fp16_run):
-                y_hat, l_length, attn, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q), (hidden_x, logw, logw_) = net_g(x, x_lengths, spec, spec_lengths, speakers)
-
-                mel = spec_to_mel_torch(
-                    spec.float(),
-                    hps.data.filter_length,
-                    hps.data.n_mel_channels,
-                    hps.data.sampling_rate,
-                    hps.data.mel_fmin,
-                    hps.data.mel_fmax
-                ) if not (hps.model.use_mel_posterior_encoder or hps.data.use_mel_posterior_encoder) else spec
-                y_mel = slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
-                y_hat_mel = mel_spectrogram_torch(
-                    y_hat.squeeze(1),
-                    hps.data.filter_length,
-                    hps.data.n_mel_channels,
-                    hps.data.sampling_rate,
-                    hps.data.hop_length,
-                    hps.data.win_length,
-                    hps.data.mel_fmin,
-                    hps.data.mel_fmax
-                )
-                y = slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice
-
-                # Discriminator
-                y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
-                with autocast(enabled=False):
-                    loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
-                    loss_disc_all = loss_disc
-
-                # Duration Discriminator
-                if net_dur_disc is not None:
-                    y_dur_hat_r, y_dur_hat_g = net_dur_disc(hidden_x.detach(), x_mask.detach(), logw_.detach(), logw.detach())
-                    with autocast(enabled=False):
-                        # TODO: I think need to mean using the mask, but for now, just mean all
-                        loss_dur_disc, losses_dur_disc_r, losses_dur_disc_g = discriminator_loss(y_dur_hat_r, y_dur_hat_g)
-                        loss_dur_disc_all = loss_dur_disc
-                    optim_dur_disc.zero_grad()
-                    scaler.scale(loss_dur_disc_all).backward()
-                    scaler.unscale_(optim_dur_disc)
-                    grad_norm_dur_disc = clip_grad_value_(net_dur_disc.parameters(), None)
-                    scaler.step(optim_dur_disc)
-
-            optim_d.zero_grad()
-            scaler.scale(loss_disc_all).backward()
-            scaler.unscale_(optim_d)
-            grad_norm_d = clip_grad_value_(net_d.parameters(), None)
-            scaler.step(optim_d)
-
-            with autocast(enabled=hps.train.fp16_run):
-                # Generator
-                y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
-                if net_dur_disc is not None:
-                    y_dur_hat_r, y_dur_hat_g = net_dur_disc(hidden_x, x_mask, logw_, logw)
-                with autocast(enabled=False):
-                    loss_dur = torch.sum(l_length.float())
-                    loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
-                    loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
-
-                    loss_fm = feature_loss(fmap_r, fmap_g)
-                    loss_gen, losses_gen = generator_loss(y_d_hat_g)
-                    loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl
-                    if net_dur_disc is not None:
-                        loss_dur_gen, losses_dur_gen = generator_loss(y_dur_hat_g)
-                        loss_gen_all += loss_dur_gen
-
-            optim_g.zero_grad()
-            scaler.scale(loss_gen_all).backward()
-            scaler.unscale_(optim_g)
-            grad_norm_g = clip_grad_value_(net_g.parameters(), None)
-            scaler.step(optim_g)
-            scaler.update()
-
-            if rank == 0:
-                if global_step % hps.train.log_interval == 0:
-                    lr = optim_g.param_groups[0]['lr']
-                    losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl]
-                    logger.info('Train Epoch: {} [{:.0f}%]'.format(epoch, 100. * batch_idx / len(train_loader)))
-                    logger.info([x.item() for x in losses] + [global_step, lr])
-
-                    scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g}
-                    scalar_dict.update({"loss/dur_disc/total": loss_dur_disc_all, "grad_norm_dur_disc": grad_norm_dur_disc}) if net_dur_disc is not None else None
-                    scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl})
-
-                    scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
-                    scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
-                    scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
-
-                # if net_dur_disc is not None:
-                #   scalar_dict.update({"loss/dur_disc_r" : f"{losses_dur_disc_r}"})
-                #   scalar_dict.update({"loss/dur_disc_g" : f"{losses_dur_disc_g}"})
-                #   scalar_dict.update({"loss/dur_gen" : f"{loss_dur_gen}"})
-
-                    image_dict = {
-                        "slice/mel_org": plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
-                        "slice/mel_gen": plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
-                        "all/mel": plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
-                        "all/attn": plot_alignment_to_numpy(attn[0,0].data.cpu().numpy())
-                    }
-                    summarize(
-                        writer=writer,
-                        global_step=global_step,
-                        images=image_dict,
-                        scalars=scalar_dict)
-
-                if global_step % hps.train.eval_interval == 0:
-                    self.evaluate(hps, net_g, eval_loader, writer_eval)
-                    save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, Path(hps.model_dir).joinpath("G_{}.pth".format(global_step)).__str__())
-                    save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, Path(hps.model_dir).joinpath("D_{}.pth".format(global_step)).__str__())
-                    save_checkpoint(net_dur_disc, optim_dur_disc, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "DUR_{}.pth".format(global_step))) if net_dur_disc is not None else None
-
-                    remove_old_checkpoints(hps.model_dir, prefixes=["G_*.pth", "D_*.pth", "DUR_*.pth"])
-            global_step += 1
-
-        if rank == 0:
-            logger.info('====> Epoch: {}'.format(epoch))
-
-    def run(self, rank, n_gpus, hps):
-        global global_step
-        net_dur_disc = None
-        if rank == 0:
-            logger = get_logger(hps.model_dir)
-            #logger.info(hps)
-            #check_git_hash(hps.model_dir)
-            writer = SummaryWriter(log_dir = self.Log_Dir)
-            writer_eval = SummaryWriter(log_dir = Path(self.Log_Dir).joinpath("eval").__str__())
-
-        dist.init_process_group(
-            backend = 'gloo' if platform.system() == 'Windows' else 'nccl', # Windows不支持NCCL backend，故使用GLOO
-            init_method = 'env://',
-            world_size = n_gpus,
-            rank = rank
-        )
-
-        torch.manual_seed(hps.train.seed)
-        torch.cuda.set_device(rank)
-
-        if "use_mel_posterior_encoder" in hps.model.keys() and hps.model.use_mel_posterior_encoder == True:
-            print("Using mel posterior encoder for VITS2")
-            posterior_channels = 80  # vits2
-            hps.data.use_mel_posterior_encoder = True
-        else:
-            print("Using lin posterior encoder for VITS1")
-            posterior_channels = hps.data.filter_length // 2 + 1
-            hps.data.use_mel_posterior_encoder = False
-
-        train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data)
-        train_sampler = DistributedBucketSampler(
-            train_dataset,
-            hps.train.batch_size,
-            [32,300,400,500,600,700,800,900,1000],
-            num_replicas=n_gpus,
-            rank=rank,
-            shuffle=True
-        )
-        collate_fn = TextAudioSpeakerCollate()
-        train_loader = DataLoader(
-            train_dataset,
-            num_workers=self.Num_Workers,
-            shuffle=False,
-            pin_memory=True,
-            collate_fn=collate_fn,
-            batch_sampler=train_sampler
-        )
-        if rank == 0:
-            eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
-            eval_loader = DataLoader(
-                eval_dataset,
-                num_workers=0,
-                shuffle=False,
-                batch_size=hps.train.batch_size,
-                pin_memory=True,
-                drop_last=False,
-                collate_fn=collate_fn
-            )
-
-        # some of these flags are not being used in the code and directly set in hps json file.
-        # they are kept here for reference and prototyping.
-        if "use_transformer_flows" in hps.model.keys() and hps.model.use_transformer_flows == True:
-            use_transformer_flows = True
-            transformer_flow_type = hps.model.transformer_flow_type
-            print(f"Using transformer flows {transformer_flow_type} for VITS2")
-            assert transformer_flow_type in AVAILABLE_FLOW_TYPES, f"transformer_flow_type must be one of {AVAILABLE_FLOW_TYPES}"
-        else:
-            print("Using normal flows for VITS1")
-            use_transformer_flows = False
-
-        if "use_spk_conditioned_encoder" in hps.model.keys() and hps.model.use_spk_conditioned_encoder == True:
-            if hps.data.n_speakers == 0:
-                raise ValueError("n_speakers must be > 0 when using spk conditioned encoder to train multi-speaker model")
-            use_spk_conditioned_encoder = True
-        else:
-            print("Using normal encoder for VITS1")
-            use_spk_conditioned_encoder = False
-
-        if "use_noise_scaled_mas" in hps.model.keys() and hps.model.use_noise_scaled_mas == True:
-            print("Using noise scaled MAS for VITS2")
-            use_noise_scaled_mas = True
-            mas_noise_scale_initial = 0.01
-            noise_scale_delta = 2e-6
-        else:
-            print("Using normal MAS for VITS1")
-            use_noise_scaled_mas = False
-            mas_noise_scale_initial = 0.0
-            noise_scale_delta = 0.0
-
-        # Initialize VITS models and move to GPU
-        net_g = SynthesizerTrn(
-            len(symbols),
-            posterior_channels,
-            hps.train.segment_size // hps.data.hop_length,
-            n_speakers=hps.data.n_speakers,
-            mas_noise_scale_initial=mas_noise_scale_initial,
-            noise_scale_delta=noise_scale_delta,
-            **hps.model
-        ).cuda(rank)
-        net_d = MultiPeriodDiscriminator(
-            hps.model.use_spectral_norm
-        ).cuda(rank)
-        if "use_duration_discriminator" in hps.model.keys() and hps.model.use_duration_discriminator == True:
-            use_duration_discriminator = True
-
-            # add duration discriminator type here
-            duration_discriminator_type = getattr(hps.model, "duration_discriminator_type", "dur_disc_1")
-            print(f"Using duration_discriminator {duration_discriminator_type} for VITS2")
-            assert duration_discriminator_type in AVAILABLE_DURATION_DISCRIMINATOR_TYPES, f"duration_discriminator_type must be one of {AVAILABLE_DURATION_DISCRIMINATOR_TYPES}"
-            duration_discriminator_type = AVAILABLE_DURATION_DISCRIMINATOR_TYPES
-            if duration_discriminator_type == "dur_disc_1":
-                net_dur_disc = DurationDiscriminatorV1(
-                    hps.model.hidden_channels,
-                    hps.model.hidden_channels,
-                    3,
-                    0.1,
-                    gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0,
-                ).cuda(rank)
-            elif duration_discriminator_type == "dur_disc_2":
-                net_dur_disc = DurationDiscriminatorV2(
-                    hps.model.hidden_channels,
-                    hps.model.hidden_channels,
-                    3,
-                    0.1,
-                    gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0,
-                ).cuda(rank) 
-        else:
-            print("NOT using any duration discriminator like VITS1")
-            net_dur_disc = None
-            use_duration_discriminator = False
-
-        # Build optimizers for the initialized VITS models
-        optim_g = torch.optim.AdamW(
-            filter(lambda net_g_params: net_g_params.requires_grad, net_g.parameters()), # Filter out params which don't require gradient
-            hps.train.learning_rate,
-            betas=hps.train.betas,
-            eps=hps.train.eps
-        )
-        optim_d = torch.optim.AdamW(
-            net_d.parameters(),
-            hps.train.learning_rate,
-            betas=hps.train.betas,
-            eps=hps.train.eps
-        )
-        optim_dur_disc = torch.optim.AdamW(
-            net_dur_disc.parameters(),
-            hps.train.learning_rate,
-            betas=hps.train.betas,
-            eps=hps.train.eps
-        ) if net_dur_disc is not None else None
-
-        # Build DDP models for the initialized VITS models
-        net_g = DDP(net_g, device_ids = [rank], find_unused_parameters = True)
-        net_d = DDP(net_d, device_ids = [rank], find_unused_parameters = False)
-        net_dur_disc = DDP(net_dur_disc, device_ids=[rank]) if net_dur_disc is not None else None
-
-        # Load state dict from checkpoint for the initialized VITS models and get the optimizer, learning rate and iteration
-        try:
-            _, optim_g, lr_g, epoch_str = load_checkpoint(
-                self.Model_Path_Pretrained_G if self.UsePretrainedModel else latest_checkpoint_path(hps.model_dir, "G_*.pth"),
-                net_g,
-                optim_g,
-                self.Keep_Original_Speakers if self.UsePretrainedModel else True
-            )
-            _, optim_d, lr_d, epoch_str = load_checkpoint(
-                self.Model_Path_Pretrained_D if self.UsePretrainedModel else latest_checkpoint_path(hps.model_dir, "D_*.pth"),
-                net_d,
-                optim_d,
-                self.Keep_Original_Speakers if self.UsePretrainedModel else True
-            )
-            _, _, _, epoch_str = load_checkpoint(
-                latest_checkpoint_path(hps.model_dir, "DUR_*.pth"),
-                net_dur_disc,
-                optim_dur_disc
-            ) if net_dur_disc is not None else (_, _, _, epoch_str)
-
-            # To prevent KeyError: "param 'initial_lr' is not specified in param_groups[0] when resuming an optimizer"
-            if optim_g.param_groups[0].get('initial_lr') is None:
-                optim_g.param_groups[0]['initial_lr'] = lr_g
-            if optim_d.param_groups[0].get('initial_lr') is None:
-                optim_d.param_groups[0]['initial_lr'] = lr_d
+from typing import Optional
+from subprocess import Popen
+from pathlib import Path
 
-            global_step = (epoch_str - 1) * len(train_loader) # > 0
-            print(f"Continue from step {global_step}")
 
-        except Exception as e:
-            epoch_str = 1
-            global_step = 0
-            print(f"Got Exception: {e}. Start from step 0")
+current_dir = Path(__file__).absolute().parent.as_posix()
+os.chdir(current_dir)
+sys.path.insert(0, f"{current_dir}/VITS2_finetuning")
 
-        # Build learning rate schedulers for optimizers
-        scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma = hps.train.lr_decay, last_epoch = epoch_str - 2)
-        scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma = hps.train.lr_decay, last_epoch = epoch_str - 2)
-        scheduler_dur_disc = torch.optim.lr_scheduler.ExponentialLR(optim_dur_disc, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) if net_dur_disc is not None else None
 
-        # Build gradient scaler
-        scaler = GradScaler(enabled = hps.train.fp16_run)
+python_exec = sys.executable or "python"
 
-        # Start training (and evaluating)
-        for epoch in range(epoch_str, hps.train.epochs + 1):
-            if rank == 0:
-                self.train_and_evaluate(
-                    rank, epoch, hps, [net_g, net_d, net_dur_disc], [optim_g, optim_d, optim_dur_disc], [scheduler_g, scheduler_d, scheduler_dur_disc], scaler,
-                    [train_loader, eval_loader], logger, [writer, writer_eval]
-                )
-            else:
-                self.train_and_evaluate(
-                    rank, epoch, hps, [net_g, net_d, net_dur_disc], [optim_g, optim_d, optim_dur_disc], [scheduler_g, scheduler_d, scheduler_dur_disc], scaler,
-                    [train_loader, None], None, None
-                )
-            scheduler_g.step()
-            scheduler_d.step()
-            scheduler_dur_disc.step() if net_dur_disc is not None else None
+p_preprocess = None
+p_train = None
 
 
-class Voice_Training(Preprocessing, Training):
+def Train(
+    FileList_Path_Training: str = 'train.txt',
+    FileList_Path_Validation: str = 'val.txt',
+    Set_Epochs: int = 10000,
+    Set_Eval_Interval: int = 1000,
+    Set_Batch_Size: int = 16,
+    Set_FP16_Run: bool = True,
+    Keep_Original_Speakers: bool = False,
+    Config_Path_Load: Optional[str] = None,
+    Num_Workers: int = 4,
+    Use_PretrainedModels: bool = False,
+    Model_Path_Pretrained_G: str = 'pretrained_G.pth',
+    Model_Path_Pretrained_D: str = 'pretrained_D.pth',
+    Output_Root: str = './',
+    Output_Dir_Name: str = 'Output',
+    Output_Config_Name: str = 'Config.json',
+    Output_LogDir: str = './'
+):
     '''
-    1. Preprocess
-    2. Train & Evaluate
+    Train speech models
     '''
-    def __init__(self,
-        FileList_Path_Training: str,
-        FileList_Path_Validation: str,
-        Set_Epochs: int = 10000,
-        Set_Eval_Interval: int = 1000,
-        Set_Batch_Size: int = 16,
-        Set_FP16_Run: bool = True,
-        Keep_Original_Speakers: bool = False,
-        Config_Path_Load: Optional[str] = None,
-        Num_Workers: int = 4,
-        Use_PretrainedModels: bool = True,
-        Model_Path_Pretrained_G: Optional[str] = None,
-        Model_Path_Pretrained_D: Optional[str] = None,
-        Output_Root: str = "./",
-        Output_Dir_Name: str = "",
-        Output_LogDir: str = ""
-    ):
-        Dir_Output = Path(Output_Root).joinpath(Output_Dir_Name).as_posix()
-        Preprocessing.__init__(self, FileList_Path_Training, FileList_Path_Validation, Dir_Output, Set_Eval_Interval, Set_Epochs, Set_Batch_Size, Set_FP16_Run, Keep_Original_Speakers, Config_Path_Load)
-        Training.__init__(self, Num_Workers, Model_Path_Pretrained_G if Use_PretrainedModels else None, Model_Path_Pretrained_D if Use_PretrainedModels else None, Keep_Original_Speakers, Output_LogDir)
-        self.Model_Dir_Save = Dir_Output
-
-    def Preprocessing_and_Training(self):
-        # Preprocess
-        self.Configurator()
-        self.Cleaner()
-        self.Resampler()
-
-        # Train & Evaluate
-        """Assume Single Node Multi GPUs Training Only"""
-        assert torch.cuda.is_available(), "CPU training is not allowed."
-
-        n_gpus = torch.cuda.device_count()
-        os.environ['MASTER_ADDR'] = 'localhost'
-        os.environ['MASTER_PORT'] = '8000'
-
-        hps = get_hparams(
-            Config_Path = self.Config_Path_Edited,
-            Model_Dir = self.Model_Dir_Save
-        )
-        mp.spawn(super().run, args = (n_gpus, hps,), nprocs = n_gpus)
\ No newline at end of file
+    global p_preprocess
+    if p_preprocess is None:
+        os.environ['FileList_Path_Training'] = str(FileList_Path_Training)
+        os.environ['FileList_Path_Validation'] = str(FileList_Path_Validation)
+        os.environ['Set_Epochs'] = str(Set_Epochs)
+        os.environ['Set_Eval_Interval'] = str(Set_Eval_Interval)
+        os.environ['Set_Batch_Size'] = str(Set_Batch_Size)
+        os.environ['Set_FP16_Run'] = str(Set_FP16_Run)
+        os.environ['Keep_Original_Speakers'] = str(Keep_Original_Speakers)
+        os.environ['Config_Path_Load'] = str(Config_Path_Load)
+        os.environ['Output_Root'] = str(Output_Root)
+        os.environ['Output_Dir_Name'] = str(Output_Dir_Name)
+        os.environ['Output_Config_Name'] = str(Output_Config_Name)
+        print("Start preprocessing...")
+        p_preprocess = Popen(f'"{python_exec}" "VITS2_finetuning/preprocess.py"', shell = True)
+        p_preprocess.wait()
+        p_preprocess = None
+    else:
+        print("已有正在进行的预处理任务，需先终止才能开启下一次任务")
+
+    global p_train
+    if p_train is None:
+        os.environ['Num_Workers'] = str(Num_Workers)
+        os.environ['Use_PretrainedModels'] = str(Use_PretrainedModels)
+        os.environ['Model_Path_Pretrained_G'] = str(Model_Path_Pretrained_G)
+        os.environ['Model_Path_Pretrained_D'] = str(Model_Path_Pretrained_D)
+        os.environ['Output_LogDir'] = str(Output_LogDir)
+        print("Start training...")
+        p_train = Popen(f'"{python_exec}" "VITS2_finetuning/train.py"', shell = True)
+        p_train.wait()
+        p_train = None
+    else:
+        print("已有正在进行的训练任务，需先终止才能开启下一次任务")
\ No newline at end of file
diff --git a/EVT_Core/TTS/VITS/vits/Attentions.py b/EVT_Core/Train/VITS/VITS2_finetuning/Attentions.py
similarity index 58%
rename from EVT_Core/TTS/VITS/vits/Attentions.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/Attentions.py
index a0b43a3..4fc2a33 100644
--- a/EVT_Core/TTS/VITS/vits/Attentions.py
+++ b/EVT_Core/Train/VITS/VITS2_finetuning/Attentions.py
@@ -2,9 +2,10 @@
 import torch
 from torch import nn
 from torch.nn import functional as F
+from torch.nn.utils import remove_weight_norm, weight_norm
 
-from .Modules import LayerNorm
-from .Commons import convert_pad_shape
+from modules import LayerNorm
+from commons import subsequent_mask, convert_pad_shape, fused_add_tanh_sigmoid_multiply
 
 
 class MultiHeadAttention(nn.Module):
@@ -222,6 +223,150 @@ def _same_padding(self, x):
         return x
 
 
+class Depthwise_Separable_Conv1D(nn.Module):
+    def __init__(self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride = 1,
+        padding = 0,
+        dilation = 1,
+        bias = True,
+        padding_mode = 'zeros',  # TODO: refine this type
+        device=None,
+        dtype=None
+    ):
+        super().__init__()
+        self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, groups=in_channels,stride = stride,padding=padding,dilation=dilation,bias=bias,padding_mode=padding_mode,device=device,dtype=dtype)
+        self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, device=device,dtype=dtype)
+    
+    def forward(self, input):
+        return self.point_conv(self.depth_conv(input))
+
+    def weight_norm(self):
+        self.depth_conv = weight_norm(self.depth_conv, name = 'weight')
+        self.point_conv = weight_norm(self.point_conv, name = 'weight')
+
+    def remove_weight_norm(self):
+        self.depth_conv = remove_weight_norm(self.depth_conv, name = 'weight')
+        self.point_conv = remove_weight_norm(self.point_conv, name = 'weight')
+
+
+class Depthwise_Separable_TransposeConv1D(nn.Module):
+    def __init__(self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride = 1,
+        padding = 0, 
+        output_padding = 0,
+        bias = True,
+        dilation = 1,
+        padding_mode = 'zeros',  # TODO: refine this type
+        device=None,
+        dtype=None
+    ):
+        super().__init__()
+        self.depth_conv = nn.ConvTranspose1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, groups=in_channels,stride = stride,output_padding=output_padding,padding=padding,dilation=dilation,bias=bias,padding_mode=padding_mode,device=device,dtype=dtype)
+        self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, device=device,dtype=dtype)
+    
+    def forward(self, input):
+        return self.point_conv(self.depth_conv(input))
+
+    def weight_norm(self):
+        self.depth_conv = weight_norm(self.depth_conv, name = 'weight')
+        self.point_conv = weight_norm(self.point_conv, name = 'weight')
+
+    def remove_weight_norm(self):
+        remove_weight_norm(self.depth_conv, name = 'weight')
+        remove_weight_norm(self.point_conv, name = 'weight')
+
+
+def weight_norm_modules(module, name = 'weight', dim = 0):
+    if isinstance(module,Depthwise_Separable_Conv1D) or isinstance(module,Depthwise_Separable_TransposeConv1D):
+        module.weight_norm()
+        return module
+    else:
+        return weight_norm(module,name,dim)
+
+def remove_weight_norm_modules(module, name = 'weight'):
+    if isinstance(module,Depthwise_Separable_Conv1D) or isinstance(module,Depthwise_Separable_TransposeConv1D):
+        module.remove_weight_norm()
+    else:
+        remove_weight_norm(module,name)
+
+
+class FFT(nn.Module):
+  def __init__(self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers=1,
+        kernel_size=1,
+        p_dropout=0.,
+        proximal_bias=False,
+        proximal_init=True,
+        isflow = False,
+        **kwargs
+    ):
+    super().__init__()
+    self.hidden_channels = hidden_channels
+    self.filter_channels = filter_channels
+    self.n_heads = n_heads
+    self.n_layers = n_layers
+    self.kernel_size = kernel_size
+    self.p_dropout = p_dropout
+    self.proximal_bias = proximal_bias
+    self.proximal_init = proximal_init
+    if isflow and 'gin_channels' in kwargs and kwargs["gin_channels"] > 0:
+      cond_layer = torch.nn.Conv1d(kwargs["gin_channels"], 2*hidden_channels*n_layers, 1)
+      self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
+      self.cond_layer = weight_norm_modules(cond_layer, name='weight')
+      self.gin_channels = kwargs["gin_channels"]
+    self.drop = nn.Dropout(p_dropout)
+    self.self_attn_layers = nn.ModuleList()
+    self.norm_layers_0 = nn.ModuleList()
+    self.ffn_layers = nn.ModuleList()
+    self.norm_layers_1 = nn.ModuleList()
+    for i in range(self.n_layers):
+      self.self_attn_layers.append(
+        MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias,
+                           proximal_init=proximal_init))
+      self.norm_layers_0.append(LayerNorm(hidden_channels))
+      self.ffn_layers.append(
+        FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
+      self.norm_layers_1.append(LayerNorm(hidden_channels))
+
+  def forward(self, x, x_mask, g = None):
+    """
+    x: decoder input
+    h: encoder output
+    """
+    if g is not None:
+      g = self.cond_layer(g)
+
+    self_attn_mask = subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
+    x = x * x_mask
+    for i in range(self.n_layers):
+      if g is not None:
+        x = self.cond_pre(x)
+        cond_offset = i * 2 * self.hidden_channels
+        g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
+        x = fused_add_tanh_sigmoid_multiply(
+          x,
+          g_l,
+          torch.IntTensor([self.hidden_channels]))
+      y = self.self_attn_layers[i](x, x, self_attn_mask)
+      y = self.drop(y)
+      x = self.norm_layers_0[i](x + y)
+
+      y = self.ffn_layers[i](x, x_mask)
+      y = self.drop(y)
+      x = self.norm_layers_1[i](x + y)
+    x = x * x_mask
+    return x
+
+
 class Encoder(nn.Module): #backward compatible vits2 encoder
     def __init__(self,
         hidden_channels,
@@ -282,4 +427,64 @@ def forward(self, x, x_mask, g = None):
             y = self.drop(y)
             x = self.norm_layers_2[i](x + y)
         x = x * x_mask
+        return x
+
+class Decoder(nn.Module):
+    def __init__(self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.,
+        proximal_bias=False,
+        proximal_init=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+
+        self.drop = nn.Dropout(p_dropout)
+        self.self_attn_layers = nn.ModuleList()
+        self.norm_layers_0 = nn.ModuleList()
+        self.encdec_attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
+            self.norm_layers_0.append(LayerNorm(hidden_channels))
+            self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+    def forward(self, x, x_mask, h, h_mask):
+        """
+        x: decoder input
+        h: encoder output
+        """
+        self_attn_mask = subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
+        encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.self_attn_layers[i](x, x, self_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_0[i](x + y)
+
+            y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
         return x
\ No newline at end of file
diff --git a/EVT_Core/Train/VITS/VITS2_finetuning/Commons.py b/EVT_Core/Train/VITS/VITS2_finetuning/Commons.py
new file mode 100644
index 0000000..6990c50
--- /dev/null
+++ b/EVT_Core/Train/VITS/VITS2_finetuning/Commons.py
@@ -0,0 +1,153 @@
+import math
+import torch
+from torch.nn import functional as F
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+
+
+def intersperse(lst, item):
+    result = [item] * (len(lst) * 2 + 1)
+    result[1::2] = lst
+    return result
+
+
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+    """KL(P||Q)"""
+    kl = (logs_q - logs_p) - 0.5
+    kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
+    return kl
+
+
+def rand_gumbel(shape):
+    """Sample from the Gumbel distribution, protect from overflows."""
+    uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+    return -torch.log(-torch.log(uniform_samples))
+
+
+def rand_gumbel_like(x):
+    g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+    return g
+
+
+def slice_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = x_lengths - segment_size + 1
+    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
+
+
+def get_timing_signal_1d(
+        length, channels, min_timescale=1.0, max_timescale=1.0e4):
+    position = torch.arange(length, dtype=torch.float)
+    num_timescales = channels // 2
+    log_timescale_increment = (
+            math.log(float(max_timescale) / float(min_timescale)) /
+            (num_timescales - 1))
+    inv_timescales = min_timescale * torch.exp(
+            torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
+    scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+    signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+    signal = F.pad(signal, [0, 0, 0, channels % 2])
+    signal = signal.view(1, channels, length)
+    return signal
+
+
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return x + signal.to(dtype=x.dtype, device=x.device)
+
+
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+
+
+def subsequent_mask(length):
+    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+    return mask
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+
+
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+
+
+def shift_1d(x):
+    x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+    return x
+
+
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def generate_path(duration, mask):
+    """
+    duration: [b, 1, t_x]
+    mask: [b, 1, t_y, t_x]
+    """
+    device = duration.device
+
+    b, _, t_y, t_x = mask.shape
+    cum_duration = torch.cumsum(duration, -1)
+
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+    path = path.unsqueeze(1).transpose(2,3) * mask
+    return path
+
+
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    if clip_value is not None:
+        clip_value = float(clip_value)
+
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+        if clip_value is not None:
+            p.grad.data.clamp_(min=-clip_value, max=clip_value)
+    total_norm = total_norm ** (1. / norm_type)
+    return total_norm
\ No newline at end of file
diff --git a/EVT_Core/Train/VITS/vits/Data_Utils.py b/EVT_Core/Train/VITS/VITS2_finetuning/Data_Utils.py
similarity index 98%
rename from EVT_Core/Train/VITS/vits/Data_Utils.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/Data_Utils.py
index 86bb0b7..b9f4d27 100644
--- a/EVT_Core/Train/VITS/vits/Data_Utils.py
+++ b/EVT_Core/Train/VITS/VITS2_finetuning/Data_Utils.py
@@ -3,10 +3,10 @@
 import torch
 import torchaudio
 
-from .Commons import intersperse
-from .Mel_Processing import spectrogram_torch, mel_spectrogram_torch
-from .Utils import load_audiopaths_sid_text
-from .text import text_to_sequence, cleaned_text_to_sequence
+from commons import intersperse
+from mel_processing import spectrogram_torch, mel_spectrogram_torch
+from utils import load_audiopaths_sid_text
+from text import text_to_sequence, cleaned_text_to_sequence
 
 
 class TextAudioSpeakerLoader(torch.utils.data.Dataset):
diff --git a/EVT_Core/Train/VITS/vits/Losses.py b/EVT_Core/Train/VITS/VITS2_finetuning/Losses.py
similarity index 100%
rename from EVT_Core/Train/VITS/vits/Losses.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/Losses.py
diff --git a/EVT_Core/Train/VITS/vits/Mel_Processing.py b/EVT_Core/Train/VITS/VITS2_finetuning/Mel_Processing.py
similarity index 100%
rename from EVT_Core/Train/VITS/vits/Mel_Processing.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/Mel_Processing.py
diff --git a/EVT_Core/TTS/VITS/vits/Models.py b/EVT_Core/Train/VITS/VITS2_finetuning/Models.py
similarity index 71%
rename from EVT_Core/TTS/VITS/vits/Models.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/Models.py
index eda0914..c207b3c 100644
--- a/EVT_Core/TTS/VITS/vits/Models.py
+++ b/EVT_Core/Train/VITS/VITS2_finetuning/Models.py
@@ -2,13 +2,13 @@
 import torch
 from torch import nn
 from torch.nn import functional as F
-from torch.nn import Conv1d, ConvTranspose1d
-from torch.nn.utils import weight_norm, remove_weight_norm
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 
-from . import Modules
-from . import Attentions
-from . import Commons
-from . import monotonic_align
+import modules
+import attentions
+import commons
+import monotonic_align
 
 
 AVAILABLE_FLOW_TYPES = [
@@ -43,25 +43,25 @@ def __init__(self,
         self.n_flows = n_flows
         self.gin_channels = gin_channels
 
-        self.log_flow = Modules.Log()
+        self.log_flow = modules.Log()
         self.flows = nn.ModuleList()
-        self.flows.append(Modules.ElementwiseAffine(2))
+        self.flows.append(modules.ElementwiseAffine(2))
         for i in range(n_flows):
-            self.flows.append(Modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
-            self.flows.append(Modules.Flip())
+            self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
+            self.flows.append(modules.Flip())
 
         self.post_pre = nn.Conv1d(1, filter_channels, 1)
         self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.post_convs = Modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
+        self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
         self.post_flows = nn.ModuleList()
-        self.post_flows.append(Modules.ElementwiseAffine(2))
+        self.post_flows.append(modules.ElementwiseAffine(2))
         for i in range(4):
-            self.post_flows.append(Modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
-            self.post_flows.append(Modules.Flip())
+            self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
+            self.post_flows.append(modules.Flip())
 
         self.pre = nn.Conv1d(in_channels, filter_channels, 1)
         self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.convs = Modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
+        self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
         if gin_channels != 0:
             self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
 
@@ -131,9 +131,9 @@ def __init__(self,
 
         self.drop = nn.Dropout(p_dropout)
         self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
-        self.norm_1 = Modules.LayerNorm(filter_channels)
+        self.norm_1 = modules.LayerNorm(filter_channels)
         self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
-        self.norm_2 = Modules.LayerNorm(filter_channels)
+        self.norm_2 = modules.LayerNorm(filter_channels)
         self.proj = nn.Conv1d(filter_channels, 1, 1)
 
         if gin_channels != 0:
@@ -156,6 +156,151 @@ def forward(self, x, x_mask, g=None):
         return x * x_mask
 
 
+class DurationDiscriminatorV1(nn.Module): # vits2
+    # TODO : not using "spk conditioning" for now according to the paper.
+    # Can be a better discriminator if we use it.
+    def __init__(self,
+        in_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout,
+        gin_channels=0
+    ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+
+        self.drop = nn.Dropout(p_dropout)
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
+        # self.norm_1 = modules.LayerNorm(filter_channels)
+        self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
+        # self.norm_2 = modules.LayerNorm(filter_channels)
+        self.dur_proj = nn.Conv1d(1, filter_channels, 1)
+
+        self.pre_out_conv_1 = nn.Conv1d(2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
+        self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
+        self.pre_out_conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
+        self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
+
+        # if gin_channels != 0:
+        #   self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+
+        self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid())
+
+    def forward_probability(self, x, x_mask, dur, g=None):
+        dur = self.dur_proj(dur)
+        x = torch.cat([x, dur], dim=1)
+        x = self.pre_out_conv_1(x * x_mask)
+        # x = torch.relu(x)
+        # x = self.pre_out_norm_1(x)
+        # x = self.drop(x)
+        x = self.pre_out_conv_2(x * x_mask)
+        # x = torch.relu(x)
+        # x = self.pre_out_norm_2(x)
+        # x = self.drop(x)
+        x = x * x_mask
+        x = x.transpose(1, 2)
+        output_prob = self.output_layer(x)
+        return output_prob
+
+    def forward(self, x, x_mask, dur_r, dur_hat, g=None):
+        x = torch.detach(x)
+        # if g is not None:
+        #   g = torch.detach(g)
+        #   x = x + self.cond(g)
+        x = self.conv_1(x * x_mask)
+        # x = torch.relu(x)
+        # x = self.norm_1(x)
+        # x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        # x = torch.relu(x)
+        # x = self.norm_2(x)
+        # x = self.drop(x)
+
+        output_probs = []
+        for dur in [dur_r, dur_hat]:
+            output_prob = self.forward_probability(x, x_mask, dur, g)
+            output_probs.append(output_prob)
+
+        return output_probs
+
+
+class DurationDiscriminatorV2(nn.Module): # vits2
+    # TODO : not using "spk conditioning" for now according to the paper.
+    # Can be a better discriminator if we use it.
+    def __init__(
+        self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
+    ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+
+        self.conv_1 = nn.Conv1d(
+            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_1 = modules.LayerNorm(filter_channels)
+        self.conv_2 = nn.Conv1d(
+            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_2 = modules.LayerNorm(filter_channels)
+        self.dur_proj = nn.Conv1d(1, filter_channels, 1)
+
+        self.pre_out_conv_1 = nn.Conv1d(
+            2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
+        self.pre_out_conv_2 = nn.Conv1d(
+            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
+
+        # if gin_channels != 0:
+        #   self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+
+        self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid())
+
+    def forward_probability(self, x, x_mask, dur, g=None):
+        dur = self.dur_proj(dur)
+        x = torch.cat([x, dur], dim=1)
+        x = self.pre_out_conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.pre_out_norm_1(x)
+        x = self.pre_out_conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.pre_out_norm_2(x)
+        x = x * x_mask
+        x = x.transpose(1, 2)
+        output_prob = self.output_layer(x)
+        return output_prob
+
+    def forward(self, x, x_mask, dur_r, dur_hat, g=None):
+        x = torch.detach(x)
+        # if g is not None:
+        #   g = torch.detach(g)
+        #   x = x + self.cond(g)
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_1(x)
+        x = self.conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_2(x)
+
+        output_probs = []
+        for dur in [dur_r, dur_hat]:
+            output_prob = self.forward_probability(x, x_mask, dur, g)
+            output_probs.append([output_prob])
+
+        return output_probs
+
+
 class TextEncoder(nn.Module):
     def __init__(self,
         n_vocab,
@@ -184,7 +329,7 @@ def __init__(self,
         nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
 
         # Transformer Encoder
-        self.encoder = Attentions.Encoder(
+        self.encoder = attentions.Encoder(
             hidden_channels,
             filter_channels,
             n_heads,
@@ -198,7 +343,7 @@ def __init__(self,
     def forward(self, x, x_lengths, g=None):
         x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
         x = torch.transpose(x, 1, -1) # [b, h, t]
-        x_mask = torch.unsqueeze(Commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
 
         x = self.encoder(x * x_mask, x_mask, g=g)
         stats = self.proj(x) * x_mask
@@ -229,7 +374,7 @@ def __init__(self,
         self.mean_only = mean_only
 
         self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-        self.pre_transformer = Attentions.Encoder(
+        self.pre_transformer = attentions.Encoder(
             hidden_channels,
             hidden_channels,
             n_heads=2,
@@ -238,7 +383,7 @@ def __init__(self,
             p_dropout=p_dropout,
             # window_size=None,
         )
-        self.enc = Modules.WN(
+        self.enc = modules.WN(
             hidden_channels,
             kernel_size,
             dilation_rate,
@@ -294,7 +439,7 @@ def __init__(self,
         self.half_channels = channels // 2
         self.mean_only = mean_only
         # vits2
-        self.pre_transformer = Attentions.Encoder(
+        self.pre_transformer = attentions.Encoder(
             self.half_channels,
             self.half_channels,
             n_heads=2,
@@ -305,7 +450,7 @@ def __init__(self,
         )
 
         self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-        self.enc = Modules.WN(
+        self.enc = modules.WN(
             hidden_channels,
             kernel_size,
             dilation_rate,
@@ -314,7 +459,7 @@ def __init__(self,
             gin_channels=gin_channels,
         )
         # vits2
-        self.post_transformer = Attentions.Encoder(
+        self.post_transformer = attentions.Encoder(
             self.hidden_channels,
             self.hidden_channels,
             n_heads=2,
@@ -378,7 +523,7 @@ def __init__(self,
         self.mean_only = mean_only
 
         self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-        self.enc = Attentions.FFT(
+        self.enc = attentions.FFT(
             hidden_channels,
             filter_channels,
             n_heads,
@@ -431,7 +576,7 @@ def __init__(self,
         self.mean_only = mean_only
         self.residual_connection = residual_connection
         # vits2
-        self.pre_transformer = Attentions.Encoder(
+        self.pre_transformer = attentions.Encoder(
             self.half_channels,
             self.half_channels,
             n_heads=2,
@@ -534,7 +679,7 @@ def __init__(self,
                             mean_only=True
                         )
                     )
-                    self.flows.append(Modules.Flip())
+                    self.flows.append(modules.Flip())
             elif transformer_flow_type == "pre_conv2":
                 for i in range(n_flows):
                     self.flows.append(
@@ -548,7 +693,7 @@ def __init__(self,
                             mean_only=True,
                         )
                     )
-                    self.flows.append(Modules.Flip())
+                    self.flows.append(modules.Flip())
             elif transformer_flow_type == "fft":
                 for i in range(n_flows):
                     self.flows.append(
@@ -562,11 +707,11 @@ def __init__(self,
                             mean_only=True
                         )
                     )
-                    self.flows.append(Modules.Flip())
+                    self.flows.append(modules.Flip())
             elif transformer_flow_type == "mono_layer_inter_residual":
                 for i in range(n_flows):
                     self.flows.append(
-                        Modules.ResidualCouplingLayer(
+                        modules.ResidualCouplingLayer(
                             channels,
                             hidden_channels,
                             kernel_size,
@@ -576,7 +721,7 @@ def __init__(self,
                             mean_only=True
                         )
                     )
-                    self.flows.append(Modules.Flip())
+                    self.flows.append(modules.Flip())
                     self.flows.append(
                         MonoTransformerFlowLayer(
                             channels, hidden_channels, mean_only=True
@@ -585,7 +730,7 @@ def __init__(self,
         elif transformer_flow_type == "mono_layer_post_residual":
             for i in range(n_flows):
                 self.flows.append(
-                    Modules.ResidualCouplingLayer(
+                    modules.ResidualCouplingLayer(
                         channels,
                         hidden_channels,
                         kernel_size,
@@ -595,7 +740,7 @@ def __init__(self,
                         mean_only=True,
                     )
                 )
-                self.flows.append(Modules.Flip())
+                self.flows.append(modules.Flip())
                 self.flows.append(
                     MonoTransformerFlowLayer(
                         channels,
@@ -607,7 +752,7 @@ def __init__(self,
         else:
             for i in range(n_flows):
                 self.flows.append(
-                    Modules.ResidualCouplingLayer(
+                    modules.ResidualCouplingLayer(
                         channels,
                         hidden_channels,
                         kernel_size,
@@ -617,7 +762,7 @@ def __init__(self,
                         mean_only=True
                     )
                 )
-                self.flows.append(Modules.Flip())
+                self.flows.append(modules.Flip())
 
     def forward(self, x, x_mask, g=None, reverse=False):
         if not reverse:
@@ -649,11 +794,11 @@ def __init__(self,
         self.gin_channels = gin_channels
 
         self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
-        self.enc = Modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+        self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
 
     def forward(self, x, x_lengths, g=None): # x: LinearSpectrum; g: GlobalCondition
-        x_mask = torch.unsqueeze(Commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
         x = self.pre(x) * x_mask
         x = self.enc(x, x_mask, g=g)
         stats = self.proj(x) * x_mask
@@ -677,7 +822,7 @@ def __init__(self,
         self.num_kernels = len(resblock_kernel_sizes)
         self.num_upsamples = len(upsample_rates)
         self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
-        resblock = Modules.ResBlock1 if resblock == '1' else Modules.ResBlock2
+        resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
 
         self.ups = nn.ModuleList()
         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
@@ -694,7 +839,7 @@ def __init__(self,
                     self.resblocks.append(resblock(ch, k, d))
 
         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-        self.ups.apply(Commons.init_weights)
+        self.ups.apply(commons.init_weights)
 
         if gin_channels != 0:
             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
@@ -705,7 +850,7 @@ def forward(self, x, g=None):
             x = x + self.cond(g)
 
         for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, Modules.LRELU_SLOPE)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
             x = self.ups[i](x)
             xs = None
             for j in range(self.num_kernels):
@@ -728,6 +873,145 @@ def remove_weight_norm(self):
             l.remove_weight_norm()
 
 
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self,
+        period,
+        kernel_size=5,
+        stride=3,
+        use_spectral_norm=False
+    ):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(commons.get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(commons.get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(commons.get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(commons.get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(commons.get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11]
+
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
+        self.discriminators = nn.ModuleList(discs)
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
 class SynthesizerTrn(nn.Module):
     """
     Synthesizer for Training
@@ -879,7 +1163,7 @@ def forward(self, x, x_lengths, y, y_lengths, sid=None):
         m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
         logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
 
-        z_slice, ids_slice = Commons.rand_slice_segments(z, y_lengths, self.segment_size)
+        z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
         o = self.dec(z_slice, g=g)
         return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q), (x, logw, logw_)
 
@@ -899,9 +1183,9 @@ def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_sca
         w = torch.exp(logw) * x_mask * length_scale
         w_ceil = torch.ceil(w)
         y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
-        y_mask = torch.unsqueeze(Commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
+        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
         attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
-        attn = Commons.generate_path(w_ceil, attn_mask)
+        attn = commons.generate_path(w_ceil, attn_mask)
 
         m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
         logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
@@ -909,4 +1193,17 @@ def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_sca
         z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
         z = self.flow(z_p, y_mask, g=g, reverse=True)
         o = self.dec((z * y_mask)[:, :, :max_len], g=g)
-        return o, attn, y_mask, (z, z_p, m_p, logs_p)
\ No newline at end of file
+        return o, attn, y_mask, (z, z_p, m_p, logs_p)
+
+    ''' 
+    ## (obsolete) currently vits-2 is not capable of voice conversion
+    def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
+        assert self.n_speakers > 0, "n_speakers have to be larger than 0."
+        g_src = self.emb_g(sid_src).unsqueeze(-1)
+        g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
+        z_p = self.flow(z, y_mask, g=g_src)
+        z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
+        o_hat, o_hat_mb = self.dec(z_hat * y_mask, g=g_tgt)
+        return o_hat, o_hat_mb, y_mask, (z, z_p, z_hat)
+    '''
\ No newline at end of file
diff --git a/EVT_Core/Train/VITS/vits/Modules.py b/EVT_Core/Train/VITS/VITS2_finetuning/Modules.py
similarity index 99%
rename from EVT_Core/Train/VITS/vits/Modules.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/Modules.py
index 62da8a3..7a845a1 100644
--- a/EVT_Core/Train/VITS/vits/Modules.py
+++ b/EVT_Core/Train/VITS/VITS2_finetuning/Modules.py
@@ -5,8 +5,8 @@
 from torch.nn import Conv1d
 from torch.nn.utils import weight_norm, remove_weight_norm
 
-from .Commons import fused_add_tanh_sigmoid_multiply, init_weights, get_padding
-from .Transforms import piecewise_rational_quadratic_transform
+from commons import fused_add_tanh_sigmoid_multiply, init_weights, get_padding
+from transforms import piecewise_rational_quadratic_transform
 
 
 LRELU_SLOPE = 0.1
diff --git a/EVT_Core/Train/VITS/vits/Transforms.py b/EVT_Core/Train/VITS/VITS2_finetuning/Transforms.py
similarity index 100%
rename from EVT_Core/Train/VITS/vits/Transforms.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/Transforms.py
diff --git a/EVT_Core/Train/VITS/VITS2_finetuning/Utils.py b/EVT_Core/Train/VITS/VITS2_finetuning/Utils.py
new file mode 100644
index 0000000..d9d6485
--- /dev/null
+++ b/EVT_Core/Train/VITS/VITS2_finetuning/Utils.py
@@ -0,0 +1,261 @@
+import os
+import re
+import glob
+import sys
+import shutil
+import logging
+logging.basicConfig(stream = sys.stdout, encoding = 'utf-8')
+logger = logging
+import json
+import subprocess
+import matplotlib
+import matplotlib.pylab as plt
+import numpy as np
+import torch
+from typing import Optional
+from pathlib import Path
+
+
+MATPLOTLIB_FLAG = False
+
+
+def load_checkpoint(checkpoint_path, model, optimizer, keep_speaker_emb: bool = False):
+    assert os.path.isfile(checkpoint_path)
+    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
+    iteration = checkpoint_dict['iteration']
+    learning_rate = checkpoint_dict['learning_rate']
+    optimizer.load_state_dict(checkpoint_dict['optimizer']) if optimizer is not None else None
+    def get_new_state_dict(state_dict, saved_state_dict, keep_speaker_emb):
+        new_state_dict = {}
+        for layer_param, tensor in state_dict.items():
+            try: # Assign tensor of layer param from saved state dict to new state dict while layer param is not embedding's weight, otherwise use the current tensor
+                if layer_param == 'emb_g.weight':
+                    if keep_speaker_emb: # Keep the original speaker embedding, otherwise drop it
+                        tensor[:saved_state_dict[layer_param].shape[0], :] = saved_state_dict[layer_param]
+                    new_state_dict[layer_param] = tensor
+                else:
+                    new_state_dict[layer_param] = saved_state_dict[layer_param]
+            except:
+                logger.info("%s is not in the checkpoint" % layer_param)
+                new_state_dict[layer_param] = tensor
+        return new_state_dict
+    if hasattr(model, 'module'):
+        model.module.load_state_dict(get_new_state_dict(model.module.state_dict(), checkpoint_dict['model'], keep_speaker_emb))
+    else:
+        model.load_state_dict(get_new_state_dict(model.state_dict(), checkpoint_dict['model'], keep_speaker_emb))
+    logger.info(f"Loaded checkpoint '{checkpoint_path}' (iteration {iteration})")
+    return model, optimizer, learning_rate, iteration
+
+
+def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): # fix issue: torch.save doesn't support chinese path
+    logger.info(f"Saving model and optimizer state at iteration {iteration} to {checkpoint_path}")
+    if hasattr(model, 'module'):
+        state_dict = model.module.state_dict()
+    else:
+        state_dict = model.state_dict()
+    checkpoint_path_tmp = Path(Path(checkpoint_path).root).joinpath("checkpoint_tmp").as_posix()
+    torch.save(
+        {
+            'model': state_dict,
+            'iteration': iteration,
+            'optimizer': optimizer.state_dict(),
+            'learning_rate': learning_rate
+        },
+        checkpoint_path_tmp
+    )
+    shutil.move(checkpoint_path_tmp, Path(checkpoint_path).parent.as_posix())
+
+
+def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
+    for k, v in scalars.items():
+        writer.add_scalar(k, v, global_step)
+    for k, v in histograms.items():
+        writer.add_histogram(k, v, global_step)
+    for k, v in images.items():
+        writer.add_image(k, v, global_step, dataformats='HWC')
+    for k, v in audios.items():
+        writer.add_audio(k, v, global_step, audio_sampling_rate)
+
+
+def latest_checkpoint_path(dir_path, regex="G_*.pth"):
+    f_list = glob.glob(os.path.normpath(os.path.join(dir_path, regex)))
+    f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+    x = f_list[-1]
+    print(x)
+    return x
+
+
+def remove_old_checkpoints(cp_dir, prefixes=['G_*.pth', 'D_*.pth', 'DUR_*.pth']):
+    def scan_checkpoint(dir_path, regex):
+        f_list = glob.glob(os.path.join(dir_path, regex))
+        f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+        if len(f_list) == 0:
+            return None
+        return f_list
+    for prefix in prefixes:
+        sorted_ckpts = scan_checkpoint(cp_dir, prefix)
+        if sorted_ckpts and len(sorted_ckpts) > 3:
+            for ckpt_path in sorted_ckpts[:-3]:
+                os.remove(ckpt_path)
+                print("removed {}".format(ckpt_path))
+
+
+def plot_spectrogram_to_numpy(spectrogram):
+    global MATPLOTLIB_FLAG
+    if not MATPLOTLIB_FLAG:
+        matplotlib.use("Agg")
+        MATPLOTLIB_FLAG = True
+        mpl_logger = logging.getLogger('matplotlib')
+        mpl_logger.setLevel(logging.WARNING)
+
+    fig, ax = plt.subplots(figsize=(10,2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation='none')
+    plt.colorbar(im, ax=ax)
+    plt.xlabel("Frames")
+    plt.ylabel("Channels")
+    plt.tight_layout()
+
+    fig.canvas.draw()
+    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    plt.close()
+    return data
+
+
+def plot_alignment_to_numpy(alignment, info=None):
+    global MATPLOTLIB_FLAG
+    if not MATPLOTLIB_FLAG:
+        matplotlib.use("Agg")
+        MATPLOTLIB_FLAG = True
+        mpl_logger = logging.getLogger('matplotlib')
+        mpl_logger.setLevel(logging.WARNING)
+
+    fig, ax = plt.subplots(figsize=(6, 4))
+    im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', interpolation='none')
+    fig.colorbar(im, ax=ax)
+    xlabel = 'Decoder timestep'
+    if info is not None:
+            xlabel += '\n\n' + info
+    plt.xlabel(xlabel)
+    plt.ylabel('Encoder timestep')
+    plt.tight_layout()
+
+    fig.canvas.draw()
+    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
+    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    plt.close()
+    return data
+
+
+def load_audiopaths_sid_text(filename, split = "|"):
+    with open(filename, 'r', encoding = 'utf-8') as f:
+        audiopaths_sid_text = [line.strip().split(split) for line in f]
+    return audiopaths_sid_text
+
+
+def add_elements(
+    Iterable1,
+    Iterable2
+):
+    '''
+    Add unique elements form Iterable2 to Iterable1
+    '''
+    def GetDictKeys(Iterable):
+        return sorted(Iterable.keys(), key = lambda Key: Iterable[Key]) if isinstance(Iterable, dict) else Iterable
+    Iterable1, Iterable2 = GetDictKeys(Iterable1), GetDictKeys(Iterable2)
+    for Element in Iterable2:
+        Iterable1.append(Element) if Element not in Iterable1 else None
+    return Iterable1
+
+
+def check_git_hash(model_dir):
+    source_dir = os.path.dirname(os.path.realpath(__file__))
+    if not os.path.exists(os.path.normpath(os.path.join(source_dir, ".git"))):
+        logger.warn(f"{source_dir} is not a git repository, therefore hash value comparison will be ignored.")
+        return
+
+    cur_hash = subprocess.getoutput("git rev-parse HEAD")
+
+    path = os.path.normpath(os.path.join(model_dir, "githash"))
+    if os.path.exists(path):
+        saved_hash = open(path).read()
+        if saved_hash != cur_hash:
+            logger.warn("git hash values are different. {}(saved) != {}(current)".format(saved_hash[:8], cur_hash[:8]))
+    else:
+        open(path, "w").write(cur_hash)
+
+
+def get_logger(model_dir, filename="train.log"):
+    global logger
+    logger = logging.getLogger(os.path.basename(model_dir))
+    logger.setLevel(logging.DEBUG)
+
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    handler = logging.FileHandler(os.path.normpath(os.path.join(model_dir, filename)))
+    handler.setLevel(logging.DEBUG)
+    handler.setFormatter(logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s"))
+    logger.addHandler(handler)
+    return logger
+
+
+class HParams():
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if type(v) == dict:
+                v = HParams(**v)
+            self[k] = v
+
+    def keys(self):
+        return self.__dict__.keys()
+
+    def items(self):
+        return self.__dict__.items()
+
+    def values(self):
+        return self.__dict__.values()
+
+    def __len__(self):
+        return len(self.__dict__)
+
+    def __getitem__(self, key):
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+
+    def __contains__(self, key):
+        return key in self.__dict__
+
+    def __repr__(self):
+        return self.__dict__.__repr__()
+
+
+def get_hparams(
+    Config_Path: str,
+    Model_Dir: Optional[str] = None
+):
+    with open(Config_Path, 'r', encoding = 'utf-8') as f:
+        data = f.read()
+    config = json.loads(data)
+    hparams = HParams(**config)
+
+    if Model_Dir is not None:
+        os.makedirs(Model_Dir) if not Path(Model_Dir).exists() else None
+        hparams.model_dir = Model_Dir
+
+    return hparams
+
+
+def Get_Config_Path(ConfigPath):
+    if Path(ConfigPath).is_dir():
+        ConfigPaths = [File for File in os.listdir(ConfigPath) if Path(File).suffix == '.json']
+        ConfigPath = sorted(ConfigPaths, key = lambda ConfigPath: re.sub(r'[A-Za-z]+', '', Path(ConfigPath).name))[-1]
+    return ConfigPath
+
+
+def Get_Model_Path(ModelPath):
+    if Path(ModelPath).is_dir():
+        ModelPaths = [File for File in os.listdir(ModelPath) if Path(File).suffix == '.pth' and 'G_' in File]
+        ModelPath = sorted(ModelPaths, key = lambda ModelPath: re.sub(r'G_[A-Za-z]+', '', Path(ModelPath).name))[-1]
+    return ModelPath
\ No newline at end of file
diff --git a/EVT_Core/Train/VITS/vits/__init__.py b/EVT_Core/Train/VITS/VITS2_finetuning/__init__.py
similarity index 100%
rename from EVT_Core/Train/VITS/vits/__init__.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/__init__.py
diff --git a/EVT_Core/Train/VITS/configs/mandarin_base.json b/EVT_Core/Train/VITS/VITS2_finetuning/configs/mandarin_base.json
similarity index 100%
rename from EVT_Core/Train/VITS/configs/mandarin_base.json
rename to EVT_Core/Train/VITS/VITS2_finetuning/configs/mandarin_base.json
diff --git a/EVT_Core/Train/VITS/configs/mandarin_english_japanese_base.json b/EVT_Core/Train/VITS/VITS2_finetuning/configs/mandarin_english_japanese_base.json
similarity index 100%
rename from EVT_Core/Train/VITS/configs/mandarin_english_japanese_base.json
rename to EVT_Core/Train/VITS/VITS2_finetuning/configs/mandarin_english_japanese_base.json
diff --git a/EVT_Core/TTS/VITS/vits/monotonic_align/Core.py b/EVT_Core/Train/VITS/VITS2_finetuning/monotonic_align/Core.py
similarity index 77%
rename from EVT_Core/TTS/VITS/vits/monotonic_align/Core.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/monotonic_align/Core.py
index 11aba18..1c44515 100644
--- a/EVT_Core/TTS/VITS/vits/monotonic_align/Core.py
+++ b/EVT_Core/Train/VITS/VITS2_finetuning/monotonic_align/Core.py
@@ -3,8 +3,8 @@
 
 # Compile a Python function into native code
 @nb.jit(
-    #  func(      paths,             values,            t_ys,           t_xs    )
-    nb.void(nb.int32[:,:,::1], nb.float32[:,:,::1], nb.int32[::1], nb.int32[::1]),
+    #  func(        paths,             values,            t_ys,            t_xs     )
+    nb.void(nb.int32[:, :, ::1], nb.float32[:, :, ::1], nb.int32[::1], nb.int32[::1]),
     nogil = True,
     nopython = True
 )
@@ -39,17 +39,17 @@ def maximum_path_nb(paths, values, t_ys, t_xs):
             if x == y:
                 v_cur = max_neg_val
             else:
-                v_cur = value[y-1, x]
+                v_cur = value[y - 1, x]
             if x == 0:
                 if y == 0:
                     v_prev = 0.
                 else:
                     v_prev = max_neg_val
             else:
-                v_prev = value[y-1, x-1]
+                v_prev = value[y - 1, x - 1]
             value[y, x] += max(v_prev, v_cur)
 
     for y in range(t_y - 1, -1, -1):
         path[y, index] = 1
-        if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
+        if index != 0 and (index == y or value[y - 1, index] < value[y - 1, index - 1]):
             index = index - 1
\ No newline at end of file
diff --git a/EVT_Core/TTS/VITS/vits/monotonic_align/__init__.py b/EVT_Core/Train/VITS/VITS2_finetuning/monotonic_align/__init__.py
similarity index 89%
rename from EVT_Core/TTS/VITS/vits/monotonic_align/__init__.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/monotonic_align/__init__.py
index 19b25b7..81b52ab 100644
--- a/EVT_Core/TTS/VITS/vits/monotonic_align/__init__.py
+++ b/EVT_Core/Train/VITS/VITS2_finetuning/monotonic_align/__init__.py
@@ -1,11 +1,11 @@
 import numpy as np
 import torch
 
-from .Core import maximum_path_nb
+from .core import maximum_path_nb
 
 
 def maximum_path(neg_cent, mask):
-    """ Cython optimized version.
+    """ Numba optimized version.
     neg_cent: [b, t_t, t_s]
     mask: [b, t_t, t_s]
     """
diff --git a/EVT_Core/Train/VITS/VITS2_finetuning/preprocess.py b/EVT_Core/Train/VITS/VITS2_finetuning/preprocess.py
new file mode 100644
index 0000000..7b585bb
--- /dev/null
+++ b/EVT_Core/Train/VITS/VITS2_finetuning/preprocess.py
@@ -0,0 +1,186 @@
+import os
+import re
+import argparse
+import json
+import torchaudio
+from typing import Optional
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+
+from utils import (
+    load_audiopaths_sid_text,
+    add_elements
+)
+from text import (
+    _clean_text
+)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--FileList_Path_Training", type = str, default = "train.txt")
+parser.add_argument("--FileList_Path_Validation", type = str, default = "val.txt")
+parser.add_argument("--Set_Epochs", type = int, default = 10000)
+parser.add_argument("--Set_Eval_Interval", type = int, default = 1000)
+parser.add_argument("--Set_Batch_Size", type = int, default = 16)
+parser.add_argument("--Set_FP16_Run", type = bool, default = True)
+parser.add_argument("--Keep_Original_Speakers", type = bool, default = False)
+parser.add_argument("--Config_Path_Load", type = Optional[str], default = None)
+parser.add_argument("--Output_Root", type = str, default = "./")
+parser.add_argument("--Output_Dir_Name", type = str, default = "Output")
+parser.add_argument("--Output_Config_Name", type = str, default = "Config.json")
+args = parser.parse_args()
+
+FileList_Path_Training = str(os.environ.get('FileList_Path_Training', str(args.FileList_Path_Training)))
+FileList_Path_Validation = str(os.environ.get('FileList_Path_Validation', str(args.FileList_Path_Validation)))
+Set_Epochs = int(os.environ.get('Set_Epochs', str(args.Set_Epochs)))
+Set_Eval_Interval = int(os.environ.get('Set_Eval_Interval', str(args.Set_Eval_Interval)))
+Set_Batch_Size = int(os.environ.get('Set_Batch_Size', str(args.Set_Batch_Size)))
+Set_FP16_Run = eval(os.environ.get('Set_FP16_Run', str(args.Set_FP16_Run)))
+Keep_Original_Speakers = eval(os.environ.get('Keep_Original_Speakers', str(args.Keep_Original_Speakers)))
+Config_Path_Load = str(os.environ.get('Config_Path_Load', str(args.Config_Path_Load))) if Keep_Original_Speakers else None
+Output_Root = str(os.environ.get('Output_Root', str(args.Output_Root)))
+Output_Dir_Name = str(os.environ.get('Output_Dir_Name', str(args.Output_Dir_Name)))
+Output_Config_Name = str(os.environ.get('Output_Config_Name', str(args.Output_Config_Name)))
+
+Dir_Output = Path(Output_Root).joinpath(Output_Dir_Name).as_posix()
+os.makedirs(Dir_Output, exist_ok = True)
+Config_Path_Edited = Path(Dir_Output).joinpath(Output_Config_Name).__str__()
+FileList_Path_Training_Updated = Path(Config_Path_Edited).parent.joinpath(Path(FileList_Path_Training).name).__str__()
+FileList_Path_Validation_Updated = Path(Config_Path_Edited).parent.joinpath(Path(FileList_Path_Validation).name).__str__()
+Out_Extension = "cleaned"
+
+
+def Configurator():
+    '''
+    Edit JSON file
+    '''
+    def Get_Languages(Text_Path_Training, Text_Path_Validation):
+        Languages = []
+        for Text_Path in [Text_Path_Training, Text_Path_Validation]:
+            with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as File:
+                Lines = File.readlines()
+            for _, Line in enumerate(Lines):
+                Line_Text = Line.split('|', maxsplit = 2)[2]
+                Language = re.split(r'[\[\]]', Line_Text)[1]
+                Languages.append(Language) if Language not in Languages else None
+        if set(Languages).issubset({'ZH', 'EN', 'JA'}):
+            if set(Languages) == {'ZH'}:
+                return "mandarin"
+            else:
+                return "mandarin_english_japanese"
+        else:
+            raise Exception('Unsupported language!')
+
+    def Get_NewSpeakers(Text_Path_Training, Text_Path_Validation):
+        Speakers = []
+        for Text_Path in [Text_Path_Training, Text_Path_Validation]:
+            with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as File:
+                Lines = File.readlines()
+            for _, Line in enumerate(Lines):
+                Speaker = Line.split('|', maxsplit = 2)[1]
+                Speakers.append(Speaker) if Speaker not in Speakers else None
+        return Speakers
+
+    def Get_OldSpeakers(Config_Path_Load):
+        if Config_Path_Load is not None and Path(Config_Path_Load).exists():
+            with open(file = Config_Path_Load, mode = 'rb') as ConfigFile_Extra:
+                OldSpeakers = json.load(ConfigFile_Extra)["speakers"]
+        else:
+            OldSpeakers = []
+        return OldSpeakers
+
+    Language = Get_Languages(FileList_Path_Training, FileList_Path_Validation)
+    NewSpeakers = Get_NewSpeakers(FileList_Path_Training, FileList_Path_Validation)
+    OldSpeakers = Get_OldSpeakers(Config_Path_Load) if Keep_Original_Speakers else []
+
+    with open(file = Path(__file__).parent.joinpath('./configs', f'{Language}_base.json').__str__(), mode = 'rb') as ConfigFile_Default:
+        Params = json.load(ConfigFile_Default)
+    try:
+        Params_Old = Params
+        Params_Old["train"]["eval_interval"]   = Set_Eval_Interval
+        Params_Old["train"]["epochs"]          = Set_Epochs
+        Params_Old["train"]["batch_size"]      = Set_Batch_Size
+        Params_Old["train"]["fp16_run"]        = Set_FP16_Run
+        Params_Old["data"]["training_files"]   = f'{FileList_Path_Training_Updated}.{Out_Extension}'
+        Params_Old["data"]["validation_files"] = f'{FileList_Path_Validation_Updated}.{Out_Extension}'
+        Params_Old["data"]["text_cleaners"]    = [(Language + "_cleaners").lower()]
+        Params_Old["data"]["n_speakers"]       = add_elements(OldSpeakers, NewSpeakers).__len__()
+        Params_Old["speakers"]                 = add_elements(OldSpeakers, NewSpeakers)
+        Params_New = Params_Old
+    except:
+        raise Exception("Please check if params exist")
+    with open(Config_Path_Edited, 'w', encoding = 'utf-8') as File_New:
+        json.dump(Params_New, File_New, indent = 4)
+    print(f"Config created in {Dir_Output}")
+
+
+def Cleaner():
+    '''
+    Convert natural language text to symbols
+    '''
+    def Update_SID(Config_Path, Text_Path, Save_Path):
+        with open(file = Config_Path, mode = 'rb') as ConfigFile:
+            NewSpeakers = json.load(ConfigFile)["speakers"]
+        with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as TextFile:
+            Lines = TextFile.readlines()
+        for Index, Line in enumerate(Lines):
+            Line_Path = Line.split('|', maxsplit = 1)[0]
+            Line_Path = Path(Text_Path).parent.joinpath(Line_Path).as_posix() if not Path(Line_Path).is_absolute() else Line_Path
+            Speaker = Line.split('|', maxsplit = 2)[1]
+            SpeakerID = NewSpeakers.index(Speaker)
+            Line_Text = Line.split('|', maxsplit = 2)[2]
+            Line = f"{Line_Path}|{SpeakerID}|{Line_Text}"
+            Lines[Index] = Line
+        with open(file = Save_Path, mode = 'w', encoding = 'utf-8') as TextFile:
+            TextFile.writelines(Lines)
+
+    def Get_Cleaners(Config_Path):
+        with open(file = Config_Path, mode = 'rb') as ConfigFile:
+            NewCleaners = json.load(ConfigFile)["data"]["text_cleaners"]
+        return NewCleaners
+
+    for Index, FileList in enumerate([FileList_Path_Training, FileList_Path_Validation]):
+        print("START:", FileList)
+        FileList_Updated = [FileList_Path_Training_Updated, FileList_Path_Validation_Updated][Index]
+        Update_SID(Config_Path_Edited, FileList, FileList_Updated)
+        Path_SID_Text = load_audiopaths_sid_text(FileList_Updated)
+        for i in range(len(Path_SID_Text)):
+            Path_SID_Text[i][2] = _clean_text(Path_SID_Text[i][2], Get_Cleaners(Config_Path_Edited))
+        Filelist_Cleaned = FileList_Updated + "." + Out_Extension
+        with open(Filelist_Cleaned, 'w', encoding = 'utf-8') as f:
+            f.writelines(["|".join(x) + "\n" for x in Path_SID_Text])
+
+
+def Resampler():
+    '''
+    Resample dataset audio to fit the sampling rate setting in config
+    '''
+    def Get_Resample_List(Config_Path, Text_Path):
+        ResampleList = []
+        with open(file = Config_Path, mode = 'rb') as ConfigFile:
+            SampleRate_New = json.load(ConfigFile)['data']['sampling_rate']
+        with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as TextFile:
+            Lines = TextFile.readlines()
+        for Line in Lines:
+            Line_Path = Line.split('|', maxsplit = 1)[0]
+            ResampleList.append((Line_Path, SampleRate_New))
+        return ResampleList
+
+    def Resample(Audio_Path, SampleRate_New):
+        AudioData_Old, SampleRate_Old = torchaudio.load(Audio_Path)
+        AudioData_New = torchaudio.transforms.Resample(orig_freq = SampleRate_Old, new_freq = SampleRate_New)(AudioData_Old)
+        torchaudio.save(Audio_Path, src = AudioData_New, sample_rate = SampleRate_New)
+
+    for FileList in (FileList_Path_Validation, FileList_Path_Training):
+        print("Resampling audio according to", FileList)
+        with ThreadPoolExecutor(max_workers = os.cpu_count()) as Executor:
+            Executor.map(
+                Resample,
+                *zip(*Get_Resample_List(Config_Path_Edited, FileList))
+            )
+
+
+if __name__ == "__main__":
+    Configurator()
+    Cleaner()
+    Resampler()
\ No newline at end of file
diff --git a/EVT_Core/Train/VITS/VITS2_finetuning/text/LICENSE b/EVT_Core/Train/VITS/VITS2_finetuning/text/LICENSE
new file mode 100644
index 0000000..4ad4ed1
--- /dev/null
+++ b/EVT_Core/Train/VITS/VITS2_finetuning/text/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2017 Keith Ito
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/EVT_Core/TTS/VITS/vits/text/__init__.py b/EVT_Core/Train/VITS/VITS2_finetuning/text/__init__.py
similarity index 60%
rename from EVT_Core/TTS/VITS/vits/text/__init__.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/text/__init__.py
index 17057e4..e56b1e5 100644
--- a/EVT_Core/TTS/VITS/vits/text/__init__.py
+++ b/EVT_Core/Train/VITS/VITS2_finetuning/text/__init__.py
@@ -5,6 +5,7 @@
 
 # Mappings from symbol to numeric ID and vice versa:
 _symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
 
 
 def _clean_text(text, cleaner_names):
@@ -32,4 +33,24 @@ def text_to_sequence(text, cleaner_names):
             continue
         symbol_id = _symbol_to_id[symbol]
         sequence += [symbol_id]
-    return sequence
\ No newline at end of file
+    return sequence
+
+
+def cleaned_text_to_sequence(cleaned_text):
+    '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+        Args:
+            text: string to convert to a sequence
+        Returns:
+            List of integers corresponding to the symbols in the text
+    '''
+    sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
+    return sequence
+
+
+def sequence_to_text(sequence):
+    '''Converts a sequence of IDs back to a string'''
+    result = ''
+    for symbol_id in sequence:
+        s = _id_to_symbol[symbol_id]
+        result += s
+    return result
\ No newline at end of file
diff --git a/EVT_Core/Train/VITS/vits/text/chinesedialect.py b/EVT_Core/Train/VITS/VITS2_finetuning/text/chinesedialect.py
similarity index 100%
rename from EVT_Core/Train/VITS/vits/text/chinesedialect.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/text/chinesedialect.py
diff --git a/EVT_Core/Train/VITS/vits/text/cleaners.py b/EVT_Core/Train/VITS/VITS2_finetuning/text/cleaners.py
similarity index 100%
rename from EVT_Core/Train/VITS/vits/text/cleaners.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/text/cleaners.py
diff --git a/EVT_Core/Train/VITS/vits/text/english.py b/EVT_Core/Train/VITS/VITS2_finetuning/text/english.py
similarity index 100%
rename from EVT_Core/Train/VITS/vits/text/english.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/text/english.py
diff --git a/EVT_Core/Train/VITS/vits/text/japanese.py b/EVT_Core/Train/VITS/VITS2_finetuning/text/japanese.py
similarity index 100%
rename from EVT_Core/Train/VITS/vits/text/japanese.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/text/japanese.py
diff --git a/EVT_Core/Train/VITS/vits/text/mandarin.py b/EVT_Core/Train/VITS/VITS2_finetuning/text/mandarin.py
similarity index 100%
rename from EVT_Core/Train/VITS/vits/text/mandarin.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/text/mandarin.py
diff --git a/EVT_Core/Train/VITS/vits/text/symbols.py b/EVT_Core/Train/VITS/VITS2_finetuning/text/symbols.py
similarity index 100%
rename from EVT_Core/Train/VITS/vits/text/symbols.py
rename to EVT_Core/Train/VITS/VITS2_finetuning/text/symbols.py
diff --git a/EVT_Core/Train/VITS/VITS2_finetuning/train.py b/EVT_Core/Train/VITS/VITS2_finetuning/train.py
new file mode 100644
index 0000000..11ba3a0
--- /dev/null
+++ b/EVT_Core/Train/VITS/VITS2_finetuning/train.py
@@ -0,0 +1,520 @@
+import os
+import sys
+import platform
+import argparse
+import logging
+logging.basicConfig(stream = sys.stdout, encoding = 'utf-8')
+logging.getLogger('numba').setLevel(logging.WARNING)
+import torch
+from torch.nn import functional as F
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+import torch.multiprocessing as mp
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.cuda.amp import autocast, GradScaler
+torch.backends.cudnn.benchmark = True
+from typing import Optional
+from pathlib import Path
+from tqdm import tqdm
+
+from data_utils import (
+    TextAudioSpeakerLoader,
+    TextAudioSpeakerCollate,
+    DistributedBucketSampler
+)
+from models import (
+    AVAILABLE_FLOW_TYPES,
+    AVAILABLE_DURATION_DISCRIMINATOR_TYPES,
+    SynthesizerTrn,
+    MultiPeriodDiscriminator,
+    DurationDiscriminatorV1,
+    DurationDiscriminatorV2
+)
+from mel_processing import (
+    mel_spectrogram_torch,
+    spec_to_mel_torch
+)
+from commons import (
+    slice_segments,
+    clip_grad_value_
+)
+from losses import (
+    generator_loss,
+    discriminator_loss,
+    feature_loss,
+    kl_loss
+)
+from utils import (
+    plot_spectrogram_to_numpy,
+    summarize,
+    plot_alignment_to_numpy,
+    save_checkpoint,
+    get_logger,
+    #check_git_hash,
+    load_checkpoint,
+    remove_old_checkpoints,
+    latest_checkpoint_path,
+    get_hparams
+)
+#from text import symbols
+from text.symbols import symbols
+from preprocess import args as preprocess_args
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--Num_Workers", type = int, default = 4)
+parser.add_argument("--Use_PretrainedModels", type = bool, default = True)
+parser.add_argument("--Model_Path_Pretrained_G", type = Optional[str], default = None)
+parser.add_argument("--Model_Path_Pretrained_D", type = Optional[str], default = None)
+parser.add_argument("--Keep_Original_Speakers", type = bool, default = preprocess_args.Keep_Original_Speakers)
+parser.add_argument("--Output_Root", type = str, default = preprocess_args.Output_Root)
+parser.add_argument("--Output_Dir_Name", type = str, default = preprocess_args.Output_Dir_Name)
+parser.add_argument("--Output_Config_Name", type = str, default = preprocess_args.Output_Config_Name)
+parser.add_argument("--Output_LogDir", type = str, default = "./")
+args = parser.parse_args()
+
+Num_Workers = int(os.environ.get('Num_Workers', str(args.Num_Workers)))
+Use_PretrainedModels = eval(os.environ.get('Use_PretrainedModels', str(args.Use_PretrainedModels)))
+Model_Path_Pretrained_G = str(os.environ.get('Model_Path_Pretrained_G', str(args.Model_Path_Pretrained_G))) if Use_PretrainedModels else None
+Model_Path_Pretrained_D = str(os.environ.get('Model_Path_Pretrained_D', str(args.Model_Path_Pretrained_D))) if Use_PretrainedModels else None
+Keep_Original_Speakers = eval(os.environ.get('Keep_Original_Speakers', str(args.Keep_Original_Speakers)))
+Output_Root = str(os.environ.get('Output_Root', str(args.Output_Root)))
+Output_Dir_Name = str(os.environ.get('Output_Dir_Name', str(args.Output_Dir_Name)))
+Output_Config_Name = str(os.environ.get('Output_Config_Name', str(args.Output_Config_Name)))
+Log_Dir = str(os.environ.get('Output_LogDir', str(args.Output_LogDir)))
+
+Dir_Output = Path(Output_Root).joinpath(Output_Dir_Name).as_posix()
+Config_Path = Path(Dir_Output).joinpath(Output_Config_Name).__str__()
+
+global_step = 0
+
+
+def evaluate(hps, generator, eval_loader, writer_eval):
+    generator.eval()
+    with torch.no_grad():
+        for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(eval_loader):
+            x, x_lengths = x.cuda(0), x_lengths.cuda(0)
+            spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0)
+            y, y_lengths = y.cuda(0), y_lengths.cuda(0)
+            speakers = speakers.cuda(0)
+
+            # remove else
+            x = x[:1]
+            x_lengths = x_lengths[:1]
+            spec = spec[:1]
+            spec_lengths = spec_lengths[:1]
+            y = y[:1]
+            y_lengths = y_lengths[:1]
+            speakers = speakers[:1]
+            break
+        y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, max_len=1000)
+        y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length
+
+        mel = spec_to_mel_torch(
+            spec,
+            hps.data.filter_length,
+            hps.data.n_mel_channels,
+            hps.data.sampling_rate,
+            hps.data.mel_fmin,
+            hps.data.mel_fmax
+        ) if not (hps.model.use_mel_posterior_encoder or hps.data.use_mel_posterior_encoder) else spec
+        y_hat_mel = mel_spectrogram_torch(
+            y_hat.squeeze(1).float(),
+            hps.data.filter_length,
+            hps.data.n_mel_channels,
+            hps.data.sampling_rate,
+            hps.data.hop_length,
+            hps.data.win_length,
+            hps.data.mel_fmin,
+            hps.data.mel_fmax
+        )
+    image_dict = {"gen/mel": plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy())}
+    audio_dict = {"gen/audio": y_hat[0, :, :y_hat_lengths[0]]}
+    if global_step == 0:
+        image_dict.update({"gt/mel": plot_spectrogram_to_numpy(mel[0].cpu().numpy())})
+        audio_dict.update({"gt/audio": y[0, :, :y_lengths[0]]})
+
+    summarize(
+        writer=writer_eval,
+        global_step=global_step,
+        images=image_dict,
+        audios=audio_dict,
+        audio_sampling_rate=hps.data.sampling_rate
+    )
+    generator.train()
+
+
+def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers):
+    net_g, net_d, net_dur_disc = nets
+    optim_g, optim_d, optim_dur_disc = optims
+    scheduler_g, scheduler_d, scheduler_dur_disc = schedulers
+    train_loader, eval_loader = loaders
+    if writers is not None:
+        writer, writer_eval = writers
+
+    train_loader.batch_sampler.set_epoch(epoch)
+    global global_step
+
+    net_g.train()
+    net_d.train()
+    net_dur_disc.train() if net_dur_disc is not None else None
+
+    if rank == 0:
+        loader = tqdm(train_loader, desc='Loading train data')
+    else:
+        loader = train_loader
+
+    for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(loader):
+        if net_g.module.use_noise_scaled_mas:
+            current_mas_noise_scale = net_g.module.mas_noise_scale_initial - net_g.module.noise_scale_delta * global_step
+            net_g.module.current_mas_noise_scale = max(current_mas_noise_scale, 0.0)
+        x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True)
+        spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
+        y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True)
+        speakers = speakers.cuda(rank, non_blocking=True)
+
+        with autocast(enabled=hps.train.fp16_run):
+            y_hat, l_length, attn, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q), (hidden_x, logw, logw_) = net_g(x, x_lengths, spec, spec_lengths, speakers)
+
+            mel = spec_to_mel_torch(
+                spec.float(),
+                hps.data.filter_length,
+                hps.data.n_mel_channels,
+                hps.data.sampling_rate,
+                hps.data.mel_fmin,
+                hps.data.mel_fmax
+            ) if not (hps.model.use_mel_posterior_encoder or hps.data.use_mel_posterior_encoder) else spec
+            y_mel = slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
+            y_hat_mel = mel_spectrogram_torch(
+                y_hat.squeeze(1),
+                hps.data.filter_length,
+                hps.data.n_mel_channels,
+                hps.data.sampling_rate,
+                hps.data.hop_length,
+                hps.data.win_length,
+                hps.data.mel_fmin,
+                hps.data.mel_fmax
+            )
+            y = slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice
+
+            # Discriminator
+            y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
+            with autocast(enabled=False):
+                loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
+                loss_disc_all = loss_disc
+
+            # Duration Discriminator
+            if net_dur_disc is not None:
+                y_dur_hat_r, y_dur_hat_g = net_dur_disc(hidden_x.detach(), x_mask.detach(), logw_.detach(), logw.detach())
+                with autocast(enabled=False):
+                    # TODO: I think need to mean using the mask, but for now, just mean all
+                    loss_dur_disc, losses_dur_disc_r, losses_dur_disc_g = discriminator_loss(y_dur_hat_r, y_dur_hat_g)
+                    loss_dur_disc_all = loss_dur_disc
+                optim_dur_disc.zero_grad()
+                scaler.scale(loss_dur_disc_all).backward()
+                scaler.unscale_(optim_dur_disc)
+                grad_norm_dur_disc = clip_grad_value_(net_dur_disc.parameters(), None)
+                scaler.step(optim_dur_disc)
+
+        optim_d.zero_grad()
+        scaler.scale(loss_disc_all).backward()
+        scaler.unscale_(optim_d)
+        grad_norm_d = clip_grad_value_(net_d.parameters(), None)
+        scaler.step(optim_d)
+
+        with autocast(enabled=hps.train.fp16_run):
+            # Generator
+            y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
+            if net_dur_disc is not None:
+                y_dur_hat_r, y_dur_hat_g = net_dur_disc(hidden_x, x_mask, logw_, logw)
+            with autocast(enabled=False):
+                loss_dur = torch.sum(l_length.float())
+                loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
+                loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
+
+                loss_fm = feature_loss(fmap_r, fmap_g)
+                loss_gen, losses_gen = generator_loss(y_d_hat_g)
+                loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl
+                if net_dur_disc is not None:
+                    loss_dur_gen, losses_dur_gen = generator_loss(y_dur_hat_g)
+                    loss_gen_all += loss_dur_gen
+
+        optim_g.zero_grad()
+        scaler.scale(loss_gen_all).backward()
+        scaler.unscale_(optim_g)
+        grad_norm_g = clip_grad_value_(net_g.parameters(), None)
+        scaler.step(optim_g)
+        scaler.update()
+
+        if rank == 0:
+            if global_step % hps.train.log_interval == 0:
+                lr = optim_g.param_groups[0]['lr']
+                losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl]
+                logger.info('Train Epoch: {} [{:.0f}%]'.format(epoch, 100. * batch_idx / len(train_loader)))
+                logger.info([x.item() for x in losses] + [global_step, lr])
+
+                scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g}
+                scalar_dict.update({"loss/dur_disc/total": loss_dur_disc_all, "grad_norm_dur_disc": grad_norm_dur_disc}) if net_dur_disc is not None else None
+                scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl})
+
+                scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
+                scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
+                scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
+
+            # if net_dur_disc is not None:
+            #   scalar_dict.update({"loss/dur_disc_r" : f"{losses_dur_disc_r}"})
+            #   scalar_dict.update({"loss/dur_disc_g" : f"{losses_dur_disc_g}"})
+            #   scalar_dict.update({"loss/dur_gen" : f"{loss_dur_gen}"})
+
+                image_dict = {
+                    "slice/mel_org": plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
+                    "slice/mel_gen": plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
+                    "all/mel": plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
+                    "all/attn": plot_alignment_to_numpy(attn[0,0].data.cpu().numpy())
+                }
+                summarize(
+                    writer=writer,
+                    global_step=global_step,
+                    images=image_dict,
+                    scalars=scalar_dict)
+
+            if global_step % hps.train.eval_interval == 0:
+                evaluate(hps, net_g, eval_loader, writer_eval)
+                save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, Path(hps.model_dir).joinpath("G_{}.pth".format(global_step)).__str__())
+                save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, Path(hps.model_dir).joinpath("D_{}.pth".format(global_step)).__str__())
+                save_checkpoint(net_dur_disc, optim_dur_disc, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "DUR_{}.pth".format(global_step))) if net_dur_disc is not None else None
+
+                remove_old_checkpoints(hps.model_dir, prefixes=["G_*.pth", "D_*.pth", "DUR_*.pth"])
+        global_step += 1
+
+    if rank == 0:
+        logger.info('====> Epoch: {}'.format(epoch))
+
+
+def run(rank, n_gpus, hps):
+    global global_step
+    net_dur_disc = None
+    if rank == 0:
+        logger = get_logger(hps.model_dir)
+        #logger.info(hps)
+        #check_git_hash(hps.model_dir)
+        writer = SummaryWriter(log_dir = Log_Dir)
+        writer_eval = SummaryWriter(log_dir = Path(Log_Dir).joinpath("eval").__str__())
+
+    dist.init_process_group(
+        backend = 'gloo' if platform.system() == 'Windows' else 'nccl', # Windows不支持NCCL backend，故使用GLOO
+        init_method = 'env://',
+        world_size = n_gpus,
+        rank = rank
+    )
+
+    torch.manual_seed(hps.train.seed)
+    torch.cuda.set_device(rank)
+
+    if "use_mel_posterior_encoder" in hps.model.keys() and hps.model.use_mel_posterior_encoder == True:
+        print("Using mel posterior encoder for VITS2")
+        posterior_channels = 80  # vits2
+        hps.data.use_mel_posterior_encoder = True
+    else:
+        print("Using lin posterior encoder for VITS1")
+        posterior_channels = hps.data.filter_length // 2 + 1
+        hps.data.use_mel_posterior_encoder = False
+
+    train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data)
+    train_sampler = DistributedBucketSampler(
+        train_dataset,
+        hps.train.batch_size,
+        [32,300,400,500,600,700,800,900,1000],
+        num_replicas=n_gpus,
+        rank=rank,
+        shuffle=True
+    )
+    collate_fn = TextAudioSpeakerCollate()
+    train_loader = DataLoader(
+        train_dataset,
+        num_workers=Num_Workers,
+        shuffle=False,
+        pin_memory=True,
+        collate_fn=collate_fn,
+        batch_sampler=train_sampler
+    )
+    if rank == 0:
+        eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
+        eval_loader = DataLoader(
+            eval_dataset,
+            num_workers=0,
+            shuffle=False,
+            batch_size=hps.train.batch_size,
+            pin_memory=True,
+            drop_last=False,
+            collate_fn=collate_fn
+        )
+
+    # some of these flags are not being used in the code and directly set in hps json file.
+    # they are kept here for reference and prototyping.
+    if "use_transformer_flows" in hps.model.keys() and hps.model.use_transformer_flows == True:
+        use_transformer_flows = True
+        transformer_flow_type = hps.model.transformer_flow_type
+        print(f"Using transformer flows {transformer_flow_type} for VITS2")
+        assert transformer_flow_type in AVAILABLE_FLOW_TYPES, f"transformer_flow_type must be one of {AVAILABLE_FLOW_TYPES}"
+    else:
+        print("Using normal flows for VITS1")
+        use_transformer_flows = False
+
+    if "use_spk_conditioned_encoder" in hps.model.keys() and hps.model.use_spk_conditioned_encoder == True:
+        if hps.data.n_speakers == 0:
+            raise ValueError("n_speakers must be > 0 when using spk conditioned encoder to train multi-speaker model")
+        use_spk_conditioned_encoder = True
+    else:
+        print("Using normal encoder for VITS1")
+        use_spk_conditioned_encoder = False
+
+    if "use_noise_scaled_mas" in hps.model.keys() and hps.model.use_noise_scaled_mas == True:
+        print("Using noise scaled MAS for VITS2")
+        use_noise_scaled_mas = True
+        mas_noise_scale_initial = 0.01
+        noise_scale_delta = 2e-6
+    else:
+        print("Using normal MAS for VITS1")
+        use_noise_scaled_mas = False
+        mas_noise_scale_initial = 0.0
+        noise_scale_delta = 0.0
+
+    # Initialize VITS models and move to GPU
+    net_g = SynthesizerTrn(
+        len(symbols),
+        posterior_channels,
+        hps.train.segment_size // hps.data.hop_length,
+        n_speakers=hps.data.n_speakers,
+        mas_noise_scale_initial=mas_noise_scale_initial,
+        noise_scale_delta=noise_scale_delta,
+        **hps.model
+    ).cuda(rank)
+    net_d = MultiPeriodDiscriminator(
+        hps.model.use_spectral_norm
+    ).cuda(rank)
+    if "use_duration_discriminator" in hps.model.keys() and hps.model.use_duration_discriminator == True:
+        use_duration_discriminator = True
+        # add duration discriminator type here
+        duration_discriminator_type = getattr(hps.model, "duration_discriminator_type", "dur_disc_1")
+        print(f"Using duration_discriminator {duration_discriminator_type} for VITS2")
+        assert duration_discriminator_type in AVAILABLE_DURATION_DISCRIMINATOR_TYPES, f"duration_discriminator_type must be one of {AVAILABLE_DURATION_DISCRIMINATOR_TYPES}"
+        duration_discriminator_type = AVAILABLE_DURATION_DISCRIMINATOR_TYPES
+        if duration_discriminator_type == "dur_disc_1":
+            net_dur_disc = DurationDiscriminatorV1(
+                hps.model.hidden_channels,
+                hps.model.hidden_channels,
+                3,
+                0.1,
+                gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0,
+            ).cuda(rank)
+        elif duration_discriminator_type == "dur_disc_2":
+            net_dur_disc = DurationDiscriminatorV2(
+                hps.model.hidden_channels,
+                hps.model.hidden_channels,
+                3,
+                0.1,
+                gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0,
+            ).cuda(rank) 
+    else:
+        print("NOT using any duration discriminator like VITS1")
+        use_duration_discriminator = False
+        net_dur_disc = None
+
+    # Build optimizers for the initialized VITS models
+    optim_g = torch.optim.AdamW(
+        filter(lambda net_g_params: net_g_params.requires_grad, net_g.parameters()), # Filter out params which don't require gradient
+        hps.train.learning_rate,
+        betas=hps.train.betas,
+        eps=hps.train.eps
+    )
+    optim_d = torch.optim.AdamW(
+        net_d.parameters(),
+        hps.train.learning_rate,
+        betas=hps.train.betas,
+        eps=hps.train.eps
+    )
+    optim_dur_disc = torch.optim.AdamW(
+        net_dur_disc.parameters(),
+        hps.train.learning_rate,
+        betas=hps.train.betas,
+        eps=hps.train.eps
+    ) if net_dur_disc is not None else None
+
+    # Build DDP models for the initialized VITS models
+    net_g = DDP(net_g, device_ids = [rank], find_unused_parameters = True)
+    net_d = DDP(net_d, device_ids = [rank], find_unused_parameters = False)
+    net_dur_disc = DDP(net_dur_disc, device_ids=[rank]) if net_dur_disc is not None else None
+
+    # Load state dict from checkpoint for the initialized VITS models and get the optimizer, learning rate and iteration
+    try:
+        _, optim_g, lr_g, epoch_str = load_checkpoint(
+            Model_Path_Pretrained_G if Use_PretrainedModels else latest_checkpoint_path(hps.model_dir, "G_*.pth"),
+            net_g,
+            optim_g,
+            Keep_Original_Speakers if Use_PretrainedModels else True
+        )
+        _, optim_d, lr_d, epoch_str = load_checkpoint(
+            Model_Path_Pretrained_D if Use_PretrainedModels else latest_checkpoint_path(hps.model_dir, "D_*.pth"),
+            net_d,
+            optim_d,
+            Keep_Original_Speakers if Use_PretrainedModels else True
+        )
+        _, _, _, epoch_str = load_checkpoint(
+            latest_checkpoint_path(hps.model_dir, "DUR_*.pth"),
+            net_dur_disc,
+            optim_dur_disc
+        ) if net_dur_disc is not None else (_, _, _, epoch_str)
+
+        # To prevent KeyError: "param 'initial_lr' is not specified in param_groups[0] when resuming an optimizer"
+        if optim_g.param_groups[0].get('initial_lr') is None:
+            optim_g.param_groups[0]['initial_lr'] = lr_g
+        if optim_d.param_groups[0].get('initial_lr') is None:
+            optim_d.param_groups[0]['initial_lr'] = lr_d
+
+        global_step = (epoch_str - 1) * len(train_loader) # > 0
+        print(f"Continue from step {global_step}")
+
+    except Exception as e:
+        epoch_str = 1
+        global_step = 0
+        print(f"Got Exception: {e}. Start from step 0")
+
+    # Build learning rate schedulers for optimizers
+    scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma = hps.train.lr_decay, last_epoch = epoch_str - 2)
+    scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma = hps.train.lr_decay, last_epoch = epoch_str - 2)
+    scheduler_dur_disc = torch.optim.lr_scheduler.ExponentialLR(optim_dur_disc, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) if net_dur_disc is not None else None
+
+    # Build gradient scaler
+    scaler = GradScaler(enabled = hps.train.fp16_run)
+
+    # Start training (and evaluating)
+    for epoch in range(epoch_str, hps.train.epochs + 1):
+        if rank == 0:
+            train_and_evaluate(
+                rank, epoch, hps, [net_g, net_d, net_dur_disc], [optim_g, optim_d, optim_dur_disc], [scheduler_g, scheduler_d, scheduler_dur_disc], scaler,
+                [train_loader, eval_loader], logger, [writer, writer_eval]
+            )
+        else:
+            train_and_evaluate(
+                rank, epoch, hps, [net_g, net_d, net_dur_disc], [optim_g, optim_d, optim_dur_disc], [scheduler_g, scheduler_d, scheduler_dur_disc], scaler,
+                [train_loader, None], None, None
+            )
+        scheduler_g.step()
+        scheduler_d.step()
+        scheduler_dur_disc.step() if net_dur_disc is not None else None
+
+
+if __name__ == "__main__":
+    # Assume Single Node Multi GPUs Training Only
+    assert torch.cuda.is_available(), "CPU training is not allowed."
+    n_gpus = torch.cuda.device_count()
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '8000'
+
+    hps = get_hparams(
+        Config_Path = Config_Path,
+        Model_Dir = Dir_Output
+    )
+    mp.spawn(run, args = (n_gpus, hps,), nprocs = n_gpus)
\ No newline at end of file
diff --git a/EVT_GUI/EnvConfigurator.py b/EVT_GUI/EnvConfigurator.py
index f866d2d..b15f1af 100644
--- a/EVT_GUI/EnvConfigurator.py
+++ b/EVT_GUI/EnvConfigurator.py
@@ -501,7 +501,10 @@ def Check_Pytorch(self, Package: str):
     def Install_Pytorch(self, Package: str, Reinstall: bool):
         DisplayCommand = 'cmd /c start cmd /k ' if platform.system() == 'Windows' else 'x-terminal-emulator -e '
         if Package in ('torch', 'torchvision', 'torchaudio'):
-            pynvml.nvmlInit()
+            try:
+                pynvml.nvmlInit()
+            except:
+                raise Exception("Failed to get NVIDIA GPUs' info.")
             CudaList = [117, 118, 121]
             CudaVersion = min(CudaList, key = lambda Cuda: abs(Cuda - pynvml.nvmlSystemGetCudaDriverVersion()//100))
             MirrorList = [f'https://download.pytorch.org/whl/cu{CudaVersion}', '']
diff --git a/README.md b/README.md
index 14ddc0c..bdce07e 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ Functions that are currently included in the toolkit are as follows:
 
 - [Voice Recognition](/docs/EN/Voice-Recognizer.md)
 
-- [Voice Transcribing](/docs/EN/Voice-Transcriber.md)
+- [Voice Transcription](/docs/EN/Voice-Transcriber.md)
 
 - [Dataset Creating (SRT Converting & WAV Splitting)](/docs/EN/Dataset-Creator.md)
 
@@ -152,7 +152,7 @@ Please make sure that you've installed [Python](https://www.python.org/downloads
 
 - Install pytorch (Command can be get from the [official site](https://pytorch.org/get-started/locally/))
     ```shell
-    # e.g. (注意自己的cuda版本，这里以11.8为例)
+    # e.g. (Mind your cuda version，here we take 11.8 as an example)
     pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
     ```
 
diff --git a/Run.py b/Run.py
index 1c25437..5a42c6d 100644
--- a/Run.py
+++ b/Run.py
@@ -20,7 +20,7 @@
 ##############################################################################################################################
 
 # Set current version
-CurrentVersion = "v1.1.3"
+CurrentVersion = "v1.1.4"
 
 ##############################################################################################################################
 
@@ -391,9 +391,8 @@ def Execute(self, Params: tuple):
             Args = [
                 f'cd "{ResourceDir}"',
                 'python -c "'
-                'from EVT_Core.Train.VITS.Train import Voice_Training; '
-                f"PreprocessandTrain = Voice_Training{str(Params)}; "
-                'PreprocessandTrain.Preprocessing_and_Training()"'
+                'from EVT_Core.Train.VITS.Train import Train; '
+                f'Train{str(Params)}"'
             ]
         )
         Output, Error = CMD.monitor(
@@ -492,9 +491,8 @@ def Execute(self, Params: tuple):
             Args = [
                 f'cd "{ResourceDir}"',
                 'python -c "'
-                'from EVT_Core.TTS.VITS.Convert import Voice_Converting; '
-                f"TTS = Voice_Converting{str(ItemReplacer(LANGUAGES, Params))}; "
-                'TTS.Converting()"'
+                'from EVT_Core.TTS.VITS.Convert import Convert; '
+                f'Convert{str(ItemReplacer(LANGUAGES, Params))}"'
             ]
         )
         Output, Error = CMD.monitor(
@@ -3514,6 +3512,7 @@ def SetText_LineEdit_DAT_GPTSoVITS_FileListPath():
                     QMessageBox.Yes: lambda: (
                         DATResult_Save(
                             ChildWindow_DAT_GPTSoVITS.ui.Table.GetValue(),
+                            LineEdit_DAT_GPTSoVITS_FileListPath.text()
                         ),
                         ChildWindow_DAT_GPTSoVITS.close()
                     )
@@ -5426,10 +5425,11 @@ def SetText_LineEdit_Train_VITS_OutputDir():
                 self.ui.LineEdit_Train_VITS_ModelPathPretrainedD,
                 self.ui.LineEdit_Train_VITS_OutputRoot,
                 self.ui.LineEdit_Train_VITS_OutputDirName,
+                'Config.json',
                 self.ui.LineEdit_Train_VITS_LogDir
             ],
             EmptyAllowed = [
-                DialogBox_KeepOriginalSpeakers.LineEdit,
+                self.ui.LineEdit_Train_VITS_ConfigPathLoad,
                 self.ui.LineEdit_Train_VITS_ModelPathPretrainedG,
                 self.ui.LineEdit_Train_VITS_ModelPathPretrainedD
             ],