diff --git a/.gitignore b/.gitignore index 4e6a2e2..02c2ac8 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ **/GPT_SoVITS/GPT_SoVITS/pretrained_models/ **/GPT_SoVITS/GPT_SoVITS/text/ **/GPT_SoVITS/GPT_SoVITS/tools/i18n/locale/ +**/GPT_SoVITS/GPT_SoVITS/inference_webui.py **/GPT_SoVITS/GPT_SoVITS/my_utils.py **/GPT_SoVITS/GPT_SoVITS/onnx_export.py **/GPT_SoVITS/GPT_SoVITS/process_ckpt.py diff --git a/EVT_Core/Dataset/GPT_SoVITS/utils/Creating_Directories.py b/EVT_Core/Dataset/GPT_SoVITS/utils/Creating_Directories.py index 32e0c3f..15716fc 100644 --- a/EVT_Core/Dataset/GPT_SoVITS/utils/Creating_Directories.py +++ b/EVT_Core/Dataset/GPT_SoVITS/utils/Creating_Directories.py @@ -2,26 +2,26 @@ def create_directories( - wav_dir_prepared, wav_dir_split, + csv_dir_prepared, csv_dir_merged, csv_dir_final ): ''' Create csv directory ''' - if not os.path.exists(wav_dir_prepared): - try: - os.makedirs(wav_dir_prepared, exist_ok = True) - except OSError: - print('Creation of directory %s failed' %wav_dir_prepared) - if not os.path.exists(wav_dir_split): try: os.makedirs(wav_dir_split, exist_ok = True) except OSError: print('Creation of directory %s failed' %wav_dir_split) + if not os.path.exists(csv_dir_prepared): + try: + os.makedirs(csv_dir_prepared, exist_ok = True) + except OSError: + print('Creation of directory %s failed' %csv_dir_prepared) + if not os.path.exists(csv_dir_merged): try: os.makedirs(csv_dir_merged, exist_ok = True) diff --git a/EVT_Core/Dataset/VITS/utils/Creating_Directories.py b/EVT_Core/Dataset/VITS/utils/Creating_Directories.py index 32e0c3f..927ebd7 100644 --- a/EVT_Core/Dataset/VITS/utils/Creating_Directories.py +++ b/EVT_Core/Dataset/VITS/utils/Creating_Directories.py @@ -4,6 +4,7 @@ def create_directories( wav_dir_prepared, wav_dir_split, + csv_dir_prepared, csv_dir_merged, csv_dir_final ): @@ -22,6 +23,12 @@ def create_directories( except OSError: print('Creation of directory %s failed' %wav_dir_split) + if not os.path.exists(csv_dir_prepared): + try: + os.makedirs(csv_dir_prepared, exist_ok = True) + except OSError: + print('Creation of directory %s failed' %csv_dir_prepared) + if not os.path.exists(csv_dir_merged): try: os.makedirs(csv_dir_merged, exist_ok = True) diff --git a/EVT_Core/TTS/GPT_SoVITS/Convert.py b/EVT_Core/TTS/GPT_SoVITS/Convert.py index 4da2374..1487763 100644 --- a/EVT_Core/TTS/GPT_SoVITS/Convert.py +++ b/EVT_Core/TTS/GPT_SoVITS/Convert.py @@ -124,7 +124,7 @@ def change_tts_inference( os.environ["infer_ttswebui"]=str(webui_port_infer_tts) os.environ["is_share"]=str(is_share) os.environ['USE_WEBUI']=str(use_webui) - cmd = f'"{python_exec}" "GPT_SoVITS/inference.py"' + cmd = f'"{python_exec}" "GPT_SoVITS/inference_gui.py"' print("TTS推理进程已开启") print(cmd) p_tts_inference = subprocess.Popen(cmd, shell=True) diff --git a/EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference_gui.py b/EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference_gui.py new file mode 100644 index 0000000..2059155 --- /dev/null +++ b/EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference_gui.py @@ -0,0 +1,310 @@ +import os +import sys +from PyQt5.QtCore import QEvent +from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit +from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox +import soundfile as sf + +from tools.i18n.i18n import I18nAuto +i18n = I18nAuto() + +from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav + + +class GPTSoVITSGUI(QMainWindow): + GPT_Path = gpt_path + SoVITS_Path = sovits_path + + def __init__(self): + super().__init__() + + self.setWindowTitle('GPT-SoVITS GUI') + self.setGeometry(800, 450, 950, 850) + + self.setStyleSheet(""" + QWidget { + background-color: #a3d3b1; + } + + QTabWidget::pane { + background-color: #a3d3b1; + } + + QTabWidget::tab-bar { + alignment: left; + } + + QTabBar::tab { + background: #8da4bf; + color: #ffffff; + padding: 8px; + } + + QTabBar::tab:selected { + background: #2a3f54; + } + + QLabel { + color: #000000; + } + + QPushButton { + background-color: #4CAF50; + color: white; + padding: 8px; + border: 1px solid #4CAF50; + border-radius: 4px; + } + + QPushButton:hover { + background-color: #45a049; + border: 1px solid #45a049; + box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.1); + } + """) + + license_text = ( + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. " + "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") + license_label = QLabel(license_text) + license_label.setWordWrap(True) + + self.GPT_model_label = QLabel("选择GPT模型:") + self.GPT_model_input = QLineEdit() + self.GPT_model_input.setPlaceholderText("拖拽或选择文件") + self.GPT_model_input.setText(self.GPT_Path) + self.GPT_model_input.setReadOnly(True) + self.GPT_model_button = QPushButton("选择GPT模型文件") + self.GPT_model_button.clicked.connect(self.select_GPT_model) + + self.SoVITS_model_label = QLabel("选择SoVITS模型:") + self.SoVITS_model_input = QLineEdit() + self.SoVITS_model_input.setPlaceholderText("拖拽或选择文件") + self.SoVITS_model_input.setText(self.SoVITS_Path) + self.SoVITS_model_input.setReadOnly(True) + self.SoVITS_model_button = QPushButton("选择SoVITS模型文件") + self.SoVITS_model_button.clicked.connect(self.select_SoVITS_model) + + self.ref_audio_label = QLabel("上传参考音频:") + self.ref_audio_input = QLineEdit() + self.ref_audio_input.setPlaceholderText("拖拽或选择文件") + self.ref_audio_input.setReadOnly(True) + self.ref_audio_button = QPushButton("选择音频文件") + self.ref_audio_button.clicked.connect(self.select_ref_audio) + + self.ref_text_label = QLabel("参考音频文本:") + self.ref_text_input = QLineEdit() + self.ref_text_input.setPlaceholderText("直接输入文字或上传文本") + self.ref_text_button = QPushButton("上传文本") + self.ref_text_button.clicked.connect(self.upload_ref_text) + + self.ref_language_label = QLabel("参考音频语言:") + self.ref_language_combobox = QComboBox() + self.ref_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"]) + self.ref_language_combobox.setCurrentText("多语种混合") + + self.target_text_label = QLabel("合成目标文本:") + self.target_text_input = QLineEdit() + self.target_text_input.setPlaceholderText("直接输入文字或上传文本") + self.target_text_button = QPushButton("上传文本") + self.target_text_button.clicked.connect(self.upload_target_text) + + self.target_language_label = QLabel("合成音频语言:") + self.target_language_combobox = QComboBox() + self.target_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"]) + self.target_language_combobox.setCurrentText("多语种混合") + + self.output_label = QLabel("输出音频路径:") + self.output_input = QLineEdit() + self.output_input.setPlaceholderText("拖拽或选择文件") + self.output_input.setReadOnly(True) + self.output_button = QPushButton("选择文件夹") + self.output_button.clicked.connect(self.select_output_path) + + self.output_text = QTextEdit() + self.output_text.setReadOnly(True) + + self.add_drag_drop_events([ + self.GPT_model_input, + self.SoVITS_model_input, + self.ref_audio_input, + self.ref_text_input, + self.target_text_input, + self.output_input, + ]) + + self.synthesize_button = QPushButton("合成") + self.synthesize_button.clicked.connect(self.synthesize) + + self.clear_output_button = QPushButton("清空输出") + self.clear_output_button.clicked.connect(self.clear_output) + + self.status_bar = QStatusBar() + + main_layout = QVBoxLayout() + + input_layout = QGridLayout(self) + input_layout.setSpacing(10) + + input_layout.addWidget(license_label, 0, 0, 1, 3) + + input_layout.addWidget(self.GPT_model_label, 1, 0) + input_layout.addWidget(self.GPT_model_input, 2, 0, 1, 2) + input_layout.addWidget(self.GPT_model_button, 2, 2) + + input_layout.addWidget(self.SoVITS_model_label, 3, 0) + input_layout.addWidget(self.SoVITS_model_input, 4, 0, 1, 2) + input_layout.addWidget(self.SoVITS_model_button, 4, 2) + + input_layout.addWidget(self.ref_audio_label, 5, 0) + input_layout.addWidget(self.ref_audio_input, 6, 0, 1, 2) + input_layout.addWidget(self.ref_audio_button, 6, 2) + + input_layout.addWidget(self.ref_language_label, 7, 0) + input_layout.addWidget(self.ref_language_combobox, 8, 0, 1, 1) + input_layout.addWidget(self.ref_text_label, 9, 0) + input_layout.addWidget(self.ref_text_input, 10, 0, 1, 2) + input_layout.addWidget(self.ref_text_button, 10, 2) + + input_layout.addWidget(self.target_language_label, 11, 0) + input_layout.addWidget(self.target_language_combobox, 12, 0, 1, 1) + input_layout.addWidget(self.target_text_label, 13, 0) + input_layout.addWidget(self.target_text_input, 14, 0, 1, 2) + input_layout.addWidget(self.target_text_button, 14, 2) + + input_layout.addWidget(self.output_label, 15, 0) + input_layout.addWidget(self.output_input, 16, 0, 1, 2) + input_layout.addWidget(self.output_button, 16, 2) + + main_layout.addLayout(input_layout) + + output_layout = QVBoxLayout() + output_layout.addWidget(self.output_text) + main_layout.addLayout(output_layout) + + main_layout.addWidget(self.synthesize_button) + + main_layout.addWidget(self.clear_output_button) + + main_layout.addWidget(self.status_bar) + + self.central_widget = QWidget() + self.central_widget.setLayout(main_layout) + self.setCentralWidget(self.central_widget) + + def dragEnterEvent(self, event): + if event.mimeData().hasUrls(): + event.acceptProposedAction() + + def dropEvent(self, event): + if event.mimeData().hasUrls(): + file_paths = [url.toLocalFile() for url in event.mimeData().urls()] + if len(file_paths) == 1: + self.update_ref_audio(file_paths[0]) + else: + self.update_ref_audio(", ".join(file_paths)) + + def add_drag_drop_events(self, widgets): + for widget in widgets: + widget.setAcceptDrops(True) + widget.installEventFilter(self) + + def eventFilter(self, obj, event): + if event.type() in (QEvent.DragEnter, QEvent.Drop): + mime_data = event.mimeData() + if mime_data.hasUrls(): + event.acceptProposedAction() + + return super().eventFilter(obj, event) + + def select_GPT_model(self): + file_path, _ = QFileDialog.getOpenFileName(self, "选择GPT模型文件", "", "GPT Files (*.ckpt)") + if file_path: + self.GPT_model_input.setText(file_path) + + def select_SoVITS_model(self): + file_path, _ = QFileDialog.getOpenFileName(self, "选择SoVITS模型文件", "", "SoVITS Files (*.pth)") + if file_path: + self.SoVITS_model_input.setText(file_path) + + def select_ref_audio(self): + file_path, _ = QFileDialog.getOpenFileName(self, "选择参考音频文件", "", "Audio Files (*.wav *.mp3)") + if file_path: + self.update_ref_audio(file_path) + + def upload_ref_text(self): + file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") + if file_path: + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + self.ref_text_input.setText(content) + + def upload_target_text(self): + file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") + if file_path: + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + self.target_text_input.setText(content) + + def select_output_path(self): + options = QFileDialog.Options() + options |= QFileDialog.DontUseNativeDialog + options |= QFileDialog.ShowDirsOnly + + folder_dialog = QFileDialog() + folder_dialog.setOptions(options) + folder_dialog.setFileMode(QFileDialog.Directory) + + if folder_dialog.exec_(): + folder_path = folder_dialog.selectedFiles()[0] + self.output_input.setText(folder_path) + + def update_ref_audio(self, file_path): + self.ref_audio_input.setText(file_path) + + def clear_output(self): + self.output_text.clear() + + def synthesize(self): + GPT_model_path = self.GPT_model_input.text() + SoVITS_model_path = self.SoVITS_model_input.text() + ref_audio_path = self.ref_audio_input.text() + language_combobox = self.ref_language_combobox.currentText() + language_combobox = i18n(language_combobox) + ref_text = self.ref_text_input.text() + target_language_combobox = self.target_language_combobox.currentText() + target_language_combobox = i18n(target_language_combobox) + target_text = self.target_text_input.text() + output_path = self.output_input.text() + + if GPT_model_path != self.GPT_Path: + change_gpt_weights(gpt_path=GPT_model_path) + self.GPT_Path = GPT_model_path + if SoVITS_model_path != self.SoVITS_Path: + change_sovits_weights(sovits_path=SoVITS_model_path) + self.SoVITS_Path = SoVITS_model_path + + synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, + prompt_text=ref_text, + prompt_language=language_combobox, + text=target_text, + text_language=target_language_combobox) + + result_list = list(synthesis_result) + + if result_list: + last_sampling_rate, last_audio_data = result_list[-1] + output_wav_path = os.path.join(output_path, "output.wav") + sf.write(output_wav_path, last_audio_data, last_sampling_rate) + + result = "Audio saved to " + output_wav_path + + self.status_bar.showMessage("合成完成!输出路径:" + output_wav_path, 5000) + self.output_text.append("处理结果:\n" + result) + + +if __name__ == '__main__': + app = QApplication(sys.argv) + mainWin = GPTSoVITSGUI() + mainWin.show() + sys.exit(app.exec_()) \ No newline at end of file diff --git a/EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference.py b/EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference_webui.py similarity index 67% rename from EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference.py rename to EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference_webui.py index 0958a6e..b21b954 100644 --- a/EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference.py +++ b/EVT_Core/TTS/GPT_SoVITS/GPT_SoVITS/inference_webui.py @@ -70,14 +70,6 @@ i18n = I18nAuto() -import sys -from PyQt5.QtCore import QEvent -from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit -from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox -import soundfile as sf - -use_webui = eval(os.environ.get('USE_WEBUI', "True")) - # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。 if torch.cuda.is_available(): @@ -516,7 +508,7 @@ def cut4(inp): return "\n".join(opts) -# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference.py +# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py def cut5(inp): # if not re.search(r'[^\w\s]', inp[-1]): # inp += '。' @@ -648,305 +640,7 @@ def get_weights_names(): gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")) -class GPTSoVITSGUI(QMainWindow): - gpt_path = gpt_path - sovits_path = sovits_path - - def __init__(self): - super().__init__() - - self.setWindowTitle('GPT-SoVITS GUI') - self.setGeometry(800, 450, 950, 850) - - self.setStyleSheet(""" - QWidget { - background-color: #a3d3b1; - } - - QTabWidget::pane { - background-color: #a3d3b1; - } - - QTabWidget::tab-bar { - alignment: left; - } - - QTabBar::tab { - background: #8da4bf; - color: #ffffff; - padding: 8px; - } - - QTabBar::tab:selected { - background: #2a3f54; - } - - QLabel { - color: #000000; - } - - QPushButton { - background-color: #4CAF50; - color: white; - padding: 8px; - border: 1px solid #4CAF50; - border-radius: 4px; - } - - QPushButton:hover { - background-color: #45a049; - border: 1px solid #45a049; - box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.1); - } - """) - - license_text = ( - "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. " - "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") - license_label = QLabel(license_text) - license_label.setWordWrap(True) - - self.GPT_model_label = QLabel("选择GPT模型:") - self.GPT_model_input = QLineEdit() - self.GPT_model_input.setPlaceholderText("拖拽或选择文件") - self.GPT_model_input.setText(self.gpt_path) - self.GPT_model_input.setReadOnly(True) - self.GPT_model_button = QPushButton("选择GPT模型文件") - self.GPT_model_button.clicked.connect(self.select_GPT_model) - - self.SoVITS_model_label = QLabel("选择SoVITS模型:") - self.SoVITS_model_input = QLineEdit() - self.SoVITS_model_input.setPlaceholderText("拖拽或选择文件") - self.SoVITS_model_input.setText(self.sovits_path) - self.SoVITS_model_input.setReadOnly(True) - self.SoVITS_model_button = QPushButton("选择SoVITS模型文件") - self.SoVITS_model_button.clicked.connect(self.select_SoVITS_model) - - self.ref_audio_label = QLabel("上传参考音频:") - self.ref_audio_input = QLineEdit() - self.ref_audio_input.setPlaceholderText("拖拽或选择文件") - self.ref_audio_input.setReadOnly(True) - self.ref_audio_button = QPushButton("选择音频文件") - self.ref_audio_button.clicked.connect(self.select_ref_audio) - - self.ref_text_label = QLabel("参考音频文本:") - self.ref_text_input = QLineEdit() - self.ref_text_input.setPlaceholderText("直接输入文字或上传文本") - self.ref_text_button = QPushButton("上传文本") - self.ref_text_button.clicked.connect(self.upload_ref_text) - - self.ref_language_label = QLabel("参考音频语言:") - self.ref_language_combobox = QComboBox() - self.ref_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"]) - self.ref_language_combobox.setCurrentText("多语种混合") - - self.target_text_label = QLabel("合成目标文本:") - self.target_text_input = QLineEdit() - self.target_text_input.setPlaceholderText("直接输入文字或上传文本") - self.target_text_button = QPushButton("上传文本") - self.target_text_button.clicked.connect(self.upload_target_text) - - self.target_language_label = QLabel("合成音频语言:") - self.target_language_combobox = QComboBox() - self.target_language_combobox.addItems(["中文", "英文", "日文", "中英混合", "日英混合", "多语种混合"]) - self.target_language_combobox.setCurrentText("多语种混合") - - self.output_label = QLabel("输出音频路径:") - self.output_input = QLineEdit() - self.output_input.setPlaceholderText("拖拽或选择文件") - self.output_input.setReadOnly(True) - self.output_button = QPushButton("选择文件夹") - self.output_button.clicked.connect(self.select_output_path) - - self.output_text = QTextEdit() - self.output_text.setReadOnly(True) - - self.add_drag_drop_events([ - self.GPT_model_input, - self.SoVITS_model_input, - self.ref_audio_input, - self.ref_text_input, - self.target_text_input, - self.output_input, - ]) - - self.synthesize_button = QPushButton("合成") - self.synthesize_button.clicked.connect(self.synthesize) - - self.clear_output_button = QPushButton("清空输出") - self.clear_output_button.clicked.connect(self.clear_output) - - self.status_bar = QStatusBar() - - main_layout = QVBoxLayout() - - input_layout = QGridLayout(self) - input_layout.setSpacing(10) - - input_layout.addWidget(license_label, 0, 0, 1, 3) - - input_layout.addWidget(self.GPT_model_label, 1, 0) - input_layout.addWidget(self.GPT_model_input, 2, 0, 1, 2) - input_layout.addWidget(self.GPT_model_button, 2, 2) - - input_layout.addWidget(self.SoVITS_model_label, 3, 0) - input_layout.addWidget(self.SoVITS_model_input, 4, 0, 1, 2) - input_layout.addWidget(self.SoVITS_model_button, 4, 2) - - input_layout.addWidget(self.ref_audio_label, 5, 0) - input_layout.addWidget(self.ref_audio_input, 6, 0, 1, 2) - input_layout.addWidget(self.ref_audio_button, 6, 2) - - input_layout.addWidget(self.ref_language_label, 7, 0) - input_layout.addWidget(self.ref_language_combobox, 8, 0, 1, 1) - input_layout.addWidget(self.ref_text_label, 9, 0) - input_layout.addWidget(self.ref_text_input, 10, 0, 1, 2) - input_layout.addWidget(self.ref_text_button, 10, 2) - - input_layout.addWidget(self.target_language_label, 11, 0) - input_layout.addWidget(self.target_language_combobox, 12, 0, 1, 1) - input_layout.addWidget(self.target_text_label, 13, 0) - input_layout.addWidget(self.target_text_input, 14, 0, 1, 2) - input_layout.addWidget(self.target_text_button, 14, 2) - - input_layout.addWidget(self.output_label, 15, 0) - input_layout.addWidget(self.output_input, 16, 0, 1, 2) - input_layout.addWidget(self.output_button, 16, 2) - - main_layout.addLayout(input_layout) - - output_layout = QVBoxLayout() - output_layout.addWidget(self.output_text) - main_layout.addLayout(output_layout) - - main_layout.addWidget(self.synthesize_button) - - main_layout.addWidget(self.clear_output_button) - - main_layout.addWidget(self.status_bar) - - self.central_widget = QWidget() - self.central_widget.setLayout(main_layout) - self.setCentralWidget(self.central_widget) - - def dragEnterEvent(self, event): - if event.mimeData().hasUrls(): - event.acceptProposedAction() - - def dropEvent(self, event): - if event.mimeData().hasUrls(): - file_paths = [url.toLocalFile() for url in event.mimeData().urls()] - - if len(file_paths) == 1: - self.update_ref_audio(file_paths[0]) - else: - self.update_ref_audio(", ".join(file_paths)) - - def add_drag_drop_events(self, widgets): - for widget in widgets: - widget.setAcceptDrops(True) - widget.installEventFilter(self) - - def eventFilter(self, obj, event): - if event.type() == QEvent.DragEnter: - mime_data = event.mimeData() - if mime_data.hasUrls(): - event.acceptProposedAction() - - elif event.type() == QEvent.Drop: - mime_data = event.mimeData() - if mime_data.hasUrls(): - event.acceptProposedAction() - - return super().eventFilter(obj, event) - - def select_GPT_model(self): - file_path, _ = QFileDialog.getOpenFileName(self, "选择GPT模型文件", "", "GPT Files (*.ckpt)") - if file_path: - self.GPT_model_input.setText(file_path) - - def select_SoVITS_model(self): - file_path, _ = QFileDialog.getOpenFileName(self, "选择SoVITS模型文件", "", "SoVITS Files (*.pth)") - if file_path: - self.SoVITS_model_input.setText(file_path) - - def select_ref_audio(self): - file_path, _ = QFileDialog.getOpenFileName(self, "选择参考音频文件", "", "Audio Files (*.wav *.mp3)") - if file_path: - self.update_ref_audio(file_path) - - def upload_ref_text(self): - file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") - if file_path: - with open(file_path, 'r', encoding='utf-8') as file: - content = file.read() - self.ref_text_input.setText(content) - - def upload_target_text(self): - file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)") - if file_path: - with open(file_path, 'r', encoding='utf-8') as file: - content = file.read() - self.target_text_input.setText(content) - - def select_output_path(self): - options = QFileDialog.Options() - options |= QFileDialog.DontUseNativeDialog - options |= QFileDialog.ShowDirsOnly - - folder_dialog = QFileDialog() - folder_dialog.setOptions(options) - folder_dialog.setFileMode(QFileDialog.Directory) - - if folder_dialog.exec_(): - folder_path = folder_dialog.selectedFiles()[0] - self.output_input.setText(folder_path) - - def update_ref_audio(self, file_path): - self.ref_audio_input.setText(file_path) - - def clear_output(self): - self.output_text.clear() - - def synthesize(self): - GPT_model_path = self.GPT_model_input.text() - SoVITS_model_path = self.SoVITS_model_input.text() - ref_audio_path = self.ref_audio_input.text() - language_combobox = self.ref_language_combobox.currentText() - language_combobox = i18n(language_combobox) - ref_text = self.ref_text_input.text() - target_language_combobox = self.target_language_combobox.currentText() - target_language_combobox = i18n(target_language_combobox) - target_text = self.target_text_input.text() - output_path = self.output_input.text() - - if GPT_model_path != self.gpt_path: - change_gpt_weights(gpt_path=GPT_model_path) - self.gpt_path = GPT_model_path - if SoVITS_model_path != self.sovits_path: - change_sovits_weights(sovits_path=SoVITS_model_path) - self.sovits_path = SoVITS_model_path - - synthesis_result = get_tts_wav(ref_wav_path=ref_audio_path, - prompt_text=ref_text, - prompt_language=language_combobox, - text=target_text, - text_language=target_language_combobox) - - result_list = list(synthesis_result) - - if result_list: - last_sampling_rate, last_audio_data = result_list[-1] - output_wav_path = os.path.join(output_path, "output.wav") - sf.write(output_wav_path, last_audio_data, last_sampling_rate) - - result = "Audio saved to " + output_wav_path - - self.status_bar.showMessage("合成完成!输出路径:" + output_wav_path, 5000) - self.output_text.append("处理结果:\n" + result) - - -if use_webui: +if __name__ == '__main__': app.queue(concurrency_count=511, max_size=1022).launch( server_name="0.0.0.0", inbrowser=True, @@ -954,8 +648,3 @@ def synthesize(self): server_port=infer_ttswebui, quiet=True, ) -else: - app = QApplication(sys.argv) - mainWin = GPTSoVITSGUI() - mainWin.show() - sys.exit(app.exec_()) \ No newline at end of file diff --git a/EVT_Core/TTS/VITS/Convert.py b/EVT_Core/TTS/VITS/Convert.py index 6641c44..d9c5f02 100644 --- a/EVT_Core/TTS/VITS/Convert.py +++ b/EVT_Core/TTS/VITS/Convert.py @@ -1,137 +1,48 @@ import os -import re -import langdetect -#import IPython.display as ipd -import torch -#from torch.utils.data import DataLoader +import sys from typing import Optional +from subprocess import Popen from pathlib import Path -from scipy.io.wavfile import write -from datetime import datetime -from .vits.Commons import intersperse -from .vits.Utils import get_hparams_from_file, load_checkpoint -#from .vits.Data_Utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate -from .vits.Models import SynthesizerTrn -from .vits.text import text_to_sequence -from .vits.text.symbols import symbols +current_dir = Path(__file__).absolute().parent.as_posix() +os.chdir(current_dir) +sys.path.insert(0, f"{current_dir}/VITS2_finetuning") -if torch.cuda.is_available() is True: - device = 'cuda:0' -else: - device = 'cpu' +python_exec = sys.executable or "python" -def Get_Config_Path(ConfigPath): - if Path(ConfigPath).is_dir(): - ConfigPaths = [File for File in os.listdir(ConfigPath) if Path(File).suffix == '.json'] - ConfigPath = sorted(ConfigPaths, key = lambda ConfigPath: re.sub(r'[A-Za-z]+', '', Path(ConfigPath).name))[-1] - return ConfigPath +p_infer = None -def Get_Model_Path(ModelPath): - if Path(ModelPath).is_dir(): - ModelPaths = [File for File in os.listdir(ModelPath) if Path(File).suffix == '.pth' and 'G_' in File] - ModelPath = sorted(ModelPaths, key = lambda ModelPath: re.sub(r'G_[A-Za-z]+', '', Path(ModelPath).name))[-1] - return ModelPath - - -class Voice_Converting: +def Convert( + Config_Path_Load: str = ..., + Model_Path_Load: str = ..., + Text: str = '请输入语句', + Language: Optional[str] = None, + Speaker: str = ..., + EmotionStrength: float = .667, + PhonemeDuration: float = 0.8, + SpeechRate: float = 1., + Audio_Path_Save: str = "audio.wav" +): ''' Convert text to speech and save as audio files ''' - def __init__(self, - Config_Path_Load: str = ..., - Model_Path_Load: str = ..., - Text: str = '请输入语句', - Language: Optional[str] = None, - Speaker: str = ..., - EmotionStrength: float = .667, - PhonemeDuration: float = 0.8, - SpeechRate: float = 1., - Audio_Path_Save: str = ... - ): - self.Config_Path_Load = Get_Config_Path(Config_Path_Load) - self.Model_Path_Load = Get_Model_Path(Model_Path_Load) - self.Text = Text - self.Language = Language - self.Speaker = Speaker - self.EmotionStrength = EmotionStrength - self.PhonemeDuration = PhonemeDuration - self.SpeechRate = SpeechRate - self.Audio_Path_Save = Audio_Path_Save - - os.remove(Audio_Path_Save) if Path(Audio_Path_Save).exists() else os.makedirs(Path(Audio_Path_Save).parent.__str__(), exist_ok = True) - - def Converting(self): - hps = get_hparams_from_file(self.Config_Path_Load) - - net_g = SynthesizerTrn( - len(symbols), - 80 if 'use_mel_posterior_encoder' in hps.model.keys() and hps.model.use_mel_posterior_encoder == True else hps.data.filter_length // 2 + 1, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - **hps.model).to(device) - _ = net_g.eval() - - _ = load_checkpoint(self.Model_Path_Load, net_g, None) - - def get_text(text, hps): - text_norm = text_to_sequence(text, hps.data.text_cleaners) - if hps.data.add_blank: - text_norm = intersperse(text_norm, 0) - text_norm = torch.LongTensor(text_norm) - return text_norm - - def langdetector(text): # from PolyLangVITS - try: - LangDict = { - 'zh-cn': 'ZH', - 'en': 'EN', - 'ja': 'JA' - } - Lang = LangDict.get(langdetect.detect(text).lower()) - return f'[{Lang}]{text}[{Lang}]' - except Exception as e: - raise Exception("Failed to detect language!") - - stn_tst = get_text( - langdetector(re.sub(r"[\[\]\(\)\{\}]", "", self.Text)) if self.Language is not None else f"[{self.Language}]{self.Text}[{self.Language}]", - hps - ) - - with torch.no_grad(): - x_tst = stn_tst.to(device).unsqueeze(0) - x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device) - speakers = list(hps.speakers.keys()) if hasattr(hps.speakers, 'keys') else hps.speakers - sid = torch.LongTensor([speakers.index(self.Speaker)]).to(device) if self.Speaker is not None else 0 - audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=self.EmotionStrength, noise_scale_w=self.PhonemeDuration, length_scale=self.SpeechRate)[0][0,0].data.cpu().float().numpy() - write(os.path.normpath(self.Audio_Path_Save), hps.data.sampling_rate, audio) #ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False)) - - -''' # Voice Conversion -dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data) -collate_fn = TextAudioSpeakerCollate() -loader = DataLoader(dataset, num_workers=8, shuffle=False, - batch_size=1, pin_memory=True, - drop_last=True, collate_fn=collate_fn) -data_list = list(loader) - -with torch.no_grad(): - x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.to(device) for x in data_list[0]] - sid_tgt1 = torch.LongTensor([1]).to(device) - sid_tgt2 = torch.LongTensor([2]).to(device) - sid_tgt3 = torch.LongTensor([4]).to(device) - audio1 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data.cpu().float().numpy() - audio2 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[0][0,0].data.cpu().float().numpy() - audio3 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[0][0,0].data.cpu().float().numpy() -print("Original SID: %d" % sid_src.item()) -ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sampling_rate, normalize=False)) -print("Converted SID: %d" % sid_tgt1.item()) -ipd.display(ipd.Audio(audio1, rate=hps.data.sampling_rate, normalize=False)) -print("Converted SID: %d" % sid_tgt2.item()) -ipd.display(ipd.Audio(audio2, rate=hps.data.sampling_rate, normalize=False)) -print("Converted SID: %d" % sid_tgt3.item()) -ipd.display(ipd.Audio(audio3, rate=hps.data.sampling_rate, normalize=False)) -''' \ No newline at end of file + global p_infer + if p_infer is None: + os.environ['Config_Path_Load'] = str(Config_Path_Load) + os.environ['Model_Path_Load'] = str(Model_Path_Load) + os.environ['Text'] = str(Text) + os.environ['Language'] = str(Language) + os.environ['Speaker'] = str(Speaker) + os.environ['EmotionStrength'] = str(EmotionStrength) + os.environ['PhonemeDuration'] = str(PhonemeDuration) + os.environ['SpeechRate'] = str(SpeechRate) + os.environ['Audio_Path_Save'] = str(Audio_Path_Save) + print("Start converting...") + p_infer = Popen(f'"{python_exec}" "VITS2_finetuning/inference.py"', shell = True) + p_infer.wait() + p_infer = None + else: + print("已有正在进行的推理任务,需先终止才能开启下一次任务") \ No newline at end of file diff --git a/EVT_Core/Train/VITS/vits/Attentions.py b/EVT_Core/TTS/VITS/VITS2_finetuning/Attentions.py similarity index 99% rename from EVT_Core/Train/VITS/vits/Attentions.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/Attentions.py index 77314d8..4fc2a33 100644 --- a/EVT_Core/Train/VITS/vits/Attentions.py +++ b/EVT_Core/TTS/VITS/VITS2_finetuning/Attentions.py @@ -4,8 +4,8 @@ from torch.nn import functional as F from torch.nn.utils import remove_weight_norm, weight_norm -from .Modules import LayerNorm -from .Commons import subsequent_mask, convert_pad_shape, fused_add_tanh_sigmoid_multiply +from modules import LayerNorm +from commons import subsequent_mask, convert_pad_shape, fused_add_tanh_sigmoid_multiply class MultiHeadAttention(nn.Module): diff --git a/EVT_Core/Train/VITS/vits/Commons.py b/EVT_Core/TTS/VITS/VITS2_finetuning/Commons.py similarity index 100% rename from EVT_Core/Train/VITS/vits/Commons.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/Commons.py diff --git a/EVT_Core/Train/VITS/vits/Models.py b/EVT_Core/TTS/VITS/VITS2_finetuning/Models.py similarity index 93% rename from EVT_Core/Train/VITS/vits/Models.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/Models.py index 521e7c3..c207b3c 100644 --- a/EVT_Core/Train/VITS/vits/Models.py +++ b/EVT_Core/TTS/VITS/VITS2_finetuning/Models.py @@ -5,10 +5,10 @@ from torch.nn import Conv1d, ConvTranspose1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from . import Modules -from . import Attentions -from . import Commons -from . import monotonic_align +import modules +import attentions +import commons +import monotonic_align AVAILABLE_FLOW_TYPES = [ @@ -43,25 +43,25 @@ def __init__(self, self.n_flows = n_flows self.gin_channels = gin_channels - self.log_flow = Modules.Log() + self.log_flow = modules.Log() self.flows = nn.ModuleList() - self.flows.append(Modules.ElementwiseAffine(2)) + self.flows.append(modules.ElementwiseAffine(2)) for i in range(n_flows): - self.flows.append(Modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) - self.flows.append(Modules.Flip()) + self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) + self.flows.append(modules.Flip()) self.post_pre = nn.Conv1d(1, filter_channels, 1) self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.post_convs = Modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) + self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) self.post_flows = nn.ModuleList() - self.post_flows.append(Modules.ElementwiseAffine(2)) + self.post_flows.append(modules.ElementwiseAffine(2)) for i in range(4): - self.post_flows.append(Modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) - self.post_flows.append(Modules.Flip()) + self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) + self.post_flows.append(modules.Flip()) self.pre = nn.Conv1d(in_channels, filter_channels, 1) self.proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.convs = Modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) + self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, filter_channels, 1) @@ -131,9 +131,9 @@ def __init__(self, self.drop = nn.Dropout(p_dropout) self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2) - self.norm_1 = Modules.LayerNorm(filter_channels) + self.norm_1 = modules.LayerNorm(filter_channels) self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2) - self.norm_2 = Modules.LayerNorm(filter_channels) + self.norm_2 = modules.LayerNorm(filter_channels) self.proj = nn.Conv1d(filter_channels, 1, 1) if gin_channels != 0: @@ -176,15 +176,15 @@ def __init__(self, self.drop = nn.Dropout(p_dropout) self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2) - # self.norm_1 = Modules.LayerNorm(filter_channels) + # self.norm_1 = modules.LayerNorm(filter_channels) self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) - # self.norm_2 = Modules.LayerNorm(filter_channels) + # self.norm_2 = modules.LayerNorm(filter_channels) self.dur_proj = nn.Conv1d(1, filter_channels, 1) self.pre_out_conv_1 = nn.Conv1d(2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) - self.pre_out_norm_1 = Modules.LayerNorm(filter_channels) + self.pre_out_norm_1 = modules.LayerNorm(filter_channels) self.pre_out_conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) - self.pre_out_norm_2 = Modules.LayerNorm(filter_channels) + self.pre_out_norm_2 = modules.LayerNorm(filter_channels) # if gin_channels != 0: # self.cond = nn.Conv1d(gin_channels, in_channels, 1) @@ -246,21 +246,21 @@ def __init__( self.conv_1 = nn.Conv1d( in_channels, filter_channels, kernel_size, padding=kernel_size // 2 ) - self.norm_1 = Modules.LayerNorm(filter_channels) + self.norm_1 = modules.LayerNorm(filter_channels) self.conv_2 = nn.Conv1d( filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 ) - self.norm_2 = Modules.LayerNorm(filter_channels) + self.norm_2 = modules.LayerNorm(filter_channels) self.dur_proj = nn.Conv1d(1, filter_channels, 1) self.pre_out_conv_1 = nn.Conv1d( 2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 ) - self.pre_out_norm_1 = Modules.LayerNorm(filter_channels) + self.pre_out_norm_1 = modules.LayerNorm(filter_channels) self.pre_out_conv_2 = nn.Conv1d( filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 ) - self.pre_out_norm_2 = Modules.LayerNorm(filter_channels) + self.pre_out_norm_2 = modules.LayerNorm(filter_channels) # if gin_channels != 0: # self.cond = nn.Conv1d(gin_channels, in_channels, 1) @@ -329,7 +329,7 @@ def __init__(self, nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) # Transformer Encoder - self.encoder = Attentions.Encoder( + self.encoder = attentions.Encoder( hidden_channels, filter_channels, n_heads, @@ -343,7 +343,7 @@ def __init__(self, def forward(self, x, x_lengths, g=None): x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(Commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.encoder(x * x_mask, x_mask, g=g) stats = self.proj(x) * x_mask @@ -374,7 +374,7 @@ def __init__(self, self.mean_only = mean_only self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.pre_transformer = Attentions.Encoder( + self.pre_transformer = attentions.Encoder( hidden_channels, hidden_channels, n_heads=2, @@ -383,7 +383,7 @@ def __init__(self, p_dropout=p_dropout, # window_size=None, ) - self.enc = Modules.WN( + self.enc = modules.WN( hidden_channels, kernel_size, dilation_rate, @@ -439,7 +439,7 @@ def __init__(self, self.half_channels = channels // 2 self.mean_only = mean_only # vits2 - self.pre_transformer = Attentions.Encoder( + self.pre_transformer = attentions.Encoder( self.half_channels, self.half_channels, n_heads=2, @@ -450,7 +450,7 @@ def __init__(self, ) self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = Modules.WN( + self.enc = modules.WN( hidden_channels, kernel_size, dilation_rate, @@ -459,7 +459,7 @@ def __init__(self, gin_channels=gin_channels, ) # vits2 - self.post_transformer = Attentions.Encoder( + self.post_transformer = attentions.Encoder( self.hidden_channels, self.hidden_channels, n_heads=2, @@ -523,7 +523,7 @@ def __init__(self, self.mean_only = mean_only self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = Attentions.FFT( + self.enc = attentions.FFT( hidden_channels, filter_channels, n_heads, @@ -576,7 +576,7 @@ def __init__(self, self.mean_only = mean_only self.residual_connection = residual_connection # vits2 - self.pre_transformer = Attentions.Encoder( + self.pre_transformer = attentions.Encoder( self.half_channels, self.half_channels, n_heads=2, @@ -679,7 +679,7 @@ def __init__(self, mean_only=True ) ) - self.flows.append(Modules.Flip()) + self.flows.append(modules.Flip()) elif transformer_flow_type == "pre_conv2": for i in range(n_flows): self.flows.append( @@ -693,7 +693,7 @@ def __init__(self, mean_only=True, ) ) - self.flows.append(Modules.Flip()) + self.flows.append(modules.Flip()) elif transformer_flow_type == "fft": for i in range(n_flows): self.flows.append( @@ -707,11 +707,11 @@ def __init__(self, mean_only=True ) ) - self.flows.append(Modules.Flip()) + self.flows.append(modules.Flip()) elif transformer_flow_type == "mono_layer_inter_residual": for i in range(n_flows): self.flows.append( - Modules.ResidualCouplingLayer( + modules.ResidualCouplingLayer( channels, hidden_channels, kernel_size, @@ -721,7 +721,7 @@ def __init__(self, mean_only=True ) ) - self.flows.append(Modules.Flip()) + self.flows.append(modules.Flip()) self.flows.append( MonoTransformerFlowLayer( channels, hidden_channels, mean_only=True @@ -730,7 +730,7 @@ def __init__(self, elif transformer_flow_type == "mono_layer_post_residual": for i in range(n_flows): self.flows.append( - Modules.ResidualCouplingLayer( + modules.ResidualCouplingLayer( channels, hidden_channels, kernel_size, @@ -740,7 +740,7 @@ def __init__(self, mean_only=True, ) ) - self.flows.append(Modules.Flip()) + self.flows.append(modules.Flip()) self.flows.append( MonoTransformerFlowLayer( channels, @@ -752,7 +752,7 @@ def __init__(self, else: for i in range(n_flows): self.flows.append( - Modules.ResidualCouplingLayer( + modules.ResidualCouplingLayer( channels, hidden_channels, kernel_size, @@ -762,7 +762,7 @@ def __init__(self, mean_only=True ) ) - self.flows.append(Modules.Flip()) + self.flows.append(modules.Flip()) def forward(self, x, x_mask, g=None, reverse=False): if not reverse: @@ -794,11 +794,11 @@ def __init__(self, self.gin_channels = gin_channels self.pre = nn.Conv1d(in_channels, hidden_channels, 1) - self.enc = Modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) + self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def forward(self, x, x_lengths, g=None): # x: LinearSpectrum; g: GlobalCondition - x_mask = torch.unsqueeze(Commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) stats = self.proj(x) * x_mask @@ -822,7 +822,7 @@ def __init__(self, self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) - resblock = Modules.ResBlock1 if resblock == '1' else Modules.ResBlock2 + resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): @@ -839,7 +839,7 @@ def __init__(self, self.resblocks.append(resblock(ch, k, d)) self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) - self.ups.apply(Commons.init_weights) + self.ups.apply(commons.init_weights) if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) @@ -850,7 +850,7 @@ def forward(self, x, g=None): x = x + self.cond(g) for i in range(self.num_upsamples): - x = F.leaky_relu(x, Modules.LRELU_SLOPE) + x = F.leaky_relu(x, modules.LRELU_SLOPE) x = self.ups[i](x) xs = None for j in range(self.num_kernels): @@ -892,7 +892,7 @@ def __init__(self, 32, (kernel_size, 1), (stride, 1), - padding=(Commons.get_padding(kernel_size, 1), 0), + padding=(commons.get_padding(kernel_size, 1), 0), ) ), norm_f( @@ -901,7 +901,7 @@ def __init__(self, 128, (kernel_size, 1), (stride, 1), - padding=(Commons.get_padding(kernel_size, 1), 0), + padding=(commons.get_padding(kernel_size, 1), 0), ) ), norm_f( @@ -910,7 +910,7 @@ def __init__(self, 512, (kernel_size, 1), (stride, 1), - padding=(Commons.get_padding(kernel_size, 1), 0), + padding=(commons.get_padding(kernel_size, 1), 0), ) ), norm_f( @@ -919,7 +919,7 @@ def __init__(self, 1024, (kernel_size, 1), (stride, 1), - padding=(Commons.get_padding(kernel_size, 1), 0), + padding=(commons.get_padding(kernel_size, 1), 0), ) ), norm_f( @@ -928,7 +928,7 @@ def __init__(self, 1024, (kernel_size, 1), 1, - padding=(Commons.get_padding(kernel_size, 1), 0), + padding=(commons.get_padding(kernel_size, 1), 0), ) ), ] @@ -948,7 +948,7 @@ def forward(self, x): for l in self.convs: x = l(x) - x = F.leaky_relu(x, Modules.LRELU_SLOPE) + x = F.leaky_relu(x, modules.LRELU_SLOPE) fmap.append(x) x = self.conv_post(x) fmap.append(x) @@ -978,7 +978,7 @@ def forward(self, x): for l in self.convs: x = l(x) - x = F.leaky_relu(x, Modules.LRELU_SLOPE) + x = F.leaky_relu(x, modules.LRELU_SLOPE) fmap.append(x) x = self.conv_post(x) fmap.append(x) @@ -1163,7 +1163,7 @@ def forward(self, x, x_lengths, y, y_lengths, sid=None): m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) - z_slice, ids_slice = Commons.rand_slice_segments(z, y_lengths, self.segment_size) + z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size) o = self.dec(z_slice, g=g) return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q), (x, logw, logw_) @@ -1183,9 +1183,9 @@ def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_sca w = torch.exp(logw) * x_mask * length_scale w_ceil = torch.ceil(w) y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() - y_mask = torch.unsqueeze(Commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype) + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype) attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) - attn = Commons.generate_path(w_ceil, attn_mask) + attn = commons.generate_path(w_ceil, attn_mask) m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] diff --git a/EVT_Core/TTS/VITS/vits/Modules.py b/EVT_Core/TTS/VITS/VITS2_finetuning/Modules.py similarity index 99% rename from EVT_Core/TTS/VITS/vits/Modules.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/Modules.py index 62da8a3..7a845a1 100644 --- a/EVT_Core/TTS/VITS/vits/Modules.py +++ b/EVT_Core/TTS/VITS/VITS2_finetuning/Modules.py @@ -5,8 +5,8 @@ from torch.nn import Conv1d from torch.nn.utils import weight_norm, remove_weight_norm -from .Commons import fused_add_tanh_sigmoid_multiply, init_weights, get_padding -from .Transforms import piecewise_rational_quadratic_transform +from commons import fused_add_tanh_sigmoid_multiply, init_weights, get_padding +from transforms import piecewise_rational_quadratic_transform LRELU_SLOPE = 0.1 diff --git a/EVT_Core/TTS/VITS/vits/Transforms.py b/EVT_Core/TTS/VITS/VITS2_finetuning/Transforms.py similarity index 100% rename from EVT_Core/TTS/VITS/vits/Transforms.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/Transforms.py diff --git a/EVT_Core/Train/VITS/vits/Utils.py b/EVT_Core/TTS/VITS/VITS2_finetuning/Utils.py similarity index 90% rename from EVT_Core/Train/VITS/vits/Utils.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/Utils.py index a9e9f1c..d9d6485 100644 --- a/EVT_Core/Train/VITS/vits/Utils.py +++ b/EVT_Core/TTS/VITS/VITS2_finetuning/Utils.py @@ -1,4 +1,5 @@ import os +import re import glob import sys import shutil @@ -11,6 +12,7 @@ import matplotlib.pylab as plt import numpy as np import torch +from typing import Optional from pathlib import Path @@ -151,22 +153,6 @@ def load_audiopaths_sid_text(filename, split = "|"): return audiopaths_sid_text -def get_hparams( - Config_Path: str, - Model_Dir: str -): - if not os.path.exists(Model_Dir): - os.makedirs(Model_Dir) - - with open(Config_Path, 'r', encoding = 'utf-8') as f: - data = f.read() - config = json.loads(data) - - hparams = HParams(**config) - hparams.model_dir = Model_Dir - return hparams - - def add_elements( Iterable1, Iterable2 @@ -242,4 +228,34 @@ def __contains__(self, key): return key in self.__dict__ def __repr__(self): - return self.__dict__.__repr__() \ No newline at end of file + return self.__dict__.__repr__() + + +def get_hparams( + Config_Path: str, + Model_Dir: Optional[str] = None +): + with open(Config_Path, 'r', encoding = 'utf-8') as f: + data = f.read() + config = json.loads(data) + hparams = HParams(**config) + + if Model_Dir is not None: + os.makedirs(Model_Dir) if not Path(Model_Dir).exists() else None + hparams.model_dir = Model_Dir + + return hparams + + +def Get_Config_Path(ConfigPath): + if Path(ConfigPath).is_dir(): + ConfigPaths = [File for File in os.listdir(ConfigPath) if Path(File).suffix == '.json'] + ConfigPath = sorted(ConfigPaths, key = lambda ConfigPath: re.sub(r'[A-Za-z]+', '', Path(ConfigPath).name))[-1] + return ConfigPath + + +def Get_Model_Path(ModelPath): + if Path(ModelPath).is_dir(): + ModelPaths = [File for File in os.listdir(ModelPath) if Path(File).suffix == '.pth' and 'G_' in File] + ModelPath = sorted(ModelPaths, key = lambda ModelPath: re.sub(r'G_[A-Za-z]+', '', Path(ModelPath).name))[-1] + return ModelPath \ No newline at end of file diff --git a/EVT_Core/TTS/VITS/vits/__init__.py b/EVT_Core/TTS/VITS/VITS2_finetuning/__init__.py similarity index 100% rename from EVT_Core/TTS/VITS/vits/__init__.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/__init__.py diff --git a/EVT_Core/TTS/VITS/VITS2_finetuning/inference.py b/EVT_Core/TTS/VITS/VITS2_finetuning/inference.py new file mode 100644 index 0000000..766e628 --- /dev/null +++ b/EVT_Core/TTS/VITS/VITS2_finetuning/inference.py @@ -0,0 +1,98 @@ +import os +import re +import argparse +import langdetect +#import IPython.display as ipd +import torch +from typing import Optional +from pathlib import Path +from scipy.io.wavfile import write +from datetime import datetime + +from commons import intersperse +from utils import get_hparams, load_checkpoint, Get_Config_Path, Get_Model_Path +from models import SynthesizerTrn +from text import text_to_sequence +from text.symbols import symbols + + +if torch.cuda.is_available() is True: + device = 'cuda:0' +else: + device = 'cpu' + + +parser = argparse.ArgumentParser() +parser.add_argument("--Config_Path_Load", type = str, default = "...") +parser.add_argument("--Model_Path_Load", type = str, default = "...") +parser.add_argument("--Text", type = str, default = "请输入语句") +parser.add_argument("--Language", type = Optional[str], default = None) +parser.add_argument("--Speaker", type = str, default = "...") +parser.add_argument("--EmotionStrength", type = float, default = .667) +parser.add_argument("--PhonemeDuration", type = float, default = 0.8) +parser.add_argument("--SpeechRate", type = float, default = 1.) +parser.add_argument("--Audio_Path_Save", type = str, default = "audio.wav") +args = parser.parse_args() +#logging.info(str(args)) + +Config_Path_Load = Get_Config_Path(os.environ.get('FileList_Path_Training', args.Config_Path_Load)) +Model_Path_Load = Get_Model_Path(os.environ.get('Model_Path_Load', args.Model_Path_Load)) +Text = str(os.environ.get('Text', args.Text)) +Language = str(os.environ.get('Language', args.Language)) +Speaker = str(os.environ.get('Speaker', args.Speaker)) +EmotionStrength = float(os.environ.get('EmotionStrength', args.EmotionStrength)) +PhonemeDuration = float(os.environ.get('PhonemeDuration', args.PhonemeDuration)) +SpeechRate = float(os.environ.get('SpeechRate', args.SpeechRate)) +Audio_Path_Save = str(os.environ.get('Audio_Path_Save', args.Audio_Path_Save)) + +os.remove(Audio_Path_Save) if Path(Audio_Path_Save).exists() else os.makedirs(Path(Audio_Path_Save).parent.__str__(), exist_ok = True) + + +def Convert(): + hps = get_hparams(Config_Path_Load) + + net_g = SynthesizerTrn( + len(symbols), + 80 if 'use_mel_posterior_encoder' in hps.model.keys() and hps.model.use_mel_posterior_encoder == True else hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + **hps.model).to(device) + _ = net_g.eval() + + _ = load_checkpoint(Model_Path_Load, net_g, None) + + def get_text(text, hps): + text_norm = text_to_sequence(text, hps.data.text_cleaners) + if hps.data.add_blank: + text_norm = intersperse(text_norm, 0) + text_norm = torch.LongTensor(text_norm) + return text_norm + + def langdetector(text): # from PolyLangVITS + try: + LangDict = { + 'zh-cn': 'ZH', + 'en': 'EN', + 'ja': 'JA' + } + Lang = LangDict.get(langdetect.detect(text).lower()) + return f'[{Lang}]{text}[{Lang}]' + except Exception as e: + raise Exception("Failed to detect language!") + + stn_tst = get_text( + langdetector(re.sub(r"[\[\]\(\)\{\}]", "", Text)) if Language is not None else f"[{Language}]{Text}[{Language}]", + hps + ) + + with torch.no_grad(): + x_tst = stn_tst.to(device).unsqueeze(0) + x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device) + speakers = list(hps.speakers.keys()) if hasattr(hps.speakers, 'keys') else hps.speakers + sid = torch.LongTensor([speakers.index(Speaker)]).to(device) if Speaker is not None else 0 + audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=EmotionStrength, noise_scale_w=PhonemeDuration, length_scale=SpeechRate)[0][0,0].data.cpu().float().numpy() + write(os.path.normpath(Audio_Path_Save), hps.data.sampling_rate, audio) #ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False)) + + +if __name__ == "__main__": + Convert() \ No newline at end of file diff --git a/EVT_Core/Train/VITS/vits/monotonic_align/Core.py b/EVT_Core/TTS/VITS/VITS2_finetuning/monotonic_align/Core.py similarity index 100% rename from EVT_Core/Train/VITS/vits/monotonic_align/Core.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/monotonic_align/Core.py diff --git a/EVT_Core/Train/VITS/vits/monotonic_align/__init__.py b/EVT_Core/TTS/VITS/VITS2_finetuning/monotonic_align/__init__.py similarity index 94% rename from EVT_Core/Train/VITS/vits/monotonic_align/__init__.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/monotonic_align/__init__.py index 3d3d289..81b52ab 100644 --- a/EVT_Core/Train/VITS/vits/monotonic_align/__init__.py +++ b/EVT_Core/TTS/VITS/VITS2_finetuning/monotonic_align/__init__.py @@ -1,7 +1,7 @@ import numpy as np import torch -from .Core import maximum_path_nb +from .core import maximum_path_nb def maximum_path(neg_cent, mask): diff --git a/EVT_Core/Train/VITS/vits/text/LICENSE b/EVT_Core/TTS/VITS/VITS2_finetuning/text/LICENSE similarity index 100% rename from EVT_Core/Train/VITS/vits/text/LICENSE rename to EVT_Core/TTS/VITS/VITS2_finetuning/text/LICENSE diff --git a/EVT_Core/Train/VITS/vits/text/__init__.py b/EVT_Core/TTS/VITS/VITS2_finetuning/text/__init__.py similarity index 100% rename from EVT_Core/Train/VITS/vits/text/__init__.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/text/__init__.py diff --git a/EVT_Core/TTS/VITS/vits/text/cleaners.py b/EVT_Core/TTS/VITS/VITS2_finetuning/text/cleaners.py similarity index 100% rename from EVT_Core/TTS/VITS/vits/text/cleaners.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/text/cleaners.py diff --git a/EVT_Core/TTS/VITS/vits/text/english.py b/EVT_Core/TTS/VITS/VITS2_finetuning/text/english.py similarity index 100% rename from EVT_Core/TTS/VITS/vits/text/english.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/text/english.py diff --git a/EVT_Core/TTS/VITS/vits/text/japanese.py b/EVT_Core/TTS/VITS/VITS2_finetuning/text/japanese.py similarity index 100% rename from EVT_Core/TTS/VITS/vits/text/japanese.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/text/japanese.py diff --git a/EVT_Core/TTS/VITS/vits/text/mandarin.py b/EVT_Core/TTS/VITS/VITS2_finetuning/text/mandarin.py similarity index 100% rename from EVT_Core/TTS/VITS/vits/text/mandarin.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/text/mandarin.py diff --git a/EVT_Core/TTS/VITS/vits/text/symbols.py b/EVT_Core/TTS/VITS/VITS2_finetuning/text/symbols.py similarity index 100% rename from EVT_Core/TTS/VITS/vits/text/symbols.py rename to EVT_Core/TTS/VITS/VITS2_finetuning/text/symbols.py diff --git a/EVT_Core/TTS/VITS/vits/Commons.py b/EVT_Core/TTS/VITS/vits/Commons.py deleted file mode 100644 index ce8d26f..0000000 --- a/EVT_Core/TTS/VITS/vits/Commons.py +++ /dev/null @@ -1,78 +0,0 @@ -import torch -from torch.nn import functional as F - - -def init_weights(m, mean=0.0, std=0.01): - classname = m.__class__.__name__ - if classname.find("Conv") != -1: - m.weight.data.normal_(mean, std) - - -def get_padding(kernel_size, dilation=1): - return int((kernel_size*dilation - dilation)/2) - - -def intersperse(lst, item): - result = [item] * (len(lst) * 2 + 1) - result[1::2] = lst - return result - - -def slice_segments(x, ids_str, segment_size=4): - ret = torch.zeros_like(x[:, :, :segment_size]) - for i in range(x.size(0)): - idx_str = ids_str[i] - idx_end = idx_str + segment_size - ret[i] = x[i, :, idx_str:idx_end] - return ret - - -def rand_slice_segments(x, x_lengths=None, segment_size=4): - b, d, t = x.size() - if x_lengths is None: - x_lengths = t - ids_str_max = x_lengths - segment_size + 1 - ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) - ret = slice_segments(x, ids_str, segment_size) - return ret, ids_str - - -@torch.jit.script -def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts - - -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape - - -def sequence_mask(length, max_length=None): - if max_length is None: - max_length = length.max() - x = torch.arange(max_length, dtype=length.dtype, device=length.device) - return x.unsqueeze(0) < length.unsqueeze(1) - - -def generate_path(duration, mask): - """ - duration: [b, 1, t_x] - mask: [b, 1, t_y, t_x] - """ - device = duration.device - - b, _, t_y, t_x = mask.shape - cum_duration = torch.cumsum(duration, -1) - - cum_duration_flat = cum_duration.view(b * t_x) - path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) - path = path.view(b, t_x, t_y) - path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] - path = path.unsqueeze(1).transpose(2,3) * mask - return path \ No newline at end of file diff --git a/EVT_Core/TTS/VITS/vits/Utils.py b/EVT_Core/TTS/VITS/vits/Utils.py deleted file mode 100644 index 4a6a17c..0000000 --- a/EVT_Core/TTS/VITS/vits/Utils.py +++ /dev/null @@ -1,78 +0,0 @@ -import os -import sys -import logging -logging.basicConfig(stream = sys.stdout, level = logging.DEBUG) -logger = logging -import json -import torch - - -MATPLOTLIB_FLAG = False - - -def load_checkpoint(checkpoint_path, model, optimizer=None): - assert os.path.isfile(checkpoint_path) - checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') - iteration = checkpoint_dict['iteration'] - learning_rate = checkpoint_dict['learning_rate'] - if optimizer is not None: - optimizer.load_state_dict(checkpoint_dict['optimizer']) - saved_state_dict = checkpoint_dict['model'] - if hasattr(model, 'module'): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - new_state_dict= {} - for k, v in state_dict.items(): - try: - new_state_dict[k] = saved_state_dict[k] - except: - logger.info("%s is not in the checkpoint" % k) - new_state_dict[k] = v - if hasattr(model, 'module'): - model.module.load_state_dict(new_state_dict) - else: - model.load_state_dict(new_state_dict) - logger.info(f"Loaded checkpoint '{checkpoint_path}' (iteration {iteration})") - return model, optimizer, learning_rate, iteration - - -def get_hparams_from_file(config_path): - with open(config_path, 'r', encoding = 'utf-8') as f: - data = f.read() - config = json.loads(data) - - hparams = HParams(**config) - return hparams - - -class HParams(): - def __init__(self, **kwargs): - for k, v in kwargs.items(): - if type(v) == dict: - v = HParams(**v) - self[k] = v - - def keys(self): - return self.__dict__.keys() - - def items(self): - return self.__dict__.items() - - def values(self): - return self.__dict__.values() - - def __len__(self): - return len(self.__dict__) - - def __getitem__(self, key): - return getattr(self, key) - - def __setitem__(self, key, value): - return setattr(self, key, value) - - def __contains__(self, key): - return key in self.__dict__ - - def __repr__(self): - return self.__dict__.__repr__() \ No newline at end of file diff --git a/EVT_Core/Train/VITS/Train.py b/EVT_Core/Train/VITS/Train.py index e13517f..843d8a2 100644 --- a/EVT_Core/Train/VITS/Train.py +++ b/EVT_Core/Train/VITS/Train.py @@ -1,718 +1,72 @@ -from typing import Optional -from pathlib import Path -from datetime import datetime import os import sys -import re -import json -import platform -import logging -logging.basicConfig(stream = sys.stdout, encoding = 'utf-8') -logging.getLogger('numba').setLevel(logging.WARNING) -import torchaudio -import torch -from torch.nn import functional as F -from torch.utils.data import DataLoader -from torch.utils.tensorboard import SummaryWriter -import torch.multiprocessing as mp -import torch.distributed as dist -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.cuda.amp import autocast, GradScaler -torch.backends.cudnn.benchmark = True -from concurrent.futures import ThreadPoolExecutor -from tqdm import tqdm - -from .vits.Data_Utils import ( - TextAudioSpeakerLoader, - TextAudioSpeakerCollate, - DistributedBucketSampler -) -from .vits.Models import ( - AVAILABLE_FLOW_TYPES, - AVAILABLE_DURATION_DISCRIMINATOR_TYPES, - SynthesizerTrn, - MultiPeriodDiscriminator, - DurationDiscriminatorV1, - DurationDiscriminatorV2 -) -from .vits.Mel_Processing import ( - mel_spectrogram_torch, - spec_to_mel_torch -) -from .vits.Commons import ( - slice_segments, - clip_grad_value_ -) -from .vits.Losses import ( - generator_loss, - discriminator_loss, - feature_loss, - kl_loss -) -from .vits.Utils import ( - load_audiopaths_sid_text, - plot_spectrogram_to_numpy, - summarize, - plot_alignment_to_numpy, - save_checkpoint, - get_logger, - add_elements, - #check_git_hash, - load_checkpoint, - remove_old_checkpoints, - latest_checkpoint_path, - get_hparams -) -from .vits.text import ( - _clean_text, - #symbols -) -from .vits.text.symbols import symbols - - -global_step = 0 - - -class Preprocessing: - ''' - Preprocess - ''' - def __init__(self, - FileList_Path_Training: str, - FileList_Path_Validation: str, - Config_Dir_Save: str = './', - Set_Eval_Interval: int = 1000, - Set_Epochs: int = 10000, - Set_Batch_Size: int = 16, - Set_FP16_Run: bool = True, - Keep_Original_Speakers: bool = False, - Config_Path_Load: Optional[str] = None - ): - self.FileList_Path_Training = FileList_Path_Training - self.FileList_Path_Validation = FileList_Path_Validation - self.Config_Dir_Save = Config_Dir_Save - self.Set_Eval_Interval = Set_Eval_Interval - self.Set_Epochs = Set_Epochs - self.Set_Batch_Size = Set_Batch_Size - self.Set_FP16_Run = Set_FP16_Run - self.Keep_Original_Speakers = Keep_Original_Speakers - self.Config_Path_Load = Config_Path_Load if Keep_Original_Speakers else None - - os.makedirs(self.Config_Dir_Save, exist_ok = True) - self.Config_Path_Edited = Path(Config_Dir_Save).joinpath(f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json").__str__() - self.FileList_Path_Training_Updated = Path(self.Config_Path_Edited).parent.joinpath(Path(self.FileList_Path_Training).name).__str__() - self.FileList_Path_Validation_Updated = Path(self.Config_Path_Edited).parent.joinpath(Path(self.FileList_Path_Validation).name).__str__() - self.Out_Extension = "cleaned" - - def Configurator(self): - ''' - Edit JSON file - ''' - def Get_Languages(Text_Path_Training, Text_Path_Validation): - Languages = [] - for Text_Path in [Text_Path_Training, Text_Path_Validation]: - with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as File: - Lines = File.readlines() - for _, Line in enumerate(Lines): - Line_Text = Line.split('|', maxsplit = 2)[2] - Language = re.split(r'[\[\]]', Line_Text)[1] - Languages.append(Language) if Language not in Languages else None - if set(Languages).issubset({'ZH', 'EN', 'JA'}): - if set(Languages) == {'ZH'}: - return "mandarin" - else: - return "mandarin_english_japanese" - else: - raise Exception('Unsupported language!') - - def Get_NewSpeakers(Text_Path_Training, Text_Path_Validation): - Speakers = [] - for Text_Path in [Text_Path_Training, Text_Path_Validation]: - with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as File: - Lines = File.readlines() - for _, Line in enumerate(Lines): - Speaker = Line.split('|', maxsplit = 2)[1] - Speakers.append(Speaker) if Speaker not in Speakers else None - return Speakers - - def Get_OldSpeakers(Config_Path_Load): - if Config_Path_Load is not None and Path(Config_Path_Load).exists(): - with open(file = Config_Path_Load, mode = 'rb') as ConfigFile_Extra: - OldSpeakers = json.load(ConfigFile_Extra)["speakers"] - else: - OldSpeakers = [] - return OldSpeakers - - Language = Get_Languages(self.FileList_Path_Training, self.FileList_Path_Validation) - NewSpeakers = Get_NewSpeakers(self.FileList_Path_Training, self.FileList_Path_Validation) - OldSpeakers = Get_OldSpeakers(self.Config_Path_Load) if self.Keep_Original_Speakers else [] - - with open(file = Path(__file__).parent.joinpath('./configs', f'{Language}_base.json').__str__(), mode = 'rb') as ConfigFile_Default: - Params = json.load(ConfigFile_Default) - try: - Params_Old = Params - Params_Old["train"]["eval_interval"] = self.Set_Eval_Interval - Params_Old["train"]["epochs"] = self.Set_Epochs - Params_Old["train"]["batch_size"] = self.Set_Batch_Size - Params_Old["train"]["fp16_run"] = self.Set_FP16_Run - Params_Old["data"]["training_files"] = f'{self.FileList_Path_Training_Updated}.{self.Out_Extension}' - Params_Old["data"]["validation_files"] = f'{self.FileList_Path_Validation_Updated}.{self.Out_Extension}' - Params_Old["data"]["text_cleaners"] = [(Language + "_cleaners").lower()] - Params_Old["data"]["n_speakers"] = add_elements(OldSpeakers, NewSpeakers).__len__() - Params_Old["speakers"] = add_elements(OldSpeakers, NewSpeakers) - Params_New = Params_Old - except: - raise Exception("Please check if params exist") - with open(self.Config_Path_Edited, 'w', encoding = 'utf-8') as File_New: - json.dump(Params_New, File_New, indent = 4) - print(f"Config created in {self.Config_Dir_Save}") - - def Cleaner(self): - ''' - Convert natural language text to symbols - ''' - def Update_SID(Config_Path, Text_Path, Save_Path): - with open(file = Config_Path, mode = 'rb') as ConfigFile: - NewSpeakers = json.load(ConfigFile)["speakers"] - with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as TextFile: - Lines = TextFile.readlines() - for Index, Line in enumerate(Lines): - Line_Path = Line.split('|', maxsplit = 1)[0] - Line_Path = Path(Text_Path).parent.joinpath(Line_Path).as_posix() if not Path(Line_Path).is_absolute() else Line_Path - Speaker = Line.split('|', maxsplit = 2)[1] - SpeakerID = NewSpeakers.index(Speaker) - Line_Text = Line.split('|', maxsplit = 2)[2] - Line = f"{Line_Path}|{SpeakerID}|{Line_Text}" - Lines[Index] = Line - with open(file = Save_Path, mode = 'w', encoding = 'utf-8') as TextFile: - TextFile.writelines(Lines) - - def Get_Cleaners(Config_Path): - with open(file = Config_Path, mode = 'rb') as ConfigFile: - NewCleaners = json.load(ConfigFile)["data"]["text_cleaners"] - return NewCleaners - - for Index, FileList in enumerate([self.FileList_Path_Training, self.FileList_Path_Validation]): - print("START:", FileList) - FileList_Updated = [self.FileList_Path_Training_Updated, self.FileList_Path_Validation_Updated][Index] - Update_SID(self.Config_Path_Edited, FileList, FileList_Updated) - Path_SID_Text = load_audiopaths_sid_text(FileList_Updated) - for i in range(len(Path_SID_Text)): - Path_SID_Text[i][2] = _clean_text(Path_SID_Text[i][2], Get_Cleaners(self.Config_Path_Edited)) - Filelist_Cleaned = FileList_Updated + "." + self.Out_Extension - with open(Filelist_Cleaned, 'w', encoding = 'utf-8') as f: - f.writelines(["|".join(x) + "\n" for x in Path_SID_Text]) - - def Resampler(self): - ''' - Resample dataset audio to fit the sampling rate setting in config - ''' - def Get_Resample_List(Config_Path, Text_Path): - ResampleList = [] - with open(file = Config_Path, mode = 'rb') as ConfigFile: - SampleRate_New = json.load(ConfigFile)['data']['sampling_rate'] - with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as TextFile: - Lines = TextFile.readlines() - for Line in Lines: - Line_Path = Line.split('|', maxsplit = 1)[0] - ResampleList.append((Line_Path, SampleRate_New)) - return ResampleList - - def Resample(Audio_Path, SampleRate_New): - AudioData_Old, SampleRate_Old = torchaudio.load(Audio_Path) - AudioData_New = torchaudio.transforms.Resample(orig_freq = SampleRate_Old, new_freq = SampleRate_New)(AudioData_Old) - torchaudio.save(Audio_Path, src = AudioData_New, sample_rate = SampleRate_New) - - for FileList in (self.FileList_Path_Validation, self.FileList_Path_Training): - print("Resampling audio according to", FileList) - with ThreadPoolExecutor(max_workers = os.cpu_count()) as Executor: - Executor.map( - Resample, - *zip(*Get_Resample_List(self.Config_Path_Edited, FileList)) - ) - - -class Training: - ''' - Train - ''' - def __init__(self, - Num_Workers: int = 4, - Model_Path_Pretrained_G: Optional[str] = None, - Model_Path_Pretrained_D: Optional[str] = None, - Keep_Original_Speakers: bool = False, - Log_Dir: str = "./" - ): - self.Num_Workers = Num_Workers - self.Model_Path_Pretrained_G = Model_Path_Pretrained_G - self.Model_Path_Pretrained_D = Model_Path_Pretrained_D - self.Keep_Original_Speakers = Keep_Original_Speakers - self.Log_Dir = Log_Dir - - self.UsePretrainedModel = False if None in (self.Model_Path_Pretrained_G, self.Model_Path_Pretrained_D) else True - - def evaluate(self, - hps, generator, eval_loader, writer_eval - ): - generator.eval() - with torch.no_grad(): - for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(eval_loader): - x, x_lengths = x.cuda(0), x_lengths.cuda(0) - spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0) - y, y_lengths = y.cuda(0), y_lengths.cuda(0) - speakers = speakers.cuda(0) - - # remove else - x = x[:1] - x_lengths = x_lengths[:1] - spec = spec[:1] - spec_lengths = spec_lengths[:1] - y = y[:1] - y_lengths = y_lengths[:1] - speakers = speakers[:1] - break - y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, max_len=1000) - y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length - - mel = spec_to_mel_torch( - spec, - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.mel_fmin, - hps.data.mel_fmax - ) if not (hps.model.use_mel_posterior_encoder or hps.data.use_mel_posterior_encoder) else spec - y_hat_mel = mel_spectrogram_torch( - y_hat.squeeze(1).float(), - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.hop_length, - hps.data.win_length, - hps.data.mel_fmin, - hps.data.mel_fmax - ) - image_dict = {"gen/mel": plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy())} - audio_dict = {"gen/audio": y_hat[0, :, :y_hat_lengths[0]]} - if global_step == 0: - image_dict.update({"gt/mel": plot_spectrogram_to_numpy(mel[0].cpu().numpy())}) - audio_dict.update({"gt/audio": y[0, :, :y_lengths[0]]}) - - summarize( - writer=writer_eval, - global_step=global_step, - images=image_dict, - audios=audio_dict, - audio_sampling_rate=hps.data.sampling_rate - ) - generator.train() - - def train_and_evaluate(self, - rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers - ): - net_g, net_d, net_dur_disc = nets - optim_g, optim_d, optim_dur_disc = optims - scheduler_g, scheduler_d, scheduler_dur_disc = schedulers - train_loader, eval_loader = loaders - if writers is not None: - writer, writer_eval = writers - - train_loader.batch_sampler.set_epoch(epoch) - global global_step - - net_g.train() - net_d.train() - net_dur_disc.train() if net_dur_disc is not None else None - - if rank == 0: - loader = tqdm(train_loader, desc='Loading train data') - else: - loader = train_loader - - for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(loader): - if net_g.module.use_noise_scaled_mas: - current_mas_noise_scale = net_g.module.mas_noise_scale_initial - net_g.module.noise_scale_delta * global_step - net_g.module.current_mas_noise_scale = max(current_mas_noise_scale, 0.0) - x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True) - spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True) - y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True) - speakers = speakers.cuda(rank, non_blocking=True) - - with autocast(enabled=hps.train.fp16_run): - y_hat, l_length, attn, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q), (hidden_x, logw, logw_) = net_g(x, x_lengths, spec, spec_lengths, speakers) - - mel = spec_to_mel_torch( - spec.float(), - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.mel_fmin, - hps.data.mel_fmax - ) if not (hps.model.use_mel_posterior_encoder or hps.data.use_mel_posterior_encoder) else spec - y_mel = slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) - y_hat_mel = mel_spectrogram_torch( - y_hat.squeeze(1), - hps.data.filter_length, - hps.data.n_mel_channels, - hps.data.sampling_rate, - hps.data.hop_length, - hps.data.win_length, - hps.data.mel_fmin, - hps.data.mel_fmax - ) - y = slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice - - # Discriminator - y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) - with autocast(enabled=False): - loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) - loss_disc_all = loss_disc - - # Duration Discriminator - if net_dur_disc is not None: - y_dur_hat_r, y_dur_hat_g = net_dur_disc(hidden_x.detach(), x_mask.detach(), logw_.detach(), logw.detach()) - with autocast(enabled=False): - # TODO: I think need to mean using the mask, but for now, just mean all - loss_dur_disc, losses_dur_disc_r, losses_dur_disc_g = discriminator_loss(y_dur_hat_r, y_dur_hat_g) - loss_dur_disc_all = loss_dur_disc - optim_dur_disc.zero_grad() - scaler.scale(loss_dur_disc_all).backward() - scaler.unscale_(optim_dur_disc) - grad_norm_dur_disc = clip_grad_value_(net_dur_disc.parameters(), None) - scaler.step(optim_dur_disc) - - optim_d.zero_grad() - scaler.scale(loss_disc_all).backward() - scaler.unscale_(optim_d) - grad_norm_d = clip_grad_value_(net_d.parameters(), None) - scaler.step(optim_d) - - with autocast(enabled=hps.train.fp16_run): - # Generator - y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) - if net_dur_disc is not None: - y_dur_hat_r, y_dur_hat_g = net_dur_disc(hidden_x, x_mask, logw_, logw) - with autocast(enabled=False): - loss_dur = torch.sum(l_length.float()) - loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel - loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl - - loss_fm = feature_loss(fmap_r, fmap_g) - loss_gen, losses_gen = generator_loss(y_d_hat_g) - loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl - if net_dur_disc is not None: - loss_dur_gen, losses_dur_gen = generator_loss(y_dur_hat_g) - loss_gen_all += loss_dur_gen - - optim_g.zero_grad() - scaler.scale(loss_gen_all).backward() - scaler.unscale_(optim_g) - grad_norm_g = clip_grad_value_(net_g.parameters(), None) - scaler.step(optim_g) - scaler.update() - - if rank == 0: - if global_step % hps.train.log_interval == 0: - lr = optim_g.param_groups[0]['lr'] - losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl] - logger.info('Train Epoch: {} [{:.0f}%]'.format(epoch, 100. * batch_idx / len(train_loader))) - logger.info([x.item() for x in losses] + [global_step, lr]) - - scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g} - scalar_dict.update({"loss/dur_disc/total": loss_dur_disc_all, "grad_norm_dur_disc": grad_norm_dur_disc}) if net_dur_disc is not None else None - scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl}) - - scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) - scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) - scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) - - # if net_dur_disc is not None: - # scalar_dict.update({"loss/dur_disc_r" : f"{losses_dur_disc_r}"}) - # scalar_dict.update({"loss/dur_disc_g" : f"{losses_dur_disc_g}"}) - # scalar_dict.update({"loss/dur_gen" : f"{loss_dur_gen}"}) - - image_dict = { - "slice/mel_org": plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), - "slice/mel_gen": plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), - "all/mel": plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), - "all/attn": plot_alignment_to_numpy(attn[0,0].data.cpu().numpy()) - } - summarize( - writer=writer, - global_step=global_step, - images=image_dict, - scalars=scalar_dict) - - if global_step % hps.train.eval_interval == 0: - self.evaluate(hps, net_g, eval_loader, writer_eval) - save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, Path(hps.model_dir).joinpath("G_{}.pth".format(global_step)).__str__()) - save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, Path(hps.model_dir).joinpath("D_{}.pth".format(global_step)).__str__()) - save_checkpoint(net_dur_disc, optim_dur_disc, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "DUR_{}.pth".format(global_step))) if net_dur_disc is not None else None - - remove_old_checkpoints(hps.model_dir, prefixes=["G_*.pth", "D_*.pth", "DUR_*.pth"]) - global_step += 1 - - if rank == 0: - logger.info('====> Epoch: {}'.format(epoch)) - - def run(self, rank, n_gpus, hps): - global global_step - net_dur_disc = None - if rank == 0: - logger = get_logger(hps.model_dir) - #logger.info(hps) - #check_git_hash(hps.model_dir) - writer = SummaryWriter(log_dir = self.Log_Dir) - writer_eval = SummaryWriter(log_dir = Path(self.Log_Dir).joinpath("eval").__str__()) - - dist.init_process_group( - backend = 'gloo' if platform.system() == 'Windows' else 'nccl', # Windows不支持NCCL backend,故使用GLOO - init_method = 'env://', - world_size = n_gpus, - rank = rank - ) - - torch.manual_seed(hps.train.seed) - torch.cuda.set_device(rank) - - if "use_mel_posterior_encoder" in hps.model.keys() and hps.model.use_mel_posterior_encoder == True: - print("Using mel posterior encoder for VITS2") - posterior_channels = 80 # vits2 - hps.data.use_mel_posterior_encoder = True - else: - print("Using lin posterior encoder for VITS1") - posterior_channels = hps.data.filter_length // 2 + 1 - hps.data.use_mel_posterior_encoder = False - - train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data) - train_sampler = DistributedBucketSampler( - train_dataset, - hps.train.batch_size, - [32,300,400,500,600,700,800,900,1000], - num_replicas=n_gpus, - rank=rank, - shuffle=True - ) - collate_fn = TextAudioSpeakerCollate() - train_loader = DataLoader( - train_dataset, - num_workers=self.Num_Workers, - shuffle=False, - pin_memory=True, - collate_fn=collate_fn, - batch_sampler=train_sampler - ) - if rank == 0: - eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data) - eval_loader = DataLoader( - eval_dataset, - num_workers=0, - shuffle=False, - batch_size=hps.train.batch_size, - pin_memory=True, - drop_last=False, - collate_fn=collate_fn - ) - - # some of these flags are not being used in the code and directly set in hps json file. - # they are kept here for reference and prototyping. - if "use_transformer_flows" in hps.model.keys() and hps.model.use_transformer_flows == True: - use_transformer_flows = True - transformer_flow_type = hps.model.transformer_flow_type - print(f"Using transformer flows {transformer_flow_type} for VITS2") - assert transformer_flow_type in AVAILABLE_FLOW_TYPES, f"transformer_flow_type must be one of {AVAILABLE_FLOW_TYPES}" - else: - print("Using normal flows for VITS1") - use_transformer_flows = False - - if "use_spk_conditioned_encoder" in hps.model.keys() and hps.model.use_spk_conditioned_encoder == True: - if hps.data.n_speakers == 0: - raise ValueError("n_speakers must be > 0 when using spk conditioned encoder to train multi-speaker model") - use_spk_conditioned_encoder = True - else: - print("Using normal encoder for VITS1") - use_spk_conditioned_encoder = False - - if "use_noise_scaled_mas" in hps.model.keys() and hps.model.use_noise_scaled_mas == True: - print("Using noise scaled MAS for VITS2") - use_noise_scaled_mas = True - mas_noise_scale_initial = 0.01 - noise_scale_delta = 2e-6 - else: - print("Using normal MAS for VITS1") - use_noise_scaled_mas = False - mas_noise_scale_initial = 0.0 - noise_scale_delta = 0.0 - - # Initialize VITS models and move to GPU - net_g = SynthesizerTrn( - len(symbols), - posterior_channels, - hps.train.segment_size // hps.data.hop_length, - n_speakers=hps.data.n_speakers, - mas_noise_scale_initial=mas_noise_scale_initial, - noise_scale_delta=noise_scale_delta, - **hps.model - ).cuda(rank) - net_d = MultiPeriodDiscriminator( - hps.model.use_spectral_norm - ).cuda(rank) - if "use_duration_discriminator" in hps.model.keys() and hps.model.use_duration_discriminator == True: - use_duration_discriminator = True - - # add duration discriminator type here - duration_discriminator_type = getattr(hps.model, "duration_discriminator_type", "dur_disc_1") - print(f"Using duration_discriminator {duration_discriminator_type} for VITS2") - assert duration_discriminator_type in AVAILABLE_DURATION_DISCRIMINATOR_TYPES, f"duration_discriminator_type must be one of {AVAILABLE_DURATION_DISCRIMINATOR_TYPES}" - duration_discriminator_type = AVAILABLE_DURATION_DISCRIMINATOR_TYPES - if duration_discriminator_type == "dur_disc_1": - net_dur_disc = DurationDiscriminatorV1( - hps.model.hidden_channels, - hps.model.hidden_channels, - 3, - 0.1, - gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0, - ).cuda(rank) - elif duration_discriminator_type == "dur_disc_2": - net_dur_disc = DurationDiscriminatorV2( - hps.model.hidden_channels, - hps.model.hidden_channels, - 3, - 0.1, - gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0, - ).cuda(rank) - else: - print("NOT using any duration discriminator like VITS1") - net_dur_disc = None - use_duration_discriminator = False - - # Build optimizers for the initialized VITS models - optim_g = torch.optim.AdamW( - filter(lambda net_g_params: net_g_params.requires_grad, net_g.parameters()), # Filter out params which don't require gradient - hps.train.learning_rate, - betas=hps.train.betas, - eps=hps.train.eps - ) - optim_d = torch.optim.AdamW( - net_d.parameters(), - hps.train.learning_rate, - betas=hps.train.betas, - eps=hps.train.eps - ) - optim_dur_disc = torch.optim.AdamW( - net_dur_disc.parameters(), - hps.train.learning_rate, - betas=hps.train.betas, - eps=hps.train.eps - ) if net_dur_disc is not None else None - - # Build DDP models for the initialized VITS models - net_g = DDP(net_g, device_ids = [rank], find_unused_parameters = True) - net_d = DDP(net_d, device_ids = [rank], find_unused_parameters = False) - net_dur_disc = DDP(net_dur_disc, device_ids=[rank]) if net_dur_disc is not None else None - - # Load state dict from checkpoint for the initialized VITS models and get the optimizer, learning rate and iteration - try: - _, optim_g, lr_g, epoch_str = load_checkpoint( - self.Model_Path_Pretrained_G if self.UsePretrainedModel else latest_checkpoint_path(hps.model_dir, "G_*.pth"), - net_g, - optim_g, - self.Keep_Original_Speakers if self.UsePretrainedModel else True - ) - _, optim_d, lr_d, epoch_str = load_checkpoint( - self.Model_Path_Pretrained_D if self.UsePretrainedModel else latest_checkpoint_path(hps.model_dir, "D_*.pth"), - net_d, - optim_d, - self.Keep_Original_Speakers if self.UsePretrainedModel else True - ) - _, _, _, epoch_str = load_checkpoint( - latest_checkpoint_path(hps.model_dir, "DUR_*.pth"), - net_dur_disc, - optim_dur_disc - ) if net_dur_disc is not None else (_, _, _, epoch_str) - - # To prevent KeyError: "param 'initial_lr' is not specified in param_groups[0] when resuming an optimizer" - if optim_g.param_groups[0].get('initial_lr') is None: - optim_g.param_groups[0]['initial_lr'] = lr_g - if optim_d.param_groups[0].get('initial_lr') is None: - optim_d.param_groups[0]['initial_lr'] = lr_d +from typing import Optional +from subprocess import Popen +from pathlib import Path - global_step = (epoch_str - 1) * len(train_loader) # > 0 - print(f"Continue from step {global_step}") - except Exception as e: - epoch_str = 1 - global_step = 0 - print(f"Got Exception: {e}. Start from step 0") +current_dir = Path(__file__).absolute().parent.as_posix() +os.chdir(current_dir) +sys.path.insert(0, f"{current_dir}/VITS2_finetuning") - # Build learning rate schedulers for optimizers - scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma = hps.train.lr_decay, last_epoch = epoch_str - 2) - scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma = hps.train.lr_decay, last_epoch = epoch_str - 2) - scheduler_dur_disc = torch.optim.lr_scheduler.ExponentialLR(optim_dur_disc, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) if net_dur_disc is not None else None - # Build gradient scaler - scaler = GradScaler(enabled = hps.train.fp16_run) +python_exec = sys.executable or "python" - # Start training (and evaluating) - for epoch in range(epoch_str, hps.train.epochs + 1): - if rank == 0: - self.train_and_evaluate( - rank, epoch, hps, [net_g, net_d, net_dur_disc], [optim_g, optim_d, optim_dur_disc], [scheduler_g, scheduler_d, scheduler_dur_disc], scaler, - [train_loader, eval_loader], logger, [writer, writer_eval] - ) - else: - self.train_and_evaluate( - rank, epoch, hps, [net_g, net_d, net_dur_disc], [optim_g, optim_d, optim_dur_disc], [scheduler_g, scheduler_d, scheduler_dur_disc], scaler, - [train_loader, None], None, None - ) - scheduler_g.step() - scheduler_d.step() - scheduler_dur_disc.step() if net_dur_disc is not None else None +p_preprocess = None +p_train = None -class Voice_Training(Preprocessing, Training): +def Train( + FileList_Path_Training: str = 'train.txt', + FileList_Path_Validation: str = 'val.txt', + Set_Epochs: int = 10000, + Set_Eval_Interval: int = 1000, + Set_Batch_Size: int = 16, + Set_FP16_Run: bool = True, + Keep_Original_Speakers: bool = False, + Config_Path_Load: Optional[str] = None, + Num_Workers: int = 4, + Use_PretrainedModels: bool = False, + Model_Path_Pretrained_G: str = 'pretrained_G.pth', + Model_Path_Pretrained_D: str = 'pretrained_D.pth', + Output_Root: str = './', + Output_Dir_Name: str = 'Output', + Output_Config_Name: str = 'Config.json', + Output_LogDir: str = './' +): ''' - 1. Preprocess - 2. Train & Evaluate + Train speech models ''' - def __init__(self, - FileList_Path_Training: str, - FileList_Path_Validation: str, - Set_Epochs: int = 10000, - Set_Eval_Interval: int = 1000, - Set_Batch_Size: int = 16, - Set_FP16_Run: bool = True, - Keep_Original_Speakers: bool = False, - Config_Path_Load: Optional[str] = None, - Num_Workers: int = 4, - Use_PretrainedModels: bool = True, - Model_Path_Pretrained_G: Optional[str] = None, - Model_Path_Pretrained_D: Optional[str] = None, - Output_Root: str = "./", - Output_Dir_Name: str = "", - Output_LogDir: str = "" - ): - Dir_Output = Path(Output_Root).joinpath(Output_Dir_Name).as_posix() - Preprocessing.__init__(self, FileList_Path_Training, FileList_Path_Validation, Dir_Output, Set_Eval_Interval, Set_Epochs, Set_Batch_Size, Set_FP16_Run, Keep_Original_Speakers, Config_Path_Load) - Training.__init__(self, Num_Workers, Model_Path_Pretrained_G if Use_PretrainedModels else None, Model_Path_Pretrained_D if Use_PretrainedModels else None, Keep_Original_Speakers, Output_LogDir) - self.Model_Dir_Save = Dir_Output - - def Preprocessing_and_Training(self): - # Preprocess - self.Configurator() - self.Cleaner() - self.Resampler() - - # Train & Evaluate - """Assume Single Node Multi GPUs Training Only""" - assert torch.cuda.is_available(), "CPU training is not allowed." - - n_gpus = torch.cuda.device_count() - os.environ['MASTER_ADDR'] = 'localhost' - os.environ['MASTER_PORT'] = '8000' - - hps = get_hparams( - Config_Path = self.Config_Path_Edited, - Model_Dir = self.Model_Dir_Save - ) - mp.spawn(super().run, args = (n_gpus, hps,), nprocs = n_gpus) \ No newline at end of file + global p_preprocess + if p_preprocess is None: + os.environ['FileList_Path_Training'] = str(FileList_Path_Training) + os.environ['FileList_Path_Validation'] = str(FileList_Path_Validation) + os.environ['Set_Epochs'] = str(Set_Epochs) + os.environ['Set_Eval_Interval'] = str(Set_Eval_Interval) + os.environ['Set_Batch_Size'] = str(Set_Batch_Size) + os.environ['Set_FP16_Run'] = str(Set_FP16_Run) + os.environ['Keep_Original_Speakers'] = str(Keep_Original_Speakers) + os.environ['Config_Path_Load'] = str(Config_Path_Load) + os.environ['Output_Root'] = str(Output_Root) + os.environ['Output_Dir_Name'] = str(Output_Dir_Name) + os.environ['Output_Config_Name'] = str(Output_Config_Name) + print("Start preprocessing...") + p_preprocess = Popen(f'"{python_exec}" "VITS2_finetuning/preprocess.py"', shell = True) + p_preprocess.wait() + p_preprocess = None + else: + print("已有正在进行的预处理任务,需先终止才能开启下一次任务") + + global p_train + if p_train is None: + os.environ['Num_Workers'] = str(Num_Workers) + os.environ['Use_PretrainedModels'] = str(Use_PretrainedModels) + os.environ['Model_Path_Pretrained_G'] = str(Model_Path_Pretrained_G) + os.environ['Model_Path_Pretrained_D'] = str(Model_Path_Pretrained_D) + os.environ['Output_LogDir'] = str(Output_LogDir) + print("Start training...") + p_train = Popen(f'"{python_exec}" "VITS2_finetuning/train.py"', shell = True) + p_train.wait() + p_train = None + else: + print("已有正在进行的训练任务,需先终止才能开启下一次任务") \ No newline at end of file diff --git a/EVT_Core/TTS/VITS/vits/Attentions.py b/EVT_Core/Train/VITS/VITS2_finetuning/Attentions.py similarity index 58% rename from EVT_Core/TTS/VITS/vits/Attentions.py rename to EVT_Core/Train/VITS/VITS2_finetuning/Attentions.py index a0b43a3..4fc2a33 100644 --- a/EVT_Core/TTS/VITS/vits/Attentions.py +++ b/EVT_Core/Train/VITS/VITS2_finetuning/Attentions.py @@ -2,9 +2,10 @@ import torch from torch import nn from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, weight_norm -from .Modules import LayerNorm -from .Commons import convert_pad_shape +from modules import LayerNorm +from commons import subsequent_mask, convert_pad_shape, fused_add_tanh_sigmoid_multiply class MultiHeadAttention(nn.Module): @@ -222,6 +223,150 @@ def _same_padding(self, x): return x +class Depthwise_Separable_Conv1D(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride = 1, + padding = 0, + dilation = 1, + bias = True, + padding_mode = 'zeros', # TODO: refine this type + device=None, + dtype=None + ): + super().__init__() + self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, groups=in_channels,stride = stride,padding=padding,dilation=dilation,bias=bias,padding_mode=padding_mode,device=device,dtype=dtype) + self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, device=device,dtype=dtype) + + def forward(self, input): + return self.point_conv(self.depth_conv(input)) + + def weight_norm(self): + self.depth_conv = weight_norm(self.depth_conv, name = 'weight') + self.point_conv = weight_norm(self.point_conv, name = 'weight') + + def remove_weight_norm(self): + self.depth_conv = remove_weight_norm(self.depth_conv, name = 'weight') + self.point_conv = remove_weight_norm(self.point_conv, name = 'weight') + + +class Depthwise_Separable_TransposeConv1D(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride = 1, + padding = 0, + output_padding = 0, + bias = True, + dilation = 1, + padding_mode = 'zeros', # TODO: refine this type + device=None, + dtype=None + ): + super().__init__() + self.depth_conv = nn.ConvTranspose1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, groups=in_channels,stride = stride,output_padding=output_padding,padding=padding,dilation=dilation,bias=bias,padding_mode=padding_mode,device=device,dtype=dtype) + self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, device=device,dtype=dtype) + + def forward(self, input): + return self.point_conv(self.depth_conv(input)) + + def weight_norm(self): + self.depth_conv = weight_norm(self.depth_conv, name = 'weight') + self.point_conv = weight_norm(self.point_conv, name = 'weight') + + def remove_weight_norm(self): + remove_weight_norm(self.depth_conv, name = 'weight') + remove_weight_norm(self.point_conv, name = 'weight') + + +def weight_norm_modules(module, name = 'weight', dim = 0): + if isinstance(module,Depthwise_Separable_Conv1D) or isinstance(module,Depthwise_Separable_TransposeConv1D): + module.weight_norm() + return module + else: + return weight_norm(module,name,dim) + +def remove_weight_norm_modules(module, name = 'weight'): + if isinstance(module,Depthwise_Separable_Conv1D) or isinstance(module,Depthwise_Separable_TransposeConv1D): + module.remove_weight_norm() + else: + remove_weight_norm(module,name) + + +class FFT(nn.Module): + def __init__(self, + hidden_channels, + filter_channels, + n_heads, + n_layers=1, + kernel_size=1, + p_dropout=0., + proximal_bias=False, + proximal_init=True, + isflow = False, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + if isflow and 'gin_channels' in kwargs and kwargs["gin_channels"] > 0: + cond_layer = torch.nn.Conv1d(kwargs["gin_channels"], 2*hidden_channels*n_layers, 1) + self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1) + self.cond_layer = weight_norm_modules(cond_layer, name='weight') + self.gin_channels = kwargs["gin_channels"] + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.ModuleList() + self.norm_layers_0 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + for i in range(self.n_layers): + self.self_attn_layers.append( + MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, + proximal_init=proximal_init)) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, g = None): + """ + x: decoder input + h: encoder output + """ + if g is not None: + g = self.cond_layer(g) + + self_attn_mask = subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) + x = x * x_mask + for i in range(self.n_layers): + if g is not None: + x = self.cond_pre(x) + cond_offset = i * 2 * self.hidden_channels + g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] + x = fused_add_tanh_sigmoid_multiply( + x, + g_l, + torch.IntTensor([self.hidden_channels])) + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + x = x * x_mask + return x + + class Encoder(nn.Module): #backward compatible vits2 encoder def __init__(self, hidden_channels, @@ -282,4 +427,64 @@ def forward(self, x, x_mask, g = None): y = self.drop(y) x = self.norm_layers_2[i](x + y) x = x * x_mask + return x + +class Decoder(nn.Module): + def __init__(self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0., + proximal_bias=False, + proximal_init=True, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.ModuleList() + self.norm_layers_0 = nn.ModuleList() + self.encdec_attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, h, h_mask): + """ + x: decoder input + h: encoder output + """ + self_attn_mask = subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) + encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + + y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask return x \ No newline at end of file diff --git a/EVT_Core/Train/VITS/VITS2_finetuning/Commons.py b/EVT_Core/Train/VITS/VITS2_finetuning/Commons.py new file mode 100644 index 0000000..6990c50 --- /dev/null +++ b/EVT_Core/Train/VITS/VITS2_finetuning/Commons.py @@ -0,0 +1,153 @@ +import math +import torch +from torch.nn import functional as F + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size*dilation - dilation)/2) + + +def intersperse(lst, item): + result = [item] * (len(lst) * 2 + 1) + result[1::2] = lst + return result + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q) + return kl + + +def rand_gumbel(shape): + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 + return -torch.log(-torch.log(uniform_samples)) + + +def rand_gumbel_like(x): + g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) + return g + + +def slice_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, :, idx_str:idx_end] + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def get_timing_signal_1d( + length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = ( + math.log(float(max_timescale) / float(min_timescale)) / + (num_timescales - 1)) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = F.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal + + +def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.to(dtype=x.dtype, device=x.device) + + +def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) + + +def subsequent_mask(length): + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + device = duration.device + + b, _, t_y, t_x = mask.shape + cum_duration = torch.cumsum(duration, -1) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose(2,3) * mask + return path + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1. / norm_type) + return total_norm \ No newline at end of file diff --git a/EVT_Core/Train/VITS/vits/Data_Utils.py b/EVT_Core/Train/VITS/VITS2_finetuning/Data_Utils.py similarity index 98% rename from EVT_Core/Train/VITS/vits/Data_Utils.py rename to EVT_Core/Train/VITS/VITS2_finetuning/Data_Utils.py index 86bb0b7..b9f4d27 100644 --- a/EVT_Core/Train/VITS/vits/Data_Utils.py +++ b/EVT_Core/Train/VITS/VITS2_finetuning/Data_Utils.py @@ -3,10 +3,10 @@ import torch import torchaudio -from .Commons import intersperse -from .Mel_Processing import spectrogram_torch, mel_spectrogram_torch -from .Utils import load_audiopaths_sid_text -from .text import text_to_sequence, cleaned_text_to_sequence +from commons import intersperse +from mel_processing import spectrogram_torch, mel_spectrogram_torch +from utils import load_audiopaths_sid_text +from text import text_to_sequence, cleaned_text_to_sequence class TextAudioSpeakerLoader(torch.utils.data.Dataset): diff --git a/EVT_Core/Train/VITS/vits/Losses.py b/EVT_Core/Train/VITS/VITS2_finetuning/Losses.py similarity index 100% rename from EVT_Core/Train/VITS/vits/Losses.py rename to EVT_Core/Train/VITS/VITS2_finetuning/Losses.py diff --git a/EVT_Core/Train/VITS/vits/Mel_Processing.py b/EVT_Core/Train/VITS/VITS2_finetuning/Mel_Processing.py similarity index 100% rename from EVT_Core/Train/VITS/vits/Mel_Processing.py rename to EVT_Core/Train/VITS/VITS2_finetuning/Mel_Processing.py diff --git a/EVT_Core/TTS/VITS/vits/Models.py b/EVT_Core/Train/VITS/VITS2_finetuning/Models.py similarity index 71% rename from EVT_Core/TTS/VITS/vits/Models.py rename to EVT_Core/Train/VITS/VITS2_finetuning/Models.py index eda0914..c207b3c 100644 --- a/EVT_Core/TTS/VITS/vits/Models.py +++ b/EVT_Core/Train/VITS/VITS2_finetuning/Models.py @@ -2,13 +2,13 @@ import torch from torch import nn from torch.nn import functional as F -from torch.nn import Conv1d, ConvTranspose1d -from torch.nn.utils import weight_norm, remove_weight_norm +from torch.nn import Conv1d, ConvTranspose1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from . import Modules -from . import Attentions -from . import Commons -from . import monotonic_align +import modules +import attentions +import commons +import monotonic_align AVAILABLE_FLOW_TYPES = [ @@ -43,25 +43,25 @@ def __init__(self, self.n_flows = n_flows self.gin_channels = gin_channels - self.log_flow = Modules.Log() + self.log_flow = modules.Log() self.flows = nn.ModuleList() - self.flows.append(Modules.ElementwiseAffine(2)) + self.flows.append(modules.ElementwiseAffine(2)) for i in range(n_flows): - self.flows.append(Modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) - self.flows.append(Modules.Flip()) + self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) + self.flows.append(modules.Flip()) self.post_pre = nn.Conv1d(1, filter_channels, 1) self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.post_convs = Modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) + self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) self.post_flows = nn.ModuleList() - self.post_flows.append(Modules.ElementwiseAffine(2)) + self.post_flows.append(modules.ElementwiseAffine(2)) for i in range(4): - self.post_flows.append(Modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) - self.post_flows.append(Modules.Flip()) + self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)) + self.post_flows.append(modules.Flip()) self.pre = nn.Conv1d(in_channels, filter_channels, 1) self.proj = nn.Conv1d(filter_channels, filter_channels, 1) - self.convs = Modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) + self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout) if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, filter_channels, 1) @@ -131,9 +131,9 @@ def __init__(self, self.drop = nn.Dropout(p_dropout) self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2) - self.norm_1 = Modules.LayerNorm(filter_channels) + self.norm_1 = modules.LayerNorm(filter_channels) self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2) - self.norm_2 = Modules.LayerNorm(filter_channels) + self.norm_2 = modules.LayerNorm(filter_channels) self.proj = nn.Conv1d(filter_channels, 1, 1) if gin_channels != 0: @@ -156,6 +156,151 @@ def forward(self, x, x_mask, g=None): return x * x_mask +class DurationDiscriminatorV1(nn.Module): # vits2 + # TODO : not using "spk conditioning" for now according to the paper. + # Can be a better discriminator if we use it. + def __init__(self, + in_channels, + filter_channels, + kernel_size, + p_dropout, + gin_channels=0 + ): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.gin_channels = gin_channels + + self.drop = nn.Dropout(p_dropout) + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2) + # self.norm_1 = modules.LayerNorm(filter_channels) + self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) + # self.norm_2 = modules.LayerNorm(filter_channels) + self.dur_proj = nn.Conv1d(1, filter_channels, 1) + + self.pre_out_conv_1 = nn.Conv1d(2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) + self.pre_out_norm_1 = modules.LayerNorm(filter_channels) + self.pre_out_conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2) + self.pre_out_norm_2 = modules.LayerNorm(filter_channels) + + # if gin_channels != 0: + # self.cond = nn.Conv1d(gin_channels, in_channels, 1) + + self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid()) + + def forward_probability(self, x, x_mask, dur, g=None): + dur = self.dur_proj(dur) + x = torch.cat([x, dur], dim=1) + x = self.pre_out_conv_1(x * x_mask) + # x = torch.relu(x) + # x = self.pre_out_norm_1(x) + # x = self.drop(x) + x = self.pre_out_conv_2(x * x_mask) + # x = torch.relu(x) + # x = self.pre_out_norm_2(x) + # x = self.drop(x) + x = x * x_mask + x = x.transpose(1, 2) + output_prob = self.output_layer(x) + return output_prob + + def forward(self, x, x_mask, dur_r, dur_hat, g=None): + x = torch.detach(x) + # if g is not None: + # g = torch.detach(g) + # x = x + self.cond(g) + x = self.conv_1(x * x_mask) + # x = torch.relu(x) + # x = self.norm_1(x) + # x = self.drop(x) + x = self.conv_2(x * x_mask) + # x = torch.relu(x) + # x = self.norm_2(x) + # x = self.drop(x) + + output_probs = [] + for dur in [dur_r, dur_hat]: + output_prob = self.forward_probability(x, x_mask, dur, g) + output_probs.append(output_prob) + + return output_probs + + +class DurationDiscriminatorV2(nn.Module): # vits2 + # TODO : not using "spk conditioning" for now according to the paper. + # Can be a better discriminator if we use it. + def __init__( + self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0 + ): + super().__init__() + + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.gin_channels = gin_channels + + self.conv_1 = nn.Conv1d( + in_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_1 = modules.LayerNorm(filter_channels) + self.conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.norm_2 = modules.LayerNorm(filter_channels) + self.dur_proj = nn.Conv1d(1, filter_channels, 1) + + self.pre_out_conv_1 = nn.Conv1d( + 2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.pre_out_norm_1 = modules.LayerNorm(filter_channels) + self.pre_out_conv_2 = nn.Conv1d( + filter_channels, filter_channels, kernel_size, padding=kernel_size // 2 + ) + self.pre_out_norm_2 = modules.LayerNorm(filter_channels) + + # if gin_channels != 0: + # self.cond = nn.Conv1d(gin_channels, in_channels, 1) + + self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid()) + + def forward_probability(self, x, x_mask, dur, g=None): + dur = self.dur_proj(dur) + x = torch.cat([x, dur], dim=1) + x = self.pre_out_conv_1(x * x_mask) + x = torch.relu(x) + x = self.pre_out_norm_1(x) + x = self.pre_out_conv_2(x * x_mask) + x = torch.relu(x) + x = self.pre_out_norm_2(x) + x = x * x_mask + x = x.transpose(1, 2) + output_prob = self.output_layer(x) + return output_prob + + def forward(self, x, x_mask, dur_r, dur_hat, g=None): + x = torch.detach(x) + # if g is not None: + # g = torch.detach(g) + # x = x + self.cond(g) + x = self.conv_1(x * x_mask) + x = torch.relu(x) + x = self.norm_1(x) + x = self.conv_2(x * x_mask) + x = torch.relu(x) + x = self.norm_2(x) + + output_probs = [] + for dur in [dur_r, dur_hat]: + output_prob = self.forward_probability(x, x_mask, dur, g) + output_probs.append([output_prob]) + + return output_probs + + class TextEncoder(nn.Module): def __init__(self, n_vocab, @@ -184,7 +329,7 @@ def __init__(self, nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) # Transformer Encoder - self.encoder = Attentions.Encoder( + self.encoder = attentions.Encoder( hidden_channels, filter_channels, n_heads, @@ -198,7 +343,7 @@ def __init__(self, def forward(self, x, x_lengths, g=None): x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(Commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.encoder(x * x_mask, x_mask, g=g) stats = self.proj(x) * x_mask @@ -229,7 +374,7 @@ def __init__(self, self.mean_only = mean_only self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.pre_transformer = Attentions.Encoder( + self.pre_transformer = attentions.Encoder( hidden_channels, hidden_channels, n_heads=2, @@ -238,7 +383,7 @@ def __init__(self, p_dropout=p_dropout, # window_size=None, ) - self.enc = Modules.WN( + self.enc = modules.WN( hidden_channels, kernel_size, dilation_rate, @@ -294,7 +439,7 @@ def __init__(self, self.half_channels = channels // 2 self.mean_only = mean_only # vits2 - self.pre_transformer = Attentions.Encoder( + self.pre_transformer = attentions.Encoder( self.half_channels, self.half_channels, n_heads=2, @@ -305,7 +450,7 @@ def __init__(self, ) self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = Modules.WN( + self.enc = modules.WN( hidden_channels, kernel_size, dilation_rate, @@ -314,7 +459,7 @@ def __init__(self, gin_channels=gin_channels, ) # vits2 - self.post_transformer = Attentions.Encoder( + self.post_transformer = attentions.Encoder( self.hidden_channels, self.hidden_channels, n_heads=2, @@ -378,7 +523,7 @@ def __init__(self, self.mean_only = mean_only self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = Attentions.FFT( + self.enc = attentions.FFT( hidden_channels, filter_channels, n_heads, @@ -431,7 +576,7 @@ def __init__(self, self.mean_only = mean_only self.residual_connection = residual_connection # vits2 - self.pre_transformer = Attentions.Encoder( + self.pre_transformer = attentions.Encoder( self.half_channels, self.half_channels, n_heads=2, @@ -534,7 +679,7 @@ def __init__(self, mean_only=True ) ) - self.flows.append(Modules.Flip()) + self.flows.append(modules.Flip()) elif transformer_flow_type == "pre_conv2": for i in range(n_flows): self.flows.append( @@ -548,7 +693,7 @@ def __init__(self, mean_only=True, ) ) - self.flows.append(Modules.Flip()) + self.flows.append(modules.Flip()) elif transformer_flow_type == "fft": for i in range(n_flows): self.flows.append( @@ -562,11 +707,11 @@ def __init__(self, mean_only=True ) ) - self.flows.append(Modules.Flip()) + self.flows.append(modules.Flip()) elif transformer_flow_type == "mono_layer_inter_residual": for i in range(n_flows): self.flows.append( - Modules.ResidualCouplingLayer( + modules.ResidualCouplingLayer( channels, hidden_channels, kernel_size, @@ -576,7 +721,7 @@ def __init__(self, mean_only=True ) ) - self.flows.append(Modules.Flip()) + self.flows.append(modules.Flip()) self.flows.append( MonoTransformerFlowLayer( channels, hidden_channels, mean_only=True @@ -585,7 +730,7 @@ def __init__(self, elif transformer_flow_type == "mono_layer_post_residual": for i in range(n_flows): self.flows.append( - Modules.ResidualCouplingLayer( + modules.ResidualCouplingLayer( channels, hidden_channels, kernel_size, @@ -595,7 +740,7 @@ def __init__(self, mean_only=True, ) ) - self.flows.append(Modules.Flip()) + self.flows.append(modules.Flip()) self.flows.append( MonoTransformerFlowLayer( channels, @@ -607,7 +752,7 @@ def __init__(self, else: for i in range(n_flows): self.flows.append( - Modules.ResidualCouplingLayer( + modules.ResidualCouplingLayer( channels, hidden_channels, kernel_size, @@ -617,7 +762,7 @@ def __init__(self, mean_only=True ) ) - self.flows.append(Modules.Flip()) + self.flows.append(modules.Flip()) def forward(self, x, x_mask, g=None, reverse=False): if not reverse: @@ -649,11 +794,11 @@ def __init__(self, self.gin_channels = gin_channels self.pre = nn.Conv1d(in_channels, hidden_channels, 1) - self.enc = Modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) + self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def forward(self, x, x_lengths, g=None): # x: LinearSpectrum; g: GlobalCondition - x_mask = torch.unsqueeze(Commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) stats = self.proj(x) * x_mask @@ -677,7 +822,7 @@ def __init__(self, self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) - resblock = Modules.ResBlock1 if resblock == '1' else Modules.ResBlock2 + resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): @@ -694,7 +839,7 @@ def __init__(self, self.resblocks.append(resblock(ch, k, d)) self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) - self.ups.apply(Commons.init_weights) + self.ups.apply(commons.init_weights) if gin_channels != 0: self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) @@ -705,7 +850,7 @@ def forward(self, x, g=None): x = x + self.cond(g) for i in range(self.num_upsamples): - x = F.leaky_relu(x, Modules.LRELU_SLOPE) + x = F.leaky_relu(x, modules.LRELU_SLOPE) x = self.ups[i](x) xs = None for j in range(self.num_kernels): @@ -728,6 +873,145 @@ def remove_weight_norm(self): l.remove_weight_norm() +class DiscriminatorP(torch.nn.Module): + def __init__(self, + period, + kernel_size=5, + stride=3, + use_spectral_norm=False + ): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(commons.get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(commons.get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(commons.get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(commons.get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(commons.get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + class SynthesizerTrn(nn.Module): """ Synthesizer for Training @@ -879,7 +1163,7 @@ def forward(self, x, x_lengths, y, y_lengths, sid=None): m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) - z_slice, ids_slice = Commons.rand_slice_segments(z, y_lengths, self.segment_size) + z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size) o = self.dec(z_slice, g=g) return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q), (x, logw, logw_) @@ -899,9 +1183,9 @@ def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_sca w = torch.exp(logw) * x_mask * length_scale w_ceil = torch.ceil(w) y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() - y_mask = torch.unsqueeze(Commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype) + y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype) attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) - attn = Commons.generate_path(w_ceil, attn_mask) + attn = commons.generate_path(w_ceil, attn_mask) m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] @@ -909,4 +1193,17 @@ def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_sca z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale z = self.flow(z_p, y_mask, g=g, reverse=True) o = self.dec((z * y_mask)[:, :, :max_len], g=g) - return o, attn, y_mask, (z, z_p, m_p, logs_p) \ No newline at end of file + return o, attn, y_mask, (z, z_p, m_p, logs_p) + + ''' + ## (obsolete) currently vits-2 is not capable of voice conversion + def voice_conversion(self, y, y_lengths, sid_src, sid_tgt): + assert self.n_speakers > 0, "n_speakers have to be larger than 0." + g_src = self.emb_g(sid_src).unsqueeze(-1) + g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) + z_p = self.flow(z, y_mask, g=g_src) + z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) + o_hat, o_hat_mb = self.dec(z_hat * y_mask, g=g_tgt) + return o_hat, o_hat_mb, y_mask, (z, z_p, z_hat) + ''' \ No newline at end of file diff --git a/EVT_Core/Train/VITS/vits/Modules.py b/EVT_Core/Train/VITS/VITS2_finetuning/Modules.py similarity index 99% rename from EVT_Core/Train/VITS/vits/Modules.py rename to EVT_Core/Train/VITS/VITS2_finetuning/Modules.py index 62da8a3..7a845a1 100644 --- a/EVT_Core/Train/VITS/vits/Modules.py +++ b/EVT_Core/Train/VITS/VITS2_finetuning/Modules.py @@ -5,8 +5,8 @@ from torch.nn import Conv1d from torch.nn.utils import weight_norm, remove_weight_norm -from .Commons import fused_add_tanh_sigmoid_multiply, init_weights, get_padding -from .Transforms import piecewise_rational_quadratic_transform +from commons import fused_add_tanh_sigmoid_multiply, init_weights, get_padding +from transforms import piecewise_rational_quadratic_transform LRELU_SLOPE = 0.1 diff --git a/EVT_Core/Train/VITS/vits/Transforms.py b/EVT_Core/Train/VITS/VITS2_finetuning/Transforms.py similarity index 100% rename from EVT_Core/Train/VITS/vits/Transforms.py rename to EVT_Core/Train/VITS/VITS2_finetuning/Transforms.py diff --git a/EVT_Core/Train/VITS/VITS2_finetuning/Utils.py b/EVT_Core/Train/VITS/VITS2_finetuning/Utils.py new file mode 100644 index 0000000..d9d6485 --- /dev/null +++ b/EVT_Core/Train/VITS/VITS2_finetuning/Utils.py @@ -0,0 +1,261 @@ +import os +import re +import glob +import sys +import shutil +import logging +logging.basicConfig(stream = sys.stdout, encoding = 'utf-8') +logger = logging +import json +import subprocess +import matplotlib +import matplotlib.pylab as plt +import numpy as np +import torch +from typing import Optional +from pathlib import Path + + +MATPLOTLIB_FLAG = False + + +def load_checkpoint(checkpoint_path, model, optimizer, keep_speaker_emb: bool = False): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') + iteration = checkpoint_dict['iteration'] + learning_rate = checkpoint_dict['learning_rate'] + optimizer.load_state_dict(checkpoint_dict['optimizer']) if optimizer is not None else None + def get_new_state_dict(state_dict, saved_state_dict, keep_speaker_emb): + new_state_dict = {} + for layer_param, tensor in state_dict.items(): + try: # Assign tensor of layer param from saved state dict to new state dict while layer param is not embedding's weight, otherwise use the current tensor + if layer_param == 'emb_g.weight': + if keep_speaker_emb: # Keep the original speaker embedding, otherwise drop it + tensor[:saved_state_dict[layer_param].shape[0], :] = saved_state_dict[layer_param] + new_state_dict[layer_param] = tensor + else: + new_state_dict[layer_param] = saved_state_dict[layer_param] + except: + logger.info("%s is not in the checkpoint" % layer_param) + new_state_dict[layer_param] = tensor + return new_state_dict + if hasattr(model, 'module'): + model.module.load_state_dict(get_new_state_dict(model.module.state_dict(), checkpoint_dict['model'], keep_speaker_emb)) + else: + model.load_state_dict(get_new_state_dict(model.state_dict(), checkpoint_dict['model'], keep_speaker_emb)) + logger.info(f"Loaded checkpoint '{checkpoint_path}' (iteration {iteration})") + return model, optimizer, learning_rate, iteration + + +def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): # fix issue: torch.save doesn't support chinese path + logger.info(f"Saving model and optimizer state at iteration {iteration} to {checkpoint_path}") + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + checkpoint_path_tmp = Path(Path(checkpoint_path).root).joinpath("checkpoint_tmp").as_posix() + torch.save( + { + 'model': state_dict, + 'iteration': iteration, + 'optimizer': optimizer.state_dict(), + 'learning_rate': learning_rate + }, + checkpoint_path_tmp + ) + shutil.move(checkpoint_path_tmp, Path(checkpoint_path).parent.as_posix()) + + +def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats='HWC') + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sampling_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + f_list = glob.glob(os.path.normpath(os.path.join(dir_path, regex))) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + print(x) + return x + + +def remove_old_checkpoints(cp_dir, prefixes=['G_*.pth', 'D_*.pth', 'DUR_*.pth']): + def scan_checkpoint(dir_path, regex): + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + if len(f_list) == 0: + return None + return f_list + for prefix in prefixes: + sorted_ckpts = scan_checkpoint(cp_dir, prefix) + if sorted_ckpts and len(sorted_ckpts) > 3: + for ckpt_path in sorted_ckpts[:-3]: + os.remove(ckpt_path) + print("removed {}".format(ckpt_path)) + + +def plot_spectrogram_to_numpy(spectrogram): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger('matplotlib') + mpl_logger.setLevel(logging.WARNING) + + fig, ax = plt.subplots(figsize=(10,2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation='none') + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger('matplotlib') + mpl_logger.setLevel(logging.WARNING) + + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', interpolation='none') + fig.colorbar(im, ax=ax) + xlabel = 'Decoder timestep' + if info is not None: + xlabel += '\n\n' + info + plt.xlabel(xlabel) + plt.ylabel('Encoder timestep') + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_audiopaths_sid_text(filename, split = "|"): + with open(filename, 'r', encoding = 'utf-8') as f: + audiopaths_sid_text = [line.strip().split(split) for line in f] + return audiopaths_sid_text + + +def add_elements( + Iterable1, + Iterable2 +): + ''' + Add unique elements form Iterable2 to Iterable1 + ''' + def GetDictKeys(Iterable): + return sorted(Iterable.keys(), key = lambda Key: Iterable[Key]) if isinstance(Iterable, dict) else Iterable + Iterable1, Iterable2 = GetDictKeys(Iterable1), GetDictKeys(Iterable2) + for Element in Iterable2: + Iterable1.append(Element) if Element not in Iterable1 else None + return Iterable1 + + +def check_git_hash(model_dir): + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.normpath(os.path.join(source_dir, ".git"))): + logger.warn(f"{source_dir} is not a git repository, therefore hash value comparison will be ignored.") + return + + cur_hash = subprocess.getoutput("git rev-parse HEAD") + + path = os.path.normpath(os.path.join(model_dir, "githash")) + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warn("git hash values are different. {}(saved) != {}(current)".format(saved_hash[:8], cur_hash[:8])) + else: + open(path, "w").write(cur_hash) + + +def get_logger(model_dir, filename="train.log"): + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.DEBUG) + + if not os.path.exists(model_dir): + os.makedirs(model_dir) + handler = logging.FileHandler(os.path.normpath(os.path.join(model_dir, filename))) + handler.setLevel(logging.DEBUG) + handler.setFormatter(logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")) + logger.addHandler(handler) + return logger + + +class HParams(): + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() + + +def get_hparams( + Config_Path: str, + Model_Dir: Optional[str] = None +): + with open(Config_Path, 'r', encoding = 'utf-8') as f: + data = f.read() + config = json.loads(data) + hparams = HParams(**config) + + if Model_Dir is not None: + os.makedirs(Model_Dir) if not Path(Model_Dir).exists() else None + hparams.model_dir = Model_Dir + + return hparams + + +def Get_Config_Path(ConfigPath): + if Path(ConfigPath).is_dir(): + ConfigPaths = [File for File in os.listdir(ConfigPath) if Path(File).suffix == '.json'] + ConfigPath = sorted(ConfigPaths, key = lambda ConfigPath: re.sub(r'[A-Za-z]+', '', Path(ConfigPath).name))[-1] + return ConfigPath + + +def Get_Model_Path(ModelPath): + if Path(ModelPath).is_dir(): + ModelPaths = [File for File in os.listdir(ModelPath) if Path(File).suffix == '.pth' and 'G_' in File] + ModelPath = sorted(ModelPaths, key = lambda ModelPath: re.sub(r'G_[A-Za-z]+', '', Path(ModelPath).name))[-1] + return ModelPath \ No newline at end of file diff --git a/EVT_Core/Train/VITS/vits/__init__.py b/EVT_Core/Train/VITS/VITS2_finetuning/__init__.py similarity index 100% rename from EVT_Core/Train/VITS/vits/__init__.py rename to EVT_Core/Train/VITS/VITS2_finetuning/__init__.py diff --git a/EVT_Core/Train/VITS/configs/mandarin_base.json b/EVT_Core/Train/VITS/VITS2_finetuning/configs/mandarin_base.json similarity index 100% rename from EVT_Core/Train/VITS/configs/mandarin_base.json rename to EVT_Core/Train/VITS/VITS2_finetuning/configs/mandarin_base.json diff --git a/EVT_Core/Train/VITS/configs/mandarin_english_japanese_base.json b/EVT_Core/Train/VITS/VITS2_finetuning/configs/mandarin_english_japanese_base.json similarity index 100% rename from EVT_Core/Train/VITS/configs/mandarin_english_japanese_base.json rename to EVT_Core/Train/VITS/VITS2_finetuning/configs/mandarin_english_japanese_base.json diff --git a/EVT_Core/TTS/VITS/vits/monotonic_align/Core.py b/EVT_Core/Train/VITS/VITS2_finetuning/monotonic_align/Core.py similarity index 77% rename from EVT_Core/TTS/VITS/vits/monotonic_align/Core.py rename to EVT_Core/Train/VITS/VITS2_finetuning/monotonic_align/Core.py index 11aba18..1c44515 100644 --- a/EVT_Core/TTS/VITS/vits/monotonic_align/Core.py +++ b/EVT_Core/Train/VITS/VITS2_finetuning/monotonic_align/Core.py @@ -3,8 +3,8 @@ # Compile a Python function into native code @nb.jit( - # func( paths, values, t_ys, t_xs ) - nb.void(nb.int32[:,:,::1], nb.float32[:,:,::1], nb.int32[::1], nb.int32[::1]), + # func( paths, values, t_ys, t_xs ) + nb.void(nb.int32[:, :, ::1], nb.float32[:, :, ::1], nb.int32[::1], nb.int32[::1]), nogil = True, nopython = True ) @@ -39,17 +39,17 @@ def maximum_path_nb(paths, values, t_ys, t_xs): if x == y: v_cur = max_neg_val else: - v_cur = value[y-1, x] + v_cur = value[y - 1, x] if x == 0: if y == 0: v_prev = 0. else: v_prev = max_neg_val else: - v_prev = value[y-1, x-1] + v_prev = value[y - 1, x - 1] value[y, x] += max(v_prev, v_cur) for y in range(t_y - 1, -1, -1): path[y, index] = 1 - if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]): + if index != 0 and (index == y or value[y - 1, index] < value[y - 1, index - 1]): index = index - 1 \ No newline at end of file diff --git a/EVT_Core/TTS/VITS/vits/monotonic_align/__init__.py b/EVT_Core/Train/VITS/VITS2_finetuning/monotonic_align/__init__.py similarity index 89% rename from EVT_Core/TTS/VITS/vits/monotonic_align/__init__.py rename to EVT_Core/Train/VITS/VITS2_finetuning/monotonic_align/__init__.py index 19b25b7..81b52ab 100644 --- a/EVT_Core/TTS/VITS/vits/monotonic_align/__init__.py +++ b/EVT_Core/Train/VITS/VITS2_finetuning/monotonic_align/__init__.py @@ -1,11 +1,11 @@ import numpy as np import torch -from .Core import maximum_path_nb +from .core import maximum_path_nb def maximum_path(neg_cent, mask): - """ Cython optimized version. + """ Numba optimized version. neg_cent: [b, t_t, t_s] mask: [b, t_t, t_s] """ diff --git a/EVT_Core/Train/VITS/VITS2_finetuning/preprocess.py b/EVT_Core/Train/VITS/VITS2_finetuning/preprocess.py new file mode 100644 index 0000000..7b585bb --- /dev/null +++ b/EVT_Core/Train/VITS/VITS2_finetuning/preprocess.py @@ -0,0 +1,186 @@ +import os +import re +import argparse +import json +import torchaudio +from typing import Optional +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor + +from utils import ( + load_audiopaths_sid_text, + add_elements +) +from text import ( + _clean_text +) + + +parser = argparse.ArgumentParser() +parser.add_argument("--FileList_Path_Training", type = str, default = "train.txt") +parser.add_argument("--FileList_Path_Validation", type = str, default = "val.txt") +parser.add_argument("--Set_Epochs", type = int, default = 10000) +parser.add_argument("--Set_Eval_Interval", type = int, default = 1000) +parser.add_argument("--Set_Batch_Size", type = int, default = 16) +parser.add_argument("--Set_FP16_Run", type = bool, default = True) +parser.add_argument("--Keep_Original_Speakers", type = bool, default = False) +parser.add_argument("--Config_Path_Load", type = Optional[str], default = None) +parser.add_argument("--Output_Root", type = str, default = "./") +parser.add_argument("--Output_Dir_Name", type = str, default = "Output") +parser.add_argument("--Output_Config_Name", type = str, default = "Config.json") +args = parser.parse_args() + +FileList_Path_Training = str(os.environ.get('FileList_Path_Training', str(args.FileList_Path_Training))) +FileList_Path_Validation = str(os.environ.get('FileList_Path_Validation', str(args.FileList_Path_Validation))) +Set_Epochs = int(os.environ.get('Set_Epochs', str(args.Set_Epochs))) +Set_Eval_Interval = int(os.environ.get('Set_Eval_Interval', str(args.Set_Eval_Interval))) +Set_Batch_Size = int(os.environ.get('Set_Batch_Size', str(args.Set_Batch_Size))) +Set_FP16_Run = eval(os.environ.get('Set_FP16_Run', str(args.Set_FP16_Run))) +Keep_Original_Speakers = eval(os.environ.get('Keep_Original_Speakers', str(args.Keep_Original_Speakers))) +Config_Path_Load = str(os.environ.get('Config_Path_Load', str(args.Config_Path_Load))) if Keep_Original_Speakers else None +Output_Root = str(os.environ.get('Output_Root', str(args.Output_Root))) +Output_Dir_Name = str(os.environ.get('Output_Dir_Name', str(args.Output_Dir_Name))) +Output_Config_Name = str(os.environ.get('Output_Config_Name', str(args.Output_Config_Name))) + +Dir_Output = Path(Output_Root).joinpath(Output_Dir_Name).as_posix() +os.makedirs(Dir_Output, exist_ok = True) +Config_Path_Edited = Path(Dir_Output).joinpath(Output_Config_Name).__str__() +FileList_Path_Training_Updated = Path(Config_Path_Edited).parent.joinpath(Path(FileList_Path_Training).name).__str__() +FileList_Path_Validation_Updated = Path(Config_Path_Edited).parent.joinpath(Path(FileList_Path_Validation).name).__str__() +Out_Extension = "cleaned" + + +def Configurator(): + ''' + Edit JSON file + ''' + def Get_Languages(Text_Path_Training, Text_Path_Validation): + Languages = [] + for Text_Path in [Text_Path_Training, Text_Path_Validation]: + with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as File: + Lines = File.readlines() + for _, Line in enumerate(Lines): + Line_Text = Line.split('|', maxsplit = 2)[2] + Language = re.split(r'[\[\]]', Line_Text)[1] + Languages.append(Language) if Language not in Languages else None + if set(Languages).issubset({'ZH', 'EN', 'JA'}): + if set(Languages) == {'ZH'}: + return "mandarin" + else: + return "mandarin_english_japanese" + else: + raise Exception('Unsupported language!') + + def Get_NewSpeakers(Text_Path_Training, Text_Path_Validation): + Speakers = [] + for Text_Path in [Text_Path_Training, Text_Path_Validation]: + with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as File: + Lines = File.readlines() + for _, Line in enumerate(Lines): + Speaker = Line.split('|', maxsplit = 2)[1] + Speakers.append(Speaker) if Speaker not in Speakers else None + return Speakers + + def Get_OldSpeakers(Config_Path_Load): + if Config_Path_Load is not None and Path(Config_Path_Load).exists(): + with open(file = Config_Path_Load, mode = 'rb') as ConfigFile_Extra: + OldSpeakers = json.load(ConfigFile_Extra)["speakers"] + else: + OldSpeakers = [] + return OldSpeakers + + Language = Get_Languages(FileList_Path_Training, FileList_Path_Validation) + NewSpeakers = Get_NewSpeakers(FileList_Path_Training, FileList_Path_Validation) + OldSpeakers = Get_OldSpeakers(Config_Path_Load) if Keep_Original_Speakers else [] + + with open(file = Path(__file__).parent.joinpath('./configs', f'{Language}_base.json').__str__(), mode = 'rb') as ConfigFile_Default: + Params = json.load(ConfigFile_Default) + try: + Params_Old = Params + Params_Old["train"]["eval_interval"] = Set_Eval_Interval + Params_Old["train"]["epochs"] = Set_Epochs + Params_Old["train"]["batch_size"] = Set_Batch_Size + Params_Old["train"]["fp16_run"] = Set_FP16_Run + Params_Old["data"]["training_files"] = f'{FileList_Path_Training_Updated}.{Out_Extension}' + Params_Old["data"]["validation_files"] = f'{FileList_Path_Validation_Updated}.{Out_Extension}' + Params_Old["data"]["text_cleaners"] = [(Language + "_cleaners").lower()] + Params_Old["data"]["n_speakers"] = add_elements(OldSpeakers, NewSpeakers).__len__() + Params_Old["speakers"] = add_elements(OldSpeakers, NewSpeakers) + Params_New = Params_Old + except: + raise Exception("Please check if params exist") + with open(Config_Path_Edited, 'w', encoding = 'utf-8') as File_New: + json.dump(Params_New, File_New, indent = 4) + print(f"Config created in {Dir_Output}") + + +def Cleaner(): + ''' + Convert natural language text to symbols + ''' + def Update_SID(Config_Path, Text_Path, Save_Path): + with open(file = Config_Path, mode = 'rb') as ConfigFile: + NewSpeakers = json.load(ConfigFile)["speakers"] + with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as TextFile: + Lines = TextFile.readlines() + for Index, Line in enumerate(Lines): + Line_Path = Line.split('|', maxsplit = 1)[0] + Line_Path = Path(Text_Path).parent.joinpath(Line_Path).as_posix() if not Path(Line_Path).is_absolute() else Line_Path + Speaker = Line.split('|', maxsplit = 2)[1] + SpeakerID = NewSpeakers.index(Speaker) + Line_Text = Line.split('|', maxsplit = 2)[2] + Line = f"{Line_Path}|{SpeakerID}|{Line_Text}" + Lines[Index] = Line + with open(file = Save_Path, mode = 'w', encoding = 'utf-8') as TextFile: + TextFile.writelines(Lines) + + def Get_Cleaners(Config_Path): + with open(file = Config_Path, mode = 'rb') as ConfigFile: + NewCleaners = json.load(ConfigFile)["data"]["text_cleaners"] + return NewCleaners + + for Index, FileList in enumerate([FileList_Path_Training, FileList_Path_Validation]): + print("START:", FileList) + FileList_Updated = [FileList_Path_Training_Updated, FileList_Path_Validation_Updated][Index] + Update_SID(Config_Path_Edited, FileList, FileList_Updated) + Path_SID_Text = load_audiopaths_sid_text(FileList_Updated) + for i in range(len(Path_SID_Text)): + Path_SID_Text[i][2] = _clean_text(Path_SID_Text[i][2], Get_Cleaners(Config_Path_Edited)) + Filelist_Cleaned = FileList_Updated + "." + Out_Extension + with open(Filelist_Cleaned, 'w', encoding = 'utf-8') as f: + f.writelines(["|".join(x) + "\n" for x in Path_SID_Text]) + + +def Resampler(): + ''' + Resample dataset audio to fit the sampling rate setting in config + ''' + def Get_Resample_List(Config_Path, Text_Path): + ResampleList = [] + with open(file = Config_Path, mode = 'rb') as ConfigFile: + SampleRate_New = json.load(ConfigFile)['data']['sampling_rate'] + with open(file = Text_Path, mode = 'r', encoding = 'utf-8') as TextFile: + Lines = TextFile.readlines() + for Line in Lines: + Line_Path = Line.split('|', maxsplit = 1)[0] + ResampleList.append((Line_Path, SampleRate_New)) + return ResampleList + + def Resample(Audio_Path, SampleRate_New): + AudioData_Old, SampleRate_Old = torchaudio.load(Audio_Path) + AudioData_New = torchaudio.transforms.Resample(orig_freq = SampleRate_Old, new_freq = SampleRate_New)(AudioData_Old) + torchaudio.save(Audio_Path, src = AudioData_New, sample_rate = SampleRate_New) + + for FileList in (FileList_Path_Validation, FileList_Path_Training): + print("Resampling audio according to", FileList) + with ThreadPoolExecutor(max_workers = os.cpu_count()) as Executor: + Executor.map( + Resample, + *zip(*Get_Resample_List(Config_Path_Edited, FileList)) + ) + + +if __name__ == "__main__": + Configurator() + Cleaner() + Resampler() \ No newline at end of file diff --git a/EVT_Core/Train/VITS/VITS2_finetuning/text/LICENSE b/EVT_Core/Train/VITS/VITS2_finetuning/text/LICENSE new file mode 100644 index 0000000..4ad4ed1 --- /dev/null +++ b/EVT_Core/Train/VITS/VITS2_finetuning/text/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2017 Keith Ito + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/EVT_Core/TTS/VITS/vits/text/__init__.py b/EVT_Core/Train/VITS/VITS2_finetuning/text/__init__.py similarity index 60% rename from EVT_Core/TTS/VITS/vits/text/__init__.py rename to EVT_Core/Train/VITS/VITS2_finetuning/text/__init__.py index 17057e4..e56b1e5 100644 --- a/EVT_Core/TTS/VITS/vits/text/__init__.py +++ b/EVT_Core/Train/VITS/VITS2_finetuning/text/__init__.py @@ -5,6 +5,7 @@ # Mappings from symbol to numeric ID and vice versa: _symbol_to_id = {s: i for i, s in enumerate(symbols)} +_id_to_symbol = {i: s for i, s in enumerate(symbols)} def _clean_text(text, cleaner_names): @@ -32,4 +33,24 @@ def text_to_sequence(text, cleaner_names): continue symbol_id = _symbol_to_id[symbol] sequence += [symbol_id] - return sequence \ No newline at end of file + return sequence + + +def cleaned_text_to_sequence(cleaned_text): + '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + Args: + text: string to convert to a sequence + Returns: + List of integers corresponding to the symbols in the text + ''' + sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()] + return sequence + + +def sequence_to_text(sequence): + '''Converts a sequence of IDs back to a string''' + result = '' + for symbol_id in sequence: + s = _id_to_symbol[symbol_id] + result += s + return result \ No newline at end of file diff --git a/EVT_Core/Train/VITS/vits/text/chinesedialect.py b/EVT_Core/Train/VITS/VITS2_finetuning/text/chinesedialect.py similarity index 100% rename from EVT_Core/Train/VITS/vits/text/chinesedialect.py rename to EVT_Core/Train/VITS/VITS2_finetuning/text/chinesedialect.py diff --git a/EVT_Core/Train/VITS/vits/text/cleaners.py b/EVT_Core/Train/VITS/VITS2_finetuning/text/cleaners.py similarity index 100% rename from EVT_Core/Train/VITS/vits/text/cleaners.py rename to EVT_Core/Train/VITS/VITS2_finetuning/text/cleaners.py diff --git a/EVT_Core/Train/VITS/vits/text/english.py b/EVT_Core/Train/VITS/VITS2_finetuning/text/english.py similarity index 100% rename from EVT_Core/Train/VITS/vits/text/english.py rename to EVT_Core/Train/VITS/VITS2_finetuning/text/english.py diff --git a/EVT_Core/Train/VITS/vits/text/japanese.py b/EVT_Core/Train/VITS/VITS2_finetuning/text/japanese.py similarity index 100% rename from EVT_Core/Train/VITS/vits/text/japanese.py rename to EVT_Core/Train/VITS/VITS2_finetuning/text/japanese.py diff --git a/EVT_Core/Train/VITS/vits/text/mandarin.py b/EVT_Core/Train/VITS/VITS2_finetuning/text/mandarin.py similarity index 100% rename from EVT_Core/Train/VITS/vits/text/mandarin.py rename to EVT_Core/Train/VITS/VITS2_finetuning/text/mandarin.py diff --git a/EVT_Core/Train/VITS/vits/text/symbols.py b/EVT_Core/Train/VITS/VITS2_finetuning/text/symbols.py similarity index 100% rename from EVT_Core/Train/VITS/vits/text/symbols.py rename to EVT_Core/Train/VITS/VITS2_finetuning/text/symbols.py diff --git a/EVT_Core/Train/VITS/VITS2_finetuning/train.py b/EVT_Core/Train/VITS/VITS2_finetuning/train.py new file mode 100644 index 0000000..11ba3a0 --- /dev/null +++ b/EVT_Core/Train/VITS/VITS2_finetuning/train.py @@ -0,0 +1,520 @@ +import os +import sys +import platform +import argparse +import logging +logging.basicConfig(stream = sys.stdout, encoding = 'utf-8') +logging.getLogger('numba').setLevel(logging.WARNING) +import torch +from torch.nn import functional as F +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +import torch.multiprocessing as mp +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.cuda.amp import autocast, GradScaler +torch.backends.cudnn.benchmark = True +from typing import Optional +from pathlib import Path +from tqdm import tqdm + +from data_utils import ( + TextAudioSpeakerLoader, + TextAudioSpeakerCollate, + DistributedBucketSampler +) +from models import ( + AVAILABLE_FLOW_TYPES, + AVAILABLE_DURATION_DISCRIMINATOR_TYPES, + SynthesizerTrn, + MultiPeriodDiscriminator, + DurationDiscriminatorV1, + DurationDiscriminatorV2 +) +from mel_processing import ( + mel_spectrogram_torch, + spec_to_mel_torch +) +from commons import ( + slice_segments, + clip_grad_value_ +) +from losses import ( + generator_loss, + discriminator_loss, + feature_loss, + kl_loss +) +from utils import ( + plot_spectrogram_to_numpy, + summarize, + plot_alignment_to_numpy, + save_checkpoint, + get_logger, + #check_git_hash, + load_checkpoint, + remove_old_checkpoints, + latest_checkpoint_path, + get_hparams +) +#from text import symbols +from text.symbols import symbols +from preprocess import args as preprocess_args + + +parser = argparse.ArgumentParser() +parser.add_argument("--Num_Workers", type = int, default = 4) +parser.add_argument("--Use_PretrainedModels", type = bool, default = True) +parser.add_argument("--Model_Path_Pretrained_G", type = Optional[str], default = None) +parser.add_argument("--Model_Path_Pretrained_D", type = Optional[str], default = None) +parser.add_argument("--Keep_Original_Speakers", type = bool, default = preprocess_args.Keep_Original_Speakers) +parser.add_argument("--Output_Root", type = str, default = preprocess_args.Output_Root) +parser.add_argument("--Output_Dir_Name", type = str, default = preprocess_args.Output_Dir_Name) +parser.add_argument("--Output_Config_Name", type = str, default = preprocess_args.Output_Config_Name) +parser.add_argument("--Output_LogDir", type = str, default = "./") +args = parser.parse_args() + +Num_Workers = int(os.environ.get('Num_Workers', str(args.Num_Workers))) +Use_PretrainedModels = eval(os.environ.get('Use_PretrainedModels', str(args.Use_PretrainedModels))) +Model_Path_Pretrained_G = str(os.environ.get('Model_Path_Pretrained_G', str(args.Model_Path_Pretrained_G))) if Use_PretrainedModels else None +Model_Path_Pretrained_D = str(os.environ.get('Model_Path_Pretrained_D', str(args.Model_Path_Pretrained_D))) if Use_PretrainedModels else None +Keep_Original_Speakers = eval(os.environ.get('Keep_Original_Speakers', str(args.Keep_Original_Speakers))) +Output_Root = str(os.environ.get('Output_Root', str(args.Output_Root))) +Output_Dir_Name = str(os.environ.get('Output_Dir_Name', str(args.Output_Dir_Name))) +Output_Config_Name = str(os.environ.get('Output_Config_Name', str(args.Output_Config_Name))) +Log_Dir = str(os.environ.get('Output_LogDir', str(args.Output_LogDir))) + +Dir_Output = Path(Output_Root).joinpath(Output_Dir_Name).as_posix() +Config_Path = Path(Dir_Output).joinpath(Output_Config_Name).__str__() + +global_step = 0 + + +def evaluate(hps, generator, eval_loader, writer_eval): + generator.eval() + with torch.no_grad(): + for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(eval_loader): + x, x_lengths = x.cuda(0), x_lengths.cuda(0) + spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0) + y, y_lengths = y.cuda(0), y_lengths.cuda(0) + speakers = speakers.cuda(0) + + # remove else + x = x[:1] + x_lengths = x_lengths[:1] + spec = spec[:1] + spec_lengths = spec_lengths[:1] + y = y[:1] + y_lengths = y_lengths[:1] + speakers = speakers[:1] + break + y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, max_len=1000) + y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length + + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax + ) if not (hps.model.use_mel_posterior_encoder or hps.data.use_mel_posterior_encoder) else spec + y_hat_mel = mel_spectrogram_torch( + y_hat.squeeze(1).float(), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax + ) + image_dict = {"gen/mel": plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy())} + audio_dict = {"gen/audio": y_hat[0, :, :y_hat_lengths[0]]} + if global_step == 0: + image_dict.update({"gt/mel": plot_spectrogram_to_numpy(mel[0].cpu().numpy())}) + audio_dict.update({"gt/audio": y[0, :, :y_lengths[0]]}) + + summarize( + writer=writer_eval, + global_step=global_step, + images=image_dict, + audios=audio_dict, + audio_sampling_rate=hps.data.sampling_rate + ) + generator.train() + + +def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers): + net_g, net_d, net_dur_disc = nets + optim_g, optim_d, optim_dur_disc = optims + scheduler_g, scheduler_d, scheduler_dur_disc = schedulers + train_loader, eval_loader = loaders + if writers is not None: + writer, writer_eval = writers + + train_loader.batch_sampler.set_epoch(epoch) + global global_step + + net_g.train() + net_d.train() + net_dur_disc.train() if net_dur_disc is not None else None + + if rank == 0: + loader = tqdm(train_loader, desc='Loading train data') + else: + loader = train_loader + + for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers) in enumerate(loader): + if net_g.module.use_noise_scaled_mas: + current_mas_noise_scale = net_g.module.mas_noise_scale_initial - net_g.module.noise_scale_delta * global_step + net_g.module.current_mas_noise_scale = max(current_mas_noise_scale, 0.0) + x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True) + spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True) + y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True) + speakers = speakers.cuda(rank, non_blocking=True) + + with autocast(enabled=hps.train.fp16_run): + y_hat, l_length, attn, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q), (hidden_x, logw, logw_) = net_g(x, x_lengths, spec, spec_lengths, speakers) + + mel = spec_to_mel_torch( + spec.float(), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax + ) if not (hps.model.use_mel_posterior_encoder or hps.data.use_mel_posterior_encoder) else spec + y_mel = slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length) + y_hat_mel = mel_spectrogram_torch( + y_hat.squeeze(1), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax + ) + y = slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice + + # Discriminator + y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach()) + with autocast(enabled=False): + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) + loss_disc_all = loss_disc + + # Duration Discriminator + if net_dur_disc is not None: + y_dur_hat_r, y_dur_hat_g = net_dur_disc(hidden_x.detach(), x_mask.detach(), logw_.detach(), logw.detach()) + with autocast(enabled=False): + # TODO: I think need to mean using the mask, but for now, just mean all + loss_dur_disc, losses_dur_disc_r, losses_dur_disc_g = discriminator_loss(y_dur_hat_r, y_dur_hat_g) + loss_dur_disc_all = loss_dur_disc + optim_dur_disc.zero_grad() + scaler.scale(loss_dur_disc_all).backward() + scaler.unscale_(optim_dur_disc) + grad_norm_dur_disc = clip_grad_value_(net_dur_disc.parameters(), None) + scaler.step(optim_dur_disc) + + optim_d.zero_grad() + scaler.scale(loss_disc_all).backward() + scaler.unscale_(optim_d) + grad_norm_d = clip_grad_value_(net_d.parameters(), None) + scaler.step(optim_d) + + with autocast(enabled=hps.train.fp16_run): + # Generator + y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat) + if net_dur_disc is not None: + y_dur_hat_r, y_dur_hat_g = net_dur_disc(hidden_x, x_mask, logw_, logw) + with autocast(enabled=False): + loss_dur = torch.sum(l_length.float()) + loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl + + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, losses_gen = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl + if net_dur_disc is not None: + loss_dur_gen, losses_dur_gen = generator_loss(y_dur_hat_g) + loss_gen_all += loss_dur_gen + + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + if rank == 0: + if global_step % hps.train.log_interval == 0: + lr = optim_g.param_groups[0]['lr'] + losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl] + logger.info('Train Epoch: {} [{:.0f}%]'.format(epoch, 100. * batch_idx / len(train_loader))) + logger.info([x.item() for x in losses] + [global_step, lr]) + + scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr, "grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g} + scalar_dict.update({"loss/dur_disc/total": loss_dur_disc_all, "grad_norm_dur_disc": grad_norm_dur_disc}) if net_dur_disc is not None else None + scalar_dict.update({"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl}) + + scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) + scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) + scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) + + # if net_dur_disc is not None: + # scalar_dict.update({"loss/dur_disc_r" : f"{losses_dur_disc_r}"}) + # scalar_dict.update({"loss/dur_disc_g" : f"{losses_dur_disc_g}"}) + # scalar_dict.update({"loss/dur_gen" : f"{loss_dur_gen}"}) + + image_dict = { + "slice/mel_org": plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()), + "slice/mel_gen": plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()), + "all/mel": plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()), + "all/attn": plot_alignment_to_numpy(attn[0,0].data.cpu().numpy()) + } + summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict) + + if global_step % hps.train.eval_interval == 0: + evaluate(hps, net_g, eval_loader, writer_eval) + save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch, Path(hps.model_dir).joinpath("G_{}.pth".format(global_step)).__str__()) + save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch, Path(hps.model_dir).joinpath("D_{}.pth".format(global_step)).__str__()) + save_checkpoint(net_dur_disc, optim_dur_disc, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "DUR_{}.pth".format(global_step))) if net_dur_disc is not None else None + + remove_old_checkpoints(hps.model_dir, prefixes=["G_*.pth", "D_*.pth", "DUR_*.pth"]) + global_step += 1 + + if rank == 0: + logger.info('====> Epoch: {}'.format(epoch)) + + +def run(rank, n_gpus, hps): + global global_step + net_dur_disc = None + if rank == 0: + logger = get_logger(hps.model_dir) + #logger.info(hps) + #check_git_hash(hps.model_dir) + writer = SummaryWriter(log_dir = Log_Dir) + writer_eval = SummaryWriter(log_dir = Path(Log_Dir).joinpath("eval").__str__()) + + dist.init_process_group( + backend = 'gloo' if platform.system() == 'Windows' else 'nccl', # Windows不支持NCCL backend,故使用GLOO + init_method = 'env://', + world_size = n_gpus, + rank = rank + ) + + torch.manual_seed(hps.train.seed) + torch.cuda.set_device(rank) + + if "use_mel_posterior_encoder" in hps.model.keys() and hps.model.use_mel_posterior_encoder == True: + print("Using mel posterior encoder for VITS2") + posterior_channels = 80 # vits2 + hps.data.use_mel_posterior_encoder = True + else: + print("Using lin posterior encoder for VITS1") + posterior_channels = hps.data.filter_length // 2 + 1 + hps.data.use_mel_posterior_encoder = False + + train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data) + train_sampler = DistributedBucketSampler( + train_dataset, + hps.train.batch_size, + [32,300,400,500,600,700,800,900,1000], + num_replicas=n_gpus, + rank=rank, + shuffle=True + ) + collate_fn = TextAudioSpeakerCollate() + train_loader = DataLoader( + train_dataset, + num_workers=Num_Workers, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + batch_sampler=train_sampler + ) + if rank == 0: + eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data) + eval_loader = DataLoader( + eval_dataset, + num_workers=0, + shuffle=False, + batch_size=hps.train.batch_size, + pin_memory=True, + drop_last=False, + collate_fn=collate_fn + ) + + # some of these flags are not being used in the code and directly set in hps json file. + # they are kept here for reference and prototyping. + if "use_transformer_flows" in hps.model.keys() and hps.model.use_transformer_flows == True: + use_transformer_flows = True + transformer_flow_type = hps.model.transformer_flow_type + print(f"Using transformer flows {transformer_flow_type} for VITS2") + assert transformer_flow_type in AVAILABLE_FLOW_TYPES, f"transformer_flow_type must be one of {AVAILABLE_FLOW_TYPES}" + else: + print("Using normal flows for VITS1") + use_transformer_flows = False + + if "use_spk_conditioned_encoder" in hps.model.keys() and hps.model.use_spk_conditioned_encoder == True: + if hps.data.n_speakers == 0: + raise ValueError("n_speakers must be > 0 when using spk conditioned encoder to train multi-speaker model") + use_spk_conditioned_encoder = True + else: + print("Using normal encoder for VITS1") + use_spk_conditioned_encoder = False + + if "use_noise_scaled_mas" in hps.model.keys() and hps.model.use_noise_scaled_mas == True: + print("Using noise scaled MAS for VITS2") + use_noise_scaled_mas = True + mas_noise_scale_initial = 0.01 + noise_scale_delta = 2e-6 + else: + print("Using normal MAS for VITS1") + use_noise_scaled_mas = False + mas_noise_scale_initial = 0.0 + noise_scale_delta = 0.0 + + # Initialize VITS models and move to GPU + net_g = SynthesizerTrn( + len(symbols), + posterior_channels, + hps.train.segment_size // hps.data.hop_length, + n_speakers=hps.data.n_speakers, + mas_noise_scale_initial=mas_noise_scale_initial, + noise_scale_delta=noise_scale_delta, + **hps.model + ).cuda(rank) + net_d = MultiPeriodDiscriminator( + hps.model.use_spectral_norm + ).cuda(rank) + if "use_duration_discriminator" in hps.model.keys() and hps.model.use_duration_discriminator == True: + use_duration_discriminator = True + # add duration discriminator type here + duration_discriminator_type = getattr(hps.model, "duration_discriminator_type", "dur_disc_1") + print(f"Using duration_discriminator {duration_discriminator_type} for VITS2") + assert duration_discriminator_type in AVAILABLE_DURATION_DISCRIMINATOR_TYPES, f"duration_discriminator_type must be one of {AVAILABLE_DURATION_DISCRIMINATOR_TYPES}" + duration_discriminator_type = AVAILABLE_DURATION_DISCRIMINATOR_TYPES + if duration_discriminator_type == "dur_disc_1": + net_dur_disc = DurationDiscriminatorV1( + hps.model.hidden_channels, + hps.model.hidden_channels, + 3, + 0.1, + gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0, + ).cuda(rank) + elif duration_discriminator_type == "dur_disc_2": + net_dur_disc = DurationDiscriminatorV2( + hps.model.hidden_channels, + hps.model.hidden_channels, + 3, + 0.1, + gin_channels=hps.model.gin_channels if hps.data.n_speakers != 0 else 0, + ).cuda(rank) + else: + print("NOT using any duration discriminator like VITS1") + use_duration_discriminator = False + net_dur_disc = None + + # Build optimizers for the initialized VITS models + optim_g = torch.optim.AdamW( + filter(lambda net_g_params: net_g_params.requires_grad, net_g.parameters()), # Filter out params which don't require gradient + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps + ) + optim_d = torch.optim.AdamW( + net_d.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps + ) + optim_dur_disc = torch.optim.AdamW( + net_dur_disc.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps + ) if net_dur_disc is not None else None + + # Build DDP models for the initialized VITS models + net_g = DDP(net_g, device_ids = [rank], find_unused_parameters = True) + net_d = DDP(net_d, device_ids = [rank], find_unused_parameters = False) + net_dur_disc = DDP(net_dur_disc, device_ids=[rank]) if net_dur_disc is not None else None + + # Load state dict from checkpoint for the initialized VITS models and get the optimizer, learning rate and iteration + try: + _, optim_g, lr_g, epoch_str = load_checkpoint( + Model_Path_Pretrained_G if Use_PretrainedModels else latest_checkpoint_path(hps.model_dir, "G_*.pth"), + net_g, + optim_g, + Keep_Original_Speakers if Use_PretrainedModels else True + ) + _, optim_d, lr_d, epoch_str = load_checkpoint( + Model_Path_Pretrained_D if Use_PretrainedModels else latest_checkpoint_path(hps.model_dir, "D_*.pth"), + net_d, + optim_d, + Keep_Original_Speakers if Use_PretrainedModels else True + ) + _, _, _, epoch_str = load_checkpoint( + latest_checkpoint_path(hps.model_dir, "DUR_*.pth"), + net_dur_disc, + optim_dur_disc + ) if net_dur_disc is not None else (_, _, _, epoch_str) + + # To prevent KeyError: "param 'initial_lr' is not specified in param_groups[0] when resuming an optimizer" + if optim_g.param_groups[0].get('initial_lr') is None: + optim_g.param_groups[0]['initial_lr'] = lr_g + if optim_d.param_groups[0].get('initial_lr') is None: + optim_d.param_groups[0]['initial_lr'] = lr_d + + global_step = (epoch_str - 1) * len(train_loader) # > 0 + print(f"Continue from step {global_step}") + + except Exception as e: + epoch_str = 1 + global_step = 0 + print(f"Got Exception: {e}. Start from step 0") + + # Build learning rate schedulers for optimizers + scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma = hps.train.lr_decay, last_epoch = epoch_str - 2) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma = hps.train.lr_decay, last_epoch = epoch_str - 2) + scheduler_dur_disc = torch.optim.lr_scheduler.ExponentialLR(optim_dur_disc, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2) if net_dur_disc is not None else None + + # Build gradient scaler + scaler = GradScaler(enabled = hps.train.fp16_run) + + # Start training (and evaluating) + for epoch in range(epoch_str, hps.train.epochs + 1): + if rank == 0: + train_and_evaluate( + rank, epoch, hps, [net_g, net_d, net_dur_disc], [optim_g, optim_d, optim_dur_disc], [scheduler_g, scheduler_d, scheduler_dur_disc], scaler, + [train_loader, eval_loader], logger, [writer, writer_eval] + ) + else: + train_and_evaluate( + rank, epoch, hps, [net_g, net_d, net_dur_disc], [optim_g, optim_d, optim_dur_disc], [scheduler_g, scheduler_d, scheduler_dur_disc], scaler, + [train_loader, None], None, None + ) + scheduler_g.step() + scheduler_d.step() + scheduler_dur_disc.step() if net_dur_disc is not None else None + + +if __name__ == "__main__": + # Assume Single Node Multi GPUs Training Only + assert torch.cuda.is_available(), "CPU training is not allowed." + n_gpus = torch.cuda.device_count() + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '8000' + + hps = get_hparams( + Config_Path = Config_Path, + Model_Dir = Dir_Output + ) + mp.spawn(run, args = (n_gpus, hps,), nprocs = n_gpus) \ No newline at end of file diff --git a/EVT_GUI/EnvConfigurator.py b/EVT_GUI/EnvConfigurator.py index f866d2d..b15f1af 100644 --- a/EVT_GUI/EnvConfigurator.py +++ b/EVT_GUI/EnvConfigurator.py @@ -501,7 +501,10 @@ def Check_Pytorch(self, Package: str): def Install_Pytorch(self, Package: str, Reinstall: bool): DisplayCommand = 'cmd /c start cmd /k ' if platform.system() == 'Windows' else 'x-terminal-emulator -e ' if Package in ('torch', 'torchvision', 'torchaudio'): - pynvml.nvmlInit() + try: + pynvml.nvmlInit() + except: + raise Exception("Failed to get NVIDIA GPUs' info.") CudaList = [117, 118, 121] CudaVersion = min(CudaList, key = lambda Cuda: abs(Cuda - pynvml.nvmlSystemGetCudaDriverVersion()//100)) MirrorList = [f'https://download.pytorch.org/whl/cu{CudaVersion}', ''] diff --git a/README.md b/README.md index 14ddc0c..bdce07e 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Functions that are currently included in the toolkit are as follows: - [Voice Recognition](/docs/EN/Voice-Recognizer.md) -- [Voice Transcribing](/docs/EN/Voice-Transcriber.md) +- [Voice Transcription](/docs/EN/Voice-Transcriber.md) - [Dataset Creating (SRT Converting & WAV Splitting)](/docs/EN/Dataset-Creator.md) @@ -152,7 +152,7 @@ Please make sure that you've installed [Python](https://www.python.org/downloads - Install pytorch (Command can be get from the [official site](https://pytorch.org/get-started/locally/)) ```shell - # e.g. (注意自己的cuda版本,这里以11.8为例) + # e.g. (Mind your cuda version,here we take 11.8 as an example) pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 ``` diff --git a/Run.py b/Run.py index 1c25437..5a42c6d 100644 --- a/Run.py +++ b/Run.py @@ -20,7 +20,7 @@ ############################################################################################################################## # Set current version -CurrentVersion = "v1.1.3" +CurrentVersion = "v1.1.4" ############################################################################################################################## @@ -391,9 +391,8 @@ def Execute(self, Params: tuple): Args = [ f'cd "{ResourceDir}"', 'python -c "' - 'from EVT_Core.Train.VITS.Train import Voice_Training; ' - f"PreprocessandTrain = Voice_Training{str(Params)}; " - 'PreprocessandTrain.Preprocessing_and_Training()"' + 'from EVT_Core.Train.VITS.Train import Train; ' + f'Train{str(Params)}"' ] ) Output, Error = CMD.monitor( @@ -492,9 +491,8 @@ def Execute(self, Params: tuple): Args = [ f'cd "{ResourceDir}"', 'python -c "' - 'from EVT_Core.TTS.VITS.Convert import Voice_Converting; ' - f"TTS = Voice_Converting{str(ItemReplacer(LANGUAGES, Params))}; " - 'TTS.Converting()"' + 'from EVT_Core.TTS.VITS.Convert import Convert; ' + f'Convert{str(ItemReplacer(LANGUAGES, Params))}"' ] ) Output, Error = CMD.monitor( @@ -3514,6 +3512,7 @@ def SetText_LineEdit_DAT_GPTSoVITS_FileListPath(): QMessageBox.Yes: lambda: ( DATResult_Save( ChildWindow_DAT_GPTSoVITS.ui.Table.GetValue(), + LineEdit_DAT_GPTSoVITS_FileListPath.text() ), ChildWindow_DAT_GPTSoVITS.close() ) @@ -5426,10 +5425,11 @@ def SetText_LineEdit_Train_VITS_OutputDir(): self.ui.LineEdit_Train_VITS_ModelPathPretrainedD, self.ui.LineEdit_Train_VITS_OutputRoot, self.ui.LineEdit_Train_VITS_OutputDirName, + 'Config.json', self.ui.LineEdit_Train_VITS_LogDir ], EmptyAllowed = [ - DialogBox_KeepOriginalSpeakers.LineEdit, + self.ui.LineEdit_Train_VITS_ConfigPathLoad, self.ui.LineEdit_Train_VITS_ModelPathPretrainedG, self.ui.LineEdit_Train_VITS_ModelPathPretrainedD ],