Skip to content

Commit

Permalink
Client improvement & GPT-SoVITS support
Browse files Browse the repository at this point in the history
1. [Core] Add GPT-SoVITS model training&inference
2. [Core] Simplify DatasetCreator to make it more readable and support GPT-SoVITS
3. [Core] Split 'OutputDir' param into params as 'OutputRoot'&'OutputDirName'
4. [GUI] Simplify all tools' output param options and allow user to manage output root directory in settings page
5. [GUI] Fix blank border issue (under windows10 system) for WindowBase
6. [GUI] Add rect-monitoring signal and mask effect for WindowBase and LineEditBase
7. [GUI] Support loading dict type info from manifest and appending local files' folder name for model management
  • Loading branch information
Spr-Aachen committed Mar 29, 2024
1 parent 9538c54 commit c7bb295
Show file tree
Hide file tree
Showing 47 changed files with 17,103 additions and 7,554 deletions.
19 changes: 18 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,21 @@
**/*venv/
**/*.ico
**/*MANIFEST.in
**/*Setup.py
**/*Setup.py

#
**/uvr5/lib/
**/GPT_SoVITS/config.py
**/GPT_SoVITS/GPT_SoVITS/AR/
**/GPT_SoVITS/GPT_SoVITS/configs/
**/GPT_SoVITS/GPT_SoVITS/module/
**/GPT_SoVITS/GPT_SoVITS/pretrained_models/
**/GPT_SoVITS/GPT_SoVITS/text/
**/GPT_SoVITS/GPT_SoVITS/tools/i18n/locale/
**/GPT_SoVITS/GPT_SoVITS/inference_webui.py
**/GPT_SoVITS/GPT_SoVITS/my_utils.py
**/GPT_SoVITS/GPT_SoVITS/onnx_export.py
**/GPT_SoVITS/GPT_SoVITS/process_ckpt.py
**/GPT_SoVITS/GPT_SoVITS/s1_train.py
**/GPT_SoVITS/GPT_SoVITS/s2_train.py
**/GPT_SoVITS/GPT_SoVITS/utils.py
9 changes: 6 additions & 3 deletions EVT_Core/ASR/VPR/Identify.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,26 @@ class Voice_Identifying:
def __init__(self,
StdAudioSpeaker: dict,
Audio_Dir_Input: str,
AudioSpeakersData_Path: str = './AudioSpeakerData.txt',
Model_Path: str = './Models/.pth',
Model_Type: str = 'Ecapa-Tdnn',
Feature_Method: str = 'melspectrogram',
DecisionThreshold: float = 0.60,
Duration_of_Audio: float = 4.20
Duration_of_Audio: float = 4.20,
Output_Root: str = "./",
Output_DirName: str = "",
AudioSpeakersData_Name: str = "AudioSpeakerData"
):
self.StdAudioSpeaker = StdAudioSpeaker
self.Audio_Dir_Input = Audio_Dir_Input
self.AudioSpeakersData_Path = AudioSpeakersData_Path
self.Model_Path = Model_Path
self.Model_Dir = Path(Model_Path).parent.__str__()
self.Model_Name = Path(Model_Path).stem.__str__()
self.Model_Type = Model_Type
self.Feature_Method = Feature_Method
self.DecisionThreshold = DecisionThreshold
self.Duration_of_Audio = Duration_of_Audio
self.Output_Dir = Path(Output_Root).joinpath(Output_DirName).as_posix()
self.AudioSpeakersData_Path = Path(self.Output_Dir).joinpath(AudioSpeakersData_Name).as_posix() + ".txt"

os.makedirs(os.path.dirname(self.AudioSpeakersData_Path), exist_ok = True)

Expand Down
135 changes: 135 additions & 0 deletions EVT_Core/Dataset/GPT_SoVITS/Create.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import os, sys, shutil
from typing import Union, Optional
from glob import glob
from pathlib import Path

from .utils.Creating_Directories import create_directories
from .utils.Convert_SRT_to_CSV import change_encoding, convert_srt_to_csv
from .utils.Split_Audio import split_files
from .utils.Create_DS_CSV import create_DS_csv
from .utils.Merge_CSV import merge_csv
from .utils.Merge_Transcripts_and_Files import merge_transcripts_and_wav_files
from .utils.Clean import clean_unwanted_characters
from .utils.Create_Dataset_Loading_Script import Transcript_Writer


class Dataset_Creating:
'''
1. Convert SRT to CSV
2. Reorgnize CSV content
3. Split and downsample WAV
'''
def __init__(self,
SRT_Dir: str,
AudioSpeakersData_Path: str,
#WAV_Time_Limitation: float = 10.00,
DataFormat: str = 'PATH|NAME|LANG|TEXT',
#Add_AuxiliaryData: bool = False,
#AuxiliaryData_Path: str = './AuxiliaryData/AuxiliaryData.txt',
Output_Root: str = "./",
Output_DirName: str = "",
FileList_Name: str = 'FileList'
):
self.SRT_Dir = SRT_Dir
def Get_WAV_Paths_Input():
WAV_Paths_Input = []
if Path(AudioSpeakersData_Path).is_dir():
for SubPath in glob(Path(AudioSpeakersData_Path).joinpath('**', '*.wav').__str__(), recursive = True):
Audio = Path(SubPath).as_posix()
WAV_Paths_Input.append(Audio)
if Path(AudioSpeakersData_Path).is_file():
with open(file = AudioSpeakersData_Path, mode = 'r', encoding = 'utf-8') as AudioSpeakersData:
AudioSpeakerLines = AudioSpeakersData.readlines()
for AudioSpeakerLine in AudioSpeakerLines:
Audio = AudioSpeakerLine.split('|')[0]
WAV_Paths_Input.append(Audio)
return WAV_Paths_Input
self.WAV_Paths_Input = Get_WAV_Paths_Input()
self.WAV_Dir_Split = Path(Output_Root).joinpath(Output_DirName).as_posix()
def Get_AudioSpeakers():
AudioSpeakers = {}
if Path(AudioSpeakersData_Path).is_dir():
for SubPath in glob(Path(AudioSpeakersData_Path).joinpath('**', '*.wav').__str__(), recursive = True):
Audio = Path(self.WAV_Dir_Split).joinpath(Path(SubPath).name).as_posix()
Speaker = Path(SubPath).parent.name
AudioSpeakers[Audio] = Speaker
if Path(AudioSpeakersData_Path).is_file():
with open(file = AudioSpeakersData_Path, mode = 'r', encoding = 'utf-8') as AudioSpeakersData:
AudioSpeakerLines = AudioSpeakersData.readlines()
for AudioSpeakerLine in AudioSpeakerLines:
Audio = Path(self.WAV_Dir_Split).joinpath(Path(AudioSpeakerLine.split('|')[0]).name).as_posix()
Speaker = AudioSpeakerLine.split('|')[1].strip()
AudioSpeakers[Audio] = Speaker
return AudioSpeakers
self.AudioSpeakers = Get_AudioSpeakers()
#self.WAV_Time_Limitation = WAV_Time_Limitation
self.DataFormat = DataFormat.replace('路径', 'PATH').replace('人名', 'NAME').replace('语言', 'LANG').replace('文本', 'TEXT')
self.FileList_Path = Path(self.WAV_Dir_Split).joinpath(FileList_Name).as_posix() + ".txt"

def CallingFunctions(self):
SRT_Counter = len(glob(os.path.join(self.SRT_Dir, '*.srt')))

if SRT_Counter == 0:
print('!!! Please add srt_file(s) to %s-folder' %self.SRT_Dir)
sys.exit()

# Create directories
CSV_Dir_Prepared = './Temp/ready_for_merging'
CSV_Dir_Merged = './Temp/merged_csv'
CSV_Dir_Final = './Temp/final_csv'
create_directories(self.WAV_Dir_Split, CSV_Dir_Prepared, CSV_Dir_Merged, CSV_Dir_Final)

# Changing encoding from utf-8 to utf-8-sig
print('Encoding srt_file(s) to utf-8...')
for SRT in glob(os.path.join(self.SRT_Dir, '*.srt')):
change_encoding(SRT)
print('Encoding of %s-file(s) changed' %SRT_Counter)
print('---------------------------------------------------------------------')

# Extracting information from srt-files to csv
print('Extracting information from srt_file(s) to csv_files')
for File in glob(os.path.join(self.SRT_Dir, '*.srt')):
convert_srt_to_csv(File, CSV_Dir_Prepared)
print('%s-file(s) converted and saved as csv-files to ./csv' %SRT_Counter)
print('---------------------------------------------------------------------')

# Now slice audio according to start- and end-times in csv
print('Slicing audio according to start- and end-times of transcript_csvs...')
split_files(CSV_Dir_Prepared, self.WAV_Paths_Input, self.WAV_Dir_Split)
WAV_Counter = len(glob(os.path.join(self.WAV_Dir_Split, '*.wav')))
print('Slicing complete. {} files in dir {}'.format(WAV_Counter, self.WAV_Dir_Split))
print('---------------------------------------------------------------------')

# Now create list of filepaths and -size of dir ./split_audio
create_DS_csv(self.WAV_Dir_Split, CSV_Dir_Merged)
print('DS_csv with Filepaths - and sizes created.')
print('---------------------------------------------------------------------')

# Now join all seperate csv files
merge_csv(CSV_Dir_Prepared, CSV_Dir_Merged)
print('Merged csv with all transcriptions created.')
print('---------------------------------------------------------------------')

# Merge the csv with transcriptions and the file-csv with paths and sizes
CSV_Name_Final = 'DS_training_final.csv'
merge_transcripts_and_wav_files(CSV_Dir_Merged, CSV_Dir_Final, CSV_Name_Final)
print('Final DS csv generated.')
print('---------------------------------------------------------------------')

# Clean the data of unwanted characters and translate numbers from int to words
CSV_Path_Final_Cleaned = clean_unwanted_characters(CSV_Dir_Final, CSV_Name_Final)
print('Unwanted characters cleaned.')
print('---------------------------------------------------------------------')

# Write transcript to text-file for model training
Transcript_Writer(self.AudioSpeakers, self.DataFormat, CSV_Path_Final_Cleaned, self.WAV_Dir_Split, self.FileList_Path)
print('Transcript written.')
print('---------------------------------------------------------------------')

# Now remove the created folders
for folders in [CSV_Dir_Prepared, CSV_Dir_Merged, CSV_Dir_Final]:
shutil.rmtree(folders, ignore_errors = True)
print('Temp files removed.')
print('********************************************** FINISHED ************************************************')

print(f'Final processed audio is in {self.WAV_Dir_Split}')
76 changes: 76 additions & 0 deletions EVT_Core/Dataset/GPT_SoVITS/utils/Clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import os
import pandas as pd
#from unidecode import unidecode


def Read_CSV(CSV_Path):
try:
return pd.read_csv(CSV_Path, engine = 'python')
except Exception as e:
print(e, type(e))
if (isinstance(e, pd.errors.EmptyDataError)):
pass


def clean_unwanted_characters(CSV_Dir_Final, CSV_Name_Final):
'''
Remove unwanted characters. After cleaning the transcripts, the text is extracted and saved in a txt file which can be used for training the language model.
'''
DF_DS_Final = Read_CSV(os.path.join(CSV_Dir_Final, CSV_Name_Final))

# some srt files contain font codes which are removed hereby
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('<font color=#91FFFF>', '', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('<font color=#72FD59>', '', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('<font color=#E8E858>', '', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('<font color=#FFFFFF>', '', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('</font>', '', regex=True)

'''
# Characters to be removed
punct = str(['.!"#$%&\'()*+,-/:;<–=>?@[\\]^_°`{}~ ̀ ̆ ̃ ́'])
transtab = str.maketrans(dict.fromkeys(punct, ' '))
'''
DF_DS_Final = DF_DS_Final.dropna()
'''
DF_DS_Final['transcript'] = '£'.join(DF_DS_Final['transcript'].tolist()).translate(transtab).split('£')
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].str.lower()
'''
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('\s+', '', regex = True) # Replace line feeds without spaces
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].str.strip()
'''
# Further remove unwanted characters
remove_char = '鄚氏鐷顤鐰鄣酹輐霵鐼羦鄜酲酺酺礫飉舣δφℳˁカᛠᛏˁːɣ\ʿʻʾŋ\ʹªьʺъˀˇʼʔˊˈ!"#$%&\()*+,-./:;<=>?@[]^_`{|}~'
table_2 = str.maketrans('','', remove_char)
DF_DS_Final['transcript'] = [w.translate(table_2) for w in DF_DS_Final['transcript']]
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('ä','ae', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('ö','oe', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('ü','ue', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('α','alpha', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('ə','e', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('ё','e', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('γ','gamma', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('µ','mikro', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('π','pi', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('β','beta', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('ζ','zeta', regex=True)
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].replace('ß','ss', regex=True)
# to get rid of final unwanted characters transform characters to strictly unicode
def to_ASCII(text):
text = unidecode(text)
return text
DF_DS_Final['transcript'] = DF_DS_Final['transcript'].apply(to_ASCII)
'''

# Save cleaned files
CSV_Name_Final_Cleaned = CSV_Name_Final[:-4]
CSV_Path_Final_Cleaned = os.path.join(CSV_Dir_Final, (CSV_Name_Final_Cleaned + '_cleaned.csv'))
DF_DS_Final.to_csv(CSV_Path_Final_Cleaned, header = True, index = False, encoding = 'utf-8') #DF_DS_Final.to_csv('./merged_csv/' + final_path + '_char_removed.csv', header = True, index = False, encoding = 'utf-8-sig')

print('Length of ds_final: {}'.format(len(DF_DS_Final)))
print('Final Files cleaned of unwanted characters')

return CSV_Path_Final_Cleaned
84 changes: 84 additions & 0 deletions EVT_Core/Dataset/GPT_SoVITS/utils/Convert_SRT_to_CSV.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import pandas as pd
import os
import io
import re
import numpy as np


def change_encoding(SRT_Path):
'''
Change encoding from utf-8 to utf-8-sig to keep Umlaute (e.g. ä, ö, ü)
'''
with io.open(SRT_Path, 'r', encoding = 'utf-8') as f:
text = f.read()
# process Unicode text
with io.open(SRT_Path, 'w', encoding = 'utf-8-sig') as f:
f.write(text)


def convert_srt_to_csv(
SRT_Path,
CSV_Dir
):
'''
Extract start time, end-time and subtitle from the SRT_Path-files and store in a csv. In preparation for audio-splitting, a column id is generated from the filename with the addition of a unique number.
'''
with open(SRT_Path, 'r', encoding = 'utf-8-sig') as h:
Sub = h.readlines() #returns list of all lines

Re_Pattern = r'[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3} --> [0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3}'
# Get start times
Times = list(filter(re.compile(Re_Pattern).search, Sub))
Start_Times = [time.split('-->')[0].strip() for time in Times]
End_Times = [time.split('-->')[1].strip() for time in Times]

# Get lines
Lines = [[]]
for Sentence in Sub:
if re.match(Re_Pattern, Sentence):
Lines[-1].pop()
Lines.append([])
else:
Lines[-1].append(Sentence)

Lines = Lines[1:] # all text in lists

Column_Names = ['id', 'start_times', 'end_times', 'transcript']
DF_Text = pd.DataFrame(columns = Column_Names)

DF_Text['start_times'] = Start_Times
DF_Text['end_times'] = End_Times
DF_Text['transcript'] = [" ".join(i).strip() for i in Lines]

DF_Text['id'] = np.arange(len(DF_Text))
ID_Extension = os.path.basename(SRT_Path).replace('.srt', '_')
'''
ID_Extension = ID_Extension.replace(' ', '_')
ID_Extension = ID_Extension.replace('-', '_')
ID_Extension = ID_Extension.replace('.', '_')
ID_Extension = ID_Extension.replace('__', '_')
ID_Extension = ID_Extension.replace('___', '_')
'''
DF_Text['id'] = ID_Extension + DF_Text['id'].map(str)

file_extension = ID_Extension[:-1]

# converting the times to milliseconds
def convert_to_ms(time):
h_ms = int(time[:2])*3600000
m_ms = int(time[3:5])*60000
s_ms = int(time[6:8])*1000
ms = int(time[9:12])
ms_total = h_ms + m_ms + s_ms + ms
return(ms_total)

def conv_int(start):
new_start = int(start)
return(new_start)

DF_Text['start_times'] = DF_Text['start_times'].apply(convert_to_ms)
DF_Text['start_times'] = DF_Text['start_times'].apply(conv_int)

DF_Text['end_times'] = DF_Text['end_times'].apply(convert_to_ms)

DF_Text.to_csv(os.path.join(CSV_Dir, (file_extension + '.csv')), index = False, header = True, encoding = 'utf-8-sig')
32 changes: 32 additions & 0 deletions EVT_Core/Dataset/GPT_SoVITS/utils/Create_DS_CSV.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pandas as pd
import os
from glob import glob
import wave
import contextlib


def create_DS_csv(
WAV_Dir_Extract,
CSV_Dir
):
'''
Create csv with filepath and -size in preparation for final DS training-csv
'''
#this function holds the code to extract the filepath and filesize of all audio in the respective directory
print(f'Extracting filepath and -size for every .wav file in {WAV_Dir_Extract}')
Data = pd.DataFrame(columns = ['wav_filename', 'wav_filesize', 'duration'])
DF = pd.DataFrame(columns = ['wav_filename', 'wav_filesize', 'duration'])

for entry in glob(os.path.join(WAV_Dir_Extract, '*.wav')):
filepath = os.path.abspath(entry)
filesize = os.path.getsize(entry)
with contextlib.closing(wave.open(entry, 'rb')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
DF['wav_filename'] = [filepath]
DF['wav_filesize'] = [filesize]
DF['duration'] = [duration]
Data = pd.concat([Data, DF], ignore_index = True)

Data.to_csv(os.path.join(CSV_Dir, 'Filepath_Filesize.csv'), header = True, index = False, encoding = 'utf-8-sig')
Loading

0 comments on commit c7bb295

Please sign in to comment.