-
Notifications
You must be signed in to change notification settings - Fork 1
/
make_spect.py
96 lines (75 loc) · 3.04 KB
/
make_spect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import pickle
import numpy as np
import soundfile as sf
from scipy import signal
from scipy.signal import get_window
from librosa.filters import mel
from librosa.core import load
import matplotlib.pyplot as plt
from numpy.random import RandomState
import argparse
import tqdm
def butter_highpass(cutoff, fs, order=5):
nyq = 0.5 * fs
normal_cutoff = cutoff / nyq
b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
return b, a
def pySTFT(x, fft_length=1024, hop_length=256):
x = np.pad(x, int(fft_length//2), mode='reflect')
noverlap = fft_length - hop_length
shape = x.shape[:-1]+((x.shape[-1]-noverlap)//hop_length, fft_length)
strides = x.strides[:-1]+(hop_length*x.strides[-1], x.strides[-1])
result = np.lib.stride_tricks.as_strided(x, shape=shape,
strides=strides)
fft_window = get_window('hann', fft_length, fftbins=True)
result = np.fft.rfft(fft_window * result, n=fft_length).T
return np.abs(result)
def to_spec(wav_path, target_path,a, b, mel_basis, min_level):
prng = RandomState(1)
# Read audio file
x, fs = load(wav_path, mono=True, sr=16000)
# Remove drifting noise
y = signal.filtfilt(b, a, x)
# Ddd a little random noise for model roubstness
wav = y * 0.96 + (prng.rand(y.shape[0])-0.5)*1e-06
# Compute spect
D = pySTFT(wav).T
# Convert to mel and normalize
D_mel = np.dot(D, mel_basis)
D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16
S = np.clip((D_db + 100) / 100, 0, 1)
# save spect
np.save(target_path, S.astype(np.float32), allow_pickle=False)
def make_spec(datasetDir = "training_set"):
mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T
min_level = np.exp(-100 / 20 * np.log(10))
b, a = butter_highpass(30, 16000, order=5)
# audio file directory
rootDir = datasetDir + '/wavs'
# spectrogram directory
targetDir = datasetDir + '/spmel'
if not os.path.exists(targetDir):
os.mkdir(targetDir)
dirs= os.listdir(rootDir)
print('Processing speakers :')
for speaker in tqdm.tqdm(dirs):
rootDirName = f"{rootDir}/{speaker}/"
targetDirName = f"{targetDir}/{speaker}/"
if not os.path.exists(targetDirName):
os.mkdir(targetDirName)
for dirName, dirs, files in os.walk(rootDirName):
subfolder = ''
if len(dirName.split('/')):
subfolder = dirName.split('/')[-1]
for fileName in files:
#if os.path.exists(os.path.join(targetDirName, subfolder+fileName[:-4]+'.npy')):
#continue
#prng = RandomState(int(subdir[1:]))
to_spec(os.path.join(dirName,fileName), os.path.join(targetDirName, subfolder+fileName[:-4]),a, b, mel_basis, min_level)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# dataset dir
parser.add_argument('--dataset', type=str, default="voxceleb", help='dataset dir')
config = parser.parse_args()
make_spec(config.dataset)