-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmel_spectrogram.py
74 lines (62 loc) · 2.38 KB
/
mel_spectrogram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""Code to calculate mel spectrograms."""
import math
import librosa
import numpy as np
import scipy.signal
def calculate_mel_spectrogram(
audio_path,
target_fs=64,
fmin=-4.2735,
fmax=5444,
nb_filters=28,
hop_length=None,
win_length=None,
):
"""Calculates mel spectrogram of a raw speech file.
Parameters
---------
audio_path: str
audio file path
target_fs: int
Sampling frequency of the calculated mel spectrogram
fmin: Union[float, int]
Minimum center frequency used in mel filter matrix
fmax: Union[float, int]
Maximum center frequency used in mel filter matrix
nb_filters: int
Number of mel spectrogram frequency bands
hop_length: int
Hop length (in samples) used for calculation of the spectrogram
win_length: int
Window length (in samples) of each frame
Returns
-------
numpy.ndarray
Mel spectrogram
"""
speech = np.load(audio_path)
audio, fs = speech["audio"], speech["fs"]
print(audio.shape)
print(fs)
if not hop_length:
hop_length = int((1 / target_fs) * fs) # this will downsample the signal to target_fs Hz
if not win_length:
win_length = int(0.025 * fs) # 25 milli seconds
print(hop_length)
print(win_length)
# Finds the closest power of 2
# that is bigger than win_length
n_fft = int(math.pow(2, math.ceil(math.log2(win_length))))
print(n_fft)
# DC removal
audio = audio - np.mean(audio)
# For the calculation of the mel-spectrograms of the final test data (test set 1 and 2) the raw audio signals are padded with zeros
# so that the final mel-spectrogram has a duration of 192 samples (3 sec) to match the mel-spectrogram length of the training data
# audio = np.pad(audio, (0, 1298), 'constant') # 1298 zeros exactly leads to a mel-spectrogram duration of 192 samples
mel_spectrogram = librosa.feature.melspectrogram(audio, window=scipy.signal.windows.hamming(win_length),
sr=fs, n_fft=n_fft, hop_length=hop_length,
win_length=win_length, fmin=fmin, fmax=fmax, htk=True,
n_mels=nb_filters, center=False, norm=None, power=1.0).T
print(mel_spectrogram.shape)
mel_spectrogram=np.power(mel_spectrogram, 0.6)
return mel_spectrogram