forked from ruohoruotsi/LSTM-Music-Genre-Classification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lstm_genre_classifier_keras.py
127 lines (108 loc) · 4.91 KB
/
lstm_genre_classifier_keras.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Keras implementation of a simple 2-layer-deep LSTM for genre classification of musical audio.
Feeding the LSTM stack are spectral {centroid, contrast}, chromagram and MFCC features
Model Summary:
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm_1 (LSTM) (None, 128, 128) 82944
_________________________________________________________________
lstm_2 (LSTM) (None, 32) 20608
_________________________________________________________________
dense_1 (Dense) (None, 8) 264
=================================================================
Total params: 103,816
Trainable params: 103,816
Non-trainable params: 0
X shape (total # of training examples, sequence_length, input_dim)
Y shape (total # of training examples, # output classes)
________________________________
Training X shape: (420, 128, 33)
Training Y shape: (420, 8)
________________________________
Dev X shape: (120, 128, 33)
Dev Y shape: (120, 8)
________________________________
Test X shape: (60, 128, 33)
Test Y shape: (60, 128, 33)
420 is the total number of 30 second training files, across all genres. From each file, we
extract a 128-length sequence of inputs (it could be longer). Each input is 33 element column
vector comprising mfcc, centroid, contrast & chroma features.
An epoch, containing all training data (420 sequences), is divided into 12 mini-batches of
length 35, i.e. each mini-batch has 35 sequences. An LSTM RNN (stack) loops over each sequence
for sequence_length (128) steps computing an output value (of dimension 32) which is
transformed (via Dense layer) into 8 output classes
"""
import logging
import os
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers import Dense
from keras.optimizers import Adam
from GenreFeatureData import (
GenreFeatureData,
) # local python class with Audio feature extraction (librosa)
# set logging level
logging.getLogger("tensorflow").setLevel(logging.ERROR)
genre_features = GenreFeatureData()
# if all of the preprocessed files do not exist, regenerate them all for self-consistency
if (
os.path.isfile(genre_features.train_X_preprocessed_data)
and os.path.isfile(genre_features.train_Y_preprocessed_data)
and os.path.isfile(genre_features.dev_X_preprocessed_data)
and os.path.isfile(genre_features.dev_Y_preprocessed_data)
and os.path.isfile(genre_features.test_X_preprocessed_data)
and os.path.isfile(genre_features.test_Y_preprocessed_data)
):
print("Preprocessed files exist, deserializing npy files")
genre_features.load_deserialize_data()
else:
print("Preprocessing raw audio files")
genre_features.load_preprocess_data()
print("Training X shape: " + str(genre_features.train_X.shape))
print("Training Y shape: " + str(genre_features.train_Y.shape))
print("Dev X shape: " + str(genre_features.dev_X.shape))
print("Dev Y shape: " + str(genre_features.dev_Y.shape))
print("Test X shape: " + str(genre_features.test_X.shape))
print("Test Y shape: " + str(genre_features.test_Y.shape))
input_shape = (genre_features.train_X.shape[1], genre_features.train_X.shape[2])
print("Build LSTM RNN model ...")
model = Sequential()
model.add(LSTM(units=128, dropout=0.05, recurrent_dropout=0.35, return_sequences=True, input_shape=input_shape))
model.add(LSTM(units=32, dropout=0.05, recurrent_dropout=0.35, return_sequences=False))
model.add(Dense(units=genre_features.train_Y.shape[1], activation="softmax"))
print("Compiling ...")
# Keras optimizer defaults:
# Adam : lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, decay=0.
# RMSprop: lr=0.001, rho=0.9, epsilon=1e-8, decay=0.
# SGD : lr=0.01, momentum=0., decay=0.
opt = Adam()
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
model.summary()
print("Training ...")
batch_size = 35 # num of training examples per minibatch
num_epochs = 400
model.fit(
genre_features.train_X,
genre_features.train_Y,
batch_size=batch_size,
epochs=num_epochs,
)
print("\nValidating ...")
score, accuracy = model.evaluate(
genre_features.dev_X, genre_features.dev_Y, batch_size=batch_size, verbose=1
)
print("Dev loss: ", score)
print("Dev accuracy: ", accuracy)
print("\nTesting ...")
score, accuracy = model.evaluate(
genre_features.test_X, genre_features.test_Y, batch_size=batch_size, verbose=1
)
print("Test loss: ", score)
print("Test accuracy: ", accuracy)
# Creates a HDF5 file 'lstm_genre_classifier.h5'
model_filename = "lstm_genre_classifier_lstm.h5"
print("\nSaving model: " + model_filename)
model.save(model_filename)