-
Notifications
You must be signed in to change notification settings - Fork 1
/
mnist.py
215 lines (176 loc) · 7.88 KB
/
mnist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
from struct import unpack
from collections import Counter
from scipy.stats import multivariate_normal
import numpy as np
import matplotlib.pylab as plt
def loadmnist(imagefile, labelfile):
# Open the images with gzip in read binary mode
images = open(imagefile, 'rb')
labels = open(labelfile, 'rb')
# Get metadata for images
images.read(4) # skip the magic_number
number_of_images = images.read(4)
number_of_images = unpack('>I', number_of_images)[0]
rows = images.read(4)
rows = unpack('>I', rows)[0]
cols = images.read(4)
cols = unpack('>I', cols)[0]
# Get metadata for labels
labels.read(4)
N = labels.read(4)
N = unpack('>I', N)[0]
# Get data
x = np.zeros((N, rows*cols), dtype=np.uint8) # Initialize numpy array
y = np.zeros(N, dtype=np.uint8) # Initialize numpy array
for i in range(N):
for j in range(rows*cols):
tmp_pixel = images.read(1) # Just a single byte
tmp_pixel = unpack('>B', tmp_pixel)[0]
x[i][j] = tmp_pixel
tmp_label = labels.read(1)
y[i] = unpack('>B', tmp_label)[0]
images.close()
labels.close()
return (x, y)
def displaychar(image):
plt.imshow(np.reshape(image, (28,28)), cmap=plt.cm.gray)
plt.axis('off')
plt.show()
def splitTrainingSet(training_data, training_labels, split_size):
"""
Splits the training data into two separate sets: training and validation
sets. The training set has the size of split_size passed as argument,
and the validation set has the size of total size - split_size.
Returns:
4-tuple of training-set with size of split_size, training-set-labels
of size split_size, and the remaining data as validation-set and its
labels.
"""
training_set = training_data[:split_size]
training_set_labels = training_labels[:split_size]
validation_set_len = len(training_data) - split_size
validation_set = training_data[split_size:]
validation_set_labels = training_labels[split_size:]
return (training_set, training_set_labels, validation_set, validation_set_labels)
def classifyTrainingData(training_set, training_labels):
"""
Classify each training data to the respective class.
Returns:
A dictionary where the key is the class [0-9] and the value
is the list of training data associated with the label.
"""
classified_training_data = {}
for idx, data in enumerate(training_set):
current_label = training_labels[idx]
# map current data to it's label
if current_label in classified_training_data:
list_of_data = classified_training_data[current_label]
list_of_data.append(data)
# add data to new key
else:
classified_training_data[current_label] = [data]
return classified_training_data
def _findClassProbabilities(training_labels):
"""
Given the training labels, find the distribution of each class [0-9]
occurring within the list.
Returns:
label_count_dict: dictionary with class [0-9] as key, and
the value as the fraction of class probabilities pi0-pi9.
"""
label_count_dict = Counter(training_labels)
total_label_size = len(training_labels)
for label, count in label_count_dict.iteritems():
label_count_dict[label] = count / float(total_label_size)
return label_count_dict
def findGaussianParams(training_set, training_labels):
"""
Given the training data and its labels, find the parameters for the
Gaussian distribution, namely class prob, mean and covariance for
each class [0-9].
Returns:
3-tuple dictionaries containing the class probabilities, mean,
and covariance.
"""
classified_data = classifyTrainingData(training_set, training_labels)
class_prob_distribution = _findClassProbabilities(training_labels)
mean_j, cov_j = {}, {}
for curr_class, list_of_data in classified_data.items():
mean[curr_class] = np.mean(list_of_data, axis=0)
cov[curr_class] = np.cov(list_of_data, rowvar=False)
return (class_prob_distribution, mean_j, cov_j)
def fitGaussian(data, class_prob_dict, mean_dict, cov_dict, c_val):
"""
Given data, a 784-dimensional vector, fit the Gaussian distribution
to every class j: P_j * N(mean_j, cov_j), and return a list of
probabilities from P_0 to P_9.
Params:
c_val: used to smooth the covariance matrix by performing
cov_j = cov_j + (c_val * I)
Returns:
gaussian_prob_list: a list of probabilities from fitting the gaussian,
containing prob P_0 to P_9.
"""
gaussian_prob_list = []
# for each j, calculate: P(x|j) = pi_j * P_j
# where P_j = N(mean_j, cov_j)
for j in range(len(class_prob_dict)):
# smooth the covariance matrix by c_val
smoothed_cov = cov_dict[j] + (c_val * np.identity(784))
# take the log of P_j since P_j is small
logP_j = multivariate_normal.logpdf(data, mean_dict[j], smoothed_cov)
logpi = np.log(class_prob_dict[j])
# Since we are taking the log of P_j,
# P(x|j) = log(pi_j) + log(P_j)
this_class_prob = logpi + logP_j
gaussian_prob_list.append(this_class_prob)
return gaussian_prob_list
def getTestError(validation_set, validation_labels, c_val,
class_prob_dict, mean_dict, cov_dict):
"""Prints out the test error for a given c_val.
Returns:
2-tuple of error rate (float) and list of prediction errors (pair)
error rate: float, error rate on validation set.
prediction errors: list of pairs, where the first element contains
the predicted label, and the second containing
the data.
"""
error_count = 0
prediction_errors = []
for idx, data in enumerate(validation_set):
prob_list = fitGaussian(data, class_prob_dict, mean_dict, cov_dict, c_val)
predicted_label = prob_list.index(max(prob_list)) # key where the prob is max
# check if prediction is wrong
if predicted_label != validation_labels[idx]:
error_count += 1
prediction_errors.append((predicted_label, data))
# get error rate
error_rate = error_count / float(len(validation_set))
return (error_rate, prediction_errors)
def main():
# Load training data
x, y = loadmnist('train-images.idx3-ubyte', 'train-labels.idx1-ubyte')
# len(x[0]) = 784
# Split training set
training_set, training_labels, validation_set, validation_labels = splitTrainingSet(x, y, 50000)
# training_set is 50000, validation_set is 10000
# Find mean and covariance for each class [0-9]
class_prob, mean_dict, cov_dict = findGaussianParams(training_set, training_labels)
# Choose appropriate c_value to smooth covariance matrices
c_value_list = [0.1, 10, 50, 100, 1000, 5000, 10000]
list_of_error_rates = []
for cval in c_value_list:
err_rate, p_list = getTestError(validation_set, validation_labels, cval,
class_prob, mean_dict, cov_dict)
list_of_error_rates.append(err_rate)
# Get best c_value, which is the c_val that yields the minimum error rate
best_c_value = c_value_list[list_of_error_rates.index(min(list_of_error_rates))]
# Load test data
test_x, test_label = loadmnist('t10k-images.idx3-ubyte', 't10k-labels.idx1-ubyte')
# Get test error
test_error_rate, pred_errors = getTestError(test_x, test_label, best_c_value,
class_prob, mean_dict, cov_dict)
# Output test error
print(test_error_rate)
if __name__ == '__main__':
main()