-
Notifications
You must be signed in to change notification settings - Fork 6.4k
/
Copy pathrbm_tf_k.py
248 lines (200 loc) · 8.33 KB
/
rbm_tf_k.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# https://udemy.com/recommender-systems
# https://deeplearningcourses.com/recommender-systems
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import pandas as pd
from scipy.sparse import lil_matrix, csr_matrix, save_npz, load_npz
from datetime import datetime
if tf.__version__.startswith('2'):
tf.compat.v1.disable_eager_execution()
# is it possible to one-hot encode the data prior to feeding it
# into the neural network, so that we don't have to do it on the fly?
# yes, but:
# 1) scipy sparse doesn't support N-D matrices
# 2) you can use the 'sparse' standalone package, but it takes very long
# and you will run out of RAM
def one_hot_encode(X, K):
# input is N x D
# output is N x D x K
N, D = X.shape
Y = np.zeros((N, D, K))
for n, d in zip(*X.nonzero()):
# 0.5...5 --> 1..10 --> 0..9
k = int(X[n,d]*2 - 1)
Y[n,d,k] = 1
return Y
def one_hot_mask(X, K):
# input is N x D
# output is N x D x K
N, D = X.shape
Y = np.zeros((N, D, K))
# if X[n,d] == 0, there's a missing rating
# so the mask should be all zeros
# else, it should be all ones
for n, d in zip(*X.nonzero()):
Y[n,d,:] = 1
return Y
one_to_ten = np.arange(10) + 1 # [1, 2, 3, ..., 10]
def convert_probs_to_ratings(probs):
# probs is N x D x K
# output is N x D matrix of predicted ratings
# N, D, K = probs.shape
# out = np.zeros((N, D))
# each predicted rating is a weighted average using the probabilities
# for n in range(N):
# for d in range(D):
# out[n,d] = probs[n,d].dot(one_to_ten) / 2
# return out
return probs.dot(one_to_ten) / 2
def dot1(V, W):
# V is N x D x K (batch of visible units)
# W is D x K x M (weights)
# returns N x M (hidden layer size)
return tf.tensordot(V, W, axes=[[1,2], [0,1]])
def dot2(H, W):
# H is N x M (batch of hiddens)
# W is D x K x M (weights transposed)
# returns N x D x K (visible)
return tf.tensordot(H, W, axes=[[1], [2]])
class RBM(object):
def __init__(self, D, M, K):
self.D = D # input feature size
self.M = M # hidden size
self.K = K # number of ratings
self.build(D, M, K)
def build(self, D, M, K):
# params
self.W = tf.Variable(tf.random.normal(shape=(D, K, M)) * np.sqrt(2.0 / M))
self.c = tf.Variable(np.zeros(M).astype(np.float32))
self.b = tf.Variable(np.zeros((D, K)).astype(np.float32))
# data
self.X_in = tf.compat.v1.placeholder(tf.float32, shape=(None, D, K))
self.mask = tf.compat.v1.placeholder(tf.float32, shape=(None, D, K))
# conditional probabilities
# NOTE: tf.contrib.distributions.Bernoulli API has changed in Tensorflow v1.2
V = self.X_in
p_h_given_v = tf.nn.sigmoid(dot1(V, self.W) + self.c)
self.p_h_given_v = p_h_given_v # save for later
# draw a sample from p(h | v)
r = tf.random.uniform(shape=tf.shape(input=p_h_given_v))
H = tf.cast(r < p_h_given_v, dtype=tf.float32)
# draw a sample from p(v | h)
# note: we don't have to actually do the softmax
logits = dot2(H, self.W) + self.b
cdist = tf.compat.v1.distributions.Categorical(logits=logits)
X_sample = cdist.sample() # shape is (N, D)
X_sample = tf.one_hot(X_sample, depth=K) # turn it into (N, D, K)
X_sample = X_sample * self.mask # missing ratings shouldn't contribute to objective
# build the objective
objective = tf.reduce_mean(input_tensor=self.free_energy(self.X_in)) - tf.reduce_mean(input_tensor=self.free_energy(X_sample))
self.train_op = tf.compat.v1.train.AdamOptimizer(1e-2).minimize(objective)
# self.train_op = tf.train.GradientDescentOptimizer(1e-3).minimize(objective)
# build the cost
# we won't use this to optimize the model parameters
# just to observe what happens during training
logits = self.forward_logits(self.X_in)
self.cost = tf.reduce_mean(
input_tensor=tf.nn.softmax_cross_entropy_with_logits(
labels=tf.stop_gradient(self.X_in),
logits=logits,
)
)
# to get the output
self.output_visible = self.forward_output(self.X_in)
initop = tf.compat.v1.global_variables_initializer()
self.session = tf.compat.v1.Session()
self.session.run(initop)
def fit(self, X, mask, X_test, mask_test, epochs=10, batch_sz=256, show_fig=True):
N, D = X.shape
n_batches = N // batch_sz
costs = []
test_costs = []
for i in range(epochs):
t0 = datetime.now()
print("epoch:", i)
X, mask, X_test, mask_test = shuffle(X, mask, X_test, mask_test) # everything has to be shuffled accordingly
for j in range(n_batches):
x = X[j*batch_sz:(j*batch_sz + batch_sz)].toarray()
m = mask[j*batch_sz:(j*batch_sz + batch_sz)].toarray()
# both visible units and mask have to be in one-hot form
# N x D --> N x D x K
batch_one_hot = one_hot_encode(x, self.K)
m = one_hot_mask(m, self.K)
_, c = self.session.run(
(self.train_op, self.cost),
feed_dict={self.X_in: batch_one_hot, self.mask: m}
)
if j % 100 == 0:
print("j / n_batches:", j, "/", n_batches, "cost:", c)
print("duration:", datetime.now() - t0)
# calculate the true train and test cost
t0 = datetime.now()
sse = 0
test_sse = 0
n = 0
test_n = 0
for j in range(n_batches):
x = X[j*batch_sz:(j*batch_sz + batch_sz)].toarray()
m = mask[j*batch_sz:(j*batch_sz + batch_sz)].toarray()
# only visible input has to be in one-hot form
xoh = one_hot_encode(x, self.K)
probs = self.get_visible(xoh)
xhat = convert_probs_to_ratings(probs)
sse += (m * (xhat - x)*(xhat - x)).sum()
n += m.sum()
# the test PREDICTIONS come from the train data!
# X_test and mask_test are only used for targets
xt = X_test[j*batch_sz:(j*batch_sz + batch_sz)].toarray()
mt = mask_test[j*batch_sz:(j*batch_sz + batch_sz)].toarray()
test_sse += (mt * (xhat - xt) * (xhat - xt)).sum()
test_n += mt.sum()
c = sse/n
ct = test_sse/test_n
print("train mse:", c)
print("test mse:", ct)
print("calculate cost duration:", datetime.now() - t0)
costs.append(c)
test_costs.append(ct)
if show_fig:
plt.plot(costs, label='train mse')
plt.plot(test_costs, label='test mse')
plt.legend()
plt.show()
def free_energy(self, V):
first_term = -tf.reduce_sum(input_tensor=dot1(V, self.b))
second_term = -tf.reduce_sum(
# tf.log(1 + tf.exp(tf.matmul(V, self.W) + self.c)),
input_tensor=tf.nn.softplus(dot1(V, self.W) + self.c),
axis=1
)
return first_term + second_term
def forward_hidden(self, X):
return tf.nn.sigmoid(dot1(X, self.W) + self.c)
def forward_logits(self, X):
Z = self.forward_hidden(X)
return dot2(Z, self.W) + self.b
def forward_output(self, X):
return tf.nn.softmax(self.forward_logits(X))
def transform(self, X):
# accepts and returns a real numpy array
# unlike forward_hidden and forward_output
# which deal with tensorflow variables
return self.session.run(self.p_h_given_v, feed_dict={self.X_in: X})
def get_visible(self, X):
return self.session.run(self.output_visible, feed_dict={self.X_in: X})
def main():
A = load_npz("Atrain.npz")
A_test = load_npz("Atest.npz")
mask = (A > 0) * 1.0
mask_test = (A_test > 0) * 1.0
N, M = A.shape
rbm = RBM(M, 50, 10)
rbm.fit(A, mask, A_test, mask_test)
if __name__ == '__main__':
main()