-
Notifications
You must be signed in to change notification settings - Fork 2
/
svd_new.py
155 lines (147 loc) · 5.45 KB
/
svd_new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import pandas as pd
import numpy as np
from numpy import linalg as LA
import time
import math
precision_k = 5000
num_of_users = 6040 + 1
num_of_movies= 3952 + 1
num_of_ratings = 1000209
def preprocess():
'''
preprocessing the data by loading data into user_movie_matrix
returns :user_movie_matrix
'''
#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-1m/ratings.dat', sep="::", names=r_cols,encoding='latin-1',engine='python')
ratings= ratings.to_numpy()
indices = list(range(ratings.shape[0]))
random.shuffle(indices)
ratings = ratings[indices]
ratings= pd.DataFrame(ratings)
ratings = ratings.rename(columns={0: 'user_id',1 : 'movie_id',2 : 'rating', 3: 'unix_timestamp'},inplace= False)
ratings = ratings[['user_id', 'movie_id', 'rating']]
ratings_list = ratings.values.tolist()
user_movie_matrix = np.zeros((num_of_users,num_of_movies))
#making the utlity matrix
for i in range(num_of_ratings):
user_id = ratings_list[i][0]
movie_id = ratings_list[i][1]
rating = ratings_list[i][2]
user_movie_matrix[user_id][movie_id] = rating
return user_movie_matrix
def center(user_movie_matrix):
'''
centering the matrix around mean
parameters : user_movie_matrix
returns : matrix_centered_zero,test
'''
matrix_centered_zero = np.copy(user_movie_matrix)
mean = 0.0
#centering the matrix about mean
for i in range(1,num_of_users):
sum = 0.0
count = 0.0
#computing mean
for j in range(1,num_of_movies):
if(user_movie_matrix[i][j] != 0):
sum = sum + user_movie_matrix[i][j]
count = count + 1.0
mean = sum / count
#centering data
for j in range(1,num_of_movies):
if(user_movie_matrix[i][j] == 0.0):
matrix_centered_zero[i][j] = mean
else:
matrix_centered_zero[i][j] = matrix_centered_zero[i][j] - mean
#making the training data set with the first 1000 * 1000 ratings as missing by assigning as -1
test = np.copy(matrix_centered_zero)
for i in range(1,1001):
for j in range(1,1001):
if(matrix_centered_zero[i][j] != 0):
test[i][j] = -1
return matrix_centered_zero,test
def main(matrix_centered_zero,test):
'''
Calculating S,V,D and then predicting values and calculating errors
parameters : matrix_centered_zero,test
Finally prints RMSE , top k precision ,Spearman
'''
mean = 0.0
#centering the training data set
for i in range(1,num_of_users):
sum = 0.0
count = 0.0
#computing mean
for j in range(1,num_of_movies):
if(test[i][j] == -1):
sum = sum + 0.0
count = count + 1.0
elif(test[i][j] > 0):
sum = sum + test[i][j]
count = count + 1.0
mean = sum / count
#centering data
for j in range(1,num_of_movies):
if(test[i][j] == -1 or test[i][j] == 0):
test[i][j] = mean
else:
test[i][j] = test[i][j] - mean
precision_rating = []
start = time.time()
#computing A(transpose) * A
AtA = np.dot(np.transpose(test), test)
#computing the eigen vales and vectors
eigen_values_V , eigen_vectors_V = LA.eig(AtA)
#retaining only the real part of eigen values and vectors
eigen_values_V = np.real(eigen_values_V)
eigen_vectors_V = np.real(eigen_vectors_V)
#sorting the eigen values in descending order
idV = np.argsort(np.multiply(-1,eigen_values_V))
eigen_values_V = eigen_values_V[idV]
#rearranging vectors as per the eigen values
eigen_vectors_V = eigen_vectors_V[:, idV]
#making the sigma matrix
S = np.sqrt(np.abs(eigen_values_V))
S = np.diag(S)
#computing the inverse of sigma matrix
Sinv = np.linalg.pinv(S)
#computing U
U = np.matmul(np.matmul(test, eigen_vectors_V), Sinv)
#computing the original matrix from the SVD decomposition
answer = np.matmul(np.matmul(U, S), np.transpose(eigen_vectors_V))
squares_sum = 0.0
count_sq = 0.0
for i in range(1,1001):
for j in range(1,1001):
if(matrix_centered_zero[i][j] != 0):
precision_rating.append(answer[i][j])
print("Actual rating")
print(matrix_centered_zero[i][j])
print("Predicted rating")
print(answer[i][j])
#computing rmse
squares_sum = squares_sum + (answer[i][j] - matrix_centered_zero[i][j])**2
count_sq = count_sq + 1.0
print("")
#calculation of the precision at top k
print("Root mean squared error")
print(math.sqrt(squares_sum / count_sq))
print("Spearman's correlation")
correlation = 1 - ((6 * squares_sum) / (count_sq**3 - count_sq))
print(correlation)
precision_rating.sort(reverse=True)
countk = 0.0
for i in range(0, precision_k):
if(precision_rating[i] >= 3):
countk = countk + 1
precision_at_topk = countk / precision_k
print("Precision at top k")
print(precision_at_topk)
print("Time required for SVD ")
print("--- %s seconds ---" % (time.time() - start))
if __name__== "__main__":
user_movie_matrix = preprocess()
matrix_centered_zero,test = center(user_movie_matrix)
main(user_movie_matrix,test)