-
Notifications
You must be signed in to change notification settings - Fork 6.4k
/
Copy pathpreprocess2sparse.py
62 lines (53 loc) · 1.61 KB
/
preprocess2sparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# https://udemy.com/recommender-systems
# https://deeplearningcourses.com/recommender-systems
from __future__ import print_function, division
from builtins import range, input
# Note: you may need to update your version of future
# sudo pip install -U future
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from scipy.sparse import lil_matrix, csr_matrix, save_npz, load_npz
# load in the data
df = pd.read_csv('../large_files/movielens-20m-dataset/edited_rating.csv')
# df = pd.read_csv('../large_files/movielens-20m-dataset/small_rating.csv')
N = df.userId.max() + 1 # number of users
M = df.movie_idx.max() + 1 # number of movies
# split into train and test
df = shuffle(df)
cutoff = int(0.8*len(df))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]
A = lil_matrix((N, M))
print("Calling: update_train")
count = 0
def update_train(row):
global count
count += 1
if count % 100000 == 0:
print("processed: %.3f" % (float(count)/cutoff))
i = int(row.userId)
j = int(row.movie_idx)
A[i,j] = row.rating
df_train.apply(update_train, axis=1)
# mask, to tell us which entries exist and which do not
A = A.tocsr()
mask = (A > 0)
save_npz("Atrain.npz", A)
# test ratings dictionary
A_test = lil_matrix((N, M))
print("Calling: update_test")
count = 0
def update_test(row):
global count
count += 1
if count % 100000 == 0:
print("processed: %.3f" % (float(count)/len(df_test)))
i = int(row.userId)
j = int(row.movie_idx)
A_test[i,j] = row.rating
df_test.apply(update_test, axis=1)
A_test = A_test.tocsr()
mask_test = (A_test > 0)
save_npz("Atest.npz", A_test)