-
Notifications
You must be signed in to change notification settings - Fork 6.4k
/
Copy pathpreprocess2dict.py
82 lines (68 loc) · 2.21 KB
/
preprocess2dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# https://udemy.com/recommender-systems
# https://deeplearningcourses.com/recommender-systems
from __future__ import print_function, division
from builtins import range, input
# Note: you may need to update your version of future
# sudo pip install -U future
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
# load in the data
# https://www.kaggle.com/grouplens/movielens-20m-dataset
df = pd.read_csv('../large_files/movielens-20m-dataset/very_small_rating.csv')
N = df.userId.max() + 1 # number of users
M = df.movie_idx.max() + 1 # number of movies
# split into train and test
df = shuffle(df)
cutoff = int(0.8*len(df))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]
# a dictionary to tell us which users have rated which movies
user2movie = {}
# a dicationary to tell us which movies have been rated by which users
movie2user = {}
# a dictionary to look up ratings
usermovie2rating = {}
print("Calling: update_user2movie_and_movie2user")
count = 0
def update_user2movie_and_movie2user(row):
global count
count += 1
if count % 100000 == 0:
print("processed: %.3f" % (float(count)/cutoff))
i = int(row.userId)
j = int(row.movie_idx)
if i not in user2movie:
user2movie[i] = [j]
else:
user2movie[i].append(j)
if j not in movie2user:
movie2user[j] = [i]
else:
movie2user[j].append(i)
usermovie2rating[(i,j)] = row.rating
df_train.apply(update_user2movie_and_movie2user, axis=1)
# test ratings dictionary
usermovie2rating_test = {}
print("Calling: update_usermovie2rating_test")
count = 0
def update_usermovie2rating_test(row):
global count
count += 1
if count % 100000 == 0:
print("processed: %.3f" % (float(count)/len(df_test)))
i = int(row.userId)
j = int(row.movie_idx)
usermovie2rating_test[(i,j)] = row.rating
df_test.apply(update_usermovie2rating_test, axis=1)
# note: these are not really JSONs
with open('user2movie.json', 'wb') as f:
pickle.dump(user2movie, f)
with open('movie2user.json', 'wb') as f:
pickle.dump(movie2user, f)
with open('usermovie2rating.json', 'wb') as f:
pickle.dump(usermovie2rating, f)
with open('usermovie2rating_test.json', 'wb') as f:
pickle.dump(usermovie2rating_test, f)