-
Notifications
You must be signed in to change notification settings - Fork 6.4k
/
Copy pathpreprocess_shrink.py
58 lines (47 loc) · 1.84 KB
/
preprocess_shrink.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# https://udemy.com/recommender-systems
# https://deeplearningcourses.com/recommender-systems
from __future__ import print_function, division
from builtins import range, input
# Note: you may need to update your version of future
# sudo pip install -U future
import pickle
import numpy as np
import pandas as pd
from collections import Counter
# load in the data
# https://www.kaggle.com/grouplens/movielens-20m-dataset
df = pd.read_csv('../large_files/movielens-20m-dataset/edited_rating.csv')
print("original dataframe size:", len(df))
N = df.userId.max() + 1 # number of users
M = df.movie_idx.max() + 1 # number of movies
user_ids_count = Counter(df.userId)
movie_ids_count = Counter(df.movie_idx)
# number of users and movies we would like to keep
n = 10000
m = 2000
user_ids = [u for u, c in user_ids_count.most_common(n)]
movie_ids = [m for m, c in movie_ids_count.most_common(m)]
# make a copy, otherwise ids won't be overwritten
df_small = df[df.userId.isin(user_ids) & df.movie_idx.isin(movie_ids)].copy()
# need to remake user ids and movie ids since they are no longer sequential
new_user_id_map = {}
i = 0
for old in user_ids:
new_user_id_map[old] = i
i += 1
print("i:", i)
new_movie_id_map = {}
j = 0
for old in movie_ids:
new_movie_id_map[old] = j
j += 1
print("j:", j)
print("Setting new ids")
df_small.loc[:, 'userId'] = df_small.apply(lambda row: new_user_id_map[row.userId], axis=1)
df_small.loc[:, 'movie_idx'] = df_small.apply(lambda row: new_movie_id_map[row.movie_idx], axis=1)
# df_small.drop(columns=['userId', 'movie_idx'])
# df_small.rename(index=str, columns={'new_userId': 'userId', 'new_movie_idx': 'movie_idx'})
print("max user id:", df_small.userId.max())
print("max movie id:", df_small.movie_idx.max())
print("small dataframe size:", len(df_small))
df_small.to_csv('../large_files/movielens-20m-dataset/small_rating.csv', index=False)