-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_movielens_rating.py
129 lines (106 loc) · 4.39 KB
/
read_movielens_rating.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import io
import os
import math
import copy
import pickle
import zipfile
from textwrap import wrap
from pathlib import Path
from itertools import zip_longest
from collections import defaultdict
from urllib.error import URLError
from urllib.request import urlopen
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# =============================================================================
# def try_download(url, download_path):
# archive_name = url.split('/')[-1]
# folder_name, _ = os.path.splitext(archive_name)
#
# try:
# r = urlopen(url)
# except URLError as e:
# print('Cannot download the data. Error: %s' % s)
# return
#
# assert r.status == 200
# data = r.read()
#
# with zipfile.ZipFile(io.BytesIO(data)) as arch:
# arch.extractall(download_path)
#
# print('The archive is extracted into folder: %s' % download_path)
# =============================================================================
def read_data(path):
files = {}
for filename in path.glob('*'):
if filename.suffix == '.csv':
files[filename.stem] = pd.read_csv(filename)
elif filename.suffix == '.dat':
if filename.stem == 'ratings':
columns = ['userId', 'movieId', 'rating', 'timestamp']
elif filename.stem == 'users':
columns = ['userId', 'gender', 'age','occupation', 'zip']
else:
columns = ['movieId', 'title', 'genres']
data = pd.read_csv(filename, sep='::', names=columns, engine='python')
files[filename.stem] = data
return files['ratings'], files['movies'], files['users']
#archive_url = f'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
download_path = Path.home() / 'data' / 'movielens'
#try_download(archive_url, download_path)
ratings, movies, users = read_data(download_path / 'ml-1m')
#ratings=ratings.rename(columns={"userId": "user_id", "movieId": "like_id"})
ratings = (pd.concat([ratings['userId']-1,ratings['movieId'],ratings['rating']],axis=1)).reset_index(drop=True)
counts = ratings['movieId'].value_counts()
ratings = (ratings[~ratings['movieId'].isin(counts[counts < 5].index)]).reset_index(drop=True)
#%% data statistics
import sys
sys.stdout=open("data/data_statistics.txt","w")
print(f"Data Statistics for Movielens data set")
print(f"Number of movies: {len(np.unique(ratings['movieId'])): .2f}")
print(f"Number of users: {len(np.unique(ratings['userId'])): .2f}")
print(f"Number of user-item interactions: {len(ratings): .2f}")
#%% user-pages: train-dev-test split
le = LabelEncoder()
columnUnique=list(ratings['movieId'].unique())
le_fitted_concentr = le.fit(columnUnique)
col_values=list(ratings['movieId'].values)
le.classes_
col_valuesUser=le.transform(col_values)
ratings['movieId'] = col_valuesUser
np.random.seed(7)
msk = np.random.rand(len(ratings)) < 0.99
train_ratings = (ratings[msk].copy()).reset_index(drop=True)
test_ratings = (ratings[~msk].copy()).reset_index(drop=True)
msk = np.random.rand(len(train_ratings)) < 0.99
new_ratings = train_ratings
train_ratings = (new_ratings[msk].copy()).reset_index(drop=True)
val_ratings = (new_ratings[~msk].copy()).reset_index(drop=True)
# =============================================================================
# col_values=list(test_ratings['movieId'].values)
# col_valuesUser=le.transform(col_values)
# test_ratings['movieId'] = col_valuesUser
#
# col_values=list(val_ratings['movieId'].values)
# col_valuesUser=le.transform(col_values)
# val_ratings['movieId'] = col_valuesUser
# =============================================================================
train_ratings.to_csv('data/train_ratings.csv',index=False)
val_ratings.to_csv('data/val_ratings.csv',index=False)
test_ratings.to_csv('data/test_ratings.csv',index=False)
#%% user-item interactions formation
numUsers = len(np.unique(ratings['userId']))
interactions = list()
for i in range(numUsers):
interactions.append(list(ratings['movieId'][ratings['userId']==i]))
f=open('data/interactions.txt','w')
for i in range(len(interactions)):
for j in interactions[i]:
f.write('%d' %(j))
f.write(' ')
f.write('\n')
f.close()