forked from federicoparroni/recsys2019
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_data.py
76 lines (62 loc) · 2.63 KB
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
import data
import numpy as np
from tqdm import tqdm
import utils.check_folder as cf
import matplotlib.pyplot as plt
import math
from sklearn.preprocessing import MultiLabelBinarizer
import scipy.sparse as sps
import utils.check_folder as cf
def merge_consecutive_equal_actions():
tqdm.pandas()
test = data.test_df('full')
test_grouped_by_session_id = test.groupby('session_id')
merged = test_grouped_by_session_id.progress_apply(_merge_consecutive_equal_actions)
cf.check_folder('dataset/cleaned_csv')
merged.to_csv('dataset/cleaned_csv/test.csv')
def _merge_consecutive_equal_actions(df):
df_cleaned = pd.DataFrame(columns=['user_id', 'session_id', 'timestamp', 'step', 'action_type',
'occurences', 'reference', 'platform', 'city', 'device', 'current_filters',
'impressions', 'prices'])
for i in range(df.shape[0]):
row = df.iloc[i]
if i == 0:
row['occurences'] = 1
df_cleaned = df_cleaned.append(row, ignore_index=True)
else:
a_r = np.array(row[['action_type', 'reference']])
last = np.array(df_cleaned.tail(1)[['action_type', 'reference']])
if (a_r == last).all():
df_cleaned.at[df_cleaned.tail(1).index[0], 'occurences'] += 1
df_cleaned.tail(1)[['timestamp', 'step']] = row[['timestamp', 'step']].values
else:
row['occurences'] = 1
df_cleaned = df_cleaned.append(row)
return df_cleaned
attribute_dict = {}
def list_item_metadata():
tqdm.pandas()
data.accomodations_df()['properties'].progress_apply(_list_item)
attribute_frequency_tuples = sorted(attribute_dict.items(), key=lambda x: x[1])
attribute_df = pd.DataFrame(attribute_frequency_tuples, columns=['attribute', 'frequency'])
#plot the attribute - frequency graphic
plot_df(attribute_df, 3, 'bar', 'attribute')
attribute_df.to_csv('attributes_apartament', index=False)
def _list_item(r):
global attribute_dict
if isinstance(r, str):
temp = r.split('|')
for attribute in temp:
if attribute in attribute_dict.keys():
attribute_dict[attribute] += 1
else:
attribute_dict[attribute] = 1
def plot_df(df, blocks_number, kind, x):
block_dimension = math.ceil(df.shape[0]/blocks_number)
for i in range(0, df.shape[0], block_dimension):
if i+block_dimension < df.shape[0]:
df[i:i+block_dimension].plot(kind=kind, x=x)
else:
df[i:df.shape[0]].plot(kind=kind, x=x)
plt.show()