forked from mageirakos/elasticsearch-movies
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqueryIndexQ4.py
132 lines (104 loc) · 7.12 KB
/
queryIndexQ4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
'''
Requirements :
You need to have run q4PreProcessing_notebook.ipynb before running this code.
(in anaconda prompt)
Jupyter notebook -> conda install -c conda-forge notebook
In this iteration of the code we take into account:
1) user_Id
2) ranking results now take into account the user's rating of a movie (if available)
3) we cluster users together based on how and which movies they rate. Then we avg out the rating per cluster, thus avoiding some NaNs in user_rating
4) We train a Ranfom Forest Classifier on each user trained on tf-idf data from the title in movies they have rated and the on hot encoding of that movie's genre.
Then we used said classifier to predict the rating on the rest of the movies. ( This gives a very bad result and should not be followed, it is only done for demonstrative
purposes, to get some exercise on the pipeline of ML models in an Information Retrival setting)
'''
from elasticsearch import Elasticsearch
import pandas as pd
import numpy as np
import pickle
def queryInput(user_input):
''' returns the movie_Id, title, BM25 score '''
# ps = per_search since BM25 _score changes depending on user_input
result_params_list_df_ps = pd.DataFrame(columns=['movieId', 'title', 'genres', 'BM25_score'])
res = es.search(index="movies", body={"query": {"match": {"title":"{}".format(user_input)}}}, size = 1000)
hits = res['hits']['total']['value']
print("Got {} Hits:".format(hits))
# I set the maximum number of results to 1000. The default was 10. So we need to take that into account
try :
for i in range(hits):
temp_df = pd.DataFrame([ [ int(res['hits']['hits'][i]['_source']['movieId']), res['hits']['hits'][i]['_source']['title'], res['hits']['hits'][i]['_source']['genres'], res['hits']['hits'][i]['_score'] ] ], columns=['movieId', 'title', 'genres', 'BM25_score'])
result_params_list_df_ps = result_params_list_df_ps.append(temp_df, ignore_index = True)
except:
for i in range(1000):
temp_df = pd.DataFrame([ [ int(res['hits']['hits'][i]['_source']['movieId']), res['hits']['hits'][i]['_source']['title'], res['hits']['hits'][i]['_source']['genres'], res['hits']['hits'][i]['_score'] ] ], columns=['movieId', 'title', 'genres', 'BM25_score'])
result_params_list_df_ps = result_params_list_df_ps.append(temp_df, ignore_index = True)
return result_params_list_df_ps
def preProcessing():
'''Returns the AVG rating of each movie (movie_avg_rating_df)
and user rating per movie (movie_rating_df_pu)
Also, loads the data from queryIndexQ3 where we clustered the users to fill NaN'''
rating_df = pd.read_csv('./data/ratings.csv')
#dataframe containing user ratings after filling most NaN base on user clustering (look at queryIndexQ3.py)
user_ratings_after_clustering = pickle.load( open("./data/user_ratings_after_clustering.p", "rb"))
#pu = per user
movie_rating_df_pu = rating_df[['userId','movieId','rating']]
movie_avg_rating_df = movie_rating_df_pu.groupby(by='movieId').mean()
movie_avg_rating_df = movie_avg_rating_df.drop('userId', axis=1).reset_index()
return movie_avg_rating_df, user_ratings_after_clustering
def startLoop(movie_avg_rating_df, user_ratings_after_clustering):
''' Basically the main function of the program.
Takes any pre-processed data as input '''
print('type "//exit" if you want to exit the search')
user_input_movie = input("Which movie do you want? (by title): \n")
while( user_input_movie != '//exit' ) :
user_input_user = input("For which user do you want to search? (int): \n")
while ( ( user_input_user.isdigit() == False) ):
user_input_user = input("For which user do you want to search? (int): \n")
#get ranking from Elasticsearch
query_result_params_df = queryInput(user_input_movie)
#compute final ranking based on additional data ( movie_avg_rating_df, movie_rating_df_pu )
final_df = finalRanking(query_result_params_df, movie_avg_rating_df, user_ratings_after_clustering, user_input_user)
#print results
print(final_df)
print('type "//exit" if you want to exit the search')
user_input_movie = input("Which movie do you want? (by title): \n")
return
def finalRanking(query_result_params_df, movie_avg_rating_df, movie_rating_df_pu, user_input_user):
''' computes the final_df which holds the final ranking '''
final_df = query_result_params_df.copy(deep=True)
#add avg_rating
final_df = final_df.merge(movie_avg_rating_df, on = 'movieId', how = 'left')
final_df.rename(columns = {'rating':'avg_rating'}, inplace=True)
#create a temp DataFrame so no changes are made to the original 'movie_rating_df_pu'
temp = movie_rating_df_pu.copy(deep=True)
#drop all rows from the user ratings where the user is not the one we want
temp.drop(temp[temp['userId'] != int(user_input_user)].index, inplace=True)
#add user_rating ( after clustering this time )
final_df = final_df.merge(temp, on = 'movieId', how = 'left')
final_df.rename(columns = {'rating':'user_rating_after_clustering'}, inplace=True)
#now add the data from the jupyter notebook where we fille user NaN ratings based on tf-idf vactors and a RandomForestClassifier
user_predicted_data = pickle.load( open("./data/predicted_result/user_{}_predicted_data.p".format(user_input_user), "rb"))
temp = user_predicted_data.copy(deep=True)
final_df = final_df.merge(temp, on = 'movieId', how = 'left')
final_df.rename(columns = {'rating':'user_rating_after_RFC'}, inplace=True)
#for the sake of simplicity our similarity algorithm will be a linear combinationi of the above 4 scores
final_df['final_score'] = np.nan
# Note that user_rating_after_RFC is in all passes because it has no Null Predictions. A prediction was made for all users for every movie
#first pass
final_df['final_score'] = final_df['BM25_score'] + final_df['avg_rating'] + final_df['user_rating_after_clustering'] + final_df['user_rating_after_RFC']
#second pass for the cases where the user has not added a rating for the movie ( as far as the clustering is concerned )
final_df['final_score'].fillna(final_df['BM25_score'] + final_df['avg_rating'] + final_df['user_rating_after_RFC'], inplace=True)
#third pass where there is no avg_rating for a movie so we only keep the BM25 score
final_df['final_score'].fillna(final_df['BM25_score'] + final_df['user_rating_after_RFC'], inplace=True)
#Sort the dataframe based on the final_score
final_df = final_df.sort_values(by = 'final_score', ascending=False).reset_index()
final_df.drop(['index','genres','userId'], axis=1, inplace=True)
final_df.drop_duplicates(inplace=True)
return final_df
#######################################################
if __name__ == "__main__":
# start Elasticsearch server
es = Elasticsearch()
# compute avg rating for all movies as a pre processing step
movie_avg_rating_df, user_ratings_after_clustering = preProcessing()
# get to main loop
startLoop(movie_avg_rating_df, user_ratings_after_clustering)