-
Notifications
You must be signed in to change notification settings - Fork 0
/
smokerscollabfilter.py
234 lines (172 loc) · 7.54 KB
/
smokerscollabfilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# -*- coding: utf-8 -*-
"""SmokersCollabFilter.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1p5rlr1LEb8oRIqGtHSFDf_kbYVekGMgV
# Item-Item Collaborative Filtering Recommender System for Smokers
## Libraries
"""
# Import Libraries
import numpy as np
import pandas as pd
from scipy import sparse
from numpy import array
from numpy import count_nonzero
from sklearn.metrics.pairwise import cosine_similarity
"""## Default N variable"""
'''
Item-Item Collaborative Filtering (|N|=2)
|N| = 2 represents the size of the set N.
'''
# The set N refers to the neighborhood or set of nearest
# neighbors of an item in an item-item collaborative filtering
# approach. In other words, for a particular item, the collaborative
# filtering algorithm will select the two most similar items to
# form the neighborhood. These two items will then be used to
# make recommendations or provide insights.
N = 2
# # Initialize toy matrix with ratings for testing purposes
# A = np.array(
# [[ 1, 0, 3, 0, 0, 5, 0, 0, 5, 0, 4, 0],
# [ 0, 0, 5, 4, 0, 0, 4, 0, 0, 2, 1, 3],
# [ 2, 4, 0, 1, 2, 0, 3, 0, 4, 3, 5, 0],
# [ 0, 2, 4, 0, 5, 0, 0, 4, 0, 0, 2, 0],
# [ 0, 0, 4, 3, 4, 2, 0, 0, 0, 0, 2, 5],
# [ 1, 0, 3, 0, 3, 0, 0, 2, 0, 0, 4, 0]])
# rows = len(A)
# cols = len(A[1])
"""## Import Dataset & Conversion to the suitable format for processing
### Import Dataset
"""
dataset_filename = 'Smokers_Answers _For_Python.xlsx'
brands_filename = 'tobacco_names.txt'
# Read the Excel file
data = pd.read_excel(dataset_filename)
# Define the range of columns from 'B' to 'K'
start_col = 'B'
end_col = 'K'
# Get the column indices based on the column labels
start_col_idx = data.columns.get_loc(start_col)
end_col_idx = data.columns.get_loc(end_col)
# Extract the relevant columns from the DataFrame
user_ratings = data.iloc[:, start_col_idx:end_col_idx + 1]
# Convert the DataFrame to a 2D array
user_ratings_array = user_ratings.values
"""### Create Dataframe and Import Tobacco & User Labels"""
# Read the brand names from the text file
with open(brands_filename, 'r') as file:
tobacco_brands = [line.strip() for line in file]
# Generate user labels based on the number of users
users_num = len(user_ratings_array) # Replace with your actual number of users
users = [f'User{i}' for i in range(users_num)]
# Create an empty DataFrame with the specified row and column labels
df = pd.DataFrame(index = tobacco_brands, columns = users)
"""### Transfer Ratings from Dataset to Dataframe"""
# Iterate through the entire ratings array and if column object contains
# tobacco brand match it to the dataframe row label and append the use rating
for i in range(len(user_ratings_array)):
for j in range(len(user_ratings_array[0])):
if j % 2 == 0: #if j is odd then we have column with tobacco brand names
brand = user_ratings_array[i][j]
for k in range(len(df)):
if brand == df.index[k]:
df.iat[k, i] = user_ratings_array[i][j+1]
else:
continue
# Delete complete NA rows and then replace the remaining ones with zeros
df.dropna(axis = 0, how = 'all', inplace = True)
df = df.fillna(0)
"""### Convert Completed Dataframe to 2D Matrix for Easier Calculations"""
A = df.to_numpy()
rows = len(A)
cols = len(A[1])
# Sparsity Calculation
sparsity = 1.0 - ( count_nonzero(A) / float(A.size) )
print("The Matrix Sparsity is:", sparsity)
"""#### Calculate Average Ratings for each Item (Tobacco Brand)"""
# Initilize a list for as many items we've got in
# order to calculate the average ratings for them
item_Avg = [0] * A.shape[0]
# Iterate through the ratings matrix and calculate
# the average rating for each item available
for i in range(rows):
item_Avg[i] = 0
positive_Counter = 0
for j in range(cols):
if A[i][j] != 0:
item_Avg[i] += A[i][j]
positive_Counter += 1
item_Avg[i] = round(item_Avg[i] / positive_Counter, 3)
# # Print each items average
# print(item_Avg[i])
"""#### 2D Array with substractions of users ratings and items averages"""
# Initialize an empty two-dimensional array that will
# contain the substractions of users ratings and items averages
minus_Array = [[0 for j in range(cols)] for i in range(rows)]
# Subtract the items average from each users rating
for i in range(len(minus_Array)):
for j in range(len(minus_Array[0])):
if A[i][j] != 0:
minus_Array[i][j] = round(A[i][j] - item_Avg[i], 3)
# # Print the updated array that's ready for cosine similarity
# for row in minus_Array:
# print(row)
# Convert the 2D matrix filled with many zeros into
# a sparse matrix representation in Compressed Sparse
# Row (CSR) format for storage efficiency storage and
# easier computation on sparse matrices
A_sparse = sparse.csr_matrix(minus_Array)
# Calculate all the pairwise cosine similarities
# between each row of the 2D matrix
similarities_sparse = round(cosine_similarity(A_sparse, dense_output = False), 4)
# # Prints For Testing Purposes
# # Cosine Similarities Output
# print('pairwise sparse output:\n {}\n'.format(similarities_sparse))
# # Extract the similarity value from similarities_sparse
# similarity_ij = similarities_sparse[0, 1]
# # Print the similarity value
# print(f"The similarity between row {i} and row {j} is: {similarity_ij}")
"""## Calculate Cosine Similarities Between Pairwise Rows and Keep the N-Top Ones"""
max_similarities = {}
# Iterate through the cosine similarities
for i in range(similarities_sparse.shape[0]):
row_similarities = []
for j in range(similarities_sparse.shape[1]):
if i != j:
similarity_value = similarities_sparse[i, j]
row_similarities.append((j, similarity_value))
# Sort the row similarities by similarity value in descending order
row_similarities.sort(key=lambda x: x[1], reverse=True)
# Keep only the top two maximum similarities for the row
top_2_similarities = row_similarities[:2]
# Extract the row indices (j) and similarity values for the row
top_2_indices = [pair[0] for pair in top_2_similarities]
top_2_values = [pair[1] for pair in top_2_similarities]
# Store the top two maximum similarities (with row indices) in a list for the current row
max_similarities[i] = list(zip(top_2_indices, top_2_values))
# # Print the top two maximum similarities for the row
# print(f"Top two maximum similarities for row {i}:")
# for j, similarity in zip(top_2_indices, top_2_values):
# print(f"Row {j}: {similarity}")
# print()
# Print the dictionary of top two maximum similarities for each row
print("Dictionary of top two maximum similarities for each row:")
for row, similarities in max_similarities.items():
print(f"Row {row}: {similarities}")
"""## Calculate New Ratings (Still-In-Progress)"""
# Lets say we want to calculate the rating of movie 1 for user 5
user = 4
movie = 0
# Get the list of max similarities for row 0
movie_0_similarities = max_similarities[movie]
# Unpack the first element (column index, similarity value)
column_index_1, similarity_value_1 = movie_0_similarities[0]
# Unpack the second element (column index, similarity value)
column_index_2, similarity_value_2 = movie_0_similarities[1]
print(column_index_1) # Output: 5
print(similarity_value_1) # Output: 0.587
print(column_index_2) # Output: 2
print(similarity_value_2) # Output: 0.414
# Prediction
prediction = round(((similarity_value_1 * A[column_index_1][user]) + (similarity_value_2 * A[column_index_2][user])) / (similarity_value_1 + similarity_value_2), 3)
prediction