smokerscollabfilter.py

# -*- coding: utf-8 -*-
"""SmokersCollabFilter.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1p5rlr1LEb8oRIqGtHSFDf_kbYVekGMgV

# Item-Item Collaborative Filtering Recommender System for Smokers

## Libraries
"""

# Import Libraries
import random
import numpy as np
import pandas as pd
from scipy import sparse
from numpy import array
from numpy import count_nonzero
from sklearn.metrics.pairwise import cosine_similarity

import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

"""## Default N variable"""

'''
Item-Item Collaborative Filtering (|N|=2)
|N| = 2 represents the size of the set N.
'''

# The set N refers to the neighborhood or set of nearest
# neighbors of an item in an item-item collaborative filtering
# approach. In other words, for a particular item, the collaborative
# filtering  algorithm will select the two most similar items to
# form the neighborhood. These two items will then be used to
# make recommendations or provide insights.

N = 2

# # Initialize toy matrix with ratings for testing purposes
# A = np.array(
#      [[ 1,  0,  3,  0,  0,  5,  0,  0,  5,  0,  4,  0],
#       [ 0,  0,  5,  4,  0,  0,  4,  0,  0,  2,  1,  3],
#       [ 2,  4,  0,  1,  2,  0,  3,  0,  4,  3,  5,  0],
#       [ 0,  2,  4,  0,  5,  0,  0,  4,  0,  0,  2,  0],
#       [ 0,  0,  4,  3,  4,  2,  0,  0,  0,  0,  2,  5],
#       [ 1,  0,  3,  0,  3,  0,  0,  2,  0,  0,  4,  0]])

# rows = len(A)
# cols = len(A[1])

"""## Import Dataset & Conversion to the suitable format for processing

### Import Dataset
"""

dataset_filename = 'Smokers_Answers _For_Python.xlsx'
brands_filename = 'tobacco_names.txt'

# Read the Excel file
data = pd.read_excel(dataset_filename)

# Define the range of columns from 'B' to 'K'
start_col = 'B'
end_col = 'K'

# Get the column indices based on the column labels
start_col_idx = data.columns.get_loc(start_col)
end_col_idx = data.columns.get_loc(end_col)

# Extract the relevant columns from the DataFrame
user_ratings = data.iloc[:, start_col_idx:end_col_idx + 1]

# Convert the DataFrame to a 2D array
user_ratings_array = user_ratings.values

"""### Create Dataframe and Import Tobacco & User Labels"""

# Read the brand names from the text file
with open(brands_filename, 'r') as file:
    tobacco_brands = [line.strip() for line in file]

# Generate user labels based on the number of users
users_num = len(user_ratings_array) # Replace with your actual number of users
users = [f'User{i}' for i in range(users_num)]

# Create an empty DataFrame with the specified row and column labels
df = pd.DataFrame(index = tobacco_brands, columns = users)

"""### Transfer Ratings from Dataset to Dataframe"""

# Iterate through the entire ratings array and if column object contains
# tobacco brand match it to the dataframe row label and append the use rating
for i in range(len(user_ratings_array)):
  for j in range(len(user_ratings_array[0])):
    if j % 2 == 0: #if j is odd then we have column with tobacco brand names
      brand = user_ratings_array[i][j]
      for k in range(len(df)):
        if brand == df.index[k]:
            df.iat[k, i] = user_ratings_array[i][j+1]
    else:
      continue

# Delete complete NA rows and then replace the remaining ones with zeros
df.dropna(axis = 0, how = 'all', inplace = True)
df = df.fillna(0)

"""### Convert Completed Dataframe to 2D Matrix for Easier Calculations"""

A = df.to_numpy()

rows, cols = A.shape

# Sparsity Calculation
sparsity = 1.0 - ( count_nonzero(A) / float(A.size) )
print("The Matrix Sparsity is:", sparsity)

"""#### Calculate Average Ratings for each Item (Tobacco Brand)"""

# Initilize a list for as many items we've got in
# order to calculate the average ratings for them
item_Avg = [0] * A.shape[0]

# Iterate through the ratings matrix and calculate
# the average rating for each item available
for i in range(rows):
  item_Avg[i] = 0
  positive_Counter = 0
  for j in range(cols):
    if A[i][j] != 0:
      item_Avg[i] += A[i][j]
      positive_Counter += 1
  item_Avg[i] = round(item_Avg[i] / positive_Counter, 3)

# # Print each items average
#   print(item_Avg[i])

"""#### 2D Array with substractions of users ratings and items averages"""

# Initialize an empty two-dimensional array that will
# contain the substractions of users ratings and items averages
minus_Array = [[0 for j in range(cols)] for i in range(rows)]

# Subtract the items average from each users rating
for i in range(len(minus_Array)):
  for j in range(len(minus_Array[0])):
    if A[i][j] != 0:
      minus_Array[i][j] = round(A[i][j] - item_Avg[i], 3)

# # Print the updated array that's ready for cosine similarity
# for row in minus_Array:
#     print(row)

# Convert the 2D matrix filled with many zeros into
# a sparse matrix representation in Compressed Sparse
# Row (CSR) format for storage efficiency storage and
# easier computation on sparse matrices
A_sparse = sparse.csr_matrix(minus_Array)

# Calculate all the pairwise cosine similarities
# between each row of the 2D matrix
similarities_sparse = round(cosine_similarity(A_sparse, dense_output = False), 4)

# # Prints For Testing Purposes

# # Cosine Similarities Output
# print('pairwise sparse output:\n {}\n'.format(similarities_sparse))

# # Extract the similarity value from similarities_sparse
# similarity_ij = similarities_sparse[0, 1]

# # Print the similarity value
# print(f"The similarity between row {i} and row {j} is: {similarity_ij}")

"""## Calculate Cosine Similarities Between Pairwise Rows and Keep the N-Top Ones"""

max_similarities = {}

# Iterate through the cosine similarities
for i in range(similarities_sparse.shape[0]):
    row_similarities = []
    for j in range(similarities_sparse.shape[1]):
        if i != j:
            similarity_value = similarities_sparse[i, j]
            row_similarities.append((j, similarity_value))

    # Sort the row similarities by similarity value in descending order
    row_similarities.sort(key=lambda x: x[1], reverse=True)

    # Keep only the top two maximum similarities for the row
    top_2_similarities = row_similarities[:2]

    # Extract the row indices (j) and similarity values for the row
    top_2_indices = [pair[0] for pair in top_2_similarities]
    top_2_values = [pair[1] for pair in top_2_similarities]

    # Store the top two maximum similarities (with row indices) in a list for the current row
    max_similarities[i] = list(zip(top_2_indices, top_2_values))

    # # Print the top two maximum similarities for the row
    # print(f"Top two maximum similarities for row {i}:")
    # for j, similarity in zip(top_2_indices, top_2_values):
    #     print(f"Row {j}: {similarity}")
    # print()

# Print the dictionary of top two maximum similarities for each row
print("Dictionary of top two maximum similarities for each row:")
for row, similarities in max_similarities.items():
    print(f"Row {row}: {similarities}")

"""## Calculate New Ratings (Still-In-Progress)"""

# Lets say we want to calculate the rating of movie 1 for user 5

user = 4
movie = 0

# Get the list of max similarities for row 0
movie_0_similarities = max_similarities[movie]

# Unpack the first element (column index, similarity value)
column_index_1, similarity_value_1 = movie_0_similarities[0]

# Unpack the second element (column index, similarity value)
column_index_2, similarity_value_2 = movie_0_similarities[1]

print(column_index_1)  # Output: 5
print(similarity_value_1)  # Output: 0.587
print(column_index_2)  # Output: 2
print(similarity_value_2)  # Output: 0.414

# Prediction
prediction = round(((similarity_value_1 * A[column_index_1][user]) + (similarity_value_2 * A[column_index_2][user])) / (similarity_value_1 + similarity_value_2), 3)

prediction

"""## GUI Inference

### Reduce Table-DF Sparsity in order to make the predictions better.
"""

# Make a copy to avoid overwriting the real data
df_lower_sparsity = df.copy()

# 1) Filter out extremely sparse rows/columns
#    Remove tobacco brands (rows) with fewer than a threshold of non-zero ratings
#    Remove users (columns) with fewer than a threshold of non-zero ratings
min_nonzero_per_brand = 3   # e.g. each brand must have at least 3 non-zero ratings
min_nonzero_per_user  = 3   # e.g. each user must have at least 3 non-zero ratings

# Keep only those rows (brands) with >= X non-zero entries
df_lower_sparsity = df_lower_sparsity.loc[
    df_lower_sparsity.astype(bool).sum(axis=1) >= min_nonzero_per_brand
]

# Keep only those columns (users) with >= X non-zero entries
df_lower_sparsity = df_lower_sparsity.loc[
    :,
    df_lower_sparsity.astype(bool).sum(axis=0) >= min_nonzero_per_user
]

# 2) Optionally fill some of the remaining zeroes with random synthetic ratings
#    (e.g., 20% chance to fill a zero with a random 1..5)
fill_probability = 0.20  # 20% chance per zero to fill with a random rating
for brand in df_lower_sparsity.index:
    for user in df_lower_sparsity.columns:
        if df_lower_sparsity.at[brand, user] == 0:
            if random.random() < fill_probability:
                # pick a random integer 1..5
                random_rating = random.randint(1, 5)
                df_lower_sparsity.at[brand, user] = random_rating

# Now df_lower_sparsity has fewer rows/columns and some random fills.
# Convert to A, just like your original code:
A = df_lower_sparsity.to_numpy()
rows, cols = A.shape

print("New shape:", A.shape)
print("New sparsity:", 1.0 - (np.count_nonzero(A) / float(A.size)))

"""### # Interactive GUI"""

# We assume df, A, and max_similarities are already defined
all_brands = df.index.tolist()
new_user_ratings = {}

title_html = widgets.HTML(
    value=(
        "<div style='background-color:#3498db; color:#fff; padding:10px; margin-bottom:10px;'>"
        "<h2 style='margin:0; font-family:Arial, sans-serif;'>"
        "SmokersCollabFilter: Personalized Recommendations</h2>"
        "</div>"
        "<p style='font-size:14px; line-height:1.5; font-family:Arial; margin:10px;'>"
        "Enter one or more tobacco brand ratings below (integers only). Then click "
        "<strong>Get Recommendations</strong> to see which other brands you might enjoy."
        "<br><em style='color:#7f8c8d;'>Note: This is a demo UI for experimentation.</em>"
        "</p>"
    )
)

def predict_new_user_rating(brand_idx, new_user_ratings):
    neighbors = max_similarities.get(brand_idx, [])
    if len(neighbors) < 1:
        return None

    rated_neighbors = []
    for (nbr_idx, sim_val) in neighbors:
        neighbor_brand_name = df.index[nbr_idx]
        if neighbor_brand_name in new_user_ratings:
            rated_neighbors.append((nbr_idx, sim_val))
        if len(rated_neighbors) == 2:
            break

    if len(rated_neighbors) == 0:
        return None
    elif len(rated_neighbors) == 1:
        (nbr_idx_1, sim_val_1) = rated_neighbors[0]
        rating_1 = new_user_ratings[df.index[nbr_idx_1]]
        return None if sim_val_1 == 0 else rating_1
    else:
        (nbr_idx_1, sim_val_1) = rated_neighbors[0]
        (nbr_idx_2, sim_val_2) = rated_neighbors[1]
        rating_1 = new_user_ratings[df.index[nbr_idx_1]]
        rating_2 = new_user_ratings[df.index[nbr_idx_2]]
        denom = sim_val_1 + sim_val_2
        if denom == 0:
            return None
        predicted = ((sim_val_1 * rating_1) + (sim_val_2 * rating_2)) / denom
        # We can return an integer or keep it as float. If you want an integer, use int(round(...))
        return int(round(predicted))

# -----------------------------
# Widgets for Brand & Rating
# -----------------------------
brand_dropdown = widgets.Dropdown(
    options=all_brands,
    description='Brand:',
    layout=widgets.Layout(width='280px', margin='0 10px 0 0')
)
brand_dropdown.style.description_width = '60px'

# Integer-only slider from 1..5
rating_slider = widgets.IntSlider(
    value=3,
    min=1,
    max=5,
    step=1,
    description='Rating:',
    continuous_update=False,
    layout=widgets.Layout(width='280px', margin='0 10px 0 0')  # wider slider
)
rating_slider.style.description_width = '60px'
rating_slider.style.handle_color = '#e67e22'

add_rating_button = widgets.Button(
    description='Add Rating',
    button_style='success',
    tooltip='Store this brand & rating',
    layout=widgets.Layout(width='120px', margin='0 10px 0 0')
)
recommend_button = widgets.Button(
    description='Get Recommendations',
    button_style='info',
    tooltip='Compute top recommended brands',
    layout=widgets.Layout(width='160px')
)

ratings_output = widgets.Output(layout=widgets.Layout(margin='10px 0 10px 0'))
recommend_output = widgets.Output(layout=widgets.Layout(margin='10px 0 10px 0'))

def on_add_rating_clicked(b):
    with ratings_output:
        clear_output()
        selected_brand = brand_dropdown.value
        # Convert to int so we always store an integer
        user_rating = int(rating_slider.value)
        new_user_ratings[selected_brand] = user_rating

        table_html = (
            "<div style='max-height:180px; overflow:auto; border:1px solid #ccc; padding:5px;'>"
            "<table style='border-collapse:collapse; width:95%; font-family:Arial;'>"
            "<thead><tr style='background-color:#ecf0f1;'>"
            "<th style='border:1px solid #bdc3c7; padding:6px;'>Brand</th>"
            "<th style='border:1px solid #bdc3c7; padding:6px;'>Rating</th>"
            "</tr></thead><tbody>"
        )
        for br, rt in new_user_ratings.items():
            table_html += (
                f"<tr>"
                f"<td style='border:1px solid #bdc3c7; padding:6px;'>{br}</td>"
                # Also show integer rating in the table
                f"<td style='border:1px solid #bdc3c7; padding:6px; text-align:center;'>{int(rt)}</td>"
                f"</tr>"
            )
        table_html += "</tbody></table></div>"

        display(HTML("<b>Your Ratings so far:</b>"))
        display(HTML(table_html))

def on_recommend_clicked(b):
    with recommend_output:
        clear_output()
        if not new_user_ratings:
            display(HTML("<p style='color:red; font-family:Arial;'>"
                         "You haven't entered any ratings yet. "
                         "Please add at least one rating.</p>"))
            return

        predicted_scores = []
        for idx, brand_name in enumerate(all_brands):
            if brand_name not in new_user_ratings:
                pred = predict_new_user_rating(idx, new_user_ratings)
                if pred is not None:
                    predicted_scores.append((brand_name, pred))

        if not predicted_scores:
            display(HTML("<p style='color:red; font-family:Arial;'>"
                         "No recommendations found. Possibly not enough neighbor overlap.</p>"))
            return

        predicted_scores.sort(key=lambda x: x[1], reverse=True)
        # If you'd like, you can limit to top 5 or 10
        top5 = predicted_scores[:5]

        recs_html = (
            "<div style='border:2px solid #bdc3c7; padding:10px; font-family:Arial;'>"
            "<b>Top Recommendations (Brand : Predicted Rating):</b>"
            "<ul style='list-style-type:disc; padding-left:20px; margin-top:5px;'>"
        )
        for (bname, score) in top5:
            recs_html += f"<li><b>{bname}</b>: {score}</li>"
        recs_html += "</ul></div>"

        display(HTML(recs_html))

add_rating_button.on_click(on_add_rating_clicked)
recommend_button.on_click(on_recommend_clicked)

# Layout
inputs_box = widgets.HBox([brand_dropdown, rating_slider])
buttons_box = widgets.HBox([add_rating_button, recommend_button])
main_container = widgets.VBox(
    [
        title_html,
        inputs_box,
        buttons_box,
        ratings_output,
        recommend_output
    ],
    layout=widgets.Layout(
        border='1px solid #2ecc71',
        padding='10px',
        margin='10px',
        background_color='#f7fdf8'
    )
)

display(main_container)