-
Notifications
You must be signed in to change notification settings - Fork 0
/
epl_similar_players_app.py
84 lines (60 loc) · 2.96 KB
/
epl_similar_players_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
@st.cache_data
def load_data():
# Load the dataset
df = pd.read_csv('pl_20-21.csv')
# Replace NaN values with 0
df.fillna(0, inplace=True)
# Remove '%' symbol from percentage columns and convert to numeric values
percentage_cols = [col for col in df.columns if df[col].astype(str).str.contains('%').any()]
for col in percentage_cols:
df[col] = df[col].str.replace('%', '').astype(float)
# Replace infinity values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
# Fill NaN values with 0 (or use another appropriate fill method)
df.fillna(0, inplace=True)
# Select the numerical columns for generating vector embeddings
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
numerical_cols = [col for col in numerical_cols if col != 'Unnamed: 0']
# Standardize the numerical columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
# Generate vector embeddings (as numpy array)
vector_embeddings = df[numerical_cols].values
# Create a similarity matrix based on cosine similarity
similarity_matrix = cosine_similarity(vector_embeddings)
# Strip leading/trailing spaces from player names
df['Name'] = df['Name'].str.strip()
# Convert the numpy array to a DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix, index=df['Name'], columns=df['Name'])
return df, similarity_df
def get_similar_players(df, similarity_df, player_name, top_n=10):
# Get the similarity scores for the given player
similarity_scores = similarity_df[player_name]
# Sort the scores in descending order and take the top_n players
most_similar_players = similarity_scores.sort_values(ascending=False).head(top_n + 1)
# Exclude the player themselves from the list
most_similar_players = most_similar_players[most_similar_players.index != player_name]
# Create a DataFrame with names and positions of the similar players
similar_players_df = pd.DataFrame(most_similar_players).join(df.set_index('Name')[['Position']])
# Rename the columns
similar_players_df.columns = ['Similarity', 'Position']
return similar_players_df
# Add title
st.title("EPL Player Similarity Finder (2020/2021 Season)")
# Load the data
df, similarity_df = load_data()
# Select a player
player_name = st.selectbox('Select a player', df['Name'].unique())
# Select number of similar players to display
top_n = st.slider('Select number of similar players to display', min_value=1, max_value=50, value=10)
# Button to get similar players
if st.button('Get Similar Players'):
# Get the most similar players
similar_players = get_similar_players(df, similarity_df, player_name, top_n)
# Display the similar players
st.write(similar_players)