-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcoding_using_python.py
103 lines (75 loc) · 2.92 KB
/
coding_using_python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*- coding: utf-8 -*-
"""coding using python
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1tvjb7dZTvvsOsdpGwW5vidR7TL6PQ2zQ
"""
# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
eps = np.finfo(float).eps
from numpy import log2 as log
from pprint import pprint
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import matplotlib.style
# %matplotlib inline
import seaborn as sns; sns.set() # for plot styling
from scipy import stats
from google.colab import drive
drive.mount("/content/drive")
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/health care analysis/archive (1).zip')
df.head()
df.tail()
df.shape
df.info()
numerical_vars = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_vars = df.select_dtypes(include=['object']).columns.tolist()
print('Numerical variables:', numerical_vars)
print('Categorical variables:', categorical_vars)
# Count the number of categorical and numerical variables
categorical_count = df.select_dtypes(include='object').shape[1]
numerical_count = df.select_dtypes(exclude='object').shape[1]
print(f"Number of categorical variables: {categorical_count}")
print(f"Number of numerical variables: {numerical_count}")
# Unique values for categorical features
print(df.select_dtypes(include=['object']).nunique())
df.describe().T.round(2)
missing_data = pd.DataFrame({'total_missing': df.isnull().sum(), 'perc_missing': (df.isnull().sum()/768)*100})
missing_data
df.corr()
# Correlation matrix
# Select only the numeric columns from the DataFrame
numeric_columns = df.select_dtypes(include=['number'])
correlation_matrix = numeric_columns.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='rainbow', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
# Plot the boxplot with rotated text labels
df.plot(kind='box', rot=45,color='green')
plt.show()
# Filter numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
for column in numeric_cols:
plt.figure(figsize=(10, 6))
sns.boxplot(x=df[column],palette='rainbow')
plt.title(f'Boxplot of {column}')
plt.show()
df_d0 = df[df['Outcome'] == 0]
df_d1 = df[df['Outcome'] == 1]
df_d0_samp = df_d0.sample(268,replace = False)
df_bal = pd.concat([df_d1, df_d0_samp])
def look_at_distr_hist(*args, df_num=None, df_cat=None, class_feature="Outcome"):
if df_num is not None:
plt.figure(figsize = [20, 15])
counter = 0
for i in df_num.columns:
counter += 1
plt.subplot(3, 3, counter)
sns.histplot(data = df, x = df[str(i)], hue = df[class_feature], multiple = 'dodge', palette={0: 'blue', 1: 'red'}, kde=True)
plt.title(f'--- "{i}" per count ---')
plt.plot()
look_at_distr_hist(df_num=df_bal, class_feature="Outcome")