coding_using_python.py

# -*- coding: utf-8 -*-
"""coding using python

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1tvjb7dZTvvsOsdpGwW5vidR7TL6PQ2zQ
"""

# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
eps = np.finfo(float).eps
from numpy import log2 as log
from pprint import pprint
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import matplotlib.style
# %matplotlib inline
import seaborn as sns; sns.set() # for plot styling
from scipy import stats

from google.colab import drive
drive.mount("/content/drive")

import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/health care analysis/archive (1).zip')

df.head()

df.tail()

df.shape

df.info()

numerical_vars = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_vars = df.select_dtypes(include=['object']).columns.tolist()
print('Numerical variables:', numerical_vars)
print('Categorical variables:', categorical_vars)

# Count the number of categorical and numerical variables
categorical_count = df.select_dtypes(include='object').shape[1]
numerical_count = df.select_dtypes(exclude='object').shape[1]

print(f"Number of categorical variables: {categorical_count}")
print(f"Number of numerical variables: {numerical_count}")

# Unique values for categorical features
print(df.select_dtypes(include=['object']).nunique())

df.describe().T.round(2)

missing_data = pd.DataFrame({'total_missing': df.isnull().sum(), 'perc_missing': (df.isnull().sum()/768)*100})
missing_data

df.corr()

# Correlation matrix
# Select only the numeric columns from the DataFrame
numeric_columns = df.select_dtypes(include=['number'])

correlation_matrix = numeric_columns.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='rainbow', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')

# Plot the boxplot with rotated text labels
df.plot(kind='box', rot=45,color='green')

plt.show()

# Filter numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns


for column in numeric_cols:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=df[column],palette='rainbow')
    plt.title(f'Boxplot of {column}')
    plt.show()

df_d0 = df[df['Outcome'] == 0]
df_d1 = df[df['Outcome'] == 1]

df_d0_samp = df_d0.sample(268,replace = False)
df_bal = pd.concat([df_d1, df_d0_samp])

def look_at_distr_hist(*args, df_num=None, df_cat=None, class_feature="Outcome"):
    if df_num is not None:
        plt.figure(figsize = [20, 15])
        counter = 0
        for i in df_num.columns:
            counter += 1
            plt.subplot(3, 3, counter)
            sns.histplot(data = df, x = df[str(i)], hue = df[class_feature], multiple  = 'dodge', palette={0: 'blue', 1: 'red'}, kde=True)
            plt.title(f'--- "{i}" per count ---')
        plt.plot()
look_at_distr_hist(df_num=df_bal, class_feature="Outcome")