.gitignore

# Road Traffic Accidents

# import library
import pandas as pd
import numpy as np

# import csv dataset
df =pd.read_csv("RTA Dataset.csv")
df

# Top 5 values
df.head()

# Bottom 5 values
df.tail()

# Delete the column
del(df['Sex_of_casualty'])
del(df['Casualty_class'])

# DataTypes
df.dtypes

# Convert 'Time' columns from object to time
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S')

df.dtypes

# Check null values are present or not
df.isnull().values.any()

# How many values are null per column
df.isnull().sum()

# Drop null values
df.dropna(inplace=True)

#Check duplicate values present or not
df.duplicated().any()

# Shape of the dataset
rows,columns=df.shape
print(f"Number of rows are : {rows}")
print(f"Number of columns are : {columns}")

df.describe(include='all')

# 1. What is the most common day of the week for accidents?
most_common_day = df['Day_of_week'].value_counts().idxmax()
most_common_day_count = df['Day_of_week'].value_counts().max()
print(f"The most common day of the week for accidents is {most_common_day} and count of accidents are {most_common_day_count}.")

# 2. Which age band of drivers is involved in the most accidents?
most_common_age_band = df['Age_band_of_driver'].value_counts().idxmax()
most_common_age_band_count = df['Age_band_of_driver'].value_counts().max()
print(f"The most common age band of drivers is {most_common_age_band} with count {most_common_age_band_count}.")

# 3. What is the distribution of accidents by the type of vehicle involved?
vehicle_distribution = df['Type_of_vehicle'].value_counts()
print(f"The distribution of accidents by type of vehicle involved : \n {vehicle_distribution}.")

# 4. How do accidents vary by the gender of the driver?
gender = df.groupby(['Sex_of_driver']).size()
print(f"The distribution of accidents by the gender of the driver is : \n {gender}")

# 5. What are the top 5 most common causes of accidents?
most_common_causes = df['Cause_of_accident'].value_counts().head(5)
print(f"The most common causes of accidents are : \n\n {most_common_causes}")

# 6. Which educational level of drivers is associated with the most accidents?
education_level = df['Educational_level'].value_counts()
print("The distribution of accidents by the educational level of drivers is :\n")
print(education_level)

# 7. What are the peak hours for accidents?
df['Hour'] = df['Time'].dt.hour   # .dt accessor is used to access datetime-related properties
peak_hours = df['Hour'].value_counts().sort_index()
print("The distribution of accidents by hour of the day is :\n")
print(peak_hours)

"""
This means that:
At 0:00 (midnight), there were 66 accidents.
At 1:00 AM, there were 46 accidents.
And so on.......
most of the accidents happened at 5:00 PM
"""

# 8. Is there a correlation between the time of day and the number of vehicles involved in an accident?
vehicles_involved_by_hour = df.groupby('Hour')['Number_of_vehicles_involved'].mean().round(2)
print("The average number of vehicles involved in accidents by hour of the day is :\n")
print(vehicles_involved_by_hour)

# 9. How do weather conditions affect the frequency of accidents?
weather_conditions = df['Weather_conditions'].value_counts()
print("The distribution of accidents by weather conditions is :\n")
print(weather_conditions)

# 10. What is the distribution of accidents based on the driving experience of the drivers?
df['Driving_experience'].value_counts()

# 11. Group the data by the type of vehicle and analyze the average driving experience of drivers involved in accidents.

# Check unique values
unique_val = df['Driving_experience'].unique()
print(unique_val)

# Convert the driving experience to numeric values for analysis
Driving_experience_mapping = {
    'Below 1yr' : 0.5,
    '1-2yr' : 1.5,
    '2-5yr' : 3.5,
    '5-10yr' : 7.5,
    'Above 10yr' : 12
}
df['Driving_experience_numeric'] = df['Driving_experience'].map(Driving_experience_mapping)
df.head()

avg_experience_by_vehicle = df.groupby("Type_of_vehicle")["Driving_experience_numeric"].mean().round(0)
print(avg_experience_by_vehicle)

# 12. Group accidents by road surface type and determine which type of road surface has the highest frequency of accidents.
accidents_by_road_surface = df['Road_surface_type'].value_counts()
print("\nFrequency of accidents by road surface type:")
print(accidents_by_road_surface)

# 13. Create a cross-tabulation of accidents by the day of the week and light conditions.
cross_tab_day_light = pd.crosstab(df['Day_of_week'], df['Light_conditions'])
print("Cross-tabulation of accidents by day of the week and light conditions:")
print(cross_tab_day_light)

# 14. Correlation between the age band of the driver and the number of vehicles involved in accidents

# All Unique Age values
df['Age_band_of_driver'].unique()

# Convert the age band to numeric values for correlation analysis
age_band_mapping = {
    'Under 18': 17,
    '18-30': 24,
    '31-50': 40,
    'Over 51': 55,
    'Unknown': None
}
df['Age_band_numeric'] = df['Age_band_of_driver'].map(age_band_mapping)
df.head()

age_vehicles_correlation = df[['Age_band_numeric', 'Number_of_vehicles_involved']].corr()
print("Correlation between age band of driver and number of vehicles involved :\n")
print(age_vehicles_correlation)

# 15. Correlation between driving experience and the cause of accidents

# Accident types
accident_types = df['Cause_of_accident'].unique()
for x in accident_types:
    print(x)

# mapping 
accident_Types_cause_mapping = {
    'Overtaking': 'Human mistake',
    'No priority to vehicle': 'Human mistake',
    'Changing lane to the right': 'Human mistake',
    'Moving Backward': 'Human mistake',
    'Changing lane to the left': 'Human mistake',
    'No distancing': 'Human mistake',
    'Other': 'Other',
    'No priority to pedestrian': 'Human mistake',
    'Unknown': 'Other',
    'Overturning': 'Human mistake',
    'Driving carelessly': 'Human mistake',
    'Turnover': 'Human mistake',
    'Driving to the left': 'Human mistake',
    'Driving at high speed': 'Human mistake',
    'Driving under the influence of drugs': 'Human mistake',
    'Getting off the vehicle improperly': 'Human mistake',
    'Overspeed': 'Human mistake',
    'Drunk driving': 'Human mistake',
    'Overloading': 'Vehicle-related',
    'Improper parking': 'Human mistake',
    'Rainy weather': 'Environmental',
}
df['accident_Cause_category'] = df['Cause_of_accident'].map(accident_Types_cause_mapping)
df.head()

# Correlation between driving experience and the cause of accidents
experience_cause_correlation = pd.crosstab(df['Driving_experience'], df['accident_Cause_category'])
print("Correlation between driving experience and cause of accidents :\n")
print(experience_cause_correlation)

# 16. Explore the trend of accidents involving male vs. female drivers over different age bands.
accidents_by_gender_age = pd.crosstab(df['Age_band_of_driver'], df['Sex_of_driver'])
print("Trend of accidents involving male vs. female drivers over different age bands :\n")
print(accidents_by_gender_age)

# 17. Pivot Table 
pd.pivot_table(df, index=['Sex_of_driver', 'Weather_conditions'], columns=['Educational_level'], values=['Number_of_vehicles_involved'], aggfunc=[np.sum], margins=True)