-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path.gitignore
197 lines (156 loc) · 7.06 KB
/
.gitignore
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# Road Traffic Accidents
# import library
import pandas as pd
import numpy as np
# import csv dataset
df =pd.read_csv("RTA Dataset.csv")
df
# Top 5 values
df.head()
# Bottom 5 values
df.tail()
# Delete the column
del(df['Sex_of_casualty'])
del(df['Casualty_class'])
# DataTypes
df.dtypes
# Convert 'Time' columns from object to time
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S')
df.dtypes
# Check null values are present or not
df.isnull().values.any()
# How many values are null per column
df.isnull().sum()
# Drop null values
df.dropna(inplace=True)
#Check duplicate values present or not
df.duplicated().any()
# Shape of the dataset
rows,columns=df.shape
print(f"Number of rows are : {rows}")
print(f"Number of columns are : {columns}")
df.describe(include='all')
# 1. What is the most common day of the week for accidents?
most_common_day = df['Day_of_week'].value_counts().idxmax()
most_common_day_count = df['Day_of_week'].value_counts().max()
print(f"The most common day of the week for accidents is {most_common_day} and count of accidents are {most_common_day_count}.")
# 2. Which age band of drivers is involved in the most accidents?
most_common_age_band = df['Age_band_of_driver'].value_counts().idxmax()
most_common_age_band_count = df['Age_band_of_driver'].value_counts().max()
print(f"The most common age band of drivers is {most_common_age_band} with count {most_common_age_band_count}.")
# 3. What is the distribution of accidents by the type of vehicle involved?
vehicle_distribution = df['Type_of_vehicle'].value_counts()
print(f"The distribution of accidents by type of vehicle involved : \n {vehicle_distribution}.")
# 4. How do accidents vary by the gender of the driver?
gender = df.groupby(['Sex_of_driver']).size()
print(f"The distribution of accidents by the gender of the driver is : \n {gender}")
# 5. What are the top 5 most common causes of accidents?
most_common_causes = df['Cause_of_accident'].value_counts().head(5)
print(f"The most common causes of accidents are : \n\n {most_common_causes}")
# 6. Which educational level of drivers is associated with the most accidents?
education_level = df['Educational_level'].value_counts()
print("The distribution of accidents by the educational level of drivers is :\n")
print(education_level)
# 7. What are the peak hours for accidents?
df['Hour'] = df['Time'].dt.hour # .dt accessor is used to access datetime-related properties
peak_hours = df['Hour'].value_counts().sort_index()
print("The distribution of accidents by hour of the day is :\n")
print(peak_hours)
"""
This means that:
At 0:00 (midnight), there were 66 accidents.
At 1:00 AM, there were 46 accidents.
And so on.......
most of the accidents happened at 5:00 PM
"""
# 8. Is there a correlation between the time of day and the number of vehicles involved in an accident?
vehicles_involved_by_hour = df.groupby('Hour')['Number_of_vehicles_involved'].mean().round(2)
print("The average number of vehicles involved in accidents by hour of the day is :\n")
print(vehicles_involved_by_hour)
# 9. How do weather conditions affect the frequency of accidents?
weather_conditions = df['Weather_conditions'].value_counts()
print("The distribution of accidents by weather conditions is :\n")
print(weather_conditions)
# 10. What is the distribution of accidents based on the driving experience of the drivers?
df['Driving_experience'].value_counts()
# 11. Group the data by the type of vehicle and analyze the average driving experience of drivers involved in accidents.
# Check unique values
unique_val = df['Driving_experience'].unique()
print(unique_val)
# Convert the driving experience to numeric values for analysis
Driving_experience_mapping = {
'Below 1yr' : 0.5,
'1-2yr' : 1.5,
'2-5yr' : 3.5,
'5-10yr' : 7.5,
'Above 10yr' : 12
}
df['Driving_experience_numeric'] = df['Driving_experience'].map(Driving_experience_mapping)
df.head()
avg_experience_by_vehicle = df.groupby("Type_of_vehicle")["Driving_experience_numeric"].mean().round(0)
print(avg_experience_by_vehicle)
# 12. Group accidents by road surface type and determine which type of road surface has the highest frequency of accidents.
accidents_by_road_surface = df['Road_surface_type'].value_counts()
print("\nFrequency of accidents by road surface type:")
print(accidents_by_road_surface)
# 13. Create a cross-tabulation of accidents by the day of the week and light conditions.
cross_tab_day_light = pd.crosstab(df['Day_of_week'], df['Light_conditions'])
print("Cross-tabulation of accidents by day of the week and light conditions:")
print(cross_tab_day_light)
# 14. Correlation between the age band of the driver and the number of vehicles involved in accidents
# All Unique Age values
df['Age_band_of_driver'].unique()
# Convert the age band to numeric values for correlation analysis
age_band_mapping = {
'Under 18': 17,
'18-30': 24,
'31-50': 40,
'Over 51': 55,
'Unknown': None
}
df['Age_band_numeric'] = df['Age_band_of_driver'].map(age_band_mapping)
df.head()
age_vehicles_correlation = df[['Age_band_numeric', 'Number_of_vehicles_involved']].corr()
print("Correlation between age band of driver and number of vehicles involved :\n")
print(age_vehicles_correlation)
# 15. Correlation between driving experience and the cause of accidents
# Accident types
accident_types = df['Cause_of_accident'].unique()
for x in accident_types:
print(x)
# mapping
accident_Types_cause_mapping = {
'Overtaking': 'Human mistake',
'No priority to vehicle': 'Human mistake',
'Changing lane to the right': 'Human mistake',
'Moving Backward': 'Human mistake',
'Changing lane to the left': 'Human mistake',
'No distancing': 'Human mistake',
'Other': 'Other',
'No priority to pedestrian': 'Human mistake',
'Unknown': 'Other',
'Overturning': 'Human mistake',
'Driving carelessly': 'Human mistake',
'Turnover': 'Human mistake',
'Driving to the left': 'Human mistake',
'Driving at high speed': 'Human mistake',
'Driving under the influence of drugs': 'Human mistake',
'Getting off the vehicle improperly': 'Human mistake',
'Overspeed': 'Human mistake',
'Drunk driving': 'Human mistake',
'Overloading': 'Vehicle-related',
'Improper parking': 'Human mistake',
'Rainy weather': 'Environmental',
}
df['accident_Cause_category'] = df['Cause_of_accident'].map(accident_Types_cause_mapping)
df.head()
# Correlation between driving experience and the cause of accidents
experience_cause_correlation = pd.crosstab(df['Driving_experience'], df['accident_Cause_category'])
print("Correlation between driving experience and cause of accidents :\n")
print(experience_cause_correlation)
# 16. Explore the trend of accidents involving male vs. female drivers over different age bands.
accidents_by_gender_age = pd.crosstab(df['Age_band_of_driver'], df['Sex_of_driver'])
print("Trend of accidents involving male vs. female drivers over different age bands :\n")
print(accidents_by_gender_age)
# 17. Pivot Table
pd.pivot_table(df, index=['Sex_of_driver', 'Weather_conditions'], columns=['Educational_level'], values=['Number_of_vehicles_involved'], aggfunc=[np.sum], margins=True)