-
Notifications
You must be signed in to change notification settings - Fork 0
/
assignment1.py
119 lines (95 loc) · 3.38 KB
/
assignment1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#
# TOOD: Import whatever needs to be imported to make this work
#
# .. your code here ..
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from sklearn.cluster import KMeans
matplotlib.style.use('ggplot') # Look Pretty
#
# TODO: To procure the dataset, follow these steps:
# 1. Navigate to: https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-present/ijzp-q8t2
# 2. In the 'Primary Type' column, click on the 'Menu' button next to the info button,
# and select 'Filter This Column'. It might take a second for the filter option to
# show up, since it has to load the entire list first.
# 3. Scroll down to 'GAMBLING'
# 4. Click the light blue 'Export' button next to the 'Filter' button, and select 'Download As CSV'
df=pd.read_csv('D:\learning\DAT210x-master\Module5\Crimes_-_2001_to_present.csv')
def doKMeans(df):
#
# INFO: Plot your data with a '.' marker, with 0.3 alpha at the Longitude,
# and Latitude locations in your dataset. Longitude = x, Latitude = y
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(df.Longitude, df.Latitude, marker='.', alpha=0.3)
#
# TODO: Filter df so that you're only looking at Longitude and Latitude,
# since the remaining columns aren't really applicable for this purpose.
#
# .. your code here ..
ab=df[['Latitude','Longitude']]
ab=ab.dropna()
#
# TODO: Use K-Means to try and find seven cluster centers in this df.
#
# .. your code here ..
kmeans = KMeans(n_clusters=7)
kmeans.fit(ab)
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=7, n_init=10,
n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
verbose=0)
#labels = kmeans.predict(df)
centroids = kmeans.cluster_centers_
T=pd.DataFrame(centroids)
T.columns = ['component1', 'component2']
T.plot.scatter(x='component2', y='component1', marker='o', c='r', alpha=0.5, linewidths=3, s=169)
plt.show()
#
# INFO: Print and plot the centroids...
#centroids = kmeans_model.cluster_centers_
#
# TODO: Load your dataset after importing Pandas
#
# .. your code here ..
#
# TODO: Drop any ROWs with nans in them
#
# .. your code here ..
cd=df.dropna()
#
# TODO: Print out the dtypes of your dset
#
# .. your code here ..
#
# Coerce the 'Date' feature (which is currently a string object) into real date,
# and confirm by re-printing the dtypes. NOTE: This is a slow process...
#
# .. your code here ..
cd.Date = pd.to_datetime(cd.Date, errors='coerce')
cd.dtypes
# INFO: Print & Plot your data
doKMeans(cd)
#
# TODO: Filter out the data so that it only contains samples that have
# a Date > '2011-01-01', using indexing. Then, in a new figure, plot the
# crime incidents, as well as a new K-Means run's centroids.
#
# .. your code here ..
EF=cd[cd.Date>'2011-01-01']
GH=EF[['Latitude','Longitude']]
kmeans = KMeans(n_clusters=7)
kmeans.fit(GH)
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=7, n_init=10,
n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
verbose=0)
#labels = kmeans.predict(df)
centroids = kmeans.cluster_centers_
centroids
T=pd.DataFrame(centroids)
T.columns = ['component1', 'component2']
T.plot.scatter(x='component2', y='component1', marker='o', c='r', alpha=0.5, linewidths=3, s=169)
plt.show()
# INFO: Print & Plot your data
doKMeans(GH)
plt.show()