-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
121 lines (106 loc) · 4.18 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
import random as rand
from math import sqrt
from datetime import datetime
def euclideanDistance(list1, list2):
sum = 0
for i in range(0, len(list1)):
sum += (list1[i] - list2[i]) ** 2
return sqrt(sum)
# Debugging
def printClusSet(set):
for clus in xrange(0, len(set)):
print "Cluster #", clus, ": ", set[clus]
# Update centroids based on the average of all the points in that cluster
def updateCentroids(dataSet, centSet, clusSet, k):
for clus in range(0, k): # For every cluster
if(len(clusSet[clus]) != 0): # Only update if there are points in the cluster
newCent = [0.0] * len(dataSet[0])
for dataRow in clusSet[clus]: # For every point in that cluster
newCent = [attr1 + attr2 for attr1, attr2 in zip(dataSet[dataRow], newCent)] # Add up all respective points
centSet[clus] = [attr/len(clusSet[clus]) for attr in newCent]
dataSet = []
centSet = []
clusSet = []
maxValues = []
rand.seed(datetime.now()) # To get different random numbers each time
# Read data input, get the max values to normalize
with open("dow_jones_index.data", "r") as file:
columns = len(file.readline().split(",")) # Ignore first row
maxValues = [0] * columns
for line in file:
lineList = line.split(",")[3:-1]
lineList.append(line.split(",")[-1][:-2]) # Last column has trailing \r\n
if '' not in lineList:
lineList = [abs(float(attr)) for attr in lineList] # Convert everything to positive float
maxValues = [max(maxVal, dataVal) for maxVal, dataVal in zip(maxValues, lineList)]
# Read data input, initialize data set
with open("dow_jones_index.data", "r") as file:
file.readline() # Ignore first row
for line in file:
lineList = line.split(",")[3:-1]
lineList.append(line.split(",")[-1][:-2]) # Last column has trailing \r\n
if '' not in lineList: # Ignore data row with missing values
lineList = [float(i) for i in lineList] # Convert strings to float
lineList = [dataVal / maxVal for dataVal, maxVal in zip(lineList, maxValues)] # Normalize by dividing by max value
dataSet.append(lineList)
# Start iterations for clustering with dynamic k
for k in range(1, len(dataSet)):
centSet = []
clusSet = []
# Generate random centroid values
for x in range(0, k):
randomRow = dataSet[rand.randint(0, len(dataSet)-1)]
centSet.append(randomRow)
clusSet.append([])
# Initialize first set of clusters
for dataRow in range(0, len(dataSet)): # For every data point
minDist = 9999999
assignCluster = 0
for cent in range(0, k): # For every centroid
currDist = euclideanDistance(dataSet[dataRow], centSet[cent])
if(currDist < minDist):
assignCluster = cent
clusSet[assignCluster].append(dataRow)
# Update Centroids
updateCentroids(dataSet, centSet, clusSet, k)
# Start assignment process. Stop when all the points belong in the appropriate cluster.
needsUpdate = True
while(needsUpdate):
needsUpdate = False
for clus in range(0, k): # For every cluster
for dataRow in clusSet[clus]: # For every data point given a cluster
# Since all data has been assigned a cluster, just grab the info
minDist = euclideanDistance(dataSet[dataRow], centSet[clus])
assignClus = clus
for cent in range(0, k): # Calculate distance between every centroid
currDist = euclideanDistance(dataSet[dataRow], centSet[cent])
if(currDist < minDist):
assignClus = cent
if assignClus != clus:
clusSet[clus].remove(dataRow)
clusSet[assignClus].append(dataRow)
needsUpdate = True
# DON: DELETE THIS AND USE THE LINE BELOW IF YOU WANT CLUSTER AFTER EACH UPDATE
# WARNING: THERE ARE TOO MANY NUMBERS FOR A HUMAN TO PROCESS
# VVVVV USE THIS VVVVV
# printClusSet(clusSet)
updateCentroids(dataSet, centSet, clusSet, k)
# Calculate IV
IV = 0
for clus in range(0, k):
for dataRow in clusSet[clus]:
IV += euclideanDistance(dataSet[dataRow], centSet[clus])
# Calculate EV
EV = 0
for clus1 in range(0, k):
for dataRow1 in clusSet[clus1]:
for clus2 in range(clus1+1, k):
for dataRow2 in clusSet[clus2]:
EV += euclideanDistance(dataSet[dataRow1], dataSet[dataRow2])
EV /= len(dataSet)
if EV != 0:
print "K = ", k
printClusSet(clusSet)
print "IV/EV =", IV/EV
print "======================================================================="