-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.py
146 lines (115 loc) · 5.2 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from scipy import stats
import numpy as np
# This method computes entropy for information gain
def entropy(class_y):
# Input:
# class_y : list of class labels (0's and 1's)
# TODO: Compute the entropy for a list of classes
#
# Example:
# entropy([0,0,0,1,1,1,1,1,1]) = 0.92
entropy = 0
freq = {}
for i in class_y:
if i in freq: freq[i] += 1
else: freq[i] = 1
probs = []
for val in freq.values():
probs.append(val/len(class_y))
for val in probs:
entropy -= val*np.log2(val)
return entropy
def partition_classes(X, y, split_attribute, split_val):
# Inputs:
# X : data containing all attributes
# y : labels
# split_attribute : column index of the attribute to split on
# split_val : either a numerical or categorical value to divide the split_attribute
# TODO: Partition the data(X) and labels(y) based on the split value - BINARY SPLIT.
#
# You will have to first check if the split attribute is numerical or categorical
# If the split attribute is numeric, split_val should be a numerical value
# For example, your split_val could be the mean of the values of split_attribute
# If the split attribute is categorical, split_val should be one of the categories.
#
# You can perform the partition in the following way
# Numeric Split Attribute:
# Split the data X into two lists(X_left and X_right) where the first list has all
# the rows where the split attribute is less than or equal to the split value, and the
# second list has all the rows where the split attribute is greater than the split
# value. Also create two lists(y_left and y_right) with the corresponding y labels.
#
# Categorical Split Attribute:
# Split the data X into two lists(X_left and X_right) where the first list has all
# the rows where the split attribute is equal to the split value, and the second list
# has all the rows where the split attribute is not equal to the split value.
# Also create two lists(y_left and y_right) with the corresponding y labels.
'''
Example:
X = [[3, 'aa', 10], y = [1,
[1, 'bb', 22], 1,
[2, 'cc', 28], 0,
[5, 'bb', 32], 0,
[4, 'cc', 32]] 1]
Here, columns 0 and 2 represent numeric attributes, while column 1 is a categorical attribute.
Consider the case where we call the function with split_attribute = 0 and split_val = 3 (mean of column 0)
Then we divide X into two lists - X_left, where column 0 is <= 3 and X_right, where column 0 is > 3.
X_left = [[3, 'aa', 10], y_left = [1,
[1, 'bb', 22], 1,
[2, 'cc', 28]] 0]
X_right = [[5, 'bb', 32], y_right = [0,
[4, 'cc', 32]] 1]
Consider another case where we call the function with split_attribute = 1 and split_val = 'bb'
Then we divide X into two lists, one where column 1 is 'bb', and the other where it is not 'bb'.
X_left = [[1, 'bb', 22], y_left = [1,
[5, 'bb', 32]] 0]
X_right = [[3, 'aa', 10], y_right = [1,
[2, 'cc', 28], 0,
[4, 'cc', 32]] 1]
'''
number = False
if isinstance(split_val, (int, float)): number = True
X_left = []
X_right = []
y_left = []
y_right = []
if number:
for i in range(len(X)):
if X[i][split_attribute] <= split_val:
X_left.append(X[i])
y_left.append(y[i])
else:
X_right.append(X[i])
y_right.append(y[i])
else:
for i in range(len(X)):
if X[i][split_attribute] == split_val:
X_left.append(X[i])
y_left.append(y[i])
else:
X_right.append(X[i])
y_right.append(y[i])
return (X_left, X_right, y_left, y_right)
def information_gain(previous_y, current_y):
# Inputs:
# previous_y: the distribution of original labels (0's and 1's)
# current_y: the distribution of labels after splitting based on a particular
# split attribute and split value
# TODO: Compute and return the information gain from partitioning the previous_y labels
# into the current_y labels.
# You will need to use the entropy function above to compute information gain
# Reference: http://www.cs.cmu.edu/afs/cs.cmu.edu/academic/class/15381-s06/www/DTs.pdf
"""
Example:
previous_y = [0,0,0,1,1,1]
current_y = [[0,0], [1,1,1,0]]
info_gain = 0.45915
"""
info_gain = 0
H = entropy(previous_y)
total = 0
#HlPl + HrPr
for lis in current_y:
total += len(lis)*entropy(lis)/len(previous_y)
info_gain = H - total
return info_gain