forked from suyong-choi/ABCDnn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathonehotencoder.py
154 lines (128 loc) · 5.99 KB
/
onehotencoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import numpy as np
class OneHotEncoder_int(object):
"""One hot encoder for integer inputs with overflows
Arguments:
object {[type]} -- [description]
"""
def __init__(self, categorical_features, lowerlimit=None, upperlimit=None):
self.iscategorical = categorical_features
self.ncolumns = len(categorical_features)
self.ncats=0
self.categories_per_feature = []
self.ncatgroups = 0
for b in categorical_features:
if b:
self.ncatgroups += 1
self.lowerlimit = lowerlimit # initial set to the input, but will be checked later
self.upperlimit = upperlimit # initial set to the input, but will be checked later
self.categories_fixed = False
pass
def applylimit(self, categoricalinputdata):
# should check whether lower limit set makes sense
if self.lowerlimit is None:
self.lowerlimit = np.min(categoricalinputdata, axis=0)
else:
self.lowerlimit = np.maximum(self.lowerlimit, np.min(categoricalinputdata, axis=0))
# should check whether upper limit set makes sense
if self.upperlimit is None:
self.upperlimit = np.max(categoricalinputdata, axis=0)
else:
self.upperlimit = np.minimum(self.upperlimit, np.max(categoricalinputdata, axis=0))
lowerlimitapp = np.maximum(categoricalinputdata, self.lowerlimit)
#limitapp = np.minimum(lowerlimitapp, self.upperlimit).astype(int)
limitapp = np.minimum(lowerlimitapp, self.upperlimit)
return limitapp
def _encode(self, inputdata):
categorical_columns=inputdata[:, self.iscategorical]
float_columns=inputdata[:, [not i for i in self.iscategorical]]
cat_limited = self.applylimit(categorical_columns)-self.lowerlimit.astype(int)
catshape = categorical_columns.shape
arraylist=[]
if not self.categories_fixed:
for cat in range(catshape[1]):
ncats = int(self.upperlimit[cat] - self.lowerlimit[cat] + 1) # number of categories
self.categories_per_feature.append(ncats)
self.ncats += ncats
self.categories_fixed = True
for cat in range(catshape[1]):
ncats = int(self.upperlimit[cat] - self.lowerlimit[cat] + 1) # number of categories
res = np.eye(ncats)[cat_limited[:,cat]]
#print(res)
arraylist.append(res)
if float_columns.shape[1]>0:
arraylist.append(float_columns)
encoded = np.concatenate(tuple(arraylist), axis=1).astype(np.float32)
return encoded
def encode(self, inputdata):
cat_limited = self.applylimit(inputdata)-self.lowerlimit
# one hot encoding information
if not self.categories_fixed:
for icol, iscat in zip(range(self.ncolumns), self.iscategorical):
if iscat:
ncats = int(self.upperlimit[icol] - self.lowerlimit[icol] + 1) # number of categories
self.categories_per_feature.append(ncats)
self.ncats += ncats
else:
self.categories_per_feature.append(0)
self.categories_fixed = True
# the actual encoding part
arraylist=[]
for icol, ncat_feat in zip(range(self.ncolumns), self.categories_per_feature):
if ncat_feat>0:
res = np.eye(ncat_feat)[cat_limited[:,icol].astype(int)]
arraylist.append(res)
else:
arraylist.append(inputdata[:,icol].reshape((inputdata.shape[0], 1)))
encoded = np.concatenate(tuple(arraylist), axis=1).astype(np.float32)
return encoded
def encodedcategories(self):
return self.ncats
def transform(self, inputdata):
return self.encode(inputdata)
def _decode(self, onehotdata):
colstart = 0
arraylist = []
for i in range(self.ncatgroups):
ncats = int(self.upperlimit[i] - self.lowerlimit[i]+1) # number of categories
datatoconvert = onehotdata[:, colstart:colstart+ncats]
converted = np.argmax(datatoconvert, axis=1) + self.lowerlimit[i]
converted = np.reshape(converted, newshape=(converted.shape[0], 1))
arraylist.append(converted)
colstart += ncats
if colstart<onehotdata.shape[1]:
arraylist.append(onehotdata[:, colstart:])
decoded = np.concatenate(tuple(arraylist), axis=1)
return decoded
def decode(self, onehotdata):
current_col = 0 # start from column 0
arraylist = []
for ifeat, ncats in zip(range(len(self.categories_per_feature)), self.categories_per_feature):
if ncats>0:
datatoconvert = onehotdata[:, current_col:current_col+ncats]
converted = np.argmax(datatoconvert, axis=1) + self.lowerlimit[ifeat]
converted = np.reshape(converted, newshape=(converted.shape[0], 1))
arraylist.append(converted)
current_col += ncats
else:
arraylist.append(onehotdata[:, current_col].reshape((onehotdata.shape[0], 1)))
current_col += 1
decoded = np.concatenate(tuple(arraylist), axis=1)
return decoded
pass
def test():
x = np.array([[ 0, 1, 2], [ 3, 4, 5], [ 6, 7, 8], [ 9, 10, 11]])
ohe = OneHotEncoder_int(categorical_features=[True, False, True], lowerlimit=[2,0,2], upperlimit=[8,100,8])
xlimited = ohe.applylimit(x)
print(xlimited)
encodedx = ohe.encode(x)
print(encodedx)
decoded = ohe.decode(encodedx)
print(decoded)
print()
ohe2 = OneHotEncoder_int(categorical_features=[True, False,True ])
encodedx = ohe2.encode(x)
decoded = ohe2.decode(encodedx)
print(decoded)
pass
if __name__ == "__main__":
test()