-
Notifications
You must be signed in to change notification settings - Fork 0
/
adasyn_optimal.py
141 lines (124 loc) · 4.97 KB
/
adasyn_optimal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import numpy as np
from sklearn import neighbors
import pandas as pd
from sklearn.model_selection import train_test_split
import operator
from statistics import mean
def adasyn(xtrain, ytrain, target_column, class_to_boost, complex_model, nominal, n_neighbors, boost_coef):
# (xtrain, ytrain, beta, threshold, target_column, boost_coef, K=5)
# we introduce the parameter class weight
# it says how many times we want to increase the population of each class
# df, X_train, y_train, class_weight, "target"
train_dataset = pd.concat([xtrain, ytrain], axis=1, sort=False)
# print(len(train_dataset))
# print(train_dataset)
if class_to_boost == 1:
train_dataset = train_dataset.sort_values(by=target_column, ascending=False)
m = int(sum(ytrain))
# print(m)
else:
train_dataset = train_dataset.sort_values(by=target_column, ascending=True)
m1 = int(sum(ytrain))
# print(m1)
m = len(ytrain) - m1
# print(m)
clf = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)
clf.fit(xtrain, ytrain)
# Step 2a, if the minority data set is below the maximum tolerated threshold, generate data.
# Beta is the desired balance level parameter. Beta > 1 means u want more of the imbalanced type, vice versa.
G = boost_coef * m - m
# Step 2b, find the K nearest neighbours of each minority class example in euclidean distance.
# Find the ratio ri = majority_class in neighbourhood / K
Ri = []
Minority_per_xi = []
for i in range(m):
xi = xtrain.iloc[i, :]
# print(xi)
# Returns indices of the closest neighbours, and return it as a list
neighbours = clf.kneighbors([xi], n_neighbors=n_neighbors, return_distance=False)[0]
# print(neighbours)
# Skip classifying itself as one of its own neighbours
# neighbours = neighbours[1:]
# Count how many belongs to the majority class
count = 0
for value in neighbours:
if value > m:
count += 1
# Find all the minority examples
minority = []
for value in neighbours:
# Shifted back 1 because indices start at 0
if value <= m - 1:
minority.append(value)
# print(minority)
# print(count)
if len(minority) >= 2:
Ri.append(count / n_neighbors)
Minority_per_xi.append(minority)
elif len(minority) == 1:
Ri.append(1/n_neighbors)
Minority_per_xi.append(minority)
else:
Ri.append(0)
Minority_per_xi.append(minority)
# Step 2c, normalize ri's so their sum equals to 1
Rhat_i = []
for ri in Ri:
rhat_i = ri / sum(Ri)
Rhat_i.append(rhat_i)
# Step 2d, calculate the number of synthetic data examples that will be generated for each minority example
Gi = []
for rhat_i in Rhat_i:
gi = round(rhat_i * G)
Gi.append(int(gi))
# print(max(Gi))
l = []
for group in Minority_per_xi:
l.append(len(group))
# print(min(l))
# # Step 2e, generate synthetic examples
number_of_added_data = 0
syn_data = []
for i in range(m):
most_common_nominal = {}
xi = xtrain.iloc[i, :]
if len(nominal) >= 1:
for feature in nominal:
count = 0
sum_nominal = 0
# print(feature)
for sample in Minority_per_xi[i]:
# print(sample)
x_sample = xtrain.iloc[sample, :]
# print(x_sample)
# print(type(x_sample))
#feature_value = x_sample[feature]
sum_nominal += x_sample[feature]
#if feature_value in count:
# count[feature_value] += 1
#else:
# count[feature_value] = 1
most_common_nominal[feature] = round(sum_nominal/len(Minority_per_xi[i]))
#key_max = max(count.items(), key=operator.itemgetter(1))[0]
#most_common_nominal[feature] = key_max
# print("xi", xi)
for j in range(Gi[i]):
# If the minority list is not empty
if Minority_per_xi[i]:
index = np.random.choice(Minority_per_xi[i])
xzi = xtrain.iloc[index, :]
si = xi + (xzi - xi) * np.random.uniform(0, 1)
if len(nominal) >= 1:
for feature in nominal:
si[feature] = most_common_nominal[feature]
syn_data.append(si)
number_of_added_data += 1
# Build the data matrix
new_y = []
for i in range(len(syn_data)):
new_y.append(int(complex_model.predict([syn_data[i]])))
new_y_df = pd.DataFrame({target_column: new_y})
new_df = pd.DataFrame(syn_data)
new_df.reset_index(drop=True, inplace=True)
new_df = pd.concat([new_df, new_y_df], axis=1)
return new_df