-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
375 lines (303 loc) · 12.8 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
import pandas as pd
import sklearn.datasets
import os
from typing import Dict, Optional
import numpy as np
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, BatchNormalization, Input, Activation,MaxPooling2D, AveragePooling2D, Concatenate
pd.set_option('future.no_silent_downcasting', True)
# Strings representing each dataset
HELOC_NAME = "HELOC"
CALIFORNIA_HOUSING_NAME = "California_Housing"
DENGUE_DATASET = "Dengue_Chikungunya"
COVERTYPE_NAME = "Covertype"
# Strings representing each method
IGTD_NAME = "IGTD"
REFINED_NAME = "REFINED"
BARGRAPH_NAME = "BARGRAPH"
DISTANCE_MATRIX_NAME = "DISTANCE-MATRIX"
COMBINATION_NAME = "COMBINATION"
TINTO_NAME = "TINTO"
SUPERTML_EF_NAME = "SuperTML-EF"
SUPERTML_VF_NAME = "SuperTML-VF"
DROPOUT_VALUE = 0.2
"""List with all the available datasets"""
ALL_DATASETS = [HELOC_NAME, CALIFORNIA_HOUSING_NAME, DENGUE_DATASET, COVERTYPE_NAME]
"""List with all the available image methods"""
ALL_IMAGE_METHODS = [IGTD_NAME, REFINED_NAME, BARGRAPH_NAME, DISTANCE_MATRIX_NAME, COMBINATION_NAME, TINTO_NAME, SUPERTML_EF_NAME, SUPERTML_VF_NAME]
"""Seed for reproducibility in shuffling and spliting"""
RANDOM_SEED = 1234
"""Size of test split (out of 1)"""
TEST_SPLIT_SIZE = 0.1
"""Size of validation split (out of 1)"""
VAL_SPLIT_SIZE = 0.1
"""List with the datasets that are binary classification"""
BINARY_CLASSIFICATION_DATASETS = [HELOC_NAME, DENGUE_DATASET]
assert all(c in ALL_DATASETS for c in BINARY_CLASSIFICATION_DATASETS)
"""Dictionary with pairs of dataset key and number of samples"""
N_SAMPLES_PER_DATASET: Dict[str, int] = {
HELOC_NAME: 9_871,
CALIFORNIA_HOUSING_NAME: 20_640,
DENGUE_DATASET: 11_448,
COVERTYPE_NAME: 581_012,
}
assert set(ALL_DATASETS) == set(N_SAMPLES_PER_DATASET.keys())
"""Dictionary with pairs of dataset key and 'problem' string for TINTOlib"""
DATASET_TYPES: Dict[str, str] = {
HELOC_NAME: "supervised",
CALIFORNIA_HOUSING_NAME: "regression",
DENGUE_DATASET: "supervised",
COVERTYPE_NAME: "supervised",
}
assert set(ALL_DATASETS) == set(DATASET_TYPES.keys())
DATASETS_N_CLASSES: Dict[str, Optional[int]] = {
HELOC_NAME: 2,
CALIFORNIA_HOUSING_NAME: None,
DENGUE_DATASET: 2,
COVERTYPE_NAME: 7,
}
assert set(ALL_DATASETS) == set(DATASET_TYPES.keys())
def get_X_y(dataset_name: str):
"""Given the string that represents a dataset, returns X and y"""
if dataset_name not in ALL_DATASETS:
raise ValueError("Cannot find a dataset with that name.")
elif dataset_name == HELOC_NAME:
path_to_dataset = os.path.join("datasets", "heloc.csv")
df = pd.read_csv(path_to_dataset)
label_col = "RiskPerformance"
X = df.drop(label_col, axis=1).to_numpy()
y = df[label_col].replace({"Good": 0, "Bad": 1}).astype(int).to_numpy()
elif dataset_name == COVERTYPE_NAME:
X,y = sklearn.datasets.fetch_covtype(return_X_y=True)
y = y-1
elif dataset_name == CALIFORNIA_HOUSING_NAME:
X, y = sklearn.datasets.fetch_california_housing(return_X_y=True)
elif dataset_name == DENGUE_DATASET:
path_to_dataset = os.path.join("datasets", "arboviruses_dataset.csv")
df = pd.read_csv(path_to_dataset, sep=";")
label_col = "CLASSI_FIN"
desired_targets = ['CHIKUNGUNYA', 'DENGUE']
df = df[df[label_col].isin(desired_targets)]
X = df.drop(label_col, axis=1).to_numpy()
y = sklearn.preprocessing.LabelEncoder().fit_transform(df[label_col])
else:
raise NotImplementedError()
assert X.shape[0] == N_SAMPLES_PER_DATASET[dataset_name]
return X,y
def get_indices_train_eval(dataset_name: str):
total_elems: int = N_SAMPLES_PER_DATASET.get(dataset_name, -1)
if total_elems == -1:
raise ValueError("Cannot find a dataset with that name.")
# Create the array with all the indices
indices = np.arange(total_elems)
# Remove the indices from the eval
indices_test = get_indices_test(dataset_name)
non_test_indices = np.setdiff1d(indices, indices_test)
assert indices_test.shape[0] + non_test_indices.shape[0] == total_elems
# Split the remaining
r = np.random.default_rng(seed=RANDOM_SEED)
r.shuffle(non_test_indices)
n_val = int(total_elems * VAL_SPLIT_SIZE)
train_indices, val_indices = non_test_indices[:-n_val], non_test_indices[-n_val:]
assert train_indices.shape[0] + val_indices.shape[0] == non_test_indices.shape[0]
return train_indices, val_indices
def get_indices_train1_eval1_train2_eval2(dataset_name: str):
total_elems: int = N_SAMPLES_PER_DATASET.get(dataset_name, -1)
if total_elems == -1:
raise ValueError("Cannot find a dataset with that name.")
# Get train_val indices
train_indices, val_indices = get_indices_train_eval(dataset_name)
# Split train_val indices in half
train_1, train_2 = train_indices[:train_indices.shape[0]//2], train_indices[train_indices.shape[0]//2:]
val_1, val_2 = val_indices[:val_indices.shape[0]//2], val_indices[val_indices.shape[0]//2:]
# Return
assert train_1.shape[0] + val_1.shape[0] + train_2.shape[0] + val_2.shape[0] == train_indices.shape[0] + val_indices.shape[0]
return train_1, val_1, train_2, val_2
def get_indices_test(dataset_name: str):
total_elems: int = N_SAMPLES_PER_DATASET.get(dataset_name, -1)
if total_elems == -1:
raise ValueError("Cannot find a dataset with that name.")
# Create the array with all the indices
indices = np.arange(total_elems)
# Shuffle the indices
r = np.random.default_rng(seed=RANDOM_SEED)
r.shuffle(indices)
# Return the N first
n = int(total_elems * TEST_SPLIT_SIZE)
return indices[:n]
def get_results_path():
p = "results"
if not os.path.exists(p):
os.makedirs(p)
return p
def get_images_base_path():
p = "images"
if not os.path.exists(p):
os.makedirs(p)
return p
def get_images_path_for_dataset(dataset_name, image_method):
if dataset_name not in ALL_DATASETS:
raise Exception("This dataset does not exist")
if image_method not in ALL_IMAGE_METHODS:
raise Exception("This image method does not exist")
p = os.path.join(get_images_base_path(), dataset_name, image_method)
if not os.path.exists(p):
os.makedirs(p)
return p
def get_classicdescriptors_path(dataset_name):
def get_classicdescriptors_base_path():
p = os.path.join(get_results_path(), "classic_descriptors")
if not os.path.exists(p):
os.makedirs(p)
return p
if dataset_name not in ALL_DATASETS:
raise Exception("This dataset does not exist")
p = os.path.join(get_classicdescriptors_base_path(), dataset_name)
if not os.path.exists(p):
os.makedirs(p)
return p
def get_classicdescriptors_split1_path(dataset_name):
def get_classicdescriptors_split1_base_path():
p = os.path.join(get_results_path(), "classic_descriptors_split1")
if not os.path.exists(p):
os.makedirs(p)
return p
if dataset_name not in ALL_DATASETS:
raise Exception("This dataset does not exist")
p = os.path.join(get_classicdescriptors_split1_base_path(), dataset_name)
if not os.path.exists(p):
os.makedirs(p)
return p
def get_cnnmodels_path(dataset_name):
def get_cnnmodels_base_path():
p = os.path.join(get_results_path(), "cnn_models")
if not os.path.exists(p):
os.makedirs(p)
return p
if dataset_name not in ALL_DATASETS:
raise Exception("This dataset does not exist")
p = os.path.join(get_cnnmodels_base_path(), dataset_name)
if not os.path.exists(p):
os.makedirs(p)
return p
def get_cnnmlp_models_path(dataset_name):
def get_cnnmlp_models_base_path():
p = os.path.join(get_results_path(), "cnn_mlp_models")
if not os.path.exists(p):
os.makedirs(p)
return p
if dataset_name not in ALL_DATASETS:
raise Exception("This dataset does not exist")
p = os.path.join(get_cnnmlp_models_base_path(), dataset_name)
if not os.path.exists(p):
os.makedirs(p)
return p
def get_cnn_classicdescriptors_path(dataset_name):
def get_cnn_classicdescriptors_base_path():
p = os.path.join(get_results_path(), "cnn_classic_models")
if not os.path.exists(p):
os.makedirs(p)
return p
if dataset_name not in ALL_DATASETS:
raise Exception("This dataset does not exist")
p = os.path.join(get_cnn_classicdescriptors_base_path(), dataset_name)
if not os.path.exists(p):
os.makedirs(p)
return p
def get_dataset_type_str(dataset_name: str):
dataset_type_str = DATASET_TYPES.get(dataset_name, None)
if dataset_type_str is None:
raise ValueError("Cannot find a dataset with that name.")
return dataset_type_str
def is_dataset_classification(dataset_name: str):
return get_dataset_type_str(dataset_name) == "supervised"
def is_dataset_multiclass_classification(dataset_name: str):
return is_dataset_classification(dataset_name) and dataset_name not in BINARY_CLASSIFICATION_DATASETS
def is_dataset_binary_classification(dataset_name: str):
return is_dataset_classification(dataset_name) and dataset_name in BINARY_CLASSIFICATION_DATASETS
def is_dataset_regression(dataset_name: str):
return get_dataset_type_str(dataset_name) == "regression"
def get_number_of_classes(dataset_name: str):
n = DATASETS_N_CLASSES.get(dataset_name)
if not is_dataset_classification(dataset_name):
assert n is None
raise Exception("This dataset is not classification")
elif is_dataset_binary_classification(dataset_name):
assert n==2
return n
elif is_dataset_multiclass_classification(dataset_name):
assert n>2
return n
raise Exception("This shouldn't have happened.")
def get_cnn_branch(input_shape):
"""Returns the CNN branches"""
# Input layer
input_branch = Input(input_shape)
# Start branch 1
tower_1 = Conv2D(16, (3,3), activation='relu',padding="same")(input_branch)
tower_1 = BatchNormalization()(tower_1)
tower_1 = Activation('relu')(tower_1)
tower_1 = MaxPooling2D(2,2)(tower_1)
tower_1 = Dropout(DROPOUT_VALUE)(tower_1)
tower_1 = Conv2D(32, (3,3), activation='relu',padding="same")(tower_1)
tower_1 = BatchNormalization()(tower_1)
tower_1 = Activation('relu')(tower_1)
tower_1 = MaxPooling2D(2,2)(tower_1)
tower_1 = Dropout(DROPOUT_VALUE)(tower_1)
tower_1 = Conv2D(64, (3,3), activation='relu',padding="same")(tower_1)
tower_1 = BatchNormalization()(tower_1)
tower_1 = Activation('relu')(tower_1)
tower_1 = MaxPooling2D(2,2)(tower_1)
tower_1 = Dropout(DROPOUT_VALUE)(tower_1)
tower_1 = Conv2D(64, (3,3), activation='relu',padding="same")(tower_1)
tower_1 = BatchNormalization()(tower_1)
tower_1 = Activation('relu')(tower_1)
tower_1 = MaxPooling2D(2,2)(tower_1)
tower_1 = Dropout(DROPOUT_VALUE)(tower_1)
# End branch 1
# Start branch 2
tower_2 = Conv2D(16, (5,5), activation='relu',padding="same")(input_branch)
tower_2 = BatchNormalization()(tower_2)
tower_2 = Activation('relu')(tower_2)
tower_2 = AveragePooling2D(2,2)(tower_2)
tower_2 = Dropout(DROPOUT_VALUE)(tower_2)
tower_2 = Conv2D(32, (5,5), activation='relu',padding="same")(tower_2)
tower_2 = BatchNormalization()(tower_2)
tower_2 = Activation('relu')(tower_2)
tower_2 = AveragePooling2D(2,2)(tower_2)
tower_2 = Dropout(DROPOUT_VALUE)(tower_2)
tower_2 = Conv2D(64, (5,5), activation='relu',padding="same")(tower_2)
tower_2 = BatchNormalization()(tower_2)
tower_2 = Activation('relu')(tower_2)
tower_2 = AveragePooling2D(2,2)(tower_2)
tower_2 = Dropout(DROPOUT_VALUE)(tower_2)
tower_2 = Conv2D(64, (5,5), activation='relu',padding="same")(tower_2)
tower_2 = BatchNormalization()(tower_2)
tower_2 = Activation('relu')(tower_2)
tower_2 = AveragePooling2D(2,2)(tower_2)
tower_2 = Dropout(DROPOUT_VALUE)(tower_2)
# End branch 2
# Concatenate the 2 branches
merged = Concatenate(axis=1)([tower_1, tower_2])
# Dense layers
out = Flatten()(merged)
out = Dense(256, activation='relu')(out)
out = Dense(128, activation='sigmoid')(out)
out = Dense(64, activation='sigmoid')(out)
out = Dense(32, activation='sigmoid')(out)
return (input_branch, out)
def get_mlp_branch(input_shape):
"""Returns the MLP branch"""
input_ffnn = Input(shape=input_shape)
x = Dense(64, activation="relu")(input_ffnn)
x = BatchNormalization()(x)
x = Dropout(DROPOUT_VALUE)(x)
x = Dense(32, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(DROPOUT_VALUE)(x)
x = Dense(16, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(DROPOUT_VALUE)(x)
return (input_ffnn, x)
# Check
for dataset_name in ALL_DATASETS:
assert is_dataset_classification(dataset_name) ^ is_dataset_regression(dataset_name)