-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathUSTC-TFC2016_20class.py
218 lines (194 loc) · 9.34 KB
/
USTC-TFC2016_20class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
'''
论文配套源代码
引用格式:B. Wang, Y. Su, M. Zhang and J. Nie, "A Deep Hierarchical Network for Packet-Level Malicious Traffic Detection,"
in IEEE Access, vol. 8, pp. 201728-201740, 2020, doi: 10.1109/ACCESS.2020.3035967.
'''
import numpy as np
import pickle
import glob
import random
from keras import callbacks
from keras import Input, Model
from keras.layers import Embedding, Bidirectional, LSTM, Dense
from timeit import default_timer as timer
from keras.utils import np_utils
from sklearn.model_selection import StratifiedKFold
# num_sample = 2000 # number of samples per pkl dataset to train on
CHAR_DIMENSION_PER_PACKET = 256
DIVIDE_RATE = 1 / 9 # validation//train sample ratio in total sample
PACKET_LEN = 54
EPOCH = 20
MINI_BATCH = 100
CHECKPOINTS_DIR = 'checkpoints/'
MODEL_NAME = ''
STEP_PER_EPOCH = 0
VALIDATION_STEP = 0
def load_data(root_path):
global STEP_PER_EPOCH, VALIDATION_STEP
'''
从当前目录glob所有数据库类型的数据,载入数据
:return:
'''
start_time = timer()
target_texts = []
input_texts = []
label_packets = []
file_list = glob.glob(root_path + '/USTC*.pkl')
for file in file_list:
with open(file, 'rb') as f:
label_packetList = pickle.load(f)
# for label_packet in label_packetList[:min(num_sample, len(label_packetList) - 1)]:
for label_packet in label_packetList:
label = label_packet[0]
packet = label_packet[1]
# input_text= [byte for byte in input_text]
# print(type(packet), packet)
label_packets.append([label, packet[:PACKET_LEN]])
# target_texts.append(label)
# input_texts.append(packet[:PACKET_LEN])
# print(line.split())
random.shuffle(label_packets)
for i in label_packets:
label = i[0]
packet = i[1]
target_texts.append(label)
input_texts.append(packet)
STEP_PER_EPOCH = int(len(target_texts) / MINI_BATCH * (1 - DIVIDE_RATE))
VALIDATION_STEP = int(len(target_texts) / MINI_BATCH * DIVIDE_RATE)
print('label_num:', len(target_texts))
print('packet_num:', len(input_texts))
end_time = timer()
print(f'time cost of loading data:{end_time - start_time} seconds')
return input_texts, target_texts
def dataset_generator(packets, labels, indices, batch_size):
'''
带OHE
:param packets:
:param labels:
:param indices:
:param batch_size:
:return:
'''
Xbatch = np.zeros((batch_size, PACKET_LEN, CHAR_DIMENSION_PER_PACKET,), dtype=np.int64)
Ybatch = np.zeros((batch_size, 20), dtype=np.int64)
batch_idx = 0
while True:
for idx in indices:
for i, bytes in enumerate(packets[idx]):
# print('bytes:',bytes)
Xbatch[batch_idx, i, bytes] = 1 # 用OHE
Ybatch[batch_idx] = np_utils.to_categorical(labels[idx], num_classes=20)
batch_idx += 1
if batch_idx == batch_size:
batch_idx = 0
# print(Xbatch, Ybatch)
# print(Xbatch.shape,Ybatch.shape)
# time.sleep(2)
yield (Xbatch, Ybatch)
def dataset_generator2(packets, labels, indices, batch_size):
'''
不带OHE
:param packets:
:param labels:
:param indices:
:param batch_size:
:return:
'''
Xbatch = np.zeros((batch_size, PACKET_LEN), dtype=np.int64)
Ybatch = np.zeros((batch_size, 20), dtype=np.int64)
batch_idx = 0
while True:
for idx in indices:
for i, byte in enumerate(packets[idx]):
# print('bytes:',bytes)
Xbatch[batch_idx, i] = byte # 不用OHE,保持原数据
Ybatch[batch_idx] = np_utils.to_categorical(labels[idx], num_classes=20)
batch_idx += 1
if batch_idx == batch_size:
batch_idx = 0
# print('x:', Xbatch)
# print('y:', Ybatch)
# print(Xbatch.shape,Ybatch.shape)
# time.sleep(2)
yield (Xbatch, Ybatch)
def get_indices(target_texts):
# print(target_texts)
target_texts = np.array(target_texts) # important ,不然用不了np.where
normal_indices = [np.where(target_texts == i)[0] for i in range(0, 10)] # 一共有20个类别
attack_indices = [np.where(target_texts == i)[0] for i in range(10, 20)]
# valid_normal_indices = np.random.choice(normal_indices, int(len(normal_indices) * DIVIDE_RATE))
valid_normal_indices = np.concatenate(
[np.random.choice(normal_indices[i], int(len(normal_indices[i]) * DIVIDE_RATE)) for i in range(10)])
valid_attack_indices = np.concatenate(
[np.random.choice(attack_indices[i], int(len(attack_indices[i]) * DIVIDE_RATE)) for i in range(10)])
# print('tai:', test_attack_indices)
valid_indices = np.concatenate([valid_normal_indices, valid_attack_indices]).astype(int)
np.random.shuffle(valid_indices) # 载入时shuffle一次,这里又shuffle一次,一共shuffle两次
train_indices = np.array(list(set(np.arange(len(target_texts))) - set(valid_indices)))
np.random.shuffle(train_indices)
return train_indices, valid_indices
def model_structure():
global MODEL_NAME
# from model.hybrid_model import simpleBoBoNet
# model, model_name = simpleBoBoNet(conv1_filters=32, conv2_filters=64, gru1_units=128, gru2_units=64,
# kernel_size=3, model_name='simpleBoBo_v3_USTC').model()
# model, model_name = simpleBoBoNet(conv1_filters=16, conv2_filters=32, gru1_units=32, gru2_units=32, dense_units=16,
# kernel_size=3, model_name='simpleBoBo_v2').model()
from model.hybrid_model import BoBoNet
model, model_name = BoBoNet(model_name='BoBoNet_USTC').model_USTC()
MODEL_NAME = model_name
# from model.LSTM_model import HwangLSTM
# model, model_name = HwangLSTM()
# MODEL_NAME = model_name
return model
def callbacks_method(K):
from tools.CallbacksMethod import CustomHistory
import time
customhistory_cb = CustomHistory(MODEL_NAME)
if K != None:
weight_file = CHECKPOINTS_DIR + str(MODEL_NAME) + '_' + str(PACKET_LEN) + '_' + str(
time.strftime("%b%d", time.localtime())) + f'_K{K}' + '_{epoch:02d}_{val_loss:.2f}.hdf5'
else:
weight_file = CHECKPOINTS_DIR + str(MODEL_NAME) + '_' + str(PACKET_LEN) + '_' + str(
time.strftime("%b%d", time.localtime())) + '_{epoch:02d}_{val_loss:.2f}.hdf5'
check_cb = callbacks.ModelCheckpoint(weight_file, monitor='val_loss', verbose=0, save_best_only=False,
save_weights_only=True, mode='min')
earlystop_cb = callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')
csvlogger_cb = callbacks.CSVLogger(filename='log/{}.csv'.format(MODEL_NAME), append=True)
return check_cb, earlystop_cb, csvlogger_cb, customhistory_cb
def train_model(model, train_data_generator, validation_data_generator, K=None):
model.summary()
model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
# model.compile("RMSProp", "categorical_crossentropy", metrics=["accuracy"])
check_cb, earlystop_cb, csvlogger_cb, customhistory_cb = callbacks_method(K)
model.fit_generator(generator=train_data_generator,
steps_per_epoch=STEP_PER_EPOCH,
epochs=EPOCH,
callbacks=[check_cb, earlystop_cb, csvlogger_cb, customhistory_cb],
validation_data=validation_data_generator,
validation_steps=VALIDATION_STEP)
def run(dataset_path):
# input_texts, target_texts = load_data(dataset_path)
# train_indices, test_indices = get_indices(target_texts)
# model = model_structure()
# train_data_generator = dataset_generator2(input_texts, target_texts, train_indices, MINI_BATCH)
# validation_data_generator = dataset_generator2(input_texts, target_texts, test_indices, MINI_BATCH)
# train_model(model, train_data_generator, validation_data_generator)
input_texts, target_texts = load_data(dataset_path)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)
K_start = 0
for train_indices, test_indices in kfold.split(input_texts, target_texts):
model = model_structure()
train_data_generator = dataset_generator2(input_texts, target_texts, train_indices, MINI_BATCH)
validation_data_generator = dataset_generator2(input_texts, target_texts, test_indices, MINI_BATCH)
train_model(model, train_data_generator, validation_data_generator, K_start)
K_start += 1
if __name__ == '__main__':
run(r'K:\dataset\USTC-TFC2016\2_dataset\train')
# input_texts, target_texts = load_data(r'K:\数据库\ISCX-IDS-2012\3_1sampleDataset\Dataset')
# # print('labels_len:',len(target_texts))
# train_indices, test_indices = get_indices(target_texts)
# # # for idx in test_indices:
# # print(idx,target_texts[idx])
# # print(np_utils.to_categorical(4,num_classes=5))
# train_data_generator = dataset_generator(input_texts, target_texts, train_indices, MINI_BATCH)