-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_v2.py
208 lines (160 loc) · 6.72 KB
/
model_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import json
import csv
import time
import argparse
import os, sys
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import warnings
from tqdm import tqdm
import pandas as pd
import numpy as np
import math
# include confidence coefficient for prediction
def read_files(files):
train_df = pd.DataFrame()
# full path is necessary for those files
for file in files:
df = pd.read_csv(file)
train_df = train_df.append(df)
# train_df = train_df.loc[0:int(0.2*len(train_df)), :]
# train_df.reset_index(inplace = True, drop = True)
# shuffle all the data from the dataset
# train_df = train_df.sample(frac=1).reset_index(drop=True)
if 'Unnamed: 0' in train_df.columns:
train_df.drop('Unnamed: 0', axis = 1, inplace = True)
# drop all non necessary columns to training
train_df.drop(['index', 'minutes', 'p1_ioc', 'p2_ioc', 'p1_name', 'p2_name',
'player_1_rank_points_avg_l3', 'player_1_rank_points_l', 'player_1_rank_points_perc_l7_p25',
'player_1_rank_points_perc_l7_p50', 'player_1_rank_points_perc_l7_p75', 'player_1_rank_points_std_l3',
'player_2_rank_points_avg_l3', 'player_2_rank_points_l', 'player_2_rank_points_perc_l7_p25',
'player_2_rank_points_perc_l7_p50', 'player_2_rank_points_perc_l7_p75', 'player_2_rank_points_std_l3',
'score', 'tourney_date', 'tourney_level', 'tourney_name'],
axis = 1, inplace = True)
train_df.rename(columns = {'target_': 'target_0'}, inplace = True)
train_df.insert(0, 'target_1', 0)
train_df.insert(1, 'target_0', 0)
# get the columns in the right order so that we can process them easily
for index, row in train_df.iterrows():
train_df.at[index, 'target_1'] = train_df.at[index, 'target']
if train_df.at[index, 'target'] == 1:
train_df.at[index, 'target_0'] = 0
else:
train_df.at[index, 'target_0'] = 1
train_df.drop('target', axis = 1, inplace = True)
train_df.dropna(how = 'any', inplace = True)
# normalize the dataset
# => use min max scaler with positive values since can't learn from 0 and 1 standardized => maps to 0
#scaler = StandardScaler()
#scaler.fit(train_df)
#train_df = pd.DataFrame(scaler.transform(train_df), columns = train_df.columns)
# seems to show same weird attitude => accuracy goes to one very quickly, linked to binary target 0/1
#scaler = MinMaxScaler((0,10))
#scaler.fit(train_df)
#train_df = pd.DataFrame(scaler.transform(train_df), columns = train_df.columns)
# quick patch to limit damages done by non limiting the values returned by the stats => which values
# shifting the number of values in the datasets seems to highly influence the accuracy of the prediction
# and also influences the change rate of this solutions => should be good to include only the valid data 0.2
train, test = split(train_df)
train_Y = train.loc[:, 'target_1':'target_0']
train_X = train.loc[:, 'p1_age':'player_2_svpt_std_l7']
test_Y = test.loc[:, 'target_1':'target_0']
test_X = test.loc[:, 'p1_age':'player_2_svpt_std_l7']
print(train_X.head())
print(train_X.shape)
print('\n -------------------------------- \n')
print(train_Y.head())
print(train_Y.shape)
print('\n -------------------------------- \n')
print(test_X.head())
print(test_X.shape)
print('\n -------------------------------- \n')
print(test_Y.head())
print(test_Y.shape)
train_X = train_X.to_numpy()
train_Y = train_Y.to_numpy()
test_X = test_X.to_numpy()
test_Y = test_Y.to_numpy()
#time.sleep(5)
return train_X, train_Y, test_X, test_Y
def split(df):
train = df.iloc[:int(0.8*len(df)), :]
test = df.iloc[int(0.8*len(df)):, :]
return train, test
def init_parser():
parser = argparse.ArgumentParser(description='inputdata')
parser.add_argument(
'-o', '--output',
required=False,
default='./model_v2.h5',
type=str,
help='Output path for model',
dest='output_path')
parser.add_argument(
'-i', '--input',
required=True,
dest='input_list',
metavar="CSV-FILE",
nargs='+',
type=str,
help="CSV files")
args = parser.parse_args()
return args
def build_model():
model = tf.keras.Sequential()
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(16, activation='sigmoid'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(64, activation='tanh'))
model.add(layers.Dense(2, activation='softmax'))
model.compile(
optimizer = tf.train.RMSPropOptimizer(0.0001),
loss = 'categorical_crossentropy',
metrics = ['accuracy'])
return model
def main():
# give the files through terminal arguments & read em
args = init_parser()
# read the input files and get numpy arrays with input and outputs
train_X, train_Y, test_X, test_Y = read_files(args.input_list)
# build model using tensorflow's keras
model = build_model()
# get a summary of how the model is composed
# fit the model, seems like a bug in the model => optimal batch_size seems arbitrary
# also optimize using hyperparameter optimization algorithm => 1000 not good,
# does not make good decisions => should use batch size near the application size of the dataste
model.fit(train_X, train_Y, epochs = 300, batch_size = 10)
model.summary()
print('\n\n--------------------------------------------')
# evaluate the model using the tests samples
model.evaluate(test_X, test_Y, batch_size=10)
print('--------------------------------------------\n\n')
# used batch size realistic relatively to the size of the data which will be predicted in application
X_predicted = model.predict(test_X[0:1000], batch_size = 10)
Y_target = test_Y[0:1000]
# custom estimators for observation of the model's performance
very_good = []
good = []
notbad = []
for i in range(1000):
#print('Prediction: {0:.2f} : {1:.2f} ---- Actual target: {2} : {3}.'.format(
# X_predicted[i][0], X_predicted[i][1], Y_target[i][0], Y_target[i][1]))
# print('Result : {} \n'.format(abs(Y_target[i][0] - X_predicted[i][0])))
if 0.05 > abs(Y_target[i][0] - X_predicted[i][0]):
very_good.append(X_predicted[i])
elif 0.10 > abs(Y_target[i][0] - X_predicted[i][0]):
good.append(X_predicted[i])
elif 0.20 > abs(Y_target[i][0] - X_predicted[i][0]):
notbad.append(X_predicted[i])
print('\n\n--------------------------------------------')
print('Over 1000 games we have {} not bad, {} good and {} very good results.'.format(len(notbad), len(good), len(very_good)))
print('--------------------------------------------\n\n')
# is right 9/10 but gets to really miss the target when does => extreme decisions
# save the model for later use
model.save(args.output_path)
print('\nThe model was saved at {}.'.format(args.output_path))
if __name__ == '__main__':
main()