-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_waveforms.py
115 lines (105 loc) · 7.01 KB
/
train_waveforms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pandas as pd
from readWaveform.segment_reader import SegmentReader
import learning.logReg as lr
import learning.svm
import learning.util
import learning.random_forest as rf
import pipeline.feature_selection_wrapper as feat_sel
import sklearn.model_selection
from sklearn.preprocessing import MinMaxScaler
import time
num_splits = 3
def appendAll(fullScores, featureWeights, times, scoresToAppend, weightsToAppend, timesToAppend):
return pd.concat([fullScores, scoresToAppend]), pd.concat([featureWeights, weightsToAppend]), pd.concat([times, timesToAppend])
def testTrain(X, Y, trainInd, testInd, model_name, test_train_valid_explicit):
'''
@param X data instances
@param Y labels
@param trainInd training split indices
@param testInd testing split indices
@param model_name
@param test_train_valid_explicit the function we use to generate results (see learning.logReg for example)
'''
Ytrain, Xtrain, Ytest, Xtest = Y.loc[trainInd], X.loc[trainInd], Y.loc[testInd], X.loc[testInd]
print("beginning model:", model_name)
startTime = time.time()
result = test_train_valid_explicit(Xtrain = Xtrain.values, \
Xtest = Xtest.values, \
Ytrain= Ytrain.values, \
Ytest = Ytest.values, \
n_jobs = -1, \
validation_size=.2)
print("Model: ", model_name, result.best_score)
print("Model: ", model_name, "; Best Params: ", result.best_params)
features_weights_to_add = pd.DataFrame(result.weights)
features_weights_to_add.index = [model_name]
features_weights_to_add.columns = X.columns
features_weights= features_weights_to_add
fullScores = learning.util.test(trainTuple=result.trainTuple, \
testTuple=result.testTuple, \
predictor=result.predictor, \
name=model_name)
# fullScores.to_csv("data/rawdatafiles/byHadmIDNumRec/final_scores.csv")
endTime = time.time()
times[model_name] = pd.Timedelta(seconds = endTime - startTime)
return fullScores, features_weights, times
if __name__=="__main__":
mm = MinMaxScaler()
segReader = SegmentReader();
Y = segReader.labels()
allFeatureWeights = None
allFullScores = None
totalTimes = None
for i in range(num_splits):
times = pd.Series()
trainInd, testInd = sklearn.model_selection.train_test_split(pd.Series(Y.index), train_size=.8, stratify=Y, random_state=i)
X = segReader.simpleStatsAll()
mm.fit(X.values)
X = pd.DataFrame(mm.transform(X.values), index=X.index, columns=X.columns)
fullScores, featureWeights, times = testTrain(X, Y, trainInd, testInd, "LR 1 hour", lr.test_train_valid_explicit)
fullScoresToAppend, featureWeightsToAppend, timesToAppend = testTrain(X, Y, trainInd, testInd, "RF 1 hour", rf.test_train_valid_explicit)
fullScores, featureWeights, times = appendAll(fullScores, featureWeights, times, fullScoresToAppend, featureWeightsToAppend, timesToAppend)
X = segReader.simpleStatsAll(unittime = pd.Timedelta("40 minutes"))
mm.fit(X.values)
X = pd.DataFrame(mm.transform(X.values), index=X.index, columns=X.columns)
fullScoresToAppend, featureWeightsToAppend, timesToAppend = testTrain(X, Y, trainInd, testInd, "LR 40 minutes", lr.test_train_valid_explicit)
fullScores, featureWeights, times = appendAll(fullScores, featureWeights, times, fullScoresToAppend, featureWeightsToAppend, timesToAppend)
fullScoresToAppend, featureWeightsToAppend, timesToAppend = testTrain(X, Y, trainInd, testInd, "RF 40 minutes", rf.test_train_valid_explicit)
fullScores, featureWeights, times = appendAll(fullScores, featureWeights, times, fullScoresToAppend, featureWeightsToAppend, timesToAppend)
X = segReader.simpleStatsAll(unittime = pd.Timedelta("20 minutes"))
mm.fit(X.values)
X = pd.DataFrame(mm.transform(X.values), index=X.index, columns=X.columns)
fullScoresToAppend, featureWeightsToAppend, timesToAppend = testTrain(X, Y, trainInd, testInd, "LR 20 minutes", lr.test_train_valid_explicit)
fullScores, featureWeights, times = appendAll(fullScores, featureWeights, times, fullScoresToAppend, featureWeightsToAppend, timesToAppend)
fullScoresToAppend, featureWeightsToAppend, timesToAppend = testTrain(X, Y, trainInd, testInd, "RF 20 minutes", rf.test_train_valid_explicit)
fullScores, featureWeights, times = appendAll(fullScores, featureWeights, times, fullScoresToAppend, featureWeightsToAppend, timesToAppend)
X = segReader.simpleStatsAll(unittime = pd.Timedelta("10 minutes"))
mm.fit(X.values)
X = pd.DataFrame(mm.transform(X.values), index=X.index, columns=X.columns)
fullScoresToAppend, featureWeightsToAppend, timesToAppend = testTrain(X, Y, trainInd, testInd, "LR 10 minutes", lr.test_train_valid_explicit)
fullScores, featureWeights, times = appendAll(fullScores, featureWeights, times, fullScoresToAppend, featureWeightsToAppend, timesToAppend)
fullScoresToAppend, featureWeightsToAppend, timesToAppend = testTrain(X, Y, trainInd, testInd, "RF 10 minutes", rf.test_train_valid_explicit)
fullScores, featureWeights, times = appendAll(fullScores, featureWeights, times, fullScoresToAppend, featureWeightsToAppend, timesToAppend)
# X = segReader.simpleStatsAll(unittime = pd.Timedelta("5 minutes"))
# mm.fit(X.values)
# X = pd.DataFrame(mm.transform(X.values), index=X.index, columns=X.columns)
# fullScoresToAppend, featureWeightsToAppend, timesToAppend = testTrain(X, Y, trainInd, testInd, "LR 5 minutes", lr.test_train_valid_explicit)
# fullScores, featureWeights, times = appendAll(fullScores, featureWeights, times, fullScoresToAppend, featureWeightsToAppend, timesToAppend)
# fullScoresToAppend, featureWeightsToAppend, timesToAppend = testTrain(X, Y, trainInd, testInd, "RF 5 minutes", rf.test_train_valid_explicit)
# fullScores, featureWeights, times = appendAll(fullScores, featureWeights, times, fullScoresToAppend, featureWeightsToAppend, timesToAppend)
print(fullScores)
if allFeatureWeights is None:
allFeatureWeights = featureWeights
allFullScores = fullScores
totalTimes = times
else:
allFeatureWeights += featureWeights
allFullScores += fullScores
totalTimes += times
allFeatureWeights /= num_splits
allFullScores /= num_splits
totalTimes /= num_splits
allFullScores.to_csv("data/rawdatafiles/numRecResults/final_scores.csv")
allFeatureWeights.to_csv("data/rawdatafiles/numRecResults/features_weights.csv")
totalTimes.to_csv("data/rawdatafiles/numRecResults/total_times.csv")
print(allFullScores)