-
Notifications
You must be signed in to change notification settings - Fork 0
/
devcon_tune_RF.R
113 lines (92 loc) · 3.26 KB
/
devcon_tune_RF.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
## DEVCON RF TUNNING SCRIPT
## 30/04/2020
setwd("/home/harpo/Dropbox/ongoing-work/git-repos/devcon/phase3/")
#load("./deconv_cgdata_cps_less_feat.RData")
# new features 20000
#load("deconv_cgdata_cps_new_feat.RData")
# newmix (25/05/2020) finegrain with Ben's samples
#load("deconv_fgdata_cps_new_feat_last3.RData")
# newmix (26/05/2020) coarse with Ben's samples
load("deconv_cgdata_cps_new_feat_last3.RData")
library(caret)
library(randomForest)
library(foreach)
require(doParallel)
#library(doMC)
library(dplyr)
#registerDoMC(cores=7)
# SETUP SNOW CLUSTER
primary <- '10.64.10.37' # SAMSON
machineAddresses <- list(
list(host=primary,user='harpo',
ncore=7),
list(host='10.64.10.36',user='harpo', # KERRRIGAN
ncore=8)
# list(host='10.64.10.39',user='harpo', # JOKER
# ncore=16)
)
spec <- lapply(machineAddresses,
function(machine) {
rep(list(list(host=machine$host,
user=machine$user)),
machine$ncore)
})
spec <- unlist(spec,recursive=FALSE)
mtry_range <- c(2,3,4,5,6,7,8,9,10,50,100,150,200)
ntree_range <- c(500)
parallelCluster <- parallel::makePSOCKcluster(
spec,
master=primary,
homogeneous=T,manual=F)
registerDoParallel(parallelCluster)
print(paste("Workers: ",getDoParWorkers()))
parms <- expand.grid(mtry = mtry_range,ntree=ntree_range)
results_final <-c()
results_final_models <- list()
# Starting training ---------
for (label_number in rownames(trainprop)) {
# create datasets ----------
labels <- trainprop[label_number, ]
trainset <- t(train)
#trainset <- scale(trainset,center=TRUE,scale=TRUE)
#data_train <- cbind(label = labels, trainset)
labels_test <- testprop[label_number, ]
testset <- t(test)
#data_test <- cbind(label = labels_test, testset)
# start parallel fine tuning -----------
results <- foreach(i = 1:nrow(parms), .combine = rbind) %dopar% {
mtry <- parms[i,]$mtry
ntree <- parms[i,]$ntree
model <- randomForest::randomForest(
y=labels,
x= trainset,
mtry = mtry,
ntree = ntree,
na.action=na.omit
)
preds <- predict(model, testset)
spear <- cor(x = preds, y = labels_test, method = "spearman")
pears <- cor(x = preds, y = labels_test, method = "pearson")
partial_results<-data.frame(label_number,parms[i,], pearson = pears, spearman = spear)
partial_results
}
# select bes model ---------
best_model <- results %>% arrange(desc(pearson)) %>% filter(row_number()==1)
print(paste("selecting best model for ",best_model$label_number," : ",best_model$mtry, ", ", best_model$ntree," Pearson value : ", best_model$pearson,sep=""))
model <- randomForest::randomForest(
y = labels,
x = trainset,
mtry = best_model$mtry,
ntree = best_model$ntree,
na.action=na.omit
)
results_final_models[[label_number]]<-model
save(results_final_models,file = "results_rf_devcon_bestmodels_coarsegrain_data_cps_20000_newmix_last3.rdata",compress = "gzip")
results_final<-rbind(results_final,results)
readr::write_csv(results_final,path="results_rf_devcon_coarsegrain_data_cps_20000_newmix_last3.csv")
}
# Shutdown cluster neatly
if(!is.null(parallelCluster)) {
parallel::stopCluster(parallelCluster)
parallelCluster <- c()
}