-
Notifications
You must be signed in to change notification settings - Fork 0
/
Random Forests and Boosting.R
107 lines (74 loc) · 3.49 KB
/
Random Forests and Boosting.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
library(dplyr)
library(randomForest)
library(gbm)
library(caret)
library(ROCR)
setwd(dirname(rstudioapi :: getSourceEditorContext()$path))
# Read and pre-porocess the datasets
users_train <- read.csv("users_train.csv")
users_test <- read.csv("users_test.csv")
#converting the signup_flow into a factor variable.
users_train$signup_flow = factor(users_train$signup_flow)
users_test$signup_flow = factor(users_test$signup_flow)
### Random Forest
set.seed(657)
users_train_model <- users_train %>% select(-c("id","date_first_booking",
"date_account_created", "booking"))
control <- trainControl(method="repeatedcv", number=10, repeats=1, search="random")
train.rf.oob <- train(country_destination ~ .,
data = users_train_model,
method="rf",
tuneGrid=data.frame(mtry=1:sqrt(ncol(users_train_model))),
nodesize = 25,
ntree = 50,
trControl=control)
mtry_chosen <- train.rf.oob$bestTune[[1]]
print(mtry_chosen)
model_RF = randomForest(country_destination ~ . ,
data=users_train_model,
ntree=80,
mtry=mtry_chosen,
nodesize=25)
importance.rf <- data.frame(imp=importance(model_RF))
importance.rf
model_RF_LIKE_CART = randomForest(country_destination ~ . ,
data=users_train_model,
ntree=1,
mtry=16,
nodesize=25)
importance.rf <- data.frame(imp=importance(model_RF_LIKE_CART))
importance.rf
### Evaluate the performance
users_test_model = users_test %>% select(-c("id","date_first_booking",
"date_account_created", "booking"))
#this is needed to equalize the levels between training and testing sets so we can do predictions
levels(users_test_model$signup_flow) <- levels(users_train_model$signup_flow)
levels(users_test_model$first_browser) <- levels(users_train_model$first_browser)
levels(users_test_model$signup_flow) <- levels(users_train_model$signup_flow)
pred_RF = predict(model_RF, newdata = users_test_model)
pred_RF_train = predict(model_RF, newdata = users_train_model)
pred_RF_LIKE_CART = predict(model_RF_LIKE_CART, newdata = users_test_model)
#confusion matrix
table(pred_RF, users_test_model$country_destination)
#misclassification
#in-sample:
mean(pred_RF_train != users_train_model$country_destination)
#out-of-sample:
mean(pred_RF != users_test_model$country_destination)
mean(pred_RF_LIKE_CART != users_test_model$country_destination)
### Boosting
#simplest model
model_GBM_1 = gbm(country_destination ~ .,
data=users_train_model, distribution = "gaussian",
n.minobsinnode = 50, n.trees = 100, shrinkage = 0.001, interaction.depth = 3)
influence = summary(model_GBM_1)
#a bit more complex: more and deeper trees, but faster learning rate
model_GBM_2 = gbm(country_destination ~ .,
data=users_train_model, distribution = "gaussian",
n.minobsinnode = 25, n.trees = 500, shrinkage = 0.01, interaction.depth = 7)
influence = summary(model_GBM_2)
#the most complex model: small learning rate, lots of deep trees
model_GBM_3 = gbm(country_destination ~ .,
data=users_train_model, distribution = "gaussian",
n.minobsinnode = 25, n.trees = 1000, shrinkage = 0.001, interaction.depth = 10)
influence = summary(model_GBM_3)