@@ -42,36 +42,106 @@ fit_glm = function(mm, response, train){
42
42
coefs_big $ name = gsub(' \\ n' , ' ' , coefs_big $ name )
43
43
if (plot ) barplot(coefs_big $ coefs , names.arg = coefs_big $ name , horiz = TRUE )
44
44
coefs_big
45
- }
46
-
47
-
48
- fit_xgboost = function (mm , response , train ,plot_it = FALSE , ... ){
45
+ }
46
+
47
+ # ' @export
48
+ fit_xgboost <- function (response , ... ) {
49
+ UseMethod(" fit_xgboost " , response )
50
+ }
51
+
52
+ # ' @export
53
+ fit_xgboost.logical = function (response , mm , train ,plot_it = FALSE , ... ){
49
54
library(xgboost )
50
55
# error checks
51
56
if (nrow(mm ) != length(response )) stop(' mm not equal to response length' )
52
57
if (nrow(mm ) != length(train )) stop(' mm not equal to train length' )
53
58
54
59
if (! is.logical(train )) stop(' train is no logical' )
55
- if (! is.logical(response )) stop(' response is not logical' )
56
-
60
+
57
61
dtrain <- xgb.DMatrix(data = mm [train ,], label = response [train ])
58
-
59
- cv <- xgb.cv(data = dtrain , nrounds = 2000 ,... ,
60
- nthread = 4 , nfold = 5 , metrics = list (" auc" ), objective = " binary:logistic" , early_stopping_rounds = 5 )
61
-
62
- bst = xgboost(data = dtrain , ... ,
63
- nrounds = cv $ best_iteration , objective = " binary:logistic" )
64
-
62
+
63
+ if (! is.logical(response )) stop(' response is not logical' )
64
+
65
+ cv <- xgb.cv(data = dtrain , nrounds = 2000 ,... ,
66
+ nthread = 4 , nfold = 5 , metrics = list (" logloss" ), objective = " binary:logistic" , early_stopping_rounds = 5 )
67
+
68
+ bst = xgboost(data = dtrain , ... ,
69
+ nrounds = cv $ best_iteration , objective = " binary:logistic" )
70
+
65
71
boost_test_pred <- predict(bst , mm [! train ,])
66
72
out = list (bst = bst )
67
73
out $ out_of_bag = data.frame (predicted = boost_test_pred , actual = response [! train ])
68
74
69
75
out $ auc = auc(boost_test_pred , response [! train ] )
70
76
cat(' auc =' , out $ auc , ' \n ' )
77
+
78
+ class(out ) = ' boostfit'
71
79
72
80
if (plot_it ) plot_boost_vars(bst , mm , 40 )
73
81
out
74
- }
82
+ }
83
+
84
+ # todo make into s3 methods
85
+ # ' @export
86
+ fit_xgboost.numeric = function (response , mm , train , plot_it = FALSE , ... ){
87
+ library(xgboost )
88
+ # error checks
89
+ if (nrow(mm ) != length(response )) stop(' mm not equal to response length' )
90
+ if (nrow(mm ) != length(train )) stop(' mm not equal to train length' )
91
+
92
+ dtrain <- xgb.DMatrix(data = mm [train ,], label = response [train ])
93
+
94
+ if (! is.numeric(response )) stop(' response is not numericl' )
95
+
96
+ cv <- xgb.cv(data = dtrain , nrounds = 2000 ,... ,
97
+ nthread = 4 , nfold = 5 , metrics = list (" rmse" ), objective = " reg:linear" , early_stopping_rounds = 5 )
98
+
99
+ bst = xgboost(data = dtrain , ... ,
100
+ nrounds = cv $ best_iteration , objective = " reg:linear" )
101
+
102
+ boost_test_pred <- predict(bst , mm [! train ,])
103
+ out = list (bst = bst )
104
+ out $ out_of_bag = data.frame (predicted = boost_test_pred , actual = response [! train ])
105
+
106
+ out $ rmse = rmse(boost_test_pred , response [! train ] )
107
+ cat(' rmse =' , out $ rmse , ' \n ' )
108
+
109
+ class(out ) = ' boostfit'
110
+
111
+ if (plot_it ) plot_boost_vars(bst , mm , 40 )
112
+ out
113
+
114
+ }
115
+
116
+ plot.boostfit = function (boostfit , probs = seq(0 , 1 , 0.25 )){
117
+ boostfit $ q = quantile(boostfit $ out_of_bag $ predicted , probs = probs )
118
+ boostfit $ q [1 ] = 0
119
+ boostfit $ q [length( boostfit $ q )] = 1
120
+
121
+ boostfit $ out_of_bag $ group = cut(boostfit $ out_of_bag $ predicted , boostfit $ q , labels = 1 : (length( boostfit $ q )- 1 ))
122
+ tt = table(boostfit $ out_of_bag $ group , boostfit $ out_of_bag $ actual )
123
+ pt = prop.table(tt , 1 )
124
+ mp(mfrow = c(2 ,2 ))
125
+ boxplot(predicted ~ actual , boostfit $ out_of_bag )
126
+ barplot(tt [,2 ], main = ' Count' )
127
+ barplot(pt [,2 ], main = ' Proportion' )
128
+ abline(h = mean(boostfit $ out_of_bag $ actual ), lty = 2 )
129
+ }
130
+
131
+ predict.boostfit = function (model , newdata ){
132
+ vars = intersect(model $ vars , colnames(newdata ))
133
+ hmm_new = FeatureHashing :: hashed.model.matrix(vars , newdata ,
134
+ hash.size = model $ hash_size )
135
+ p = xgboost ::: predict.xgb.Booster(model $ bst , hmm_new )
136
+
137
+ if (! is.null(model $ q )){
138
+ group = cut(p , model $ q , labels = 1 : (length(model $ q ) - 1 ))
139
+ group = tomr :: unfactor(group )
140
+ return (data.frame (p , group ))
141
+ }
142
+
143
+ else return (data.frame (p ))
144
+ }
75
145
76
146
boost_variables = function (model , mm , n = 40 ){
77
147
ri = xgb.importance(colnames(mm ), model = model )
0 commit comments