Skip to content

Commit e891546

Browse files
author
tom.liptrot
committed
added diff dates and some eval functions
1 parent 816397e commit e891546

File tree

4 files changed

+175
-38
lines changed

4 files changed

+175
-38
lines changed

DESCRIPTION

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
Package: tlml
2-
Title: Tom Liptrot's Machine Learning package
3-
Version: 0.0.0.9000
4-
Authors@R: person("Tom", "Liptrot", email = "[email protected]", role = c("aut", "cre"))
5-
Description: What the package does (one paragraph).
6-
Depends: R (>= 3.2.2)
7-
License: What license is it under?
8-
LazyData: true
9-
Imports: data.table
1+
Package: tlml
2+
Title: Tom Liptrot's Machine Learning package
3+
Version: 0.0.0.9000
4+
Authors@R: person("Tom", "Liptrot", email = "[email protected]", role = c("aut", "cre"))
5+
Description: What the package does (one paragraph).
6+
Depends: R (>= 3.2.2)
7+
License: What license is it under?
8+
LazyData: true
9+
Imports: data.table
10+
RoxygenNote: 6.0.1

R/data prep.r

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,55 @@
1+
is_date_ish = function(x, orders = c("ymd", "ymd HMS"), ...){
2+
oldw <- getOption("warn")
3+
options(warn = -1)
4+
5+
try_date = parse_date_time(x, orders = orders , ...)
6+
options(warn = oldw)
7+
all_na = all(is.na(try_date))
8+
!all_na
9+
}
10+
11+
make_diff_dates = function(df, ...){
12+
library(lubridate)
13+
14+
15+
16+
to_date = function(x, orders = c("ymd", "ymd HMS"), ... ){
17+
oldw <- getOption("warn")
18+
options(warn = -1)
19+
d = parse_date_time(x, orders = orders, ... )
20+
options(warn = oldw)
21+
as.Date(d)
22+
}
23+
i_date = map_lgl(df, is_date_ish, ...)
24+
date_df = map_df(df[i_date ] , to_date, ...)
25+
date_df = map_df( date_df, as.numeric)
26+
dates_matrix = as.matrix(date_df)
27+
28+
number_of_cols = ncol(dates_matrix)
29+
number_of_rows = nrow(dates_matrix)
30+
31+
diff_dates = matrix(ncol = number_of_cols^2, nrow = number_of_rows)
32+
dimnames(diff_dates) <- list(rownames(diff_dates, do.NULL = FALSE, prefix = "row"),
33+
colnames(diff_dates, do.NULL = FALSE, prefix = "col"))
34+
new_names = matrix(data = '', nrow = number_of_cols^2, ncol = 2, byrow = FALSE, dimnames = NULL)
35+
36+
37+
index = 0
38+
for (i in 1:number_of_cols){
39+
for(j in 1:number_of_cols){
40+
index = index + 1
41+
diff_dates [,index] = as.integer(dates_matrix[,i] - dates_matrix[,j])
42+
new_names[index,1] = colnames(dates_matrix)[i]
43+
new_names[index,2] = colnames(dates_matrix)[j]
44+
}
45+
}
46+
47+
colnames(diff_dates) = paste(new_names[,1] , new_names[,2] , sep = '_diff_')
48+
49+
diff_dates = as.data.frame(diff_dates)
50+
diff_dates
51+
52+
}
153

254
poly_df = function(x, degree = 3){
355

R/evaluation functions.r

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,29 @@
1-
auc = function (predicted_prob, actual_class) {
2-
if(!is.logical(actual_class)) actual_class = as.logical(as.factor(actual_class))
3-
if(length(actual_class) != length(predicted_prob)) stop('vector lengths do not match')
4-
rprob = rank(predicted_prob)
5-
n1 = sum(actual_class)
6-
n0 = length(actual_class) - n1
7-
u = sum(rprob[actual_class == 1]) - n1 * (n1 + 1)/2
8-
u / (n1 * n0)
9-
}
10-
11-
accuracy = function(predicted_class, actual_class){
12-
if(length(predicted_class) != length(predicted_class)) stop('vector lengths do not match')
13-
14-
sum(predicted_class == actual_class) / length(actual_class)
15-
}
1+
auc = function (predicted_prob, actual_class) {
2+
if(!is.logical(actual_class)) actual_class = as.logical(as.factor(actual_class))
3+
if(length(actual_class) != length(predicted_prob)) stop('vector lengths do not match')
4+
rprob = rank(predicted_prob)
5+
n1 = sum(actual_class)
6+
n0 = length(actual_class) - n1
7+
u = sum(rprob[actual_class == 1]) - n1 * (n1 + 1)/2
8+
u / (n1 * n0)
9+
}
10+
11+
accuracy = function(predicted_class, actual_class){
12+
if(length(predicted_class) != length(predicted_class)) stop('vector lengths do not match')
13+
14+
sum(predicted_class == actual_class) / length(actual_class)
15+
}
16+
17+
# Function that returns Root Mean Squared Error
18+
rmse <- function(predicted, actual)
19+
{
20+
error = predicted - actual
21+
sqrt(mean(error^2))
22+
}
23+
24+
mae <- function(predicted, actual)
25+
{
26+
error = predicted - actual
27+
median(abs(error))
28+
}
29+

R/fitter_functions.r

Lines changed: 84 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -42,36 +42,106 @@ fit_glm = function(mm, response, train){
4242
coefs_big$name = gsub('\\n', ' ', coefs_big$name)
4343
if(plot) barplot(coefs_big$coefs, names.arg = coefs_big$name, horiz = TRUE)
4444
coefs_big
45-
}
46-
47-
48-
fit_xgboost = function(mm, response, train,plot_it = FALSE, ...){
45+
}
46+
47+
#' @export
48+
fit_xgboost <- function (response, ...) {
49+
UseMethod("fit_xgboost ", response)
50+
}
51+
52+
#' @export
53+
fit_xgboost.logical = function(response, mm, train,plot_it = FALSE, ...){
4954
library(xgboost)
5055
#error checks
5156
if(nrow(mm) != length(response)) stop('mm not equal to response length')
5257
if(nrow(mm) != length(train)) stop('mm not equal to train length')
5358

5459
if(!is.logical(train)) stop('train is no logical')
55-
if(!is.logical(response)) stop('response is not logical')
56-
60+
5761
dtrain <- xgb.DMatrix(data = mm[train,], label=response [train])
58-
59-
cv <- xgb.cv(data = dtrain , nrounds = 2000,...,
60-
nthread = 4, nfold = 5, metrics = list("auc"), objective = "binary:logistic", early_stopping_rounds = 5)
61-
62-
bst = xgboost(data = dtrain , ...,
63-
nrounds = cv$best_iteration, objective = "binary:logistic")
64-
62+
63+
if(!is.logical(response)) stop('response is not logical')
64+
65+
cv <- xgb.cv(data = dtrain , nrounds = 2000,...,
66+
nthread = 4, nfold = 5, metrics = list("logloss"), objective = "binary:logistic", early_stopping_rounds = 5)
67+
68+
bst = xgboost(data = dtrain , ...,
69+
nrounds = cv$best_iteration, objective = "binary:logistic")
70+
6571
boost_test_pred <- predict(bst, mm[!train,])
6672
out = list(bst = bst)
6773
out$out_of_bag = data.frame(predicted = boost_test_pred , actual = response [!train])
6874

6975
out$auc = auc(boost_test_pred, response [!train] )
7076
cat('auc =', out$auc, '\n')
77+
78+
class(out) = 'boostfit'
7179

7280
if(plot_it) plot_boost_vars(bst, mm, 40)
7381
out
74-
}
82+
}
83+
84+
#todo make into s3 methods
85+
#' @export
86+
fit_xgboost.numeric = function(response, mm, train, plot_it = FALSE, ...){
87+
library(xgboost)
88+
#error checks
89+
if(nrow(mm) != length(response)) stop('mm not equal to response length')
90+
if(nrow(mm) != length(train)) stop('mm not equal to train length')
91+
92+
dtrain <- xgb.DMatrix(data = mm[train,], label=response [train])
93+
94+
if(!is.numeric(response)) stop('response is not numericl')
95+
96+
cv <- xgb.cv(data = dtrain , nrounds = 2000,...,
97+
nthread = 4, nfold = 5, metrics = list("rmse"), objective = "reg:linear", early_stopping_rounds = 5)
98+
99+
bst = xgboost(data = dtrain , ...,
100+
nrounds = cv$best_iteration, objective = "reg:linear")
101+
102+
boost_test_pred <- predict(bst, mm[!train,])
103+
out = list(bst = bst)
104+
out$out_of_bag = data.frame(predicted = boost_test_pred , actual = response [!train])
105+
106+
out$ rmse = rmse(boost_test_pred, response [!train] )
107+
cat('rmse =', out$rmse, '\n')
108+
109+
class(out) = 'boostfit'
110+
111+
if(plot_it) plot_boost_vars(bst, mm, 40)
112+
out
113+
114+
}
115+
116+
plot.boostfit = function(boostfit, probs = seq(0, 1, 0.25)){
117+
boostfit$q = quantile(boostfit$out_of_bag$predicted, probs = probs)
118+
boostfit$q[1] = 0
119+
boostfit$q[length( boostfit$q)] = 1
120+
121+
boostfit$out_of_bag$group = cut(boostfit$out_of_bag$predicted, boostfit$q, labels = 1:(length( boostfit$q)-1))
122+
tt = table(boostfit$out_of_bag$group , boostfit$out_of_bag$actual)
123+
pt = prop.table(tt, 1)
124+
mp(mfrow = c(2,2))
125+
boxplot(predicted ~ actual , boostfit$out_of_bag)
126+
barplot(tt[,2], main = 'Count')
127+
barplot(pt[,2], main = 'Proportion')
128+
abline(h = mean(boostfit$out_of_bag$actual), lty = 2)
129+
}
130+
131+
predict.boostfit = function(model, newdata){
132+
vars = intersect(model$vars, colnames(newdata))
133+
hmm_new = FeatureHashing::hashed.model.matrix(vars, newdata,
134+
hash.size = model$hash_size)
135+
p = xgboost:::predict.xgb.Booster(model$bst, hmm_new)
136+
137+
if(!is.null(model$q)){
138+
group = cut(p, model$q, labels = 1:(length(model$q) - 1))
139+
group = tomr::unfactor(group)
140+
return(data.frame(p, group))
141+
}
142+
143+
else return(data.frame(p))
144+
}
75145

76146
boost_variables = function(model, mm, n = 40){
77147
ri = xgb.importance(colnames(mm), model = model)

0 commit comments

Comments
 (0)