added diff dates and some eval functions

tom.liptrot · tom.liptrot · commit e891546e9608 · 2017-05-23T07:15:34.000Z
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,9 +1,10 @@
-Package: tlml
-Title: Tom Liptrot's Machine Learning package
-Version: 0.0.0.9000
-Authors@R: person("Tom", "Liptrot", email = "tom.liptrot@gmail.com", role = c("aut", "cre"))
-Description: What the package does (one paragraph).
-Depends: R (>= 3.2.2)
-License: What license is it under?
-LazyData: true
-Imports: data.table
+Package: tlml
+Title: Tom Liptrot's Machine Learning package
+Version: 0.0.0.9000
+Authors@R: person("Tom", "Liptrot", email = "tom.liptrot@gmail.com", role = c("aut", "cre"))
+Description: What the package does (one paragraph).
+Depends: R (>= 3.2.2)
+License: What license is it under?
+LazyData: true
+Imports: data.table
+RoxygenNote: 6.0.1
diff --git a/R/data prep.r b/R/data prep.r
@@ -1,3 +1,55 @@
+is_date_ish = function(x, orders = c("ymd", "ymd HMS"), ...){
+  oldw <- getOption("warn")
+  options(warn = -1)
+  
+  try_date = parse_date_time(x, orders = orders , ...)
+  options(warn = oldw)
+  all_na = all(is.na(try_date))
+  !all_na
+}
+
+make_diff_dates = function(df, ...){
+  library(lubridate)
+  
+
+  
+  to_date = function(x, orders = c("ymd", "ymd HMS"), ... ){
+    oldw <- getOption("warn")
+    options(warn = -1)
+    d = parse_date_time(x, orders = orders, ... )
+    options(warn = oldw)
+    as.Date(d)
+  }
+  i_date = map_lgl(df, is_date_ish,  ...)
+  date_df = map_df(df[i_date ] , to_date, ...)
+  date_df = map_df( date_df, as.numeric) 
+  dates_matrix = as.matrix(date_df)
+  
+  number_of_cols = ncol(dates_matrix)
+  number_of_rows = nrow(dates_matrix)
+  
+  diff_dates = matrix(ncol = number_of_cols^2, nrow = number_of_rows)
+  dimnames(diff_dates) <- list(rownames(diff_dates, do.NULL = FALSE, prefix = "row"),
+                               colnames(diff_dates, do.NULL = FALSE, prefix = "col"))
+  new_names =  matrix(data = '', nrow = number_of_cols^2, ncol = 2, byrow = FALSE, dimnames = NULL)
+  
+  
+  index = 0
+  for (i in 1:number_of_cols){
+    for(j in 1:number_of_cols){
+      index = index + 1
+      diff_dates [,index] = as.integer(dates_matrix[,i] - dates_matrix[,j])
+      new_names[index,1] =  colnames(dates_matrix)[i] 
+      new_names[index,2] =  colnames(dates_matrix)[j] 
+    }
+  }
+  
+  colnames(diff_dates) = paste(new_names[,1] , new_names[,2] , sep = '_diff_')
+  
+  diff_dates = as.data.frame(diff_dates)
+  diff_dates
+  
+}
 
 poly_df = function(x, degree  = 3){
 
diff --git a/R/evaluation functions.r b/R/evaluation functions.r
@@ -1,15 +1,29 @@
-auc = function (predicted_prob, actual_class) {
-    if(!is.logical(actual_class)) actual_class =  as.logical(as.factor(actual_class))
-    if(length(actual_class) != length(predicted_prob)) stop('vector lengths do not match')
-    rprob = rank(predicted_prob)
-    n1 = sum(actual_class)
-    n0 = length(actual_class) - n1
-    u = sum(rprob[actual_class == 1]) - n1 * (n1 + 1)/2
-    u / (n1 * n0)
- }
- 
- accuracy  = function(predicted_class, actual_class){
-     if(length(predicted_class) != length(predicted_class)) stop('vector lengths do not match')
-     
-     sum(predicted_class == actual_class) / length(actual_class)
-     }
+auc = function (predicted_prob, actual_class) {
+    if(!is.logical(actual_class)) actual_class =  as.logical(as.factor(actual_class))
+    if(length(actual_class) != length(predicted_prob)) stop('vector lengths do not match')
+    rprob = rank(predicted_prob)
+    n1 = sum(actual_class)
+    n0 = length(actual_class) - n1
+    u = sum(rprob[actual_class == 1]) - n1 * (n1 + 1)/2
+    u / (n1 * n0)
+ }
+ 
+ accuracy  = function(predicted_class, actual_class){
+     if(length(predicted_class) != length(predicted_class)) stop('vector lengths do not match')
+     
+     sum(predicted_class == actual_class) / length(actual_class)
+ }
+ 
+ # Function that returns Root Mean Squared Error
+ rmse <- function(predicted, actual)
+ {
+   error = predicted - actual
+   sqrt(mean(error^2))
+ }
+ 
+ mae <- function(predicted, actual)
+ {
+   error = predicted - actual
+   median(abs(error))
+ }
+ 
diff --git a/R/fitter_functions.r b/R/fitter_functions.r
@@ -42,36 +42,106 @@ fit_glm = function(mm, response, train){
     coefs_big$name = gsub('\\n', ' ', coefs_big$name)
     if(plot)    barplot(coefs_big$coefs, names.arg = coefs_big$name, horiz = TRUE)  
     coefs_big
-    }
-    
-    
- fit_xgboost = function(mm, response, train,plot_it = FALSE,  ...){
+ }
+ 
+ #' @export
+fit_xgboost <- function (response, ...) {
+   UseMethod("fit_xgboost ", response)
+}
+
+#' @export
+fit_xgboost.logical = function(response, mm, train,plot_it = FALSE,  ...){
     library(xgboost)
     #error checks
     if(nrow(mm) != length(response)) stop('mm not equal to response length')
     if(nrow(mm) != length(train)) stop('mm not equal to train length')
     
     if(!is.logical(train)) stop('train is no logical')
-    if(!is.logical(response)) stop('response is not logical')
-   
+  
     dtrain <- xgb.DMatrix(data =  mm[train,], label=response [train])
-   
-   cv <- xgb.cv(data = dtrain , nrounds = 2000,...,
-                nthread = 4, nfold = 5, metrics = list("auc"), objective = "binary:logistic", early_stopping_rounds = 5)
-
-   bst = xgboost(data = dtrain , ...,
-                            nrounds = cv$best_iteration, objective = "binary:logistic")
-   
+    
+    if(!is.logical(response)) stop('response is not logical')
+    
+    cv <- xgb.cv(data = dtrain , nrounds = 2000,...,
+                 nthread = 4, nfold = 5, metrics = list("logloss"), objective = "binary:logistic", early_stopping_rounds = 5)
+    
+    bst = xgboost(data = dtrain , ...,
+                  nrounds = cv$best_iteration, objective = "binary:logistic")
+    
     boost_test_pred <- predict(bst, mm[!train,])
     out = list(bst = bst)
     out$out_of_bag = data.frame(predicted = boost_test_pred , actual = response [!train])
     
     out$auc = auc(boost_test_pred, response [!train] )
     cat('auc =',    out$auc, '\n')
+  
+    class(out) = 'boostfit'    
      
    if(plot_it) plot_boost_vars(bst, mm, 40)
     out
-    }
+}
+
+#todo make into s3 methods
+#' @export
+fit_xgboost.numeric = function(response, mm, train, plot_it = FALSE,  ...){
+  library(xgboost)
+  #error checks
+  if(nrow(mm) != length(response)) stop('mm not equal to response length')
+  if(nrow(mm) != length(train)) stop('mm not equal to train length')
+  
+  dtrain <- xgb.DMatrix(data =  mm[train,], label=response [train])
+  
+  if(!is.numeric(response)) stop('response is not numericl')
+  
+  cv <- xgb.cv(data = dtrain , nrounds = 2000,...,
+               nthread = 4, nfold = 5, metrics = list("rmse"), objective = "reg:linear", early_stopping_rounds = 5)
+  
+  bst = xgboost(data = dtrain , ...,
+                nrounds = cv$best_iteration, objective = "reg:linear")
+  
+  boost_test_pred <- predict(bst, mm[!train,])
+  out = list(bst = bst)
+  out$out_of_bag = data.frame(predicted = boost_test_pred , actual = response [!train])
+  
+  out$ rmse =  rmse(boost_test_pred, response [!train] )
+  cat('rmse =',    out$rmse, '\n')
+  
+  class(out) = 'boostfit'    
+  
+  if(plot_it) plot_boost_vars(bst, mm, 40)
+  out
+  
+}
+ 
+plot.boostfit = function(boostfit, probs = seq(0, 1, 0.25)){
+  boostfit$q = quantile(boostfit$out_of_bag$predicted, probs = probs)
+  boostfit$q[1] = 0
+  boostfit$q[length( boostfit$q)] = 1
+  
+  boostfit$out_of_bag$group = cut(boostfit$out_of_bag$predicted, boostfit$q, labels = 1:(length( boostfit$q)-1))
+  tt = table(boostfit$out_of_bag$group , boostfit$out_of_bag$actual)
+  pt = prop.table(tt, 1)
+  mp(mfrow = c(2,2))
+  boxplot(predicted ~ actual , boostfit$out_of_bag)
+  barplot(tt[,2], main = 'Count')
+  barplot(pt[,2], main = 'Proportion')
+  abline(h = mean(boostfit$out_of_bag$actual), lty = 2)
+}
+
+predict.boostfit = function(model, newdata){
+  vars = intersect(model$vars, colnames(newdata))
+  hmm_new = FeatureHashing::hashed.model.matrix(vars, newdata, 
+                                                hash.size = model$hash_size)
+  p = xgboost:::predict.xgb.Booster(model$bst, hmm_new)
+  
+  if(!is.null(model$q)){
+  group = cut(p, model$q, labels = 1:(length(model$q) - 1))
+  group = tomr::unfactor(group)
+  return(data.frame(p, group))
+  }
+  
+  else return(data.frame(p))
+}
     
 boost_variables = function(model, mm, n = 40){
     ri = xgb.importance(colnames(mm), model = model)