diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 01c282a96dc6..7b6a20f704dd 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -26,6 +26,11 @@ NVL <- function(x, val) { 'multi:softprob', 'rank:pairwise', 'rank:ndcg', 'rank:map')) } +.RANKING_OBJECTIVES <- function() { + return(c('binary:logistic', 'binary:logitraw', 'binary:hinge', 'multi:softmax', + 'multi:softprob')) +} + # # Low-level functions for boosting -------------------------------------------- @@ -235,33 +240,43 @@ convert.labels <- function(labels, objective_name) { } # Generates random (stratified if needed) CV folds -generate.cv.folds <- function(nfold, nrows, stratified, label, params) { +generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) { + if (NROW(group)) { + if (stratified) { + warning( + paste0( + "Stratified splitting is not supported when using 'group' attribute.", + " Will use unstratified splitting." + ) + ) + } + return(generate.group.folds(nfold, group)) + } + objective <- params$objective + if (!is.character(objective)) { + warning("Will use unstratified splitting (custom objective used)") + stratified <- FALSE + } + # cannot stratify if label is NULL + if (stratified && is.null(label)) { + warning("Will use unstratified splitting (no 'labels' available)") + stratified <- FALSE + } # cannot do it for rank - objective <- params$objective if (is.character(objective) && strtrim(objective, 5) == 'rank:') { - stop("\n\tAutomatic generation of CV-folds is not implemented for ranking!\n", + stop("\n\tAutomatic generation of CV-folds is not implemented for ranking without 'group' field!\n", "\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n") } # shuffle rnd_idx <- sample.int(nrows) - if (stratified && - length(label) == length(rnd_idx)) { + if (stratified && length(label) == length(rnd_idx)) { y <- label[rnd_idx] - # WARNING: some heuristic logic is employed to identify classification setting! # - For classification, need to convert y labels to factor before making the folds, # and then do stratification by factor levels. # - For regression, leave y numeric and do stratification by quantiles. if (is.character(objective)) { - y <- convert.labels(y, params$objective) - } else { - # If no 'objective' given in params, it means that user either wants to - # use the default 'reg:squarederror' objective or has provided a custom - # obj function. Here, assume classification setting when y has 5 or less - # unique values: - if (length(unique(y)) <= 5) { - y <- factor(y) - } + y <- convert.labels(y, objective) } folds <- xgb.createFolds(y = y, k = nfold) } else { @@ -277,6 +292,29 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, params) { return(folds) } +generate.group.folds <- function(nfold, group) { + ngroups <- length(group) - 1 + if (ngroups < nfold) { + stop("DMatrix has fewer groups than folds.") + } + seq_groups <- seq_len(ngroups) + indices <- lapply(seq_groups, function(gr) seq(group[gr] + 1, group[gr + 1])) + assignments <- base::split(seq_groups, as.integer(seq_groups %% nfold)) + assignments <- unname(assignments) + + out <- vector("list", nfold) + randomized_groups <- sample(ngroups) + for (idx in seq_len(nfold)) { + groups_idx_test <- randomized_groups[assignments[[idx]]] + groups_test <- indices[groups_idx_test] + idx_test <- unlist(groups_test) + attributes(idx_test)$group_test <- lengths(groups_test) + attributes(idx_test)$group_train <- lengths(indices[-groups_idx_test]) + out[[idx]] <- idx_test + } + return(out) +} + # Creates CV folds stratified by the values of y. # It was borrowed from caret::createFolds and simplified # by always returning an unnamed list of fold indices. diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index edbc267c1067..15f6faed0ba0 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -1259,8 +1259,11 @@ xgb.get.DMatrix.data <- function(dmat) { #' Get a new DMatrix containing the specified rows of #' original xgb.DMatrix object #' -#' @param object Object of class "xgb.DMatrix" -#' @param idxset a integer vector of indices of rows needed +#' @param object Object of class "xgb.DMatrix". +#' @param idxset An integer vector of indices of rows needed (base-1 indexing). +#' @param allow_groups Whether to allow slicing an `xgb.DMatrix` with `group` (or +#' equivalently `qid`) field. Note that in such case, the result will not have +#' the groups anymore - they need to be set manually through `setinfo`. #' @param colset currently not used (columns subsetting is not available) #' #' @examples @@ -1275,11 +1278,11 @@ xgb.get.DMatrix.data <- function(dmat) { #' #' @rdname xgb.slice.DMatrix #' @export -xgb.slice.DMatrix <- function(object, idxset) { +xgb.slice.DMatrix <- function(object, idxset, allow_groups = FALSE) { if (!inherits(object, "xgb.DMatrix")) { stop("object must be xgb.DMatrix") } - ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset) + ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset, allow_groups) attr_list <- attributes(object) nr <- nrow(object) @@ -1296,7 +1299,15 @@ xgb.slice.DMatrix <- function(object, idxset) { } } } - return(structure(ret, class = "xgb.DMatrix")) + + out <- structure(ret, class = "xgb.DMatrix") + parent_fields <- as.list(attributes(object)$fields) + if (NROW(parent_fields)) { + child_fields <- parent_fields[!(names(parent_fields) %in% c("group", "qid"))] + child_fields <- as.environment(child_fields) + attributes(out)$fields <- child_fields + } + return(out) } #' @rdname xgb.slice.DMatrix @@ -1340,11 +1351,11 @@ print.xgb.DMatrix <- function(x, verbose = FALSE, ...) { } cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ') - infos <- character(0) - if (xgb.DMatrix.hasinfo(x, 'label')) infos <- 'label' - if (xgb.DMatrix.hasinfo(x, 'weight')) infos <- c(infos, 'weight') - if (xgb.DMatrix.hasinfo(x, 'base_margin')) infos <- c(infos, 'base_margin') - if (length(infos) == 0) infos <- 'NA' + infos <- names(attributes(x)$fields) + infos <- infos[infos != "feature_name"] + if (!NROW(infos)) infos <- "NA" + infos <- infos[order(infos)] + infos <- paste(infos, collapse = ", ") cat(infos) cnames <- colnames(x) cat(' colnames:') diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 1cafd7be75c6..880fd56974bc 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -1,6 +1,6 @@ #' Cross Validation #' -#' The cross validation function of xgboost +#' The cross validation function of xgboost. #' #' @param params the list of parameters. The complete list of parameters is #' available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below @@ -19,13 +19,17 @@ #' #' See \code{\link{xgb.train}} for further details. #' See also demo/ for walkthrough example in R. -#' @param data takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input. +#' +#' Note that, while `params` accepts a `seed` entry and will use such parameter for model training if +#' supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG +#' system - thus, for reproducible results, one needs to call the `set.seed` function beforehand. +#' @param data An `xgb.DMatrix` object, with corresponding fields like `label` or bounds as required +#' for model training by the objective. +#' +#' Note that only the basic `xgb.DMatrix` class is supported - variants such as `xgb.QuantileDMatrix` +#' or `xgb.ExternalDMatrix` are not supported here. #' @param nrounds the max number of iterations #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples. -#' @param label vector of response values. Should be provided only when data is an R-matrix. -#' @param missing is only used when input is a dense matrix. By default is set to NA, which means -#' that NA values should be considered as 'missing' by the algorithm. -#' Sometimes, 0 or other extreme value might be used to represent missing values. #' @param prediction A logical value indicating whether to return the test fold predictions #' from each CV model. This parameter engages the \code{\link{xgb.cb.cv.predict}} callback. #' @param showsd \code{boolean}, whether to show standard deviation of cross validation @@ -47,13 +51,30 @@ #' @param feval customized evaluation function. Returns #' \code{list(metric='metric-name', value='metric-value')} with given #' prediction and dtrain. -#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified -#' by the values of outcome labels. +#' @param stratified A \code{boolean} indicating whether sampling of folds should be stratified +#' by the values of outcome labels. For real-valued labels in regression objectives, +#' stratification will be done by discretizing the labels into up to 5 buckets beforehand. +#' +#' If passing "auto", will be set to `TRUE` if the objective in `params` is a classification +#' objective (from XGBoost's built-in objectives, doesn't apply to custom ones), and to +#' `FALSE` otherwise. +#' +#' This parameter is ignored when `data` has a `group` field - in such case, the splitting +#' will be based on whole groups (note that this might make the folds have different sizes). +#' +#' Value `TRUE` here is \bold{not} supported for custom objectives. #' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds #' (each element must be a vector of test fold's indices). When folds are supplied, #' the \code{nfold} and \code{stratified} parameters are ignored. +#' +#' If `data` has a `group` field and the objective requires this field, each fold (list element) +#' must additionally have two attributes (retrievable through \link{attributes}) named `group_test` +#' and `group_train`, which should hold the `group` to assign through \link{setinfo.xgb.DMatrix} to +#' the resulting DMatrices. #' @param train_folds \code{list} list specifying which indicies to use for training. If \code{NULL} #' (the default) all indices not specified in \code{folds} will be used for training. +#' +#' This is not supported when `data` has `group` field. #' @param verbose \code{boolean}, print the statistics during the process #' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}. #' Default is 1 which means all messages are printed. This parameter is passed to the @@ -118,13 +139,14 @@ #' print(cv, verbose=TRUE) #' #' @export -xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing = NA, +xgb.cv <- function(params = list(), data, nrounds, nfold, prediction = FALSE, showsd = TRUE, metrics = list(), - obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, train_folds = NULL, + obj = NULL, feval = NULL, stratified = "auto", folds = NULL, train_folds = NULL, verbose = TRUE, print_every_n = 1L, early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) { check.deprecation(...) + stopifnot(inherits(data, "xgb.DMatrix")) if (inherits(data, "xgb.DMatrix") && .Call(XGCheckNullPtr_R, data)) { stop("'data' is an invalid 'xgb.DMatrix' object. Must be constructed again.") } @@ -137,16 +159,22 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing check.custom.obj() check.custom.eval() - # Check the labels - if ((inherits(data, 'xgb.DMatrix') && !xgb.DMatrix.hasinfo(data, 'label')) || - (!inherits(data, 'xgb.DMatrix') && is.null(label))) { - stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix") - } else if (inherits(data, 'xgb.DMatrix')) { - if (!is.null(label)) - warning("xgb.cv: label will be ignored, since data is of type xgb.DMatrix") - cv_label <- getinfo(data, 'label') - } else { - cv_label <- label + if (stratified == "auto") { + if (is.character(params$objective)) { + stratified <- ( + (params$objective %in% .CLASSIFICATION_OBJECTIVES()) + && !(params$objective %in% .RANKING_OBJECTIVES()) + ) + } else { + stratified <- FALSE + } + } + + # Check the labels and groups + cv_label <- getinfo(data, "label") + cv_group <- getinfo(data, "group") + if (!is.null(train_folds) && NROW(cv_group)) { + stop("'train_folds' is not supported for DMatrix object with 'group' field.") } # CV folds @@ -157,7 +185,7 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing } else { if (nfold <= 1) stop("'nfold' must be > 1") - folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, params) + folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, cv_group, params) } # Callbacks @@ -195,20 +223,18 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing # create the booster-folds # train_folds - dall <- xgb.get.DMatrix( - data = data, - label = label, - missing = missing, - weight = NULL, - nthread = params$nthread - ) + dall <- data bst_folds <- lapply(seq_along(folds), function(k) { - dtest <- xgb.slice.DMatrix(dall, folds[[k]]) + dtest <- xgb.slice.DMatrix(dall, folds[[k]], allow_groups = TRUE) # code originally contributed by @RolandASc on stackoverflow if (is.null(train_folds)) - dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k])) + dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k]), allow_groups = TRUE) else - dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]]) + dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]], allow_groups = TRUE) + if (!is.null(attributes(folds[[k]])$group_test)) { + setinfo(dtest, "group", attributes(folds[[k]])$group_test) + setinfo(dtrain, "group", attributes(folds[[k]])$group_train) + } bst <- xgb.Booster( params = params, cachelist = list(dtrain, dtest), @@ -312,8 +338,8 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing #' @examples #' data(agaricus.train, package='xgboost') #' train <- agaricus.train -#' cv <- xgb.cv(data = train$data, label = train$label, nfold = 5, max_depth = 2, -#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") +#' cv <- xgb.cv(data = xgb.DMatrix(train$data, label = train$label), nfold = 5, max_depth = 2, +#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' print(cv) #' print(cv, verbose=TRUE) #' diff --git a/R-package/man/print.xgb.cv.Rd b/R-package/man/print.xgb.cv.Rd index 05ad61eed8ac..74fc15d01fb9 100644 --- a/R-package/man/print.xgb.cv.Rd +++ b/R-package/man/print.xgb.cv.Rd @@ -23,8 +23,8 @@ including the best iteration (when available). \examples{ data(agaricus.train, package='xgboost') train <- agaricus.train -cv <- xgb.cv(data = train$data, label = train$label, nfold = 5, max_depth = 2, - eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") +cv <- xgb.cv(data = xgb.DMatrix(train$data, label = train$label), nfold = 5, max_depth = 2, + eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") print(cv) print(cv, verbose=TRUE) diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 778b4540afcd..cede67570683 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -9,14 +9,12 @@ xgb.cv( data, nrounds, nfold, - label = NULL, - missing = NA, prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL, feval = NULL, - stratified = TRUE, + stratified = "auto", folds = NULL, train_folds = NULL, verbose = TRUE, @@ -44,19 +42,22 @@ is a shorter summary: } See \code{\link{xgb.train}} for further details. -See also demo/ for walkthrough example in R.} +See also demo/ for walkthrough example in R. -\item{data}{takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.} +Note that, while \code{params} accepts a \code{seed} entry and will use such parameter for model training if +supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG +system - thus, for reproducible results, one needs to call the \code{set.seed} function beforehand.} -\item{nrounds}{the max number of iterations} +\item{data}{An \code{xgb.DMatrix} object, with corresponding fields like \code{label} or bounds as required +for model training by the objective. -\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.} +\if{html}{\out{