diff --git a/NAMESPACE b/NAMESPACE index 6eba20d..f4b4e60 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,6 +48,7 @@ importFrom(stats,as.formula) importFrom(stats,binomial) importFrom(stats,gaussian) importFrom(stats,glm.fit) +importFrom(stats,make.link) importFrom(stats,model.matrix) importFrom(stats,quasibinomial) importFrom(stats,rbinom) diff --git a/R/misc.R b/R/misc.R index ac003e1..1261023 100644 --- a/R/misc.R +++ b/R/misc.R @@ -212,14 +212,16 @@ validate.family <- function(family, y) { family <- family() if (!is(family, "family")) stop("Argument of 'family' is not a valid family.") - if (!family$family %in% c("gaussian", "binomial")) - stop("Only 'gaussian' and 'binomial' are supported families.") + if (!family$family %in% c("gaussian", "binomial", "clogit")) + stop("Only 'gaussian', 'binomial' and 'clogit' are supported families.") - if (family$family == "binomial") { + if (family$family == "binomial" || family$family == "clogit") { if (length(table(y)) != 2) - stop("Outcome variable must contain two classes with family=binomial.") + stop("Outcome variable must contain two classes ", + "if family is 'binomial' or 'clogit'.") if (!is.factor(y) && any(y < 0 | y > 1)) - stop("Outcome variable must contain 0-1 values with family=binomial.") + stop("Outcome variable must contain 0-1 values ", + "if family is 'binomial' or 'clogit'.") } return(family) @@ -462,6 +464,24 @@ vector.summary <- function(x, prob) { c(mean=mean(x), sd=stats::sd(x), stats::quantile(x, c(lower, upper))) } +#' Family object for conditional logistic models +#' +#' @param link Specification for the model link function. +#' +#' @return +#' An object of class `family`. +#' +#' @importFrom stats make.link +#' @noRd +clogit <- function(link="logit") { + if (link != "logit") + stop("Only link 'logit' is available for the clogit family.") + + clogit <- binomial() + clogit$family <- "clogit" + return(clogit) +} + #' Check whether the model fitted is a logistic regression model. #' #' @param obj An object of class `hsstan`. diff --git a/R/projection.R b/R/projection.R index 38bfa50..bc03561 100644 --- a/R/projection.R +++ b/R/projection.R @@ -42,18 +42,26 @@ lm_proj <- function(x, fit, sigma2, indproj, is.logistic) { ## logistic regression model if (is.logistic) { - - ## compute the projection for each sample - par.fun <- function(s) glm.fit(xp, fit[, s], - family=quasibinomial())$coefficients + ## compute the projection for each sample + par.fun <- function(s) { + qb <- glm.fit(xp, fit[, s], family=quasibinomial())$coefficients + gc() # fixes an apparent memory leak + return(qb) + } if (.Platform$OS.type != "windows") { wp <- parallel::mclapply(X=1:S, mc.preschedule=TRUE, FUN=par.fun) + + #wp <- vector("list", length = S) + #for(s in 1:S) { + # wp[[s]] <- par.fun(s) + #} + } else { # windows cl <- parallel::makePSOCKcluster(getOption("mc.cores", 1)) on.exit(parallel::stopCluster(cl)) wp <- parallel::parLapply(X=1:S, cl=cl, fun=par.fun) } - wp <- matrix(unlist(wp, use.names=FALSE), ncol=S) + wp <- matrix(unlist(wp, use.names=FALSE), ncol=S) # P x S matrix ## estimate the KL divergence between full and projected model fitp <- binomial()$linkinv(multiplyAB(xp, wp)) @@ -76,7 +84,7 @@ lm_proj <- function(x, fit, sigma2, indproj, is.logistic) { ## reshape wp so that it has same dimension (P x S) as t(x), and zeros for ## those variables that are not included in the projected model - wp.all <- matrix(0, P, S) + wp.all <- matrix(data=0, nrow=P, ncol=S) wp.all[indproj, ] <- wp return(list(w=wp.all, sigma2=sigma2p, fit=fitp, kl=kl)) } @@ -170,6 +178,7 @@ elpd <- function(yt, eta, sigma2, logistic) { #' submodel. If `NULL` (default), selection starts from the set of #' unpenalized covariates if the model contains penalized predictors, #' otherwise selection starts from the intercept-only model. +#' @param num.samples Number of posterior samples to be used in projection. #' @param out.csv If not `NULL`, the name of a CSV file to save the #' output to. #' @@ -200,13 +209,21 @@ elpd <- function(yt, eta, sigma2, logistic) { #' @importFrom utils write.csv #' @export projsel <- function(obj, max.iters=30, start.from=NULL, - out.csv=NULL) { + num.samples=NULL, out.csv=NULL) { validate.hsstan(obj) validate.samples(obj) x <- xt <- validate.newdata(obj, obj$data) + if (obj$family$family == "clogit") { + ## first col is stratum, which should be dropped + ## there is no intercept column in data + x <- x[, -1] # drop intercept column + } + yt <- obj$data[[obj$model.terms$outcome]] is.logistic <- is.logistic(obj) + if(obj$family$family == "clogit") + is.logistic <- TRUE sigma2 <- if (is.logistic) 1 else as.matrix(obj$stanfit, pars="sigma")^2 ## set of variables in the initial submodel @@ -226,14 +243,25 @@ projsel <- function(obj, max.iters=30, start.from=NULL, ## fitted values for the full model (N x S) fit <- t(posterior_linpred(obj, transform=TRUE)) + + if (!is.null(num.samples)) { + fit <- fit[, 1:num.samples] + if (obj$family$family == "gaussian") { + sigma2 <- sigma2[1:num.samples, ] + } + } - ## intercept only model - sub <- fit.submodel(x, sigma2, fit, 1, xt, yt, is.logistic) - kl.elpd <- cbind(sub$kl, sub$elpd) - report.iter("Intercept only", sub$kl, sub$elpd) + if (obj$family$family != "clogit") { + ## intercept only model + sub <- fit.submodel(x, sigma2, fit, 1, xt, yt, is.logistic) + kl.elpd <- cbind(sub$kl, sub$elpd) + report.iter("Intercept only", sub$kl, sub$elpd) + } else { + kl.elpd <- NULL + } ## initial submodel with the set of chosen covariates - if (length(start.idx) > 1) { + if (length(start.idx) > 1 || obj$family$family == "clogit") { sub <- fit.submodel(x, sigma2, fit, chosen, xt, yt, is.logistic) kl.elpd <- rbind(kl.elpd, c(sub$kl, sub$elpd)) report.iter("Initial submodel", sub$kl, sub$elpd) @@ -259,9 +287,12 @@ projsel <- function(obj, max.iters=30, start.from=NULL, if (length(rel.kl) == 2 && is.nan(rel.kl[2])) rel.kl[2] <- 1 - res <- data.frame(var=c("Intercept only", - if (length(start.idx) > 1) "Initial submodel", - colnames(x)[setdiff(chosen, start.idx)]), + var.names <- c(if(length(start.idx) > 1) "Initial submodel", + colnames(x)[setdiff(chosen, start.idx)]) + if(obj$family$family != "clogit") { + var.names <- c("Intercept only", var.names) + } + res <- data.frame(var=var.names, kl=kl, rel.kl.null=1 - kl / kl[1], rel.kl=rel.kl, diff --git a/R/stan.R b/R/stan.R index 24f4daa..65069a9 100644 --- a/R/stan.R +++ b/R/stan.R @@ -36,7 +36,8 @@ #' penalized. If `NULL` or an empty vector, a model with only unpenalized #' covariates is fitted. #' @param family Type of model fitted: either `gaussian()` for linear regression -#' (default) or `binomial()` for logistic regression. +#' (default), `binomial()` for logistic regression, or `clogit()` for +#' conditional logistic regression. #' @param seed Optional integer defining the seed for the pseudo-random number #' generator. #' @param qr Whether the thin QR decomposition should be used to decorrelate the @@ -121,6 +122,26 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian, ## to work around rstan issue #681 validate.rstan.args(...) + if (family$family == "clogit") { + ## recode stratum as an integer, sort by stratum, generate starts, stops, S + x$stratum <- as.integer(as.factor(x$stratum)) + x <- x[order(x$stratum), ] + starts <- 1 + c(0, which(diff(x$stratum) > 0)) + stops <- c(starts[-1] - 1, nrow(x)) + S <- length(starts) + + ## recode outcome as categoric + y <- x[[model.terms$outcome]] # redefine after sorting x + ycat <- integer(S) + for(s in 1:S) { + yind <- y[starts[s]:stops[s]] + y.gt0 <- which(yind > 0) + if(length(y.gt0) != 1) + stop("stratum ", s, " has ", length(y.gt0), " observations with y > 0") + ycat[s] <- which(yind > 0) + } + } + ## parameter names not to include by default in the stanfit object hs.pars <- c("lambda", "tau", "z", "c2") if (keep.hs.pars) @@ -135,6 +156,7 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian, ## choose the model to be fitted model <- ifelse(length(penalized) == 0, "base", "hs") if (family$family == "binomial") model <- paste0(model, "_logit") + else if (family$family == "clogit") model <- paste0(model, "_clogit") ## set or check adapt.delta if (is.null(adapt.delta)) { @@ -145,6 +167,10 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian, ## create the design matrix X <- ordered.model.matrix(x, model.terms$unpenalized, model.terms$penalized) + if (family$family == "clogit") { + X <- X[, -1] # drop first column containing 1s + } + N <- nrow(X) P <- ncol(X) @@ -163,6 +189,9 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian, ## global scale for regularized horseshoe prior global.scale <- if (regularized) par.ratio / sqrt(N) else 1 + if (family$family == "clogit") { + global.scale <- par.ratio / sqrt(S) + } ## core block { @@ -173,8 +202,13 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian, global_scale=global.scale, global_df=global.df, slab_scale=slab.scale, slab_df=slab.df) + ## extra data for clogit + if (family$family == "clogit") + data.input <- c(data.input, + ycat=ycat, S=S, starts=starts, stops=stops) + ## run the stan model - samples <- rstan::sampling(stanmodels[[model]], data=data.input, + samples <- rstan::sampling(object=stanmodels[[model]], data=data.input, iter=iter, warmup=warmup, seed=seed, ..., pars=hs.pars, include=keep.hs.pars, control=list(adapt_delta=adapt.delta)) diff --git a/R/stanmodels.R b/R/stanmodels.R index 5b7eeb8..68769eb 100644 --- a/R/stanmodels.R +++ b/R/stanmodels.R @@ -1,12 +1,14 @@ # Generated by rstantools. Do not edit by hand. # names of stan models -stanmodels <- c("base", "base_logit", "hs", "hs_logit") +stanmodels <- c("base", "base_clogit", "base_logit", "hs", "hs_clogit", "hs_logit") # load each stan module Rcpp::loadModule("stan_fit4base_mod", what = TRUE) +Rcpp::loadModule("stan_fit4base_clogit_mod", what = TRUE) Rcpp::loadModule("stan_fit4base_logit_mod", what = TRUE) Rcpp::loadModule("stan_fit4hs_mod", what = TRUE) +Rcpp::loadModule("stan_fit4hs_clogit_mod", what = TRUE) Rcpp::loadModule("stan_fit4hs_logit_mod", what = TRUE) # instantiate each stanmodel object diff --git a/inst/stan/base_clogit.stan b/inst/stan/base_clogit.stan new file mode 100644 index 0000000..88bf045 --- /dev/null +++ b/inst/stan/base_clogit.stan @@ -0,0 +1,74 @@ +// +// Gaussian prior on regression coeffs (conditional logistic regression with 1 case per stratum) +// + +functions { + + int which(int[ ] x) { // return first subscript for which x[i] > 0) + int ycat; + int K; + int i; + ycat = K + 1; // assign to an out of range value + K = size(x); + i = 1; + while (ycat == K + 1 && i <= K) { + if(x[i] > 0) { + ycat = i; + } + i = i + 1; + } + return ycat; + } + +} + +data { + // prior standard deviation for the unpenalised variables + real scale_u; + + int N; // number of observations + int U; // number of unpenalized columns in model matrix + int S; // number of strata + + // store ragged array as flat array + int starts[S]; + int stops[S]; + + matrix[N, U] X; // design matrix + // int y[N]; // binary outcome + + // categorical response variable taking values in 1:lengths[s] + int ycat[S]; + +} + +transformed data { + + // categorical response variable taking values in 1:lengths[s] + // int ycat[S]; + + // for(s in 1:S) { + // ycat[s] = which(y[starts[s]:stops[s]]); + // } +} + +parameters { + // unpenalized regression parameters + vector[U] beta_u; +} + +transformed parameters { + vector[N] X_beta; + + X_beta = X * beta_u; + +} + +model { + // unpenalized coefficients including intercept + beta_u ~ normal(0, scale_u); + // conditional likelihood is evaluated by looping over strata + for (s in 1:S) { + ycat[s] ~ categorical_logit(X_beta[starts[s]:stops[s]]); + } +} diff --git a/inst/stan/hs_clogit.stan b/inst/stan/hs_clogit.stan new file mode 100644 index 0000000..4659c78 --- /dev/null +++ b/inst/stan/hs_clogit.stan @@ -0,0 +1,111 @@ +// +// Gaussian prior on regression coeffs (conditional logistic regression with 1 case per stratum) +// + +functions { +#include /chunks/hs.fun +} + +data { + // prior standard deviation for the unpenalised variables + real scale_u; + + // number of columns in model matrix + int P; + + int N; // number of observations + int U; // number of unpenalized columns in model matrix + int S; // number of strata + + // store ragged array as flat array + int starts[S]; + int stops[S]; + + matrix[N, P] X; // design matrix + + // categorical response variable taking values in 1:lengths[s] + int ycat[S]; + + // whether the regularized horseshoe should be used + int regularized; + + // degrees of freedom for the half-t priors on lambda + real nu; + + // scale for the half-t prior on tau + real global_scale; + + // degrees of freedom for the half-t prior on tau + real global_df; + + // slab scale for the regularized horseshoe + real slab_scale; + + // slab degrees of freedom for the regularized horseshoe + real slab_df; + +} + +transformed data { + + // categorical response variable taking values in 1:lengths[s] + // int ycat[S]; + + // for(s in 1:S) { + // ycat[s] = which(y[starts[s]:stops[s]]); + // } + +} + +parameters { + // unpenalized regression parameters + vector[U] beta_u; + + // global shrinkage parameter + real tau; + + // local shrinkage parameter + vector[P-U] lambda; + + // auxiliary variables + vector[P-U] z; + real c2; + +} + +transformed parameters { + + // penalized regression parameters + vector[P-U] beta_p; + + if (regularized) + beta_p = reg_hs(z, lambda, tau, slab_scale^2 * c2); + else + beta_p = hs(z, lambda, tau); + + +} + +model { + + // local variables + vector[P] beta = append_row(beta_u, beta_p); // regression coefficients + vector[N] X_beta = X * beta; // linear predictors + + // half t-priors for lambdas and tau + z ~ std_normal(); + lambda ~ student_t(nu, 0, 1); + tau ~ student_t(global_df, 0, global_scale); + + // inverse-gamma prior for c^2 + c2 ~ inv_gamma(0.5 * slab_df, 0.5 * slab_df); + + // unpenalized coefficients including intercept + beta_u ~ normal(0, scale_u); + + // conditional likelihood is evaluated by looping over strata + for (s in 1:S) { + ycat[s] ~ categorical_logit(X_beta[starts[s]:stops[s]]); + } + +} diff --git a/man/hsstan.Rd b/man/hsstan.Rd index 141dc7f..6c9acb4 100644 --- a/man/hsstan.Rd +++ b/man/hsstan.Rd @@ -39,7 +39,8 @@ penalized. If \code{NULL} or an empty vector, a model with only unpenalized covariates is fitted.} \item{family}{Type of model fitted: either \code{gaussian()} for linear regression -(default) or \code{binomial()} for logistic regression.} +(default), \code{binomial()} for logistic regression, or \code{clogit()} for +conditional logistic regression.} \item{iter}{Total number of iterations in each chain, including warmup (2000 by default).} diff --git a/man/projsel.Rd b/man/projsel.Rd index 6bee548..510857a 100644 --- a/man/projsel.Rd +++ b/man/projsel.Rd @@ -4,7 +4,13 @@ \alias{projsel} \title{Forward selection minimizing KL-divergence in projection} \usage{ -projsel(obj, max.iters = 30, start.from = NULL, out.csv = NULL) +projsel( + obj, + max.iters = 30, + start.from = NULL, + num.samples = NULL, + out.csv = NULL +) } \arguments{ \item{obj}{Object of class \code{hsstan}.} @@ -17,6 +23,8 @@ submodel. If \code{NULL} (default), selection starts from the set of unpenalized covariates if the model contains penalized predictors, otherwise selection starts from the intercept-only model.} +\item{num.samples}{Number of posterior samples to be used in projection.} + \item{out.csv}{If not \code{NULL}, the name of a CSV file to save the output to.} } diff --git a/tests/testthat/test-arguments.R b/tests/testthat/test-arguments.R index 756d45b..029c088 100644 --- a/tests/testthat/test-arguments.R +++ b/tests/testthat/test-arguments.R @@ -174,7 +174,7 @@ test_that("family is valid", expect_error(validate.family(x), "Argument of 'family' is not a valid family") expect_error(validate.family(poisson), - "Only 'gaussian' and 'binomial' are supported families") + "Only 'gaussian', 'binomial' and 'clogit' are supported families") expect_error(validate.family(binomial()), "is missing, with no default") }) @@ -187,6 +187,13 @@ test_that("invalid family inputs", "must contain 0-1 values") expect_error(validate.family(binomial(), y.binom - 1), "must contain 0-1 values") + + expect_error(validate.family(clogit(), y.gauss), + "must contain two classes") + expect_error(validate.family(clogit(), y.binom + 1), + "must contain 0-1 values") + expect_error(validate.family(clogit(), y.binom - 1), + "must contain 0-1 values") }) test_that("valid family inputs", @@ -197,6 +204,10 @@ test_that("valid family inputs", "binomial") expect_equal(validate.family("binomial", y.factr)$family, "binomial") + expect_equal(validate.family("clogit", y.binom)$family, + "clogit") + expect_equal(validate.family("clogit", y.factr)$family, + "clogit") expect_equal(validate.family("gaussian")$family, "gaussian") expect_equal(validate.family(gaussian())$family,