Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ importFrom(stats,as.formula)
importFrom(stats,binomial)
importFrom(stats,gaussian)
importFrom(stats,glm.fit)
importFrom(stats,make.link)
importFrom(stats,model.matrix)
importFrom(stats,quasibinomial)
importFrom(stats,rbinom)
Expand Down
30 changes: 25 additions & 5 deletions R/misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -212,14 +212,16 @@ validate.family <- function(family, y) {
family <- family()
if (!is(family, "family"))
stop("Argument of 'family' is not a valid family.")
if (!family$family %in% c("gaussian", "binomial"))
stop("Only 'gaussian' and 'binomial' are supported families.")
if (!family$family %in% c("gaussian", "binomial", "clogit"))
stop("Only 'gaussian', 'binomial' and 'clogit' are supported families.")

if (family$family == "binomial") {
if (family$family == "binomial" || family$family == "clogit") {
if (length(table(y)) != 2)
stop("Outcome variable must contain two classes with family=binomial.")
stop("Outcome variable must contain two classes ",
"if family is 'binomial' or 'clogit'.")
if (!is.factor(y) && any(y < 0 | y > 1))
stop("Outcome variable must contain 0-1 values with family=binomial.")
stop("Outcome variable must contain 0-1 values ",
"if family is 'binomial' or 'clogit'.")
}

return(family)
Expand Down Expand Up @@ -462,6 +464,24 @@ vector.summary <- function(x, prob) {
c(mean=mean(x), sd=stats::sd(x), stats::quantile(x, c(lower, upper)))
}

#' Family object for conditional logistic models
#'
#' @param link Specification for the model link function.
#'
#' @return
#' An object of class `family`.
#'
#' @importFrom stats make.link
#' @noRd
clogit <- function(link="logit") {
if (link != "logit")
stop("Only link 'logit' is available for the clogit family.")

clogit <- binomial()
clogit$family <- "clogit"
return(clogit)
}

#' Check whether the model fitted is a logistic regression model.
#'
#' @param obj An object of class `hsstan`.
Expand Down
61 changes: 46 additions & 15 deletions R/projection.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,26 @@ lm_proj <- function(x, fit, sigma2, indproj, is.logistic) {

## logistic regression model
if (is.logistic) {

## compute the projection for each sample
par.fun <- function(s) glm.fit(xp, fit[, s],
family=quasibinomial())$coefficients
## compute the projection for each sample
par.fun <- function(s) {
qb <- glm.fit(xp, fit[, s], family=quasibinomial())$coefficients
gc() # fixes an apparent memory leak
return(qb)
}
if (.Platform$OS.type != "windows") {
wp <- parallel::mclapply(X=1:S, mc.preschedule=TRUE, FUN=par.fun)

#wp <- vector("list", length = S)
#for(s in 1:S) {
# wp[[s]] <- par.fun(s)
#}

} else { # windows
cl <- parallel::makePSOCKcluster(getOption("mc.cores", 1))
on.exit(parallel::stopCluster(cl))
wp <- parallel::parLapply(X=1:S, cl=cl, fun=par.fun)
}
wp <- matrix(unlist(wp, use.names=FALSE), ncol=S)
wp <- matrix(unlist(wp, use.names=FALSE), ncol=S) # P x S matrix

## estimate the KL divergence between full and projected model
fitp <- binomial()$linkinv(multiplyAB(xp, wp))
Expand All @@ -76,7 +84,7 @@ lm_proj <- function(x, fit, sigma2, indproj, is.logistic) {

## reshape wp so that it has same dimension (P x S) as t(x), and zeros for
## those variables that are not included in the projected model
wp.all <- matrix(0, P, S)
wp.all <- matrix(data=0, nrow=P, ncol=S)
wp.all[indproj, ] <- wp
return(list(w=wp.all, sigma2=sigma2p, fit=fitp, kl=kl))
}
Expand Down Expand Up @@ -170,6 +178,7 @@ elpd <- function(yt, eta, sigma2, logistic) {
#' submodel. If `NULL` (default), selection starts from the set of
#' unpenalized covariates if the model contains penalized predictors,
#' otherwise selection starts from the intercept-only model.
#' @param num.samples Number of posterior samples to be used in projection.
#' @param out.csv If not `NULL`, the name of a CSV file to save the
#' output to.
#'
Expand Down Expand Up @@ -200,13 +209,21 @@ elpd <- function(yt, eta, sigma2, logistic) {
#' @importFrom utils write.csv
#' @export
projsel <- function(obj, max.iters=30, start.from=NULL,
out.csv=NULL) {
num.samples=NULL, out.csv=NULL) {
validate.hsstan(obj)
validate.samples(obj)

x <- xt <- validate.newdata(obj, obj$data)
if (obj$family$family == "clogit") {
## first col is stratum, which should be dropped
## there is no intercept column in data
x <- x[, -1] # drop intercept column
}

yt <- obj$data[[obj$model.terms$outcome]]
is.logistic <- is.logistic(obj)
if(obj$family$family == "clogit")
is.logistic <- TRUE
sigma2 <- if (is.logistic) 1 else as.matrix(obj$stanfit, pars="sigma")^2

## set of variables in the initial submodel
Expand All @@ -226,14 +243,25 @@ projsel <- function(obj, max.iters=30, start.from=NULL,

## fitted values for the full model (N x S)
fit <- t(posterior_linpred(obj, transform=TRUE))

if (!is.null(num.samples)) {
fit <- fit[, 1:num.samples]
if (obj$family$family == "gaussian") {
sigma2 <- sigma2[1:num.samples, ]
}
}

## intercept only model
sub <- fit.submodel(x, sigma2, fit, 1, xt, yt, is.logistic)
kl.elpd <- cbind(sub$kl, sub$elpd)
report.iter("Intercept only", sub$kl, sub$elpd)
if (obj$family$family != "clogit") {
## intercept only model
sub <- fit.submodel(x, sigma2, fit, 1, xt, yt, is.logistic)
kl.elpd <- cbind(sub$kl, sub$elpd)
report.iter("Intercept only", sub$kl, sub$elpd)
} else {
kl.elpd <- NULL
}

## initial submodel with the set of chosen covariates
if (length(start.idx) > 1) {
if (length(start.idx) > 1 || obj$family$family == "clogit") {
sub <- fit.submodel(x, sigma2, fit, chosen, xt, yt, is.logistic)
kl.elpd <- rbind(kl.elpd, c(sub$kl, sub$elpd))
report.iter("Initial submodel", sub$kl, sub$elpd)
Expand All @@ -259,9 +287,12 @@ projsel <- function(obj, max.iters=30, start.from=NULL,
if (length(rel.kl) == 2 && is.nan(rel.kl[2]))
rel.kl[2] <- 1

res <- data.frame(var=c("Intercept only",
if (length(start.idx) > 1) "Initial submodel",
colnames(x)[setdiff(chosen, start.idx)]),
var.names <- c(if(length(start.idx) > 1) "Initial submodel",
colnames(x)[setdiff(chosen, start.idx)])
if(obj$family$family != "clogit") {
var.names <- c("Intercept only", var.names)
}
res <- data.frame(var=var.names,
kl=kl,
rel.kl.null=1 - kl / kl[1],
rel.kl=rel.kl,
Expand Down
38 changes: 36 additions & 2 deletions R/stan.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
#' penalized. If `NULL` or an empty vector, a model with only unpenalized
#' covariates is fitted.
#' @param family Type of model fitted: either `gaussian()` for linear regression
#' (default) or `binomial()` for logistic regression.
#' (default), `binomial()` for logistic regression, or `clogit()` for
#' conditional logistic regression.
#' @param seed Optional integer defining the seed for the pseudo-random number
#' generator.
#' @param qr Whether the thin QR decomposition should be used to decorrelate the
Expand Down Expand Up @@ -121,6 +122,26 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian,
## to work around rstan issue #681
validate.rstan.args(...)

if (family$family == "clogit") {
## recode stratum as an integer, sort by stratum, generate starts, stops, S
x$stratum <- as.integer(as.factor(x$stratum))
x <- x[order(x$stratum), ]
starts <- 1 + c(0, which(diff(x$stratum) > 0))
stops <- c(starts[-1] - 1, nrow(x))
S <- length(starts)

## recode outcome as categoric
y <- x[[model.terms$outcome]] # redefine after sorting x
ycat <- integer(S)
for(s in 1:S) {
yind <- y[starts[s]:stops[s]]
y.gt0 <- which(yind > 0)
if(length(y.gt0) != 1)
stop("stratum ", s, " has ", length(y.gt0), " observations with y > 0")
ycat[s] <- which(yind > 0)
}
}

## parameter names not to include by default in the stanfit object
hs.pars <- c("lambda", "tau", "z", "c2")
if (keep.hs.pars)
Expand All @@ -135,6 +156,7 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian,
## choose the model to be fitted
model <- ifelse(length(penalized) == 0, "base", "hs")
if (family$family == "binomial") model <- paste0(model, "_logit")
else if (family$family == "clogit") model <- paste0(model, "_clogit")

## set or check adapt.delta
if (is.null(adapt.delta)) {
Expand All @@ -145,6 +167,10 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian,

## create the design matrix
X <- ordered.model.matrix(x, model.terms$unpenalized, model.terms$penalized)
if (family$family == "clogit") {
X <- X[, -1] # drop first column containing 1s
}

N <- nrow(X)
P <- ncol(X)

Expand All @@ -163,6 +189,9 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian,

## global scale for regularized horseshoe prior
global.scale <- if (regularized) par.ratio / sqrt(N) else 1
if (family$family == "clogit") {
global.scale <- par.ratio / sqrt(S)
}

## core block
{
Expand All @@ -173,8 +202,13 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian,
global_scale=global.scale, global_df=global.df,
slab_scale=slab.scale, slab_df=slab.df)

## extra data for clogit
if (family$family == "clogit")
data.input <- c(data.input,
ycat=ycat, S=S, starts=starts, stops=stops)

## run the stan model
samples <- rstan::sampling(stanmodels[[model]], data=data.input,
samples <- rstan::sampling(object=stanmodels[[model]], data=data.input,
iter=iter, warmup=warmup, seed=seed, ...,
pars=hs.pars, include=keep.hs.pars,
control=list(adapt_delta=adapt.delta))
Expand Down
4 changes: 3 additions & 1 deletion R/stanmodels.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
# Generated by rstantools. Do not edit by hand.

# names of stan models
stanmodels <- c("base", "base_logit", "hs", "hs_logit")
stanmodels <- c("base", "base_clogit", "base_logit", "hs", "hs_clogit", "hs_logit")

# load each stan module
Rcpp::loadModule("stan_fit4base_mod", what = TRUE)
Rcpp::loadModule("stan_fit4base_clogit_mod", what = TRUE)
Rcpp::loadModule("stan_fit4base_logit_mod", what = TRUE)
Rcpp::loadModule("stan_fit4hs_mod", what = TRUE)
Rcpp::loadModule("stan_fit4hs_clogit_mod", what = TRUE)
Rcpp::loadModule("stan_fit4hs_logit_mod", what = TRUE)

# instantiate each stanmodel object
Expand Down
74 changes: 74 additions & 0 deletions inst/stan/base_clogit.stan
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
//
// Gaussian prior on regression coeffs (conditional logistic regression with 1 case per stratum)
//

functions {

int which(int[ ] x) { // return first subscript for which x[i] > 0)
int ycat;
int K;
int i;
ycat = K + 1; // assign to an out of range value
K = size(x);
i = 1;
while (ycat == K + 1 && i <= K) {
if(x[i] > 0) {
ycat = i;
}
i = i + 1;
}
return ycat;
}

}

data {
// prior standard deviation for the unpenalised variables
real <lower=0> scale_u;

int N; // number of observations
int U; // number of unpenalized columns in model matrix
int S; // number of strata

// store ragged array as flat array
int starts[S];
int stops[S];

matrix[N, U] X; // design matrix
// int <lower=0, upper=1> y[N]; // binary outcome

// categorical response variable taking values in 1:lengths[s]
int<lower=1> ycat[S];

}

transformed data {

// categorical response variable taking values in 1:lengths[s]
// int<lower=1> ycat[S];

// for(s in 1:S) {
// ycat[s] = which(y[starts[s]:stops[s]]);
// }
}

parameters {
// unpenalized regression parameters
vector[U] beta_u;
}

transformed parameters {
vector[N] X_beta;

X_beta = X * beta_u;

}

model {
// unpenalized coefficients including intercept
beta_u ~ normal(0, scale_u);
// conditional likelihood is evaluated by looping over strata
for (s in 1:S) {
ycat[s] ~ categorical_logit(X_beta[starts[s]:stops[s]]);
}
}
Loading