mcol · pmckeigue · Jul 8, 2020 · Jul 14, 2020 · Jul 23, 2020 · Jul 24, 2020
diff --git a/NAMESPACE b/NAMESPACE
@@ -48,6 +48,7 @@ importFrom(stats,as.formula)
 importFrom(stats,binomial)
 importFrom(stats,gaussian)
 importFrom(stats,glm.fit)
+importFrom(stats,make.link)
 importFrom(stats,model.matrix)
 importFrom(stats,quasibinomial)
 importFrom(stats,rbinom)

diff --git a/R/misc.R b/R/misc.R
@@ -212,14 +212,16 @@ validate.family <- function(family, y) {
         family <- family()
     if (!is(family, "family"))
         stop("Argument of 'family' is not a valid family.")
-    if (!family$family %in% c("gaussian", "binomial"))
-        stop("Only 'gaussian' and 'binomial' are supported families.")
+    if (!family$family %in% c("gaussian", "binomial", "clogit"))
+        stop("Only 'gaussian', 'binomial' and 'clogit' are supported families.")
 
-    if (family$family == "binomial") {
+    if (family$family == "binomial" || family$family == "clogit") {
         if (length(table(y)) != 2)
-            stop("Outcome variable must contain two classes with family=binomial.")
+            stop("Outcome variable must contain two classes ",
+                 "if family is 'binomial' or 'clogit'.")
         if (!is.factor(y) && any(y < 0 | y > 1))
-            stop("Outcome variable must contain 0-1 values with family=binomial.")
+            stop("Outcome variable must contain 0-1 values ",
+                 "if family is 'binomial' or 'clogit'.")
     }
 
     return(family)
@@ -462,6 +464,24 @@ vector.summary <- function(x, prob) {
     c(mean=mean(x), sd=stats::sd(x), stats::quantile(x, c(lower, upper)))
 }
 
+#' Family object for conditional logistic models
+#'
+#' @param link Specification for the model link function.
+#'
+#' @return
+#' An object of class `family`.
+#'
+#' @importFrom stats make.link
+#' @noRd
+clogit <- function(link="logit") {
+    if (link != "logit")
+        stop("Only link 'logit' is available for the clogit family.")
+
+    clogit <- binomial()
+    clogit$family <- "clogit"
+    return(clogit)
+}
+
 #' Check whether the model fitted is a logistic regression model.
 #'
 #' @param obj An object of class `hsstan`.

diff --git a/R/projection.R b/R/projection.R
@@ -42,18 +42,26 @@ lm_proj <- function(x, fit, sigma2, indproj, is.logistic) {
 
     ## logistic regression model
     if (is.logistic) {
-
-        ## compute the projection for each sample
-        par.fun <- function(s) glm.fit(xp, fit[, s],
-                                       family=quasibinomial())$coefficients
+         ## compute the projection for each sample
+        par.fun <- function(s) {
+            qb <- glm.fit(xp, fit[, s], family=quasibinomial())$coefficients
+            gc() # fixes an apparent memory leak
+            return(qb)
+        }
         if (.Platform$OS.type != "windows") {
             wp <- parallel::mclapply(X=1:S, mc.preschedule=TRUE, FUN=par.fun)
+
+            #wp <- vector("list", length = S) 
+            #for(s in 1:S) {
+            #    wp[[s]] <- par.fun(s)
+            #}
+
         } else { # windows
             cl <- parallel::makePSOCKcluster(getOption("mc.cores", 1))
             on.exit(parallel::stopCluster(cl))
             wp <- parallel::parLapply(X=1:S, cl=cl, fun=par.fun)
         }
-        wp <- matrix(unlist(wp, use.names=FALSE), ncol=S)
+        wp <- matrix(unlist(wp, use.names=FALSE), ncol=S) # P x S matrix 
 
         ## estimate the KL divergence between full and projected model
         fitp <- binomial()$linkinv(multiplyAB(xp, wp))
@@ -76,7 +84,7 @@ lm_proj <- function(x, fit, sigma2, indproj, is.logistic) {
 
     ## reshape wp so that it has same dimension (P x S) as t(x), and zeros for
     ## those variables that are not included in the projected model
-    wp.all <- matrix(0, P, S)
+    wp.all <- matrix(data=0, nrow=P, ncol=S)
     wp.all[indproj, ] <- wp
     return(list(w=wp.all, sigma2=sigma2p, fit=fitp, kl=kl))
 }
@@ -170,6 +178,7 @@ elpd <- function(yt, eta, sigma2, logistic) {
 #'        submodel. If `NULL` (default), selection starts from the set of
 #'        unpenalized covariates if the model contains penalized predictors,
 #'        otherwise selection starts from the intercept-only model.
+#' @param num.samples Number of posterior samples to be used in projection.
 #' @param out.csv If not `NULL`, the name of a CSV file to save the
 #'        output to.
 #'
@@ -200,13 +209,21 @@ elpd <- function(yt, eta, sigma2, logistic) {
 #' @importFrom utils write.csv
 #' @export
 projsel <- function(obj, max.iters=30, start.from=NULL,
-                    out.csv=NULL) {
+                    num.samples=NULL, out.csv=NULL) {
     validate.hsstan(obj)
     validate.samples(obj)
 
     x <- xt <- validate.newdata(obj, obj$data)
+    if (obj$family$family == "clogit") {
+        ## first col is stratum, which should be dropped
+        ## there is no intercept column in data
+        x <- x[, -1] # drop intercept column
+    }
+
     yt <- obj$data[[obj$model.terms$outcome]]
     is.logistic <- is.logistic(obj)
+    if(obj$family$family == "clogit")
+        is.logistic <- TRUE
     sigma2 <- if (is.logistic) 1 else as.matrix(obj$stanfit, pars="sigma")^2
 
     ## set of variables in the initial submodel
@@ -226,14 +243,25 @@ projsel <- function(obj, max.iters=30, start.from=NULL,
 
     ## fitted values for the full model (N x S)
     fit <- t(posterior_linpred(obj, transform=TRUE))
+
+    if (!is.null(num.samples)) {
+        fit <- fit[, 1:num.samples]
+        if (obj$family$family == "gaussian") {
+            sigma2 <- sigma2[1:num.samples, ]
+        }
+    }
 
-    ## intercept only model
-    sub <- fit.submodel(x, sigma2, fit, 1, xt, yt, is.logistic)
-    kl.elpd <- cbind(sub$kl, sub$elpd)
-    report.iter("Intercept only", sub$kl, sub$elpd)
+    if (obj$family$family != "clogit") {
+        ## intercept only model
+        sub <- fit.submodel(x, sigma2, fit, 1, xt, yt, is.logistic)
+        kl.elpd <- cbind(sub$kl, sub$elpd)
+        report.iter("Intercept only", sub$kl, sub$elpd)
+    } else {
+        kl.elpd <- NULL
+    }
 
     ## initial submodel with the set of chosen covariates
-    if (length(start.idx) > 1) {
+    if (length(start.idx) > 1 || obj$family$family == "clogit") {
         sub <- fit.submodel(x, sigma2, fit, chosen, xt, yt, is.logistic)
         kl.elpd <- rbind(kl.elpd, c(sub$kl, sub$elpd))
         report.iter("Initial submodel", sub$kl, sub$elpd)
@@ -259,9 +287,12 @@ projsel <- function(obj, max.iters=30, start.from=NULL,
     if (length(rel.kl) == 2 && is.nan(rel.kl[2]))
         rel.kl[2] <- 1
 
-    res <- data.frame(var=c("Intercept only",
-                            if (length(start.idx) > 1) "Initial submodel",
-                            colnames(x)[setdiff(chosen, start.idx)]),
+    var.names <- c(if(length(start.idx) > 1) "Initial submodel",
+                   colnames(x)[setdiff(chosen, start.idx)])
+    if(obj$family$family != "clogit") {
+        var.names <- c("Intercept only", var.names)
+    }
+    res <- data.frame(var=var.names,
                       kl=kl,
                       rel.kl.null=1 - kl / kl[1],
                       rel.kl=rel.kl,

diff --git a/R/stan.R b/R/stan.R
@@ -36,7 +36,8 @@
 #'        penalized. If `NULL` or an empty vector, a model with only unpenalized
 #'        covariates is fitted.
 #' @param family Type of model fitted: either `gaussian()` for linear regression
-#'        (default) or `binomial()` for logistic regression.
+#'        (default), `binomial()` for logistic regression, or `clogit()` for
+#'        conditional logistic regression.
 #' @param seed Optional integer defining the seed for the pseudo-random number
 #'        generator.
 #' @param qr Whether the thin QR decomposition should be used to decorrelate the
@@ -121,6 +122,26 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian,
     ## to work around rstan issue #681
     validate.rstan.args(...)
 
+    if (family$family == "clogit") {
+        ## recode stratum as an integer, sort by stratum, generate starts, stops, S
+        x$stratum <- as.integer(as.factor(x$stratum))
+        x <- x[order(x$stratum), ]
+        starts <- 1 + c(0, which(diff(x$stratum) > 0))
+        stops <- c(starts[-1] - 1, nrow(x))
+        S <- length(starts)
+
+        ## recode outcome as categoric
+        y <- x[[model.terms$outcome]] # redefine after sorting x
+        ycat <- integer(S)
+        for(s in 1:S) {
+            yind <- y[starts[s]:stops[s]]
+            y.gt0 <- which(yind > 0)
+            if(length(y.gt0) != 1)
+                stop("stratum ", s, " has ", length(y.gt0),  " observations with y > 0")
+            ycat[s] <- which(yind > 0)
+        }
+    }
+
     ## parameter names not to include by default in the stanfit object
     hs.pars <- c("lambda", "tau", "z", "c2")
     if (keep.hs.pars)
@@ -135,6 +156,7 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian,
     ## choose the model to be fitted
     model <- ifelse(length(penalized) == 0, "base", "hs")
     if (family$family == "binomial") model <- paste0(model, "_logit")
+    else if (family$family == "clogit") model <- paste0(model, "_clogit")
 
     ## set or check adapt.delta
     if (is.null(adapt.delta)) {
@@ -145,6 +167,10 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian,
 
     ## create the design matrix
     X <- ordered.model.matrix(x, model.terms$unpenalized, model.terms$penalized)
+    if (family$family == "clogit") {
+        X <- X[, -1] # drop first column containing 1s
+    }
+
     N <- nrow(X)
     P <- ncol(X)
 
@@ -163,6 +189,9 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian,
 
     ## global scale for regularized horseshoe prior
     global.scale <- if (regularized) par.ratio / sqrt(N) else 1
+    if (family$family == "clogit") {
+        global.scale <- par.ratio / sqrt(S)
+    }
 
     ## core block
     {
@@ -173,8 +202,13 @@ hsstan <- function(x, covs.model, penalized=NULL, family=gaussian,
                            global_scale=global.scale, global_df=global.df,
                            slab_scale=slab.scale, slab_df=slab.df)
 
+        ## extra data for clogit
+        if (family$family == "clogit")
+            data.input <- c(data.input,
+                            ycat=ycat, S=S, starts=starts, stops=stops)
+
         ## run the stan model
-        samples <- rstan::sampling(stanmodels[[model]], data=data.input,
+        samples <- rstan::sampling(object=stanmodels[[model]], data=data.input,
                                    iter=iter, warmup=warmup, seed=seed, ...,
                                    pars=hs.pars, include=keep.hs.pars,
                                    control=list(adapt_delta=adapt.delta))

diff --git a/R/stanmodels.R b/R/stanmodels.R
@@ -1,12 +1,14 @@
 # Generated by rstantools.  Do not edit by hand.
 
 # names of stan models
-stanmodels <- c("base", "base_logit", "hs", "hs_logit")
+stanmodels <- c("base", "base_clogit", "base_logit", "hs", "hs_clogit", "hs_logit")
 
 # load each stan module
 Rcpp::loadModule("stan_fit4base_mod", what = TRUE)
+Rcpp::loadModule("stan_fit4base_clogit_mod", what = TRUE)
 Rcpp::loadModule("stan_fit4base_logit_mod", what = TRUE)
 Rcpp::loadModule("stan_fit4hs_mod", what = TRUE)
+Rcpp::loadModule("stan_fit4hs_clogit_mod", what = TRUE)
 Rcpp::loadModule("stan_fit4hs_logit_mod", what = TRUE)
 
 # instantiate each stanmodel object

diff --git a/inst/stan/base_clogit.stan b/inst/stan/base_clogit.stan
@@ -0,0 +1,74 @@
+//
+// Gaussian prior on regression coeffs (conditional logistic regression with 1 case per stratum)
+//
+
+functions {
+
+  int which(int[ ] x) { // return first subscript for which x[i] > 0)
+    int ycat;
+    int K; 
+    int i;
+    ycat = K + 1; // assign to an out of range value
+    K = size(x);
+    i = 1;
+    while (ycat == K + 1 && i <= K) {
+      if(x[i] > 0) {
+	ycat = i;
+      }
+      i = i + 1;
+    }
+    return ycat;
+  }
+
+}
+
+data {
+  // prior standard deviation for the unpenalised variables
+  real <lower=0> scale_u;
+
+  int N; // number of observations
+  int U; // number of unpenalized columns in model matrix
+   int S;  // number of strata
+
+  // store ragged array as flat array
+  int starts[S];
+  int stops[S];
+
+  matrix[N, U] X;   // design matrix
+  // int <lower=0, upper=1> y[N]; // binary outcome
+
+  // categorical response variable taking values in 1:lengths[s]
+  int<lower=1> ycat[S];
+
+}
+
+transformed data {
+
+  // categorical response variable taking values in 1:lengths[s]
+  //  int<lower=1> ycat[S];
+
+  //  for(s in 1:S) {
+  //   ycat[s] = which(y[starts[s]:stops[s]]);
+  // }
+}
+
+parameters {
+  // unpenalized regression parameters
+  vector[U] beta_u;
+}
+
+transformed parameters {
+  vector[N] X_beta;
+
+  X_beta = X * beta_u;
+
+}
+
+model {
+  // unpenalized coefficients including intercept
+  beta_u ~ normal(0, scale_u);
+  // conditional likelihood is evaluated by looping over strata
+  for (s in 1:S) {
+    ycat[s] ~ categorical_logit(X_beta[starts[s]:stops[s]]);
+  }
+}