bayes_linear_regression.R

library(Matrix)
library(MASS)
require(geoR)
require(mvtnorm)
# least squares model with uninformed prior
# provide data frame with target column
# and explanatory variables column
# Note X is n x k data matrix and n > k
# in order or the posterior to be proper
bayes_least_squares_uninform_prior <- function(data, explainCol, targetCol, zscale=FALSE) {
  Y <- data[,targetCol]
  X <- data[,explainCol]
  # assume uninformative prior.
  # prior p(Beta,sigma|X) = 1/sigma^2
  # p(Beta|sigma,y) \propto N(\hat{\beta}, V_\beta\sigma^2)
  # \hat{\beta} = (X'X)^-1 X' y
  # V_\beta = (X'X)^-1
  X <- (as.matrix(data[,xcols]))
  Y <- (as.matrix(data[,ycol]))
  yc <- apply(Y, 2, mean)
  ys <- apply(Y, 2, sd)
  xc <- apply(X, 2, mean)
  xs <- apply(X, 2, sd)
  
  result <- list()
  result$ycenter <- yc
  result$yscale <- ys
  result$xcenter <- xc
  result$xscale <- xs
  
  if (zscale==TRUE) {
    Y <- scale(Y)
    X <- scale(X)
    yc <- attr(Y,"scaled:center")
    ys <- attr(Y, "scaled:scale")
    xc <- attr(X, "scaled:center")
    xs <- attr(X, "scaled:scale")
  }
  # X is n x p matrix
  # QR decomposition is a matrix st X = QR
  # where Q is n x p lower triangular and R is k x k upper triangular
  # we expect X is 12 x 14 then R should be 14 x 14. 
  QR <- qr(X, LAPACK=TRUE)
  r <- qr.R(QR, complete=TRUE)
  r1 <- ginv(r)
  # V = (X'X)^-1 = R^-1 (R^-1)'
  V <- r1%*%t(r1)
  q <- qr.Q(QR, complete=TRUE)
  # solve for B via RB = Q'y
  B <- qr.solve(r, t(q)%*%Y)
  # note also that
  #B <- V%*%t(X)%*%Y
  
  n <- nrow(X) 
  k <- ncol(X)
  
  sigma <- sd(as.vector(Y))
  sigma2 <- sigma^2
  # sum of squared residuals
  sSqr <- (1/(abs(n-k)) * t(Y-X%*%B)%*%(Y-X%*%B))[1,1]
  prior <- 1/(sigma^2)
  
  result$isScaled <- zscale
  result$sSqr <- sSqr
  result$sigma <- sigma
  result$n <- n
  result$k <- k
  result$nu <- n - k
  result$V <- V
  result$B <- B
  result$X <- X
  result$Y <- Y
  result$ycenter <- yc
  result$yscale <- ys
  result$xcenter <- xc
  result$xscale <- xs
  
  result$scaleY <- function(result, otherY) {
    newY <- otherY
    if (result$isScaled == TRUE) {
      newY <- sapply(1:ncol(newY), function(i) {
        (newY[,i] - result$ycenter)/result$yscale
      })
    }
    newY
  }
  
  result$scaleX <- function(result, otherX) {
    newX <- otherX
    if (result$isScaled == TRUE) {
      newX <- sapply(1:ncol(newX), function(i) {
        (newX[,i] - result$xcenter[i])/result$xscale[i]
      })
    }
    newX
  }
  
  ## predict the Y target variable given the 
  ## evidence data X
  result$predictY <- function(result, newX) {
    if (result$isScaled == TRUE) {
      newX <- sapply(1:ncol(newX), function(i) {
        (newX[,i] - result$xcenter[i])/result$xscale[i]
      })
    }
    # Y <- X'B
    newX%*%result$B
  }
  
  # the posterior of sigma
  result$sigmaPosterior <- function(result, sigmaIn) {
    dinvchisq(sigmaIn, df=(result$nu), scale=result$sSqr)
  }
  # the random sigma posterior generated by simulations
  result$randSigmaPosterior <- function(result, N) {
    rinvchisq(N, df=(abs(result$nu)), scale=result$sSqr)
  }
  # the random beta posterior generated by simulation
  result$randBetaPosterior <- function(result, sigmaHat, N) {
    rmvnorm(N,result$B,(sigmaHat*result$V))
  }
  
  # the confidence interval for Beta at the given alpha level
  # note that beta1 postetior is multivariate t distribution with 
  # nu df centered on beta hat and scaled by Vs^2
  result$betaConfInterval <- function(result, sigmaHat, alpha) {
    Cov <- sigmaHat*result$V
    beta <- result$B
    a <- qmvt(alpha/2,delta=as.numeric(beta),sigma=Cov,tail="lower", df=result$nu) 
    b <- qmvt(alpha/2,delta=as.numeric(beta),sigma=Cov,tail="upper", df=result$nu) 
    u <- max(a$quantile,b$quantile)
    l <- min(a$quantile,b$quantile)
    betaL <- beta - l*sqrt(result$sSqr)
    betaU <- beta + u*sqrt(result$sSqr)
    conf <- list()
    conf$beta <- beta
    conf$betaU <- betaU
    conf$betaL <- betaL
    conf$upperQuartile <- u
    conf$lowerQuartile <- l
    conf$df <- result$nu
    conf$Cov <- Cov
    conf
  }
  
  ## retrieve the random posterior density for beta
  result$betaConfData <- function(result, sigmaHat, N, names=NULL) {
    Cov <- sigmaHat*result$V
    beta <- result$B
    # estimate probability of beta from sample
    rBeta <- result$randBetaPosterior(result, result$sSqr, N)
    # rbeta has N rows and k columns 
    c <- nrow(rBeta)
    # build a matrix of N with the probabilities of each row
    P <- sapply(1:c, function(i) {
      row <- rBeta[i,]
      p <- pmvt(row, delta=as.numeric(beta), sigma=Cov,df=result$nu)
      p[1]
    })
    # note that we can also provide an estimate of y for the probability
    
    d2 <- data.frame(rBeta)
    d2$prob <- P
    d2 <- d2[order(-d2$prob),]
    if (!is.null(names)) {
      colnames(d2) <- names
    }
    d2
  }
  
  
  ## apply the update rules to the model given the 
  ## new Y data and the newX data.
  result$applyUpdateRules <- function(result, newY, newX) {
  ## updating the S parameter
  ## snew = s^2 + 1/(Nnew - k) (Y_new - X_newB)'(Y_new - X_newB)
   Y <- newY
   X <- newX
   yc <- apply(Y, 2, mean)
   ys <- apply(Y, 2, sd)
   xc <- apply(X, 2, mean)
   xs <- apply(X, 2, sd)
   
   result2 <- list()
   
   result2$isScaled <- result$isScaled
   
   if (result$isScaled) {
     Y <- scale(Y)
     X <- scale(X)
     
     yc <- attr(Y,"scaled:center")
     ys <- attr(Y, "scaled:scale")
     xc <- attr(X, "scaled:center")
     xs <- attr(X, "scaled:scale")
   } 
   # Update the scale parameters
   n1 <- result$n
   n2 <- nrow(X)
   
   s1 <- result$yscale^2
   s2 <- ys^2
   mu1 <- result$ycenter
   mu2 <- yc
   result2$ycenter <- (mu1+mu2)/2
   muC <- result$ycenter
   result2$yscale <- sqrt( (n1*(s1 + (mu1 - muC)^2) + n2*(s2 + (mu2 - muC)^2))/(n1+n2) )
   
   s1 <- result$xscale^2
   s2 <- xs^2
   mu1 <- result$xcenter
   mu2 <- xc
   result2$xcenter <- (mu1+mu2)/2
   muC <- result$xcenter
   result2$xscale <- sqrt( (n1*(s1 + (mu1 - muC)^2) + n2*(s2 + (mu2 - muC)^2))/(n1+n2) )
   
   Yhat <- X%*%result$B
   # update the sigma parameter for the sample
   sSqr <- result$sSqr + 1/(nrow(X) - ncol(X)) * t(Y-Yhat)%*%(Y-Yhat)
   nu <- result$nu + (nrow(X) - ncol(X))
   # combine the variance to estimate sigma
   n1 <- result$n
   n2 <- nrow(X)
   s1 <- result$sigma^2
   s2 <- (apply(Y, 2, function(y) { sd(y) }))
   s2 <- s2^2
   mu1 <- mean(result$Y)
   mu2 <- mean(Y)
   muC <- (mu1+mu2)/2
   sigma <- (n1*(s1 + (mu1 - muC)^2) + n2*(s2 + (mu2 - muC)^2))/(n1+n2)
   
   # do the same forY
   result2$nu <- nu
   result2$sigma <- sigma
   result2$sigma2 <- sigma^2
   
   result2$sSqr <- sSqr[1,1]
   result2$n <- n1 + n2
   
   I <- diag(x=1,nrow=nrow(result$V), ncol=ncol(result$V))
   Mu0 <- result$B
   Lambda0 <- result$V
   Cov <- (s2*I)
   Lambda2 <- Lambda0 + Cov
   MeanVec <- as.matrix(sample(x = c(1), size = nrow(Mu0), replace = TRUE) * mu2)
   Mu2 <- ginv(ginv(Lambda0) + n2*ginv(Lambda2)) %*%(ginv(Lambda0)%*%Mu0 + n2*ginv(Cov)%*%MeanVec)
   Mu2 <- data.frame(Mu2)
   rownames(Mu2) <- rownames(Mu0)
   result2$V <- Lambda2
   result2$B <- as.matrix(Mu2)
   
   # store the additional data
   result2$X <- rbind(result$X, X)
   result2$Y <- rbind(result$Y, Y)
   
   # assign the functions to the model
   result2$scaleY <- result$scaleY
   result2$scaleX <- result$scaleX
   result2$predictY <- result$predictY
   result2$sigmaPosterior <- result$sigmaPosterior
   result2$randSigmaPosterior <- result$randSigmaPosterior
   result2$randBetaPosterior <- result$randBetaPosterior
   result2$betaConfInterval <- result$betaConfInterval
   result2$betaConfData <- result$betaConfData
   result2$applyUpdateRules <- result$applyUpdateRules
   result2$yJointPosteriorSimulate <- result$yJointPosteriorSimulate
   result2$yPredictivePosteriorSimulate <- result$yPredictivePosteriorSimulate
   result2$Eppd <- result$Eppd
   result2$WAIC <- result$WAIC
   result2$predictedResiduals <- result$predictedResiduals
   result2$plotPredictedResiduals <- result$plotPredictedResiduals
   result2$yPredictiveConfidence <- result$yPredictiveConfidence
   
   
   # return updated model
   result2
  }
  
  # this is the joint posterior distribution of
  # y_hat \sim N(X_hat B, \sigma^2I)
  # this method can be used for simulation as it draws the 
  # parameters from their respective posterior distributions
  result$yJointPosteriorSimulate <- function(result, newX, N) {
    if (result$isScaled == TRUE) {
      newX <- result$scaleX(result, newX)
    }
    sigmaHat <- result$randSigmaPosterior(result, 1)
    bhat <- result$randBetaPosterior(result, sigmaHat, 1)
    xMu <- newX%*%t(bhat)
    row <- nrow(xMu)
    col <- nrow(xMu)
    v <- sqrt(sigmaHat) * diag(x=1,row,col)
    # this outputs rows of length n for Y
    # each Y is a row vector of simulated values
    rmvnorm(N, xMu,as.matrix(v))
  }
  # this is the predictive distribution of
  # p(y_hat|y) which is t distribution with center X_hatB 
  # and scale s^2(I + X_hatVX_hat') and n - k degrees freedom where n > k
  # this method will be used for prediction as it uses
  # the parameter values derived from the estimate
  # this method will simulate y using the known parameters
  result$yPredictivePosteriorSimulate <- function(result, newX, N) {
    if (result$isScaled == TRUE) {
      newX <- result$scaleX(result, newX)
    }
    df <- result$nu
    xMu <- newX%*%as.matrix(result$B)
    var <- newX%*%result$V%*%t(newX)
    I <- diag(x=1,nrow=nrow(var), ncol=ncol(var))
    sigma <- sqrt(result$sSqr) * (I + var)
    rmvt(N, sigma=sigma, df=df,delta=xMu,type="shifted")
  }
  
  ## compute the expected log predictive density
  result$Eppd <- function(result, inY, inX) {
    X <- inX
    Y <- inY
    if (result$isScaled == TRUE) {
      X <- result$scaleX(result, inX)
      Y <- result$scaleY(result, as.matrix(inY))
    }
    ## compute the HPD at the top end of the intervals
    betaConf <- result$betaConfInterval(result, result$sSqr, 0.005)
    Ylower <- X%*%betaConf$betaL
    Ylower <- rep(min(Ylower), length(Ylower))
    Yupper <- X%*%betaConf$betaU
    Yupper <- rep(max(Yupper), length(Yupper))
    ## now we have upper and lower limits
    ## we need sigma and the delta
    df <- result$nu
    xMu <- X%*%as.matrix(result$B)
    var <- X%*%result$V%*%t(X)
    I <- diag(x=1,nrow=nrow(var), ncol=ncol(var))
    sigma <- sqrt(result$sSqr) * (I + var)
    
    pointVar <- diag(sigma)
    
    P <- sapply(1:length(Y), 
                function(i) {
                  pt(Y[i], df=df, ncp=xMu[i])
                })
    P.log <- log(P)
    # compute the probability that the mean 
    #p <- pmvt(lower=as.vector(Ylower), upper=as.vector(Yupper), delta=as.vector(xMu),
    #     df=df,
    #     sigma=sigma,
    #     type="shifted")
    
    
    lppd <- sum( log(P))
    # compute the waic
    mp1 <- log(1/length(Y) * sum(P))
    
    mp2 <- 1/length(Y) *sum(log(P))
    waic <-  2 *length(Y) * sum(mp1 - mp2)
    out <- list()
    out$lppd <- lppd
    out$eppd <- lppd - waic
    out$waic <- waic
    out
  }
  
  ## given a model 
  ## evaluate two density using the "WAIC - widely applicable information criteria"
  ## we estimate for Y_i using the univariate t density
  ## the estimated log predictive density is used
  ## this is approximately eppd = sum log(1/N sum p(y_i|theta))
  ## waic is used to adjust for bias
  result$WAIC <- function(resultA, resultB, inY, inX) {
    X <- inX
    Y <- inY
    if (resultA$isScaled == TRUE) {
      X <- resultA$scaleX(result, inX)
      Y <- resultA$scaleY(result, as.matrix(inY))
    }
    df <- resultA$nu
    xMu <- X%*%as.matrix(resultA$B)
    p <- sapply(1:length(Y), function(i) { pt(Y[i], df=df, ncp=xMu[i]) })
    mp1 <- log(sum(1/length(p) * p))
    
    mp2 <- 1/length(p) * sum(log(p))
    
    waic <- 2 * sum(mp1 - mp2)
    
    eppd1 <- sum(1/(length(p)) * log(p))
    out = list()
    out$modelAeppd1 <- eppd1
    out$modelAwaic <- waic
    out$modelAadjeppd1 <- eppd1 - waic
    
    xMu <- X%*%as.matrix(resultB$B)
    p <- sapply(1:length(Y), function(i) { pt(Y[i], df=df, ncp=xMu[i]) })
    mp1 <- log(sum(1/length(p) * p))
    
    mp2 <- 1/length(p) * sum(log(p))
    
    waic <- 2 * sum(mp1 - mp2)
    
    eppd2 <- sum(1/(length(p)) * log(p))
    
    out$modelBeppd1 <- eppd2
    out$modelBwaic <- waic
    out$modelBadjeppd1 <- eppd2 - waic
    out
  }
  
  # return the residuals for each row of the predictedY matrix from the original data
  # predictedY is a row matrix with k columns for k items in Y 
  result$predictedResiduals <- function(result, predictedY, targetY= NULL) {
    yscaled <- t(result$Y)
    n <- nrow(predictedY)
    if (!is.null(targetY)) {
      if (result$isScaled) {
        yscaled <- scale(targetY)
      } else {
        yscaled <- targetY
      }
    }
    residuals <- sapply(1:nrow(predictedY), function(i) { predictedY[i,] - yscaled})
    # return rows of residuals for each row in predictedY
    t(residuals)
  }
  
  
  # plot the standardised residuals
  # for the 12 attributes, these should
  # be normally distributed with sd = 1
  
  # this is a visual guide in assessing the predictive accuracy
  # of the model compared against known data set.
  result$plotPredictedResiduals <- function(result, predictedY, rows, cols) {
    pr <- result$predictedResiduals(result, predictedY)
    prs <- t(scale(t(pr)))
    
    old <- par(mfrow=c(rows,cols))
    
    for(j in c(2:ncol(predictedY))) {
      plot(prs[,j]~prs[,1], xlab="residual 1", ylab=paste("residual", j))
      abline(0,1, col="red")
    }
    
    par <- par(old)
    n <- nrow(predictedY)
    title(main=paste("standardised residuals from", n,"simulations"))
    old
  }
  
  ## find the confidence interval around the mean for the upper and lower tail
  ## we can compute posterior quantiles analytically 
  # from the multivariate t distribution
  result$yPredictiveConfidence <- function(result, newX, alpha) {
    if (result$isScaled == TRUE) {
      newX <- result$scaleX(result, newX)
    }
    df <- result$nu
    xMu <- newX%*%result$B
    var <- newX%*%result$V%*%t(newX)
    I <- diag(x=1,nrow=nrow(var), ncol=ncol(var))
    s <- result$sSqr * (I + var)
    a <- alpha/2
    q <- qmvt(a, df=df, delta=as.numeric(xMu), sigma=sqrt(I+var), tail="upper.tail")
    upper <- as.numeric(xMu) + (q$quantile)
    lower <- as.numeric(xMu) - (q$quantile)
    conf <- list()
    conf$mean <- xMu
    conf$upper <- upper
    conf$lower <- lower
    
    # return also the unscaled parameters 
    conf$unscaledMean <- conf$mean*result$yscale + result$ycenter
    conf$unscaledUpper <- conf$upper*result$yscale + result$ycenter
    conf$unscaledLower <- conf$lower*result$yscale + result$ycenter
    
    conf
  }
  
  result
}