diff --git a/DESCRIPTION b/DESCRIPTION index 9a38c90..134477e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: projectR Type: Package Title: Functions for the projection of weights from PCA, CoGAPS, NMF, correlation, and clustering -Version: 1.19.2 +Version: 1.23.1 Author: Gaurav Sharma, Charles Shin, Jared Slosberg, Loyal Goff, Genevieve Stein-O'Brien Authors@R: c( person("Gaurav", "Sharma", role = c("aut")), @@ -15,6 +15,7 @@ Authors@R: c( Description: Functions for the projection of data into the spaces defined by PCA, CoGAPS, NMF, correlation, and clustering. License: GPL (==2) Imports: + SingleCellExperiment, methods, cluster, stats, @@ -50,7 +51,7 @@ Suggests: SeuratObject LazyData: TRUE LazyDataCompression: gzip -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Encoding: UTF-8 VignetteBuilder: knitr biocViews: FunctionalPrediction, GeneRegulation, BiologicalQuestion, Software diff --git a/NAMESPACE b/NAMESPACE index 273c371..5ffadf6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,6 +21,7 @@ exportClasses(correlateR) exportClasses(rotatoR) import(MatrixModels) import(RColorBrewer) +import(SingleCellExperiment) import(cluster) import(dplyr) import(fgsea) diff --git a/R/projectR.R b/R/projectR.R index 0a415e0..3e8b9eb 100644 --- a/R/projectR.R +++ b/R/projectR.R @@ -9,7 +9,6 @@ setOldClass("prcomp") #' @param NP vector of integers indicating which columns of loadings object to use. The default of NP=NA will use entire matrix. #' @param full logical indicating whether to return the full model solution. By default only the new pattern object is returned. #' @param model Optional arguements to choose method for projection -#' @param family VGAM family function for model fitting (default: "gaussianff") #' @param bootstrapPval logical to indicate whether to generate p-values using bootstrap, not available for prcomp and rotatoR objects #' @param bootIter number of bootstrap iterations, default = 1000 #' @rdname projectR-methods @@ -21,13 +20,12 @@ setMethod("projectR",signature(data="matrix",loadings="matrix"),function( loadingsNames = NULL, # a vector with names of loadings rows NP=NA, # vector of integers indicating which columns of loadings object to use. The default of NP=NA will use entire matrix. full=FALSE, # logical indicating whether to return the full model solution. By default only the new pattern object is returned. - family="gaussianff", # VGAM family function (default: "gaussianff") bootstrapPval=FALSE, # logical to indicate whether to generate p-values using bootstrap bootIter=1e3 # No of bootstrap iterations ){ ifelse(!is.na(NP),loadings<-loadings[,NP],loadings<-loadings) - #if(!is.na(NP)){loadings<-loadings[,NP]} was giving warning with subset of patterns + #match genes in data sets if(is.null(dataNames)){ dataNames <- rownames(data) @@ -45,16 +43,8 @@ setMethod("projectR",signature(data="matrix",loadings="matrix"),function( projectionPatterns <- t(projection$coefficients) projection.ts<-t(projection$coefficients/projection$stdev.unscaled/projection$sigma) - #projection<-vglm(dataM$data2 ~ 0 + dataM$data1,family=family) - #projectionPatterns<-coefvlm(projection,matrix.out=TRUE) - - #For VGAM - #pval.matrix<-matrix(2*pnorm(-abs(summary(projection)@coef3[,3])),nrow=5,byrow=TRUE) - #For limma pval.matrix<-2*pnorm(-abs(projection.ts)) - #colnames(pval.matrix)<-colnames(projectionPatterns) - #rownames(pval.matrix)<-rownames(projectionPatterns) if(bootstrapPval){ boots <- lapply(1:bootIter,function(x){ @@ -66,7 +56,6 @@ setMethod("projectR",signature(data="matrix",loadings="matrix"),function( } if(full & bootstrapPval){ - #projectionFit <- list('projection'=projectionPatterns, 'fit'=projection,'pval'=pval.matrix) projectionFit <- list('projection'=projectionPatterns, 'pval'=pval.matrix, 'bootstrapPval' = bootPval) return(projectionFit) } else if(full){ @@ -82,7 +71,6 @@ setMethod("projectR",signature(data="matrix",loadings="matrix"),function( #' @param NP vector of integers indicating which columns of loadings object to use. The default of NP=NA will use entire matrix. #' @param full logical indicating whether to return the full model solution. By default only the new pattern object is returned. #' @param model Optional arguements to choose method for projection -#' @param family VGAM family function for model fitting (default: "gaussianff") #' @rdname projectR-methods #' @aliases projectR setMethod("projectR",signature(data="dgCMatrix",loadings="matrix"),function( @@ -90,52 +78,54 @@ setMethod("projectR",signature(data="dgCMatrix",loadings="matrix"),function( loadings, # a matrix of continous values to be projected with unique rownames dataNames = NULL, # a vector with names of data rows loadingsNames = NULL, # a vector with names of loadings rows - NP=NA, # vector of integers indicating which columns of loadings object to use. The default of NP=NA will use entire matrix. - full=FALSE, # logical indicating whether to return the full model solution. By default only the new pattern object is returned. - family="gaussianff" # VGAM family function (default: "gaussianff") + NP=NULL, # vector of integers indicating which columns of loadings object to use. The default of NP=NA will use entire matrix. + full=FALSE # logical indicating whether to return the full model solution. By default only the new pattern object is returned. ){ - ifelse(!is.na(NP),loadings<-loadings[,NP],loadings<-loadings) - #if(!is.na(NP)){loadings<-loadings[,NP]} was giving warning with subset of patterns - #match genes in data sets - if(is.null(dataNames)){ - dataNames <- rownames(data) + if(!is.null(NP)) { + loadings<-loadings[,NP] } - if(is.null(loadingsNames)){ - loadingsNames <- rownames(loadings) - } - dataM<-geneMatchR(data1=data, data2=loadings, data1Names=dataNames, data2Names=loadingsNames, merge=FALSE) - print(paste(as.character(dim(dataM[[2]])[1]),'row names matched between data and loadings')) - print(paste('Updated dimension of data:',as.character(paste(dim(dataM[[2]]), collapse = ' ')))) - # do projection - Design <- model.matrix(~0 + dataM[[1]]) - colnames(Design) <- colnames(dataM[[1]]) - projection <- MatrixModels:::lm.fit.sparse(t(dataM[[2]]),dataM[[1]]) - projectionPatterns <- t(projection$coefficients) - projection.ts<-t(projection$coefficients/projection$stdev.unscaled/projection$sigma) - - #projection<-vglm(dataM$data2 ~ 0 + dataM$data1,family=family) - #projectionPatterns<-coefvlm(projection,matrix.out=TRUE) - - #For VGAM - #pval.matrix<-matrix(2*pnorm(-abs(summary(projection)@coef3[,3])),nrow=5,byrow=TRUE) - #For limma - pval.matrix<-2*pnorm(-abs(projection.ts)) - #colnames(pval.matrix)<-colnames(projectionPatterns) - #rownames(pval.matrix)<-rownames(projectionPatterns) + print("dgCMatrix detected, projecting in chunks.") + #columns of dgcMatrix are LHS for stats::lm, and columns of loadings are the + #dense RHS (predictors). sometimes dgcMatrix is too big to fit RAM, so we + #just fit chunks of lm models as supported by stats::lm/limma::lmFit + + chop <- function(sparsematrix) { + coln <- ncol(sparsematrix) + bins <- seq(1, coln, by = 1000) + lapply(seq_along(bins), function(i) { + start <- bins[i] + end <- ifelse(i < length(bins), bins[i + 1] - 1, coln) + return(start:end) + }) + } + #discard print statements projectR generates each time a chunk is called + w <- invisible(capture.output( + projectionList <- lapply(chop(data), function(i) { + projectR(as.matrix(data[,i]), loadings, full=full) + }) + )) + #since chopping by columns, it's enough to print matching rows only once + print(w[1]) - if(full==TRUE){ - #projectionFit <- list('projection'=projectionPatterns, 'fit'=projection,'pval'=pval.matrix) - projectionFit <- list('projection'=projectionPatterns, 'pval'=pval.matrix) - return(projectionFit) + if(full==TRUE) { + if(length(projectionList)==1) {#if only one chunk + res <- projectionList[[1]] + } else { + projectionFit <- lapply(projectionList, function(x) do.call(cbind, x)) + res <- projectionFit + } + } else { + res <- do.call(cbind, projectionList) } - else{return(projectionPatterns)} + return(res) }) ####################################################################################################################################### #' @import limma +#' @import SingleCellExperiment #' @importFrom NMF fcnnls #' @examples #' library("CoGAPS") @@ -153,7 +143,6 @@ setMethod("projectR",signature(data="matrix",loadings="LinearEmbeddingMatrix"),f NP=NA, # vector of integers indicating which columns of loadings object to use. The default of NP=NA will use entire matrix. full=FALSE, # logical indicating whether to return the full model solution. By default only the new pattern object is returned. model=NA, # optional arguements to choose method for projection - family="gaussianff", # VGAM family function (default: "gaussianff") bootstrapPval=FALSE, # logical to indicate whether to generate p-values using bootstrap bootIter=1e3 # No of bootstrap iterations ){ @@ -256,10 +245,8 @@ setMethod("projectR",signature(data="matrix",loadings="rotatoR"),function( Eigenvalues<-eigen(cov(projectionPatterns))$values PercentVariance<-round(Eigenvalues/sum(Eigenvalues) * 100, digits = 2) - #PercentVariance<-apply(projectionPatterns,2, function(x) 100*var(x)/sum(apply(p2P,2,var))) - - projectionFit <- list(t(projectionPatterns), PercentVariance) - return(projectionFit) + projectionFit <- list(t(projectionPatterns), PercentVariance) + return(projectionFit) } else{return(t(projectionPatterns))} diff --git a/man/projectR-methods.Rd b/man/projectR-methods.Rd index e0b1247..5f553fe 100644 --- a/man/projectR-methods.Rd +++ b/man/projectR-methods.Rd @@ -23,7 +23,6 @@ projectR(data, loadings, dataNames = NULL, loadingsNames = NULL, ...) loadingsNames = NULL, NP = NA, full = FALSE, - family = "gaussianff", bootstrapPval = FALSE, bootIter = 1000 ) @@ -33,9 +32,8 @@ projectR(data, loadings, dataNames = NULL, loadingsNames = NULL, ...) loadings, dataNames = NULL, loadingsNames = NULL, - NP = NA, - full = FALSE, - family = "gaussianff" + NP = NULL, + full = FALSE ) \S4method{projectR}{matrix,LinearEmbeddingMatrix}( @@ -46,7 +44,6 @@ projectR(data, loadings, dataNames = NULL, loadingsNames = NULL, ...) NP = NA, full = FALSE, model = NA, - family = "gaussianff", bootstrapPval = FALSE, bootIter = 1000 ) @@ -129,8 +126,6 @@ projectR(data, loadings, dataNames = NULL, loadingsNames = NULL, ...) \item{full}{logical indicating whether to return the full model solution. By default only the new pattern object is returned.} -\item{family}{VGAM family function for model fitting (default: "gaussianff")} - \item{bootstrapPval}{logical to indicate whether to generate p-values using bootstrap, not available for prcomp and rotatoR objects} \item{bootIter}{number of bootstrap iterations, default = 1000} diff --git a/tests/testthat/test_projectR.R b/tests/testthat/test_projectR.R index 3acd3d4..46ddfcf 100644 --- a/tests/testthat/test_projectR.R +++ b/tests/testthat/test_projectR.R @@ -200,4 +200,28 @@ test_that("results are correctly formatted for P value mode",{ }) +test_that("projection works on sparse data matrix", { + dense <- as.matrix(p.RNAseq6l3c3t) + sparse <- as(p.RNAseq6l3c3t, "sparseMatrix") + loadings <- CR.RNAseq6l3c3t@featureLoadings + expect_true("dgCMatrix" %in% class(sparse)) + expect_no_error(projectR(sparse, loadings)) + + pdense <- projectR(dense, loadings) + psparse <- projectR(sparse, loadings) + expect_identical(pdense, psparse) +}) + +test_that("projection works on sparse data matrix with full=TRUE", { + dense <- as.matrix(p.RNAseq6l3c3t) + sparse <- as(p.RNAseq6l3c3t, "sparseMatrix") + loadings <- CR.RNAseq6l3c3t@featureLoadings + + expect_true("dgCMatrix" %in% class(sparse)) + expect_no_error(projectR(sparse, loadings)) + + pdense <- projectR(dense, loadings, full=TRUE) + psparse <- projectR(sparse, loadings, full=TRUE) + expect_identical(pdense, psparse) +}) \ No newline at end of file