genomicanalysis
diff --git a/‎R/pgx-api.R
Lines changed: 5 additions & 4 deletions b/‎R/pgx-api.R
Lines changed: 5 additions & 4 deletions
diff --git a/‎R/pgx-correct.R
Lines changed: 12 additions & 16 deletions b/‎R/pgx-correct.R
Lines changed: 12 additions & 16 deletions
diff --git a/‎R/pgx-init.R
Lines changed: 26 additions & 16 deletions b/‎R/pgx-init.R
Lines changed: 26 additions & 16 deletions
@@ -125,13 +125,14 @@ pgx.getMarkerGenes <- function(pgx, n=10, dir=0, sym=FALSE, filt=NULL) {
 pgx.getFamilies <- function(pgx, nmin=10, extended=FALSE) {
     if(extended) {
         fam <- grep("^[<].*|^FAMILY|^TISSUE|^COMPARTMENT|^CELLTYPE|^GOCC|^DISEASE|^CUSTOM",
-                    names(GSETS),value=TRUE)
-        fam <- grep("^[<].*|^FAMILY|^COMPARTMENT|^CUSTOM",names(GSETS),value=TRUE)
+                    names(iGSETS),value=TRUE)
+        fam <- grep("^[<].*|^FAMILY|^COMPARTMENT|^CUSTOM",names(iGSETS),value=TRUE)
     } else {
-        fam <- grep("^[<].*|^FAMILY|^CUSTOM",names(GSETS),value=TRUE)
+        fam <- grep("^[<].*|^FAMILY|^CUSTOM",names(iGSETS),value=TRUE)
     }
     xgenes <- toupper(rownames(pgx$X))
     xgenes <- toupper(pgx$genes$gene_name)
-    jj <- which(sapply(GSETS[fam],function(x) sum(x %in% xgenes)) >= nmin)
+    gg <- getGSETS(fam)
+    jj <- which(sapply(gg,function(x) sum(x %in% xgenes)) >= nmin)
     sort(fam[jj])
 }
@@ -323,14 +323,14 @@ pgx.superBatchCorrect <- function(X, pheno, model.par, partype=NULL,
             n.sv
         }
         cX1 <- Matrix::head(cX[order(-apply(cX,1,sd)),],1000) ## top 1000 genes only (faster)
-        sv <- try( sva(cX1, mod1x, mod0=mod0x, n.sv=n.sv)$sv )
+        sv <- try( sva::sva(cX1, mod1x, mod0=mod0x, n.sv=n.sv)$sv )
         ##sv <- SmartSVA::smartsva.cpp(cX, mod1x, mod0=mod0x, n.sv=n.sv)$sv
         if(any(class(sv)=="try-error")) {
             ## try again with little bit of noise...
             a <- 0.01*mean(apply(cX,1,sd))
             cX1 <- cX + a*matrix(rnorm(length(cX)),nrow(cX),ncol(cX))
             cX1 <- Matrix::head(cX1[order(-apply(cX1,1,sd)),],1000) ## top 1000 genes only (faster)
-            sv <- try( sva(cX1, mod1x, mod0=mod0x, n.sv=pmax(n.sv-1,1))$sv )
+            sv <- try( sva::sva(cX1, mod1x, mod0=mod0x, n.sv=pmax(n.sv-1,1))$sv )
         }
         if(!any(class(sv)=="try-error")) {
             message("[pgx.superBatchCorrect] Performing SVA correction...")
@@ -509,7 +509,7 @@ pgx.PC_correlation <- function(X, pheno, nv=3, stat="F", plot=TRUE, main=NULL) {
         tt0   <- c("PC correlation","PC variation")[1 + 1*(stat=="F")]
         if(is.null(main)) main <- tt0
         ## R <- R[,ncol(R):1]
-        plt <- ggpubr::ggbarplot(t(R), ylab=stat0, srt=45, group.name="") +
+        plt <- plot.ggbarplot(t(R), ylab=stat0, srt=45, group.name="") +
             ## ggplot2::theme(
             ##     legend.key.size = grid::unit(0.65,"lines"),
             ##     legend.key.height = grid::unit(0.35,"lines"),
@@ -683,7 +683,7 @@ pgx.performBatchCorrection <- function(ngs, zx, batchparams,
                 ## mod1 <- ngs$model.parameters$design
                 mod0 <- cbind(mod1[,1])
                 ##mod0 = model.matrix( ~ 1, data=ngs$samples)
-                sv <- sva( 0.0001+zx, mod1, mod0, n.sv=NULL)$sv
+                sv <- sva::sva( 0.0001+zx, mod1, mod0, n.sv=NULL)$sv
                 ##sv <- svaseq( 2**zx, mod1, mod0, n.sv=NULL)$sv
                 zx <- limma::removeBatchEffect(zx, covariates=sv, design=mod1)
             } else if(batchpar=="<NNM>") {
@@ -778,7 +778,7 @@ pgx.removeBatchEffect <- function(X, batch, model.vars=NULL,
         X <- limma::removeBatchEffect(X, batch=batch0)
     } else if(method=="ComBat") {
 
-        X <- ComBat(X, batch = batch0)
+        X <- sva::ComBat(X, batch = batch0)
     } else if(method=="BMC") {
         ## batch mean center
         matlist <- tapply(1:ncol(X), batch0, function(i) X[,i,drop=FALSE])
@@ -885,13 +885,10 @@ pgx.computeBiologicalEffects <- function(X, is.count=FALSE)
     return(pheno)
 }
 
-nmax=-1
+##nmax=-1
 pgx.svaCorrect <- function(X, pheno, nmax=-1) {
     ## 
-    ## IK: not sure about this SVA correction stuff... 
-    
-    
-
+    ## IK: not sure about this SVA correction stuff...       
     if(NCOL(pheno)==1) {
         pheno <- data.frame(pheno=pheno)
     }
@@ -915,24 +912,23 @@ pgx.svaCorrect <- function(X, pheno, nmax=-1) {
     ##df <- data.frame(var=y)    
     ##mod1x = model.matrix( ~var, data=df)
     ##mod0x = model.matrix( ~1, data=df)
-
     mod1x <- cbind(1, mod1)
     mod0x <- mod1x[,1,drop=FALSE]
     ##mod0 = NULL
 
     message("Estimating number of surrogate variables...")
     if(0) {
         ## original method using SVA
-        n.sv = num.sv(X, mod1x, method="be")
+        n.sv = sva::num.sv(X, mod1x, method="be")
         n.sv            
     } else {
         ## fast method using SmartSVA
         ##X.r <- t(resid(lm(t(X) ~ var, data=df)))
         pp <- paste0(colnames(pheno),collapse="+")
         pp
         lm.expr <- paste0("lm(t(X) ~ ",pp,", data=pheno)")
-        X.r <- t(resid(eval(parse(text=lm.expr))))        
-        n.sv <- EstDimRMT(X.r, FALSE)$dim + 1
+        X.r <- t(stats::resid(eval(parse(text=lm.expr))))        
+        n.sv <- isva::EstDimRMT(X.r, FALSE)$dim + 1
         n.sv
     }
     n.sv <- min(n.sv, min(table(y)))
@@ -943,7 +939,7 @@ pgx.svaCorrect <- function(X, pheno, nmax=-1) {
     if(nmax>0) {
         vX = Matrix::head(X[order(-apply(X,1,sd)),],nmax)
     }
-    sv <- sva(vX, mod1x, mod0x, n.sv=n.sv)$sv
+    sv <- sva::sva(vX, mod1x, mod0x, n.sv=n.sv)$sv
     ##sv <- SmartSVA::smartsva.cpp(X, mod1x, mod0=mod0x, n.sv=n.sv)$sv
 
     message("Perform batch correction...")
@@ -1056,7 +1052,7 @@ pgx._runComputeNumSig <- function(ngs, parcomb, contrast, resample=-1,
             mod0 = cbind(mod1[,1])
             logcpm <- edgeR::cpm(ngs$counts, log=TRUE) ## perform SVA on logCPM
             log <- capture.output({
-                suppressWarnings(sv <- sva(logcpm, mod1, mod0, n.sv=NULL)$sv)
+                suppressWarnings(sv <- sva::sva(logcpm, mod1, mod0, n.sv=NULL)$sv)
                 suppressWarnings(aX <- limma::removeBatchEffect( logcpm, covariates=sv, design=mod1))
             })
             dim(aX)
 
@@ -9,7 +9,11 @@
 ##-----------------------------------------------------------------------------
 ## GLOBAL variables
 ##-----------------------------------------------------------------------------
-
+if(0) {
+    RDIR='~/Playground/omicsplayground/R'
+    FILES='~/Playground/omicsplayground/lib'
+    source(file.path(RDIR,"pgx-include.R"),local=TRUE)  ## pass local vars
+}
 source(file.path(RDIR,"pgx-functions.R"),local=TRUE)  ## pass local vars
 
 ## Caching the init files
@@ -20,23 +24,24 @@ INIT.FILE
 
 file.exists(INIT.FILE)
 
-init.start_time = Sys.time()
-
 if(1 && file.exists(INIT.FILE)) {    
 
     message("[INIT] loading cached INIT file ",INIT.FILE)
-    t <- Sys.time()
+    t0 <- Sys.time()
     load(INIT.FILE, verbose=1)
-    message("Loading cache took: ", round(Sys.time() - t), " seconds")
+    message("Loading cache took: ", round(Sys.time() - t0), " seconds")
 
 } else {
 
     message("[INIT] no INIT file! building INIT from scratch.")
+    message("[INIT] INIT.FILE = ", INIT.FILE)    
+    t0 <- Sys.time()
+
     oldvars <- ls()
 
     ## All gene families in Human UPPER CASE    
     GENE.TITLE  = unlist(as.list(org.Hs.eg.db::org.Hs.egGENENAME))
-    GENE.SYMBOL = unlist(as.list(org.hs.eg.db::org.Hs.egSYMBOL))
+    GENE.SYMBOL = unlist(as.list(org.Hs.eg.db::org.Hs.egSYMBOL))
     names(GENE.TITLE) = GENE.SYMBOL
     ##GSET.PREFIX.REGEX = paste(paste0("^",GSET.PREFIXES,"_"),collapse="|")
     GSET.PREFIX.REGEX="^BIOCARTA_|^C2_|^C3_|^C7_|^CHEA_|^GOBP_|^GOCC_|^GOMF_|^HALLMARK_|^KEA_|^KEGG_|^PID_|^REACTOME_|^ST_"
@@ -49,7 +54,6 @@ if(1 && file.exists(INIT.FILE)) {
     GSETS = gmt.all;remove(gmt.all)
 
     message("[INIT] parsing gene families...")
-
     FAMILIES <- pgx.getGeneFamilies(GENE.SYMBOL, FILES=FILES, min.size=10, max.size=9999)
     fam.file <- file.path(FILES,"custom-families.gmt")
     if(file.exists(fam.file)) {
@@ -63,11 +67,20 @@ if(1 && file.exists(INIT.FILE)) {
     names(f1) <- sub("FAMILY:<all>","<all>",names(f1))
     GSETS <- c(GSETS,f1)
 
+    ## convert to integer list (more efficient)
+    message("[INIT] converting GSETS to list of integers...")
+    GSET.GENES <- sort(unique(unlist(GSETS)))  ## slow...
+    iGSETS <- parallel::mclapply(GSETS, function(a) match(a,GSET.GENES))  ## slow...
+    names(iGSETS) <- names(GSETS)
+    getGSETS <- function(gs) {
+        lapply(iGSETS[gs],function(i) GSET.GENES[i])
+    }
+        
     message("[INIT] parsing collections...")
     COLLECTIONS <- pgx.getGeneSetCollections(names(GSETS), min.size=10, max.size=99999)
     COLLECTIONS <- COLLECTIONS[order(names(COLLECTIONS))]
 
-    remove(list=c("custom.gmt","f1"))
+    remove(list=c("custom.gmt","f1","GSETS"))
 
     ##-----------------------------------------------------------------------------
     ## TISSUE/REFERENCE data sets
@@ -100,8 +113,7 @@ if(1 && file.exists(INIT.FILE)) {
     ##-----------------------------------------------------------------------------
     ## Colors
     ##-----------------------------------------------------------------------------
-    
-    
+        
     COLORS = rep(RColorBrewer::brewer.pal(8,"Set2"),99)
     COLORS = rep(c(ggsci::pal_npg("nrc", alpha = 0.7)(10),
                    ggsci::pal_aaas("default", alpha = 0.7)(10),
@@ -117,14 +129,12 @@ if(1 && file.exists(INIT.FILE)) {
     newvars <- setdiff(ls(), oldvars)
     newvars
 
-    ## message("[INIT] saving INIT file ", INIT.FILE)    
-    ## save( list=newvars, file=INIT.FILE)    
+    message("Creating global init took: ", round(Sys.time() - t0), " seconds")
+    message("[INIT] saving INIT file ", INIT.FILE)    
+    save( list=newvars, file=INIT.FILE)
+    
 }
 
-init.load_time = round( Sys.time() - init.start_time, digits=3)
-message("[INIT] init load time = ",init.load_time, " ",attr(init.load_time,"units"))
-
-
 pgx.initialize <- function(pgx) {
 
     ##---------------------------------------------------------------------