v1.0.0

First version on GitHub
PF2-pasteur-fr · Dec 10, 2014 · 401ee90 · 401ee90
1 parent f1b943a
commit 401ee90
Show file tree

Hide file tree

Showing 100 changed files with 36,558 additions and 1 deletion.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,14 @@
+Package: SARTools
+Type: Package
+Title: Statistical Analysis of RNA-Seq Tools
+Version: 1.0.0
+Date: 2014-12-10
+Author: Marie-Agnes Dillies and Hugo Varet
+Maintainer: Hugo Varet <[email protected]>
+Depends: R (>= 3.1.0), DESeq2 (>= 1.6.0), edgeR (>= 3.8.0), xtable
+Suggests: genefilter (>= 1.44.0), BiocStyle
+Imports: knitr, GenomicRanges, S4Vectors, limma
+VignetteBuilder: knitr
+Encoding: latin1
+Description: SARTools provides R tools and an environment for the statistical analysis of RNA-Seq projects: load and clean data, produce figures, perform statistical analysis/testing with DESeq2 or edgeR, export results and create final report
+License: GPL-2
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,11 @@
+# Generated by roxygen2 (4.0.2): do not edit by hand
+
+exportPattern("^[a-zA-Z]")
+import(DESeq2)
+import(edgeR)
+import(xtable)
+importFrom(GenomicRanges,assay)
+importFrom(GenomicRanges,colData)
+importFrom(S4Vectors,mcols)
+importFrom(knitr,knit2html)
+importFrom(limma,plotMDS)
diff --git a/NEWS b/NEWS
@@ -0,0 +1,18 @@
+CHANGES IN VERSION 1.0.0
+------------------------
+	o First version publicly available on GitHub
+
+CHANGES IN VERSION 0.99.6
+-------------------------
+	o Added batch argument to loadTargetFile()
+	o Simplified the R script templates with wrappers
+
+CHANGES IN VERSION 0.99.5
+-------------------------
+	o Now check the format/validity of the parameters at the beginning of the script
+	o Added functions checkParameters.DESeq2() and checkParameters.edgeR()
+	o Added small precisions in the vignette
+
+CHANGES IN VERSION 0.99.4
+-------------------------
+	o First version of the package
diff --git a/R/BCVPlot.R b/R/BCVPlot.R
@@ -0,0 +1,13 @@
+#' BCV plot (for edgeR dispersions)
+#'
+#' Biological Coefficient of Variation plot (for edgeR objects)
+#'
+#' @param dge a \code{DGEList} object
+#' @return A file named BCV.png in the figures directory with a BCV plot produced by the \code{plotBCV()} function of the edgeR package
+#' @author Marie-Agnes Dillies and Hugo Varet
+
+BCVPlot <- function(dge){	
+  png(filename="figures/BCV.png", width=400, height=400)
+    plotBCV(dge, las = 1, main = "BCV plot")
+  dev.off()	
+}
diff --git a/R/MAPlot.R b/R/MAPlot.R
@@ -0,0 +1,33 @@
+#' MA-plots
+#'
+#' MA-plot for each comparison: log2(FC) vs mean of normalized counts with one dot per feature (red dot for a differentially expressed feature, black dot otherwise)
+#'
+#' @param complete A \code{list} of \code{data.frame} containing features results (from \code{exportResults.DESeq2()} or \code{exportResults.edgeR()})
+#' @param alpha cut-off to apply on each adjusted p-value
+#' @return A file named MAPlot.png in the figures directory containing one MA-plot per comparison
+#' @author Marie-Agnes Dillies and Hugo Varet
+
+MAPlot <- function(complete, alpha=0.05){
+  nrow <- ceiling(sqrt(length(complete)))
+  ncol <- ceiling(length(complete)/nrow)
+  png(filename="figures/MAPlot.png", width=400*max(ncol,nrow), height=400*min(ncol,nrow))
+    par(mfrow=sort(c(nrow,ncol)))
+      for (name in names(complete)){
+        complete.name <- complete[[name]]
+        complete.name <- complete.name[complete.name$baseMean>0,]
+        complete.name$padj <- ifelse(is.na(complete.name$padj),1,complete.name$padj)
+        log2FC <- complete.name$log2FoldChange
+        ylim = 1.1 * c(-1,1) * quantile(abs(log2FC[is.finite(log2FC)]), probs=0.99)
+        plot(complete.name$baseMean, pmax(ylim[1], pmin(ylim[2], log2FC)),
+	         log = "x", cex=0.45, las = 1, ylim = ylim,
+             col = ifelse(complete.name[,"padj"] < alpha, "red", "black"),
+             pch = ifelse(log2FC<ylim[1], 6, ifelse(log2FC>ylim[2], 2, 20)),
+		     xlab = "Mean of normalized counts",
+		     ylab = expression(log[2]~fold~change),
+  		     main = paste0("MA-plot - ",gsub("_"," ",name)))
+        abline(h=0, lwd=1, col="lightgray")
+      }
+  dev.off()
+}
+
+
diff --git a/R/MDSPlot.R b/R/MDSPlot.R
@@ -0,0 +1,25 @@
+#' MDS plot (for edgeR objects)
+#'
+#' Multi-Dimensional Scaling plot of samples based on the 500 most variant features (for edgeR analyses)
+#'
+#' @param dge a \code{DGEList} object
+#' @param group vector of the condition from which each sample belongs
+#' @param n number of features to keep among the most variant
+#' @param gene.selection \code{"pairwise"} to choose the top features separately for each pairwise comparison between the samples or \code{"common"} to select the same features for all comparisons. Only used when \code{method="logFC"}
+#' @param col colors to use (one per biological condition)
+#' @return A file named MDS.png in the figures directory
+#' @author Marie-Agnes Dillies and Hugo Varet
+
+MDSPlot <- function(dge, group, n=500, gene.selection=c("pairwise", "common"),
+                    col=c("lightblue","orange","MediumVioletRed","SpringGreen")){
+  png(filename="figures/MDS.png", width=400, height=400)
+    coord <- plotMDS(dge, top=n, method="logFC", gene.selection=gene.selection[1])
+    abs=range(coord$x); abs=abs(abs[2]-abs[1])/25;
+    ord=range(coord$y); ord=abs(ord[2]-ord[1])/25;
+    plot(coord$x,coord$y, col=col[as.integer(group)], las=1, main="Multi-Dimensional Scaling plot",
+         xlab="Leading logFC dimension 1", ylab="Leading logFC dimension 2", cex=2, pch=16)
+    abline(h=0,v=0,lty=2,col="lightgray")
+    text(coord$x - ifelse(coord$x>0,abs,-abs), coord$y - ifelse(coord$y>0,ord,-ord),
+         colnames(dge$counts), col=col[as.integer(group)])
+  dev.off()      
+}
diff --git a/R/NAMESPACE.R b/R/NAMESPACE.R
@@ -0,0 +1,9 @@
+#' @exportPattern ^[a-zA-Z]
+#' @import DESeq2
+#' @import edgeR
+#' @import xtable
+#' @importFrom knitr knit2html
+#' @importFrom GenomicRanges colData assay
+#' @importFrom S4Vectors mcols
+#' @importFrom limma plotMDS
+NULL
diff --git a/R/PCAPlot.R b/R/PCAPlot.R
@@ -0,0 +1,49 @@
+#' PCA of samples (if use of DESeq2)
+#'
+#' Principal Component Analysis of samples based on the 500 most variant features on VST- or rlog-counts (if use of DESeq2)
+#'
+#' @param counts.trans a matrix a transformed counts (VST- or rlog-counts)
+#' @param group factor vector of the condition from which each sample belongs
+#' @param n number of features to keep among the most variant
+#' @param col colors to use (one per biological condition)
+#' @return A file named PCA.png in the figures directory with a pairwise plot of the three first principal components
+#' @author Marie-Agnes Dillies and Hugo Varet
+
+PCAPlot <- function(counts.trans, group, n=500, col=c("lightblue","orange","MediumVioletRed","SpringGreen")){
+  # PCA on the 500 most variables features
+  rv = apply(counts.trans, 1, var, na.rm=TRUE)
+  select = order(rv, decreasing = TRUE)[1:n]
+  pca = prcomp(t(counts.trans[select, ]))
+  prp <- pca$sdev^2 * 100 / sum(pca$sdev^2)
+  prp <- round(prp[1:3],2)
+
+  # create figure
+  png(filename="figures/PCA.png",width=400*2,height=400)
+    par(mfrow=c(1,2))
+	# axes 1 et 2
+	abs=range(pca$x[,1]); abs=abs(abs[2]-abs[1])/25;
+    ord=range(pca$x[,2]); ord=abs(ord[2]-ord[1])/25;
+    plot(pca$x[,1], pca$x[,2],
+         las = 1, cex = 2, pch = 16, col = col[as.integer(group)],
+	     xlab = paste0("PC1 (",prp[1],"%)"), 
+	     ylab = paste0("PC2 (",prp[2],"%)"), 
+	     main = "Principal Component Analysis - Axes 1 and 2")
+    abline(h=0,v=0,lty=2,col="lightgray")
+    text(pca$x[,1] - ifelse(pca$x[,1]>0,abs,-abs), pca$x[,2] - ifelse(pca$x[,2]>0,ord,-ord),
+         colnames(counts.trans), col=col[as.integer(group)])
+
+	# axes 1 et 3
+	abs=range(pca$x[,1]); abs=abs(abs[2]-abs[1])/25;
+    ord=range(pca$x[,3]); ord=abs(ord[2]-ord[1])/25;
+    plot(pca$x[,1], pca$x[,3],
+         las = 1, cex = 2, pch = 16, col = col[as.integer(group)],
+	     xlab = paste0("PC1 (",prp[1],"%)"), 
+	     ylab = paste0("PC3 (",prp[3],"%)"), 
+	     main = "Principal Component Analysis - Axes 1 and 3")
+    abline(h=0,v=0,lty=2,col="lightgray")
+    text(pca$x[,1] - ifelse(pca$x[,1]>0,abs,-abs), pca$x[,3] - ifelse(pca$x[,3]>0,ord,-ord),
+         colnames(counts.trans), col=col[as.integer(group)])
+  dev.off()
+
+  return(invisible(pca$x))
+}
diff --git a/R/SARTools-package.r b/R/SARTools-package.r
@@ -0,0 +1,6 @@
+#' SARTools provides R tools and an environment for the statistical analysis of RNA-Seq projects: load and clean data, produce figures, perform statistical analysis/testing with DESeq2 or edgeR, export results and create final report
+#' @title Statistical Analysis of RNA-Seq Tools
+#' @author Marie-Agnes Dillies and Hugo Varet
+#' @docType package
+#' @name SARTools-package
+NULL
diff --git a/R/SERE.r b/R/SERE.r
@@ -0,0 +1,28 @@
+#' Pairwise SERE for two samples
+#'
+#' Compute the SERE coefficient for two samples
+#'
+#' @param observed \code{matrix} with two columns containing observed counts of two samples
+#' @return The SERE coefficient for the two samples
+#' @references Schulze, Kanwar, Golzenleuchter et al, SERE: Single-parameter quality control and sample comparison for RNA-Seq, BMC Genomics, 2012
+#' @author See paper published
+
+SERE <- function(observed){
+  #calculate lambda and expected values
+  laneTotals <- colSums(observed)
+  total <- sum(laneTotals)
+  fullObserved <- observed[rowSums(observed)>0,]
+  fullLambda <- rowSums(fullObserved)/total
+  fullLhat <- fullLambda > 0
+  fullExpected<- outer(fullLambda, laneTotals)
+
+  #keep values
+  fullKeep <- which(fullExpected > 0)
+
+  #calculate degrees of freedom (nrow*(ncol -1) >> number of parameters - calculated (just lamda is calculated >> thats why minus 1)
+  #calculate pearson and deviance for all values
+  oeFull <- (fullObserved[fullKeep] - fullExpected[fullKeep])^2/ fullExpected[fullKeep] # pearson chisq test
+  dfFull <- length(fullKeep) - sum(fullLhat!=0)
+
+  sqrt(sum(oeFull)/dfFull)
+}
diff --git a/R/barplotNull.R b/R/barplotNull.R
@@ -0,0 +1,23 @@
+#' Percentage of null counts per sample
+#'
+#' Bar plot of the percentage of null counts per sample
+#'
+#' @param counts \code{matrix} of counts
+#' @param group factor vector of the condition from which each sample belongs
+#' @param col colors of the bars (one color per biological condition)
+#' @return A file named barplotNull.png in the figures directory
+#' @author Marie-Agnes Dillies and Hugo Varet
+
+barplotNull <- function(counts, group, col=c("lightblue","orange","MediumVioletRed","SpringGreen")){
+  png(filename="figures/barplotNull.png",width=400,height=400)
+    percentage <- apply(counts, 2, function(x){sum(x == 0)})*100/nrow(counts)
+    percentage.allNull <- (nrow(counts) - nrow(removeNull(counts)))*100/nrow(counts)
+    barplot(percentage, las = 2,
+            col = col[as.integer(group)],
+		    ylab = "Proportion of null counts",
+		    main = "Proportion of null counts per sample", 
+	  	    ylim = c(0,1.2*max(percentage)))
+    abline(h = percentage.allNull, lty = 2, lwd = 2)
+    legend("topright", levels(group), fill=col[1:nlevels(group)], bty="n")
+  dev.off()
+}
diff --git a/R/barplotTotal.R b/R/barplotTotal.R
@@ -0,0 +1,21 @@
+#' Total number of reads per sample
+#'
+#' Bar plot of the total number of reads per sample
+#'
+#' @param counts \code{matrix} of counts
+#' @param group factor vector of the condition from which each sample belongs
+#' @param col colors of the bars (one color per biological condition)
+#' @return A file named barplotTotal.png in the figures directory
+#' @author Marie-Agnes Dillies and Hugo Varet
+
+barplotTotal <- function(counts, group, col=c("lightblue","orange","MediumVioletRed","SpringGreen")){
+  png(filename="figures/barplotTotal.png",width=400,height=400)
+  barplot(colSums(counts),
+          main = "Total read count per sample",
+		  ylab = "Total read count",
+		  ylim = c(0, max(colSums(counts))*1.2),
+		  col = col[as.integer(group)],
+		  las = 2)
+  legend("topright", levels(group), fill=col[1:nlevels(group)], bty="n")
+  dev.off()
+}
diff --git a/R/checkParameters.DESeq2.r b/R/checkParameters.DESeq2.r
@@ -0,0 +1,108 @@
+#' Check the parameters (when using DESeq2)
+#'
+#' Check the format and the validity of the parameters which will be used for the analysis with DESeq2.
+#  For example, it is important that $alpha$ be a numeric of length 1 between 0 and 1. This function avoid
+#  potential stupid bugs when running the suite of the script.
+#'
+#' @param projectName name of the project
+#' @param author author of the statistical analysis/report
+#' @param targetFile path to the design/target file
+#' @param rawDir path to the directory containing raw counts files
+#' @param featuresToRemove names of the features to be removed
+#' @param varInt factor of interest
+#' @param condRef reference biological condition
+#' @param batch blocking factor in the design
+#' @param fitType mean-variance relationship: "parametric" (default) or "local"
+#' @param cooksCutoff outliers detection threshold (NULL to let DESeq2 choosing it)
+#' @param independentFiltering TRUE/FALSE to perform independent filtering
+#' @param alpha threshold of statistical significance
+#' @param pAdjustMethod p-value adjustment method: "BH" (default) or "BY" for example
+#' @param typeTrans transformation for PCA/clustering: "VST" ou "rlog"
+#' @param locfunc "median" (default) or "shorth" to estimate the size factors
+#' @param colors vector of colors of each biological condition on the plots
+#' @return A boolean indicating if there is a problem in the parameters
+#' @author Hugo Varet
+
+checkParameters.DESeq2 <- function(projectName,author,targetFile,rawDir,
+                                   featuresToRemove,varInt,condRef,batch,fitType,
+								   cooksCutoff,independentFiltering,alpha,pAdjustMethod,
+								   typeTrans,locfunc,colors){
+  problem <- FALSE
+  if (!is.character(projectName) | length(projectName)!=1){
+    print("projectName must be a character vector of length 1")
+	problem <- TRUE
+  }
+  if (!is.character(author) | length(author)!=1){
+    print("author must be a character vector of length 1")
+	problem <- TRUE
+  }
+  if (!is.character(targetFile) | length(targetFile)!=1 || !file.exists(targetFile)){
+    print("targetFile must be a character vector of length 1 specifying an accessible file")
+	problem <- TRUE
+  }
+  if (!is.character(rawDir) | length(rawDir)!=1 || is.na(file.info(rawDir)[1,"isdir"]) | !file.info(rawDir)[1,"isdir"]){
+    print("rawDir must be a character vector of length 1 specifying an accessible directory")
+	problem <- TRUE
+  }
+  if (!is.character(featuresToRemove)){
+    print("featuresToRemove must be a character vector")
+	problem <- TRUE
+  }
+  if (!is.character(varInt) | length(varInt)!=1){
+    print("varInt must be a character vector of length 1")
+	problem <- TRUE
+  }
+  if (!is.character(condRef) | length(condRef)!=1){
+    print("condRef must be a character vector of length 1")
+	problem <- TRUE
+  }
+  if (!is.null(batch) && I(!is.character(batch) | length(batch)!=1)){
+    print("batch must be NULL or a character vector of length 1")
+	problem <- TRUE
+  }
+  if (!is.character(fitType) | length(fitType)!=1 || !I(fitType %in% c("parametric","local"))){
+    print("fitType must be equal to 'parametric' or 'local'")
+	problem <- TRUE
+  }
+  if (!is.null(cooksCutoff) && I(!is.numeric(cooksCutoff) |  length(cooksCutoff)!=1 || cooksCutoff<=0)){
+    print("cooksCutoff must be NULL or a numeric vector of length 1 with a positive value")
+	problem <- TRUE
+  }
+  if (!is.logical(independentFiltering) | length(independentFiltering)!=1){
+    print("independentFiltering must be a boolean vector of length 1")
+	problem <- TRUE
+  }
+  if (!is.numeric(alpha) | length(alpha)!=1 || I(alpha<=0 | alpha>=1)){
+    print("alpha must be a numeric vector of length 1 with a value between 0 and 1")
+	problem <- TRUE
+  }
+  if (!is.character(pAdjustMethod) | length(pAdjustMethod)!=1 || !I(pAdjustMethod %in% p.adjust.methods)){
+    print(paste("pAdjustMethod must be a value in", paste(p.adjust.methods, collapse=", ")))
+	problem <- TRUE
+  }
+  if (!is.character(typeTrans) | length(typeTrans)!=1 || !I(typeTrans %in% c("VST","rlog"))){
+    print("typeTrans must be equal to 'VST' or 'rlog'")
+	problem <- TRUE
+  }
+  if (!is.character(locfunc) | length(locfunc)!=1 || !I(locfunc %in% c("median","shorth"))){
+    print("locfunc must be equal to 'median' or 'shorth'")
+	problem <- TRUE
+  } else{
+    if (locfunc=="shorth" & !I("genefilter" %in% installed.packages()[,"Package"])){
+	  print("Package genefilter is needed if using locfunc='shorth'")
+	  problem <- TRUE
+	}
+  }
+  areColors <- function(col){
+    sapply(col, function(X){tryCatch(is.matrix(col2rgb(X)), error=function(e){FALSE})})
+  }
+  if (!is.vector(colors) || !all(areColors(colors))){
+    print("colors must be a vector of colors")
+	problem <- TRUE
+  }
+
+  if (!problem){
+    print("All the parameters are correct")
+  }
+  return(invisible(problem))
+}