Skip to content

Commit

Permalink
v1.0.0
Browse files Browse the repository at this point in the history
First version on GitHub
  • Loading branch information
hvaret committed Dec 10, 2014
1 parent f1b943a commit 401ee90
Show file tree
Hide file tree
Showing 100 changed files with 36,558 additions and 1 deletion.
14 changes: 14 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Package: SARTools
Type: Package
Title: Statistical Analysis of RNA-Seq Tools
Version: 1.0.0
Date: 2014-12-10
Author: Marie-Agnes Dillies and Hugo Varet
Maintainer: Hugo Varet <[email protected]>
Depends: R (>= 3.1.0), DESeq2 (>= 1.6.0), edgeR (>= 3.8.0), xtable
Suggests: genefilter (>= 1.44.0), BiocStyle
Imports: knitr, GenomicRanges, S4Vectors, limma
VignetteBuilder: knitr
Encoding: latin1
Description: SARTools provides R tools and an environment for the statistical analysis of RNA-Seq projects: load and clean data, produce figures, perform statistical analysis/testing with DESeq2 or edgeR, export results and create final report
License: GPL-2
11 changes: 11 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Generated by roxygen2 (4.0.2): do not edit by hand

exportPattern("^[a-zA-Z]")
import(DESeq2)
import(edgeR)
import(xtable)
importFrom(GenomicRanges,assay)
importFrom(GenomicRanges,colData)
importFrom(S4Vectors,mcols)
importFrom(knitr,knit2html)
importFrom(limma,plotMDS)
18 changes: 18 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
CHANGES IN VERSION 1.0.0
------------------------
o First version publicly available on GitHub

CHANGES IN VERSION 0.99.6
-------------------------
o Added batch argument to loadTargetFile()
o Simplified the R script templates with wrappers

CHANGES IN VERSION 0.99.5
-------------------------
o Now check the format/validity of the parameters at the beginning of the script
o Added functions checkParameters.DESeq2() and checkParameters.edgeR()
o Added small precisions in the vignette

CHANGES IN VERSION 0.99.4
-------------------------
o First version of the package
13 changes: 13 additions & 0 deletions R/BCVPlot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#' BCV plot (for edgeR dispersions)
#'
#' Biological Coefficient of Variation plot (for edgeR objects)
#'
#' @param dge a \code{DGEList} object
#' @return A file named BCV.png in the figures directory with a BCV plot produced by the \code{plotBCV()} function of the edgeR package
#' @author Marie-Agnes Dillies and Hugo Varet

BCVPlot <- function(dge){
png(filename="figures/BCV.png", width=400, height=400)
plotBCV(dge, las = 1, main = "BCV plot")
dev.off()
}
33 changes: 33 additions & 0 deletions R/MAPlot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#' MA-plots
#'
#' MA-plot for each comparison: log2(FC) vs mean of normalized counts with one dot per feature (red dot for a differentially expressed feature, black dot otherwise)
#'
#' @param complete A \code{list} of \code{data.frame} containing features results (from \code{exportResults.DESeq2()} or \code{exportResults.edgeR()})
#' @param alpha cut-off to apply on each adjusted p-value
#' @return A file named MAPlot.png in the figures directory containing one MA-plot per comparison
#' @author Marie-Agnes Dillies and Hugo Varet

MAPlot <- function(complete, alpha=0.05){
nrow <- ceiling(sqrt(length(complete)))
ncol <- ceiling(length(complete)/nrow)
png(filename="figures/MAPlot.png", width=400*max(ncol,nrow), height=400*min(ncol,nrow))
par(mfrow=sort(c(nrow,ncol)))
for (name in names(complete)){
complete.name <- complete[[name]]
complete.name <- complete.name[complete.name$baseMean>0,]
complete.name$padj <- ifelse(is.na(complete.name$padj),1,complete.name$padj)
log2FC <- complete.name$log2FoldChange
ylim = 1.1 * c(-1,1) * quantile(abs(log2FC[is.finite(log2FC)]), probs=0.99)
plot(complete.name$baseMean, pmax(ylim[1], pmin(ylim[2], log2FC)),
log = "x", cex=0.45, las = 1, ylim = ylim,
col = ifelse(complete.name[,"padj"] < alpha, "red", "black"),
pch = ifelse(log2FC<ylim[1], 6, ifelse(log2FC>ylim[2], 2, 20)),
xlab = "Mean of normalized counts",
ylab = expression(log[2]~fold~change),
main = paste0("MA-plot - ",gsub("_"," ",name)))
abline(h=0, lwd=1, col="lightgray")
}
dev.off()
}


25 changes: 25 additions & 0 deletions R/MDSPlot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#' MDS plot (for edgeR objects)
#'
#' Multi-Dimensional Scaling plot of samples based on the 500 most variant features (for edgeR analyses)
#'
#' @param dge a \code{DGEList} object
#' @param group vector of the condition from which each sample belongs
#' @param n number of features to keep among the most variant
#' @param gene.selection \code{"pairwise"} to choose the top features separately for each pairwise comparison between the samples or \code{"common"} to select the same features for all comparisons. Only used when \code{method="logFC"}
#' @param col colors to use (one per biological condition)
#' @return A file named MDS.png in the figures directory
#' @author Marie-Agnes Dillies and Hugo Varet

MDSPlot <- function(dge, group, n=500, gene.selection=c("pairwise", "common"),
col=c("lightblue","orange","MediumVioletRed","SpringGreen")){
png(filename="figures/MDS.png", width=400, height=400)
coord <- plotMDS(dge, top=n, method="logFC", gene.selection=gene.selection[1])
abs=range(coord$x); abs=abs(abs[2]-abs[1])/25;
ord=range(coord$y); ord=abs(ord[2]-ord[1])/25;
plot(coord$x,coord$y, col=col[as.integer(group)], las=1, main="Multi-Dimensional Scaling plot",
xlab="Leading logFC dimension 1", ylab="Leading logFC dimension 2", cex=2, pch=16)
abline(h=0,v=0,lty=2,col="lightgray")
text(coord$x - ifelse(coord$x>0,abs,-abs), coord$y - ifelse(coord$y>0,ord,-ord),
colnames(dge$counts), col=col[as.integer(group)])
dev.off()
}
9 changes: 9 additions & 0 deletions R/NAMESPACE.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#' @exportPattern ^[a-zA-Z]
#' @import DESeq2
#' @import edgeR
#' @import xtable
#' @importFrom knitr knit2html
#' @importFrom GenomicRanges colData assay
#' @importFrom S4Vectors mcols
#' @importFrom limma plotMDS
NULL
49 changes: 49 additions & 0 deletions R/PCAPlot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#' PCA of samples (if use of DESeq2)
#'
#' Principal Component Analysis of samples based on the 500 most variant features on VST- or rlog-counts (if use of DESeq2)
#'
#' @param counts.trans a matrix a transformed counts (VST- or rlog-counts)
#' @param group factor vector of the condition from which each sample belongs
#' @param n number of features to keep among the most variant
#' @param col colors to use (one per biological condition)
#' @return A file named PCA.png in the figures directory with a pairwise plot of the three first principal components
#' @author Marie-Agnes Dillies and Hugo Varet

PCAPlot <- function(counts.trans, group, n=500, col=c("lightblue","orange","MediumVioletRed","SpringGreen")){
# PCA on the 500 most variables features
rv = apply(counts.trans, 1, var, na.rm=TRUE)
select = order(rv, decreasing = TRUE)[1:n]
pca = prcomp(t(counts.trans[select, ]))
prp <- pca$sdev^2 * 100 / sum(pca$sdev^2)
prp <- round(prp[1:3],2)

# create figure
png(filename="figures/PCA.png",width=400*2,height=400)
par(mfrow=c(1,2))
# axes 1 et 2
abs=range(pca$x[,1]); abs=abs(abs[2]-abs[1])/25;
ord=range(pca$x[,2]); ord=abs(ord[2]-ord[1])/25;
plot(pca$x[,1], pca$x[,2],
las = 1, cex = 2, pch = 16, col = col[as.integer(group)],
xlab = paste0("PC1 (",prp[1],"%)"),
ylab = paste0("PC2 (",prp[2],"%)"),
main = "Principal Component Analysis - Axes 1 and 2")
abline(h=0,v=0,lty=2,col="lightgray")
text(pca$x[,1] - ifelse(pca$x[,1]>0,abs,-abs), pca$x[,2] - ifelse(pca$x[,2]>0,ord,-ord),
colnames(counts.trans), col=col[as.integer(group)])

# axes 1 et 3
abs=range(pca$x[,1]); abs=abs(abs[2]-abs[1])/25;
ord=range(pca$x[,3]); ord=abs(ord[2]-ord[1])/25;
plot(pca$x[,1], pca$x[,3],
las = 1, cex = 2, pch = 16, col = col[as.integer(group)],
xlab = paste0("PC1 (",prp[1],"%)"),
ylab = paste0("PC3 (",prp[3],"%)"),
main = "Principal Component Analysis - Axes 1 and 3")
abline(h=0,v=0,lty=2,col="lightgray")
text(pca$x[,1] - ifelse(pca$x[,1]>0,abs,-abs), pca$x[,3] - ifelse(pca$x[,3]>0,ord,-ord),
colnames(counts.trans), col=col[as.integer(group)])
dev.off()

return(invisible(pca$x))
}
6 changes: 6 additions & 0 deletions R/SARTools-package.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#' SARTools provides R tools and an environment for the statistical analysis of RNA-Seq projects: load and clean data, produce figures, perform statistical analysis/testing with DESeq2 or edgeR, export results and create final report
#' @title Statistical Analysis of RNA-Seq Tools
#' @author Marie-Agnes Dillies and Hugo Varet
#' @docType package
#' @name SARTools-package
NULL
28 changes: 28 additions & 0 deletions R/SERE.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#' Pairwise SERE for two samples
#'
#' Compute the SERE coefficient for two samples
#'
#' @param observed \code{matrix} with two columns containing observed counts of two samples
#' @return The SERE coefficient for the two samples
#' @references Schulze, Kanwar, Golzenleuchter et al, SERE: Single-parameter quality control and sample comparison for RNA-Seq, BMC Genomics, 2012
#' @author See paper published

SERE <- function(observed){
#calculate lambda and expected values
laneTotals <- colSums(observed)
total <- sum(laneTotals)
fullObserved <- observed[rowSums(observed)>0,]
fullLambda <- rowSums(fullObserved)/total
fullLhat <- fullLambda > 0
fullExpected<- outer(fullLambda, laneTotals)

#keep values
fullKeep <- which(fullExpected > 0)

#calculate degrees of freedom (nrow*(ncol -1) >> number of parameters - calculated (just lamda is calculated >> thats why minus 1)
#calculate pearson and deviance for all values
oeFull <- (fullObserved[fullKeep] - fullExpected[fullKeep])^2/ fullExpected[fullKeep] # pearson chisq test
dfFull <- length(fullKeep) - sum(fullLhat!=0)

sqrt(sum(oeFull)/dfFull)
}
23 changes: 23 additions & 0 deletions R/barplotNull.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#' Percentage of null counts per sample
#'
#' Bar plot of the percentage of null counts per sample
#'
#' @param counts \code{matrix} of counts
#' @param group factor vector of the condition from which each sample belongs
#' @param col colors of the bars (one color per biological condition)
#' @return A file named barplotNull.png in the figures directory
#' @author Marie-Agnes Dillies and Hugo Varet

barplotNull <- function(counts, group, col=c("lightblue","orange","MediumVioletRed","SpringGreen")){
png(filename="figures/barplotNull.png",width=400,height=400)
percentage <- apply(counts, 2, function(x){sum(x == 0)})*100/nrow(counts)
percentage.allNull <- (nrow(counts) - nrow(removeNull(counts)))*100/nrow(counts)
barplot(percentage, las = 2,
col = col[as.integer(group)],
ylab = "Proportion of null counts",
main = "Proportion of null counts per sample",
ylim = c(0,1.2*max(percentage)))
abline(h = percentage.allNull, lty = 2, lwd = 2)
legend("topright", levels(group), fill=col[1:nlevels(group)], bty="n")
dev.off()
}
21 changes: 21 additions & 0 deletions R/barplotTotal.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#' Total number of reads per sample
#'
#' Bar plot of the total number of reads per sample
#'
#' @param counts \code{matrix} of counts
#' @param group factor vector of the condition from which each sample belongs
#' @param col colors of the bars (one color per biological condition)
#' @return A file named barplotTotal.png in the figures directory
#' @author Marie-Agnes Dillies and Hugo Varet

barplotTotal <- function(counts, group, col=c("lightblue","orange","MediumVioletRed","SpringGreen")){
png(filename="figures/barplotTotal.png",width=400,height=400)
barplot(colSums(counts),
main = "Total read count per sample",
ylab = "Total read count",
ylim = c(0, max(colSums(counts))*1.2),
col = col[as.integer(group)],
las = 2)
legend("topright", levels(group), fill=col[1:nlevels(group)], bty="n")
dev.off()
}
108 changes: 108 additions & 0 deletions R/checkParameters.DESeq2.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#' Check the parameters (when using DESeq2)
#'
#' Check the format and the validity of the parameters which will be used for the analysis with DESeq2.
# For example, it is important that $alpha$ be a numeric of length 1 between 0 and 1. This function avoid
# potential stupid bugs when running the suite of the script.
#'
#' @param projectName name of the project
#' @param author author of the statistical analysis/report
#' @param targetFile path to the design/target file
#' @param rawDir path to the directory containing raw counts files
#' @param featuresToRemove names of the features to be removed
#' @param varInt factor of interest
#' @param condRef reference biological condition
#' @param batch blocking factor in the design
#' @param fitType mean-variance relationship: "parametric" (default) or "local"
#' @param cooksCutoff outliers detection threshold (NULL to let DESeq2 choosing it)
#' @param independentFiltering TRUE/FALSE to perform independent filtering
#' @param alpha threshold of statistical significance
#' @param pAdjustMethod p-value adjustment method: "BH" (default) or "BY" for example
#' @param typeTrans transformation for PCA/clustering: "VST" ou "rlog"
#' @param locfunc "median" (default) or "shorth" to estimate the size factors
#' @param colors vector of colors of each biological condition on the plots
#' @return A boolean indicating if there is a problem in the parameters
#' @author Hugo Varet

checkParameters.DESeq2 <- function(projectName,author,targetFile,rawDir,
featuresToRemove,varInt,condRef,batch,fitType,
cooksCutoff,independentFiltering,alpha,pAdjustMethod,
typeTrans,locfunc,colors){
problem <- FALSE
if (!is.character(projectName) | length(projectName)!=1){
print("projectName must be a character vector of length 1")
problem <- TRUE
}
if (!is.character(author) | length(author)!=1){
print("author must be a character vector of length 1")
problem <- TRUE
}
if (!is.character(targetFile) | length(targetFile)!=1 || !file.exists(targetFile)){
print("targetFile must be a character vector of length 1 specifying an accessible file")
problem <- TRUE
}
if (!is.character(rawDir) | length(rawDir)!=1 || is.na(file.info(rawDir)[1,"isdir"]) | !file.info(rawDir)[1,"isdir"]){
print("rawDir must be a character vector of length 1 specifying an accessible directory")
problem <- TRUE
}
if (!is.character(featuresToRemove)){
print("featuresToRemove must be a character vector")
problem <- TRUE
}
if (!is.character(varInt) | length(varInt)!=1){
print("varInt must be a character vector of length 1")
problem <- TRUE
}
if (!is.character(condRef) | length(condRef)!=1){
print("condRef must be a character vector of length 1")
problem <- TRUE
}
if (!is.null(batch) && I(!is.character(batch) | length(batch)!=1)){
print("batch must be NULL or a character vector of length 1")
problem <- TRUE
}
if (!is.character(fitType) | length(fitType)!=1 || !I(fitType %in% c("parametric","local"))){
print("fitType must be equal to 'parametric' or 'local'")
problem <- TRUE
}
if (!is.null(cooksCutoff) && I(!is.numeric(cooksCutoff) | length(cooksCutoff)!=1 || cooksCutoff<=0)){
print("cooksCutoff must be NULL or a numeric vector of length 1 with a positive value")
problem <- TRUE
}
if (!is.logical(independentFiltering) | length(independentFiltering)!=1){
print("independentFiltering must be a boolean vector of length 1")
problem <- TRUE
}
if (!is.numeric(alpha) | length(alpha)!=1 || I(alpha<=0 | alpha>=1)){
print("alpha must be a numeric vector of length 1 with a value between 0 and 1")
problem <- TRUE
}
if (!is.character(pAdjustMethod) | length(pAdjustMethod)!=1 || !I(pAdjustMethod %in% p.adjust.methods)){
print(paste("pAdjustMethod must be a value in", paste(p.adjust.methods, collapse=", ")))
problem <- TRUE
}
if (!is.character(typeTrans) | length(typeTrans)!=1 || !I(typeTrans %in% c("VST","rlog"))){
print("typeTrans must be equal to 'VST' or 'rlog'")
problem <- TRUE
}
if (!is.character(locfunc) | length(locfunc)!=1 || !I(locfunc %in% c("median","shorth"))){
print("locfunc must be equal to 'median' or 'shorth'")
problem <- TRUE
} else{
if (locfunc=="shorth" & !I("genefilter" %in% installed.packages()[,"Package"])){
print("Package genefilter is needed if using locfunc='shorth'")
problem <- TRUE
}
}
areColors <- function(col){
sapply(col, function(X){tryCatch(is.matrix(col2rgb(X)), error=function(e){FALSE})})
}
if (!is.vector(colors) || !all(areColors(colors))){
print("colors must be a vector of colors")
problem <- TRUE
}

if (!problem){
print("All the parameters are correct")
}
return(invisible(problem))
}
Loading

0 comments on commit 401ee90

Please sign in to comment.