Submit to github

wenbostar · Nov 2, 2017 · c05e388 · c05e388
commit c05e388
Show file tree

Hide file tree

Showing 41 changed files with 60,169 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,28 @@
+Package: PGA
+Type: Package
+Title: An package for identification of novel peptides by customized
+        database derived from RNA-Seq
+Description: This package provides functions for construction of customized
+    protein databases based on RNA-Seq data with/without genome guided, database
+    searching, post-processing and report generation. This kind of customized
+    protein database includes both the reference database (such as Refseq or
+    ENSEMBL) and the novel peptide sequences form RNA-Seq data.
+Version: 1.9.1
+Date: 2017-06-26
+Author: Shaohang Xu, Bo Wen
+Maintainer: Bo Wen <[email protected]>, Shaohang Xu <[email protected]>
+Depends: R (>= 3.0.1), IRanges, GenomicRanges, Biostrings (>= 2.26.3),
+        data.table, rTANDEM
+Imports: S4Vectors (>= 0.9.25), Rsamtools (>= 1.10.2), GenomicFeatures
+        (>= 1.19.8), biomaRt (>= 2.17.1), stringr, RCurl, Nozzle.R1,
+        VariantAnnotation (>= 1.7.28), rtracklayer, RSQLite, ggplot2,
+        AnnotationDbi, customProDB (>= 1.7.0), pheatmap
+Suggests: BSgenome.Hsapiens.UCSC.hg19, RUnit, BiocGenerics, BiocStyle,
+        knitr, R.utils
+VignetteBuilder: knitr
+License: GPL-2
+biocViews: Proteomics, MassSpectrometry, Software, ReportWriting,
+        RNASeq, Sequencing
+RoxygenNote: 5.0.1
+NeedsCompilation: no
+Packaged: 2017-10-31 00:02:07 UTC; biocbuild
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,48 @@
+importFrom(data.table,"%between%", "%chin%", "%like%", ":=",  as.data.table, between, chgroup, chmatch, chorder, CJ, copy, data.table, dcast, dcast.data.table, foverlaps,fread, haskey, key, "key<-", key2, last, like,melt, set, set2key, set2keyv, setattr, setDF, setDT, setkey, setkeyv, setnames,  setorder, setorderv)
+importFrom("grDevices", "colorRampPalette", "dev.off", "pdf", "png",
+             "rainbow")
+importFrom("graphics", "abline", "axTicks", "axis", "barplot", "box",
+             "clip", "hist", "lines", "mtext", "par", "pie", "plot",
+             "points", "rug", "text")
+importFrom("stats", "density", "rt", "setNames")
+importFrom("utils", "read.delim", "read.table", "write.table")
+import(pheatmap)
+import(RCurl)
+import(Nozzle.R1)
+import(stringr)
+import(ggplot2)
+import(S4Vectors)
+import(IRanges)
+import(GenomicRanges)
+importFrom(Biostrings,
+    AAStringSet, DNAStringSet, DNAStringSetList, DNA_BASES, getSeq ,
+    nchar, reverseComplement, substr, translate)
+import(Rsamtools)
+importFrom(GenomicFeatures,exonsBy,makeTxDbFromGFF,extractTranscriptSeqs)
+import(RSQLite)
+importFrom(rtracklayer,browserSession, ucscTableQuery,
+                    tableNames, getTable, trackNames, ucscSchema,import,export)
+importFrom(biomaRt, getBM, listAttributes, listDatasets, listMarts,
+           useDataset, useMart)
+
+importFrom(GenomicFeatures, cdsBy, exons, transcripts,makeTxDbFromUCSC,
+    fiveUTRsByTranscript, threeUTRsByTranscript)
+
+importFrom(VariantAnnotation, scanVcf, scanVcfHeader, samples)
+importFrom(AnnotationDbi, loadDb, saveDb)
+importFrom(customProDB,aaVariation,Bed2Range,InputVcf,JunctionType,
+           OutputNovelJun,Outputproseq,OutputVarproseq,
+           OutputVarproseq_single,Positionincoding,
+           PrepareAnnotationEnsembl,Varlocation)
+importFrom(rTANDEM, rTParam, rTTaxo, setParamValue, tandem)
+
+
+export(dbCreator)
+export(reportGear)
+export(runTandem)
+export(parserGear)
+export(easyRun)
+export(PrepareAnnotationEnsembl2)
+export(PrepareAnnotationRefseq2)
+export(createProDB4DenovoRNASeq)
+export(addGeneName4Ensembl)
diff --git a/NEWS b/NEWS
@@ -0,0 +1,28 @@
+Version 1.9.1
+
+o Add a parameter to set FDR
+o Update parser program
+
+Version 1.5.1
+
+o Fix the bug in Outputaberrant2
+
+Version 1.3.15
+
+o Fix some bugs
+
+Version 1.3.10
+o Fix the bug of citation file
+o Update site of ensembl when using useMart
+
+Version 1.3.6
+o Add citation information
+
+Version 1.3.5
+o Fix some bugs
+o Add the funcion of constructing the Trinity protein DB
+o Add the "addGeneName4Ensembl" function
+
+Version 0.1.0
+o First release version
+
diff --git a/R/dbCreator.R b/R/dbCreator.R
@@ -0,0 +1,204 @@
+
+##' @title Create customized protein database from RNA-Seq data
+##' @description The main function to create customized protein database from 
+##' RNA-Seq data
+##' @param gtfFile A GTF format file containing novel transcripts information
+##' @param vcfFile A VCF format file containing SNV and INDEL information
+##' @param bedFile A BED format file containing juction information
+##' @param annotation_path This directory contains numerous pieces of genome 
+##' annotation information which can be downloaded by 
+##' \code{\link{PrepareAnnotationEnsembl2}} or \code{\link{PrepareAnnotationRefseq2}}.
+##' @param outdir Output directory
+##' @param outfile_name Output file name
+##' @param lablersid A logical indicating whether to do the SNV annotation(dbSNP)
+##' @param COSMIC A logical indicating whether to do the SNV annotation(COSMIC)
+##' @param bool_get_longest When it's set as TRUE, the longest sequences will be 
+##' retained after the DNA sequences are six-frame translated into protein 
+##' sequences. Otherwise, the protein sequences more than 30 aa are retained.
+##' @param organism What is the Genus and species of this organism.Please use 
+##' proper scientific nomenclature for example: "Homo sapiens" and not "human",
+##' default is "Homo sapiens".
+##' @param make_decoy A logical indicating whether to add the decoy sequences
+##' @param genome Genome information. This is a BSgenome object(e.g. Hsapiens). 
+##' Default is NULL.
+##' @param ... Additional arguments
+##' @export
+##' @return The database file
+##' @examples 
+##' vcffile <- system.file("extdata/input", "PGA.vcf",package="PGA")
+##' bedfile <- system.file("extdata/input", "junctions.bed",package="PGA")
+##' gtffile <- system.file("extdata/input", "transcripts.gtf",package="PGA")
+##' annotation <- system.file("extdata", "annotation",package="PGA")
+##' outfile_path<-"db/"
+##' outfile_name<-"test"
+##' library(BSgenome.Hsapiens.UCSC.hg19)
+##' dbCreator(gtfFile=gtffile,vcfFile=vcffile,bedFile=bedfile,
+##'           annotation_path=annotation,outfile_name=outfile_name,
+##'           genome=Hsapiens,outdir=outfile_path)
+dbCreator <- function(gtfFile=NULL,vcfFile=NULL,bedFile=NULL,annotation_path=NULL, 
+                      outdir, outfile_name,lablersid=FALSE, COSMIC=FALSE,
+                      bool_get_longest=TRUE,organism="Homo sapiens",
+                      make_decoy=TRUE, genome=NULL, ...) {
+
+    dir.create(outdir,showWarnings = FALSE,recursive = TRUE)
+
+    if(is.null(annotation_path)) {
+        stop("must specify the path of annotation files")
+    }
+
+    if(class(genome)!="BSgenome"){
+        stop("The parameter 'genome' must be class 'BSgenome'.")
+    }
+    exon <- ''
+    ids <- ''
+    proteinseq <- ''
+    procodingseq <- ''
+    falist<-c()
+
+    ## The following four files are from 
+    load(paste(annotation_path, '/procodingseq.RData', sep=''))
+    load(paste(annotation_path, '/exon_anno.RData', sep=''))
+    load(paste(annotation_path, '/ids.RData', sep=''))   
+    load(paste(annotation_path, '/proseq.RData', sep=''))   
+
+    ## InputVcf {customProDB}
+    ## The InputVcf() function generates a list of GRanges object from a single 
+    ## VCF file.
+	if(!is.null(vcfFile))
+	{
+    	vcf <- InputVcf(vcfFile)
+    	#table(values(vcf[[1]])[['INDEL']])
+
+    	idx_snv <- which(values(vcf[[1]])[['INDEL']] == FALSE)
+    	SNVvcf <- vcf[[1]][idx_snv]
+    	idx_indel <- which(values(vcf[[1]])[['INDEL']] == TRUE)
+    	indelvcf <- vcf[[1]][idx_indel]
+	}
+	else
+	{
+		SNVvcf<-NULL
+		indelvcf<-NULL
+	}
+
+    if(length(indelvcf) >=1){
+
+        message("Output abberant protein FASTA file caused by short INDEL... ",
+                appendLF=FALSE)
+
+        ## Find the position in coding sequence for each variation.
+        postable_indel <- Positionincoding(indelvcf, exon)
+        if(all(is.na(postable_indel$proname))){
+            postable_indel[,"proname"] <- ids[match(postable_indel[, 'txname'], 
+                                                    ids[,'tx_name']),'pro_name']
+            postable_indel[,"genename"]<-ids[match(postable_indel[, 'txname'], 
+                                                   ids[,'tx_name']),'gene_name']
+        }
+        outf_indel <- paste(outdir, '/', outfile_name, '_indel.fasta', 
+                            sep='')
+        outm_indel <- paste(outdir, '/', outfile_name, '_indel.tab', 
+                            sep='')
+
+        if(!is.null(postable_indel)){
+            chrlist <- c(seq(1:22),'X','Y')
+            if( !any(seqnames(indelvcf) %in% chrlist) ){
+                chrlist <- paste('chr',c(seq(1:22),'X','Y'),sep='')
+            }  
+            indexchr <-which(postable_indel[,'chr'] %in% chrlist)
+            postable_indel <- postable_indel[indexchr,]
+
+            txlist_indel <- unique(postable_indel[, 'txid'])
+            codingseq_indel <- procodingseq[procodingseq[, 'tx_id'] %in% 
+                                                txlist_indel, ]
+            #save(postable_indel,file="postable_indel.Rdata")#for testing
+            #save(codingseq_indel,file="codingseq_indel.Rdata")#for testing
+            Outputaberrant2(postable_indel, coding=codingseq_indel, 
+                            proteinseq=proteinseq, outfa=outf_indel,
+                            outmtab=outm_indel, ids=ids) 
+            falist<-c(outf_indel,falist)
+        }
+
+        packageStartupMessage(" done")
+    }
+
+    if(length(SNVvcf) >=1){
+        message("Output variation table and variant protein sequence caused by",
+                "SNVs... ", appendLF=FALSE)
+        if(lablersid==TRUE){
+            dbsnpinCoding <- ''
+            load(paste(annotation_path, '/dbsnpinCoding.RData', sep=''))
+            if(COSMIC==TRUE){
+                cosmic <- ''
+                load(paste(annotation_path, '/cosmic.RData', sep=''))
+                postable_snv <- Positionincoding(SNVvcf, exon, 
+                                                 dbsnp=dbsnpinCoding, 
+                                                 COSMIC=cosmic)
+            }else{
+                postable_snv <- Positionincoding(SNVvcf, exon, 
+                                                 dbsnp=dbsnpinCoding)
+            }
+        }else{
+            if(COSMIC==TRUE){
+                cosmic <- ''
+                load(paste(annotation_path, '/cosmic.RData', sep=''))
+                postable_snv <- Positionincoding(SNVvcf, exon, dbsnp=NULL, 
+                                                 COSMIC=cosmic)
+            }else{
+                postable_snv <- Positionincoding(SNVvcf, exon)
+            }
+        }
+        txlist <- unique(postable_snv[, 'txid'])
+        codingseq <- procodingseq[procodingseq[, 'tx_id'] %in% txlist, ]
+        mtab <- aaVariation (postable_snv, codingseq)
+
+        outm_snv <- paste(outdir, '/', outfile_name, '_snv.tab', sep='')
+        outf_snv <- paste(outdir, '/', outfile_name, '_snv.fasta', sep='')
+        OutputVarproseq2(mtab, proteinseq, outf_snv, outm_snv, ids, 
+                         lablersid=lablersid)
+        falist<-c(outf_snv,falist)
+
+        packageStartupMessage(" done")
+    }
+
+    if( !is.null(bedFile) & !is.null(genome) ){
+        message("Output novel junction peptides... ", appendLF=FALSE)
+        splicemax <- ''
+        load(paste(annotation_path, '/splicemax.RData', sep=''))
+        txdb <- loadDb(paste(annotation_path, '/txdb.sqlite', sep=''))
+        jun <-  Bed2Range(bedFile, skip=1, covfilter=5)
+        junction_type <- JunctionType(jun, splicemax, txdb, ids)
+        outf_junc <- paste(outdir, '/', outfile_name, 
+                           '_junc.fasta', sep='')
+        outm_junc <- paste(outdir, '/', outfile_name, 
+                           '_junc.tab', sep='')
+        OutputNovelJun2(junction_type, genome, outf_junc,outm_junc, 
+                        proteinseq)
+        falist<-c(outf_junc,falist)
+        packageStartupMessage(" done")
+    }
+
+    if( !is.null(gtfFile) & !is.null(genome) ){
+        message("Output novel transripts... ", appendLF=FALSE)
+        outf_ntx <- paste(outdir, '/', outfile_name, 
+                          '_ntx.fasta', sep='')
+        outm_ntx <- paste(outdir, '/', outfile_name, 
+                          '_ntx.tab', sep='')
+        outg_ntx <- paste(outdir, '/', outfile_name, 
+                          '_ntx.gtf', sep='')
+
+        getNovelTx(gtfFile,genome=genome,outfa=outf_ntx,outmtab=outm_ntx,
+                   outgtf=outg_ntx,bool_get_longest=bool_get_longest,
+                   organism=organism)
+        falist<-c(outf_ntx,falist)
+        packageStartupMessage(" done")
+    }
+
+    message("Cat all fasta ... ", appendLF=FALSE)
+    final_db_file <- dbcat(fa=falist,
+                           proteinseq=proteinseq,
+                           make_decoy=make_decoy,
+                           outfile_path=outdir,
+                           outfile_name=outfile_name,...)
+    packageStartupMessage(" done")
+    return(final_db_file)
+}
+