Skip to content

Commit

Permalink
Submit to github
Browse files Browse the repository at this point in the history
  • Loading branch information
wenbostar committed Nov 2, 2017
0 parents commit c05e388
Show file tree
Hide file tree
Showing 41 changed files with 60,169 additions and 0 deletions.
28 changes: 28 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Package: PGA
Type: Package
Title: An package for identification of novel peptides by customized
database derived from RNA-Seq
Description: This package provides functions for construction of customized
protein databases based on RNA-Seq data with/without genome guided, database
searching, post-processing and report generation. This kind of customized
protein database includes both the reference database (such as Refseq or
ENSEMBL) and the novel peptide sequences form RNA-Seq data.
Version: 1.9.1
Date: 2017-06-26
Author: Shaohang Xu, Bo Wen
Maintainer: Bo Wen <[email protected]>, Shaohang Xu <[email protected]>
Depends: R (>= 3.0.1), IRanges, GenomicRanges, Biostrings (>= 2.26.3),
data.table, rTANDEM
Imports: S4Vectors (>= 0.9.25), Rsamtools (>= 1.10.2), GenomicFeatures
(>= 1.19.8), biomaRt (>= 2.17.1), stringr, RCurl, Nozzle.R1,
VariantAnnotation (>= 1.7.28), rtracklayer, RSQLite, ggplot2,
AnnotationDbi, customProDB (>= 1.7.0), pheatmap
Suggests: BSgenome.Hsapiens.UCSC.hg19, RUnit, BiocGenerics, BiocStyle,
knitr, R.utils
VignetteBuilder: knitr
License: GPL-2
biocViews: Proteomics, MassSpectrometry, Software, ReportWriting,
RNASeq, Sequencing
RoxygenNote: 5.0.1
NeedsCompilation: no
Packaged: 2017-10-31 00:02:07 UTC; biocbuild
48 changes: 48 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
importFrom(data.table,"%between%", "%chin%", "%like%", ":=", as.data.table, between, chgroup, chmatch, chorder, CJ, copy, data.table, dcast, dcast.data.table, foverlaps,fread, haskey, key, "key<-", key2, last, like,melt, set, set2key, set2keyv, setattr, setDF, setDT, setkey, setkeyv, setnames, setorder, setorderv)
importFrom("grDevices", "colorRampPalette", "dev.off", "pdf", "png",
"rainbow")
importFrom("graphics", "abline", "axTicks", "axis", "barplot", "box",
"clip", "hist", "lines", "mtext", "par", "pie", "plot",
"points", "rug", "text")
importFrom("stats", "density", "rt", "setNames")
importFrom("utils", "read.delim", "read.table", "write.table")
import(pheatmap)
import(RCurl)
import(Nozzle.R1)
import(stringr)
import(ggplot2)
import(S4Vectors)
import(IRanges)
import(GenomicRanges)
importFrom(Biostrings,
AAStringSet, DNAStringSet, DNAStringSetList, DNA_BASES, getSeq ,
nchar, reverseComplement, substr, translate)
import(Rsamtools)
importFrom(GenomicFeatures,exonsBy,makeTxDbFromGFF,extractTranscriptSeqs)
import(RSQLite)
importFrom(rtracklayer,browserSession, ucscTableQuery,
tableNames, getTable, trackNames, ucscSchema,import,export)
importFrom(biomaRt, getBM, listAttributes, listDatasets, listMarts,
useDataset, useMart)

importFrom(GenomicFeatures, cdsBy, exons, transcripts,makeTxDbFromUCSC,
fiveUTRsByTranscript, threeUTRsByTranscript)

importFrom(VariantAnnotation, scanVcf, scanVcfHeader, samples)
importFrom(AnnotationDbi, loadDb, saveDb)
importFrom(customProDB,aaVariation,Bed2Range,InputVcf,JunctionType,
OutputNovelJun,Outputproseq,OutputVarproseq,
OutputVarproseq_single,Positionincoding,
PrepareAnnotationEnsembl,Varlocation)
importFrom(rTANDEM, rTParam, rTTaxo, setParamValue, tandem)


export(dbCreator)
export(reportGear)
export(runTandem)
export(parserGear)
export(easyRun)
export(PrepareAnnotationEnsembl2)
export(PrepareAnnotationRefseq2)
export(createProDB4DenovoRNASeq)
export(addGeneName4Ensembl)
28 changes: 28 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Version 1.9.1

o Add a parameter to set FDR
o Update parser program

Version 1.5.1

o Fix the bug in Outputaberrant2

Version 1.3.15

o Fix some bugs

Version 1.3.10
o Fix the bug of citation file
o Update site of ensembl when using useMart

Version 1.3.6
o Add citation information

Version 1.3.5
o Fix some bugs
o Add the funcion of constructing the Trinity protein DB
o Add the "addGeneName4Ensembl" function

Version 0.1.0
o First release version

204 changes: 204 additions & 0 deletions R/dbCreator.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@

##' @title Create customized protein database from RNA-Seq data
##' @description The main function to create customized protein database from
##' RNA-Seq data
##' @param gtfFile A GTF format file containing novel transcripts information
##' @param vcfFile A VCF format file containing SNV and INDEL information
##' @param bedFile A BED format file containing juction information
##' @param annotation_path This directory contains numerous pieces of genome
##' annotation information which can be downloaded by
##' \code{\link{PrepareAnnotationEnsembl2}} or \code{\link{PrepareAnnotationRefseq2}}.
##' @param outdir Output directory
##' @param outfile_name Output file name
##' @param lablersid A logical indicating whether to do the SNV annotation(dbSNP)
##' @param COSMIC A logical indicating whether to do the SNV annotation(COSMIC)
##' @param bool_get_longest When it's set as TRUE, the longest sequences will be
##' retained after the DNA sequences are six-frame translated into protein
##' sequences. Otherwise, the protein sequences more than 30 aa are retained.
##' @param organism What is the Genus and species of this organism.Please use
##' proper scientific nomenclature for example: "Homo sapiens" and not "human",
##' default is "Homo sapiens".
##' @param make_decoy A logical indicating whether to add the decoy sequences
##' @param genome Genome information. This is a BSgenome object(e.g. Hsapiens).
##' Default is NULL.
##' @param ... Additional arguments
##' @export
##' @return The database file
##' @examples
##' vcffile <- system.file("extdata/input", "PGA.vcf",package="PGA")
##' bedfile <- system.file("extdata/input", "junctions.bed",package="PGA")
##' gtffile <- system.file("extdata/input", "transcripts.gtf",package="PGA")
##' annotation <- system.file("extdata", "annotation",package="PGA")
##' outfile_path<-"db/"
##' outfile_name<-"test"
##' library(BSgenome.Hsapiens.UCSC.hg19)
##' dbCreator(gtfFile=gtffile,vcfFile=vcffile,bedFile=bedfile,
##' annotation_path=annotation,outfile_name=outfile_name,
##' genome=Hsapiens,outdir=outfile_path)
dbCreator <- function(gtfFile=NULL,vcfFile=NULL,bedFile=NULL,annotation_path=NULL,
outdir, outfile_name,lablersid=FALSE, COSMIC=FALSE,
bool_get_longest=TRUE,organism="Homo sapiens",
make_decoy=TRUE, genome=NULL, ...) {

dir.create(outdir,showWarnings = FALSE,recursive = TRUE)

if(is.null(annotation_path)) {
stop("must specify the path of annotation files")
}

if(class(genome)!="BSgenome"){
stop("The parameter 'genome' must be class 'BSgenome'.")
}
exon <- ''
ids <- ''
proteinseq <- ''
procodingseq <- ''
falist<-c()

## The following four files are from
load(paste(annotation_path, '/procodingseq.RData', sep=''))
load(paste(annotation_path, '/exon_anno.RData', sep=''))
load(paste(annotation_path, '/ids.RData', sep=''))
load(paste(annotation_path, '/proseq.RData', sep=''))

## InputVcf {customProDB}
## The InputVcf() function generates a list of GRanges object from a single
## VCF file.
if(!is.null(vcfFile))
{
vcf <- InputVcf(vcfFile)
#table(values(vcf[[1]])[['INDEL']])

idx_snv <- which(values(vcf[[1]])[['INDEL']] == FALSE)
SNVvcf <- vcf[[1]][idx_snv]
idx_indel <- which(values(vcf[[1]])[['INDEL']] == TRUE)
indelvcf <- vcf[[1]][idx_indel]
}
else
{
SNVvcf<-NULL
indelvcf<-NULL
}

if(length(indelvcf) >=1){

message("Output abberant protein FASTA file caused by short INDEL... ",
appendLF=FALSE)

## Find the position in coding sequence for each variation.
postable_indel <- Positionincoding(indelvcf, exon)
if(all(is.na(postable_indel$proname))){
postable_indel[,"proname"] <- ids[match(postable_indel[, 'txname'],
ids[,'tx_name']),'pro_name']
postable_indel[,"genename"]<-ids[match(postable_indel[, 'txname'],
ids[,'tx_name']),'gene_name']
}
outf_indel <- paste(outdir, '/', outfile_name, '_indel.fasta',
sep='')
outm_indel <- paste(outdir, '/', outfile_name, '_indel.tab',
sep='')

if(!is.null(postable_indel)){
chrlist <- c(seq(1:22),'X','Y')
if( !any(seqnames(indelvcf) %in% chrlist) ){
chrlist <- paste('chr',c(seq(1:22),'X','Y'),sep='')
}
indexchr <-which(postable_indel[,'chr'] %in% chrlist)
postable_indel <- postable_indel[indexchr,]

txlist_indel <- unique(postable_indel[, 'txid'])
codingseq_indel <- procodingseq[procodingseq[, 'tx_id'] %in%
txlist_indel, ]
#save(postable_indel,file="postable_indel.Rdata")#for testing
#save(codingseq_indel,file="codingseq_indel.Rdata")#for testing
Outputaberrant2(postable_indel, coding=codingseq_indel,
proteinseq=proteinseq, outfa=outf_indel,
outmtab=outm_indel, ids=ids)
falist<-c(outf_indel,falist)
}

packageStartupMessage(" done")
}

if(length(SNVvcf) >=1){
message("Output variation table and variant protein sequence caused by",
"SNVs... ", appendLF=FALSE)
if(lablersid==TRUE){
dbsnpinCoding <- ''
load(paste(annotation_path, '/dbsnpinCoding.RData', sep=''))
if(COSMIC==TRUE){
cosmic <- ''
load(paste(annotation_path, '/cosmic.RData', sep=''))
postable_snv <- Positionincoding(SNVvcf, exon,
dbsnp=dbsnpinCoding,
COSMIC=cosmic)
}else{
postable_snv <- Positionincoding(SNVvcf, exon,
dbsnp=dbsnpinCoding)
}
}else{
if(COSMIC==TRUE){
cosmic <- ''
load(paste(annotation_path, '/cosmic.RData', sep=''))
postable_snv <- Positionincoding(SNVvcf, exon, dbsnp=NULL,
COSMIC=cosmic)
}else{
postable_snv <- Positionincoding(SNVvcf, exon)
}
}
txlist <- unique(postable_snv[, 'txid'])
codingseq <- procodingseq[procodingseq[, 'tx_id'] %in% txlist, ]
mtab <- aaVariation (postable_snv, codingseq)

outm_snv <- paste(outdir, '/', outfile_name, '_snv.tab', sep='')
outf_snv <- paste(outdir, '/', outfile_name, '_snv.fasta', sep='')
OutputVarproseq2(mtab, proteinseq, outf_snv, outm_snv, ids,
lablersid=lablersid)
falist<-c(outf_snv,falist)

packageStartupMessage(" done")
}

if( !is.null(bedFile) & !is.null(genome) ){
message("Output novel junction peptides... ", appendLF=FALSE)
splicemax <- ''
load(paste(annotation_path, '/splicemax.RData', sep=''))
txdb <- loadDb(paste(annotation_path, '/txdb.sqlite', sep=''))
jun <- Bed2Range(bedFile, skip=1, covfilter=5)
junction_type <- JunctionType(jun, splicemax, txdb, ids)
outf_junc <- paste(outdir, '/', outfile_name,
'_junc.fasta', sep='')
outm_junc <- paste(outdir, '/', outfile_name,
'_junc.tab', sep='')
OutputNovelJun2(junction_type, genome, outf_junc,outm_junc,
proteinseq)
falist<-c(outf_junc,falist)
packageStartupMessage(" done")
}

if( !is.null(gtfFile) & !is.null(genome) ){
message("Output novel transripts... ", appendLF=FALSE)
outf_ntx <- paste(outdir, '/', outfile_name,
'_ntx.fasta', sep='')
outm_ntx <- paste(outdir, '/', outfile_name,
'_ntx.tab', sep='')
outg_ntx <- paste(outdir, '/', outfile_name,
'_ntx.gtf', sep='')

getNovelTx(gtfFile,genome=genome,outfa=outf_ntx,outmtab=outm_ntx,
outgtf=outg_ntx,bool_get_longest=bool_get_longest,
organism=organism)
falist<-c(outf_ntx,falist)
packageStartupMessage(" done")
}

message("Cat all fasta ... ", appendLF=FALSE)
final_db_file <- dbcat(fa=falist,
proteinseq=proteinseq,
make_decoy=make_decoy,
outfile_path=outdir,
outfile_name=outfile_name,...)
packageStartupMessage(" done")
return(final_db_file)
}

Loading

0 comments on commit c05e388

Please sign in to comment.