Skip to content

Commit

Permalink
Merge pull request #4 from yhoogstrate/obtain_stats
Browse files Browse the repository at this point in the history
Obtain stats
  • Loading branch information
yhoogstrate authored Jul 1, 2016
2 parents c4f9971 + d0f84d9 commit 7429a31
Show file tree
Hide file tree
Showing 3 changed files with 372 additions and 57 deletions.
25 changes: 23 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,23 @@
![Dr](http://www.efeeme.com/wp-content/uploads/hugh-laurie-25-03-13.jpg)
![Disco](https://openclipart.org/image/2400px/svg_to_png/191143/1392845552.png)
Dr. Disco
=========

Running RNA-STAR with fusion settings will produce a ``....Chimeric.out.sam`` file, containing all discordant reads. This alignment does not properly link mates together by having some incorrectly used tags. This tool, fix-chimeric-out-bam.py, is able to solve these issues allowing you to view the discordant reads in IGV in much more detail. Because the tool makes the *disco*rdant alignment health again, we've named it **Dr. Disco**.

Installation
------------
None yet, please make sure pysam 0.9, and click are avaiable to python, e.g. using `pip install ...`. Also samtools >= 1.3 is required.

Usage
-----
If you have as input file `....Chimeric.out.sam`, you should run Dr. Disco as follows:

```
samtools view -bS '....Chimeric.out.sam' > '....Chimeric.out.bam' ;
./fix-chimeric-out-bam.py '....Chimeric.out.bam' '....Chimeric.fixed.bam'
```

If you have bam files in advance, you can proceed with:

```
./fix-chimeric-out-bam.py '....Chimeric.out.bam' '....Chimeric.fixed.bam'
```
120 changes: 120 additions & 0 deletions plots.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
library(MASS)

setwd("/home/youri/Desktop/protocol_hg19/rna/fusion/0_fuma-prostate/extract_breakpoints_in_STAR_fusion_results")


plot_gene <- function(frequencies, frequencies_windowed, entropy, intron_mask) {
data = read.delim(frequencies)
#data_w = read.table(frequencies_windowed)
data_e = read.table(entropy)
data_i = read.table(intron_mask)

weight_s = median(data$s[data$s != 0])
weight_e = median(data$e[data$e != 0])

n = length(data$e)
x = 0:(n-1)
ymax = log2(max(data$e * data_i$V2,data$s * data_i$V2) + 1)

plot(c(0,n-1),c(0,ymax*2.05),type="n")
abline(h=ymax*1.05)
##------------------------------------------------------------
ds = c(data$s,data$j)
#ds2 = ds[ds < 10]
ds = ds[ds > 0]

#hist(ds,breaks=20)
fit = fitdistr(ds[ds > 0], 'Poisson')
simg = dpois(1:10000,fit$estimate)
simg = simg/max(simg) * 120000*3.6
lines(1:10000,simg,lwd=2,col="blue")
#transition1 = qpois(0.950, lambda = fit$estimate[1])
#transition2 = qpois(0.975, lambda = fit$estimate[1])
transition3 = qpois(0.990, lambda = fit$estimate[1])
transition4 = qpois(0.999, lambda = fit$estimate[1])
transition5 = qpois(0.9999,lambda = fit$estimate[1])

#abline(h=log2(transition1+1),lty=2,col="#880088")
#abline(h=log2(transition2+1),lty=2,col="#660066")
abline(h=log2(transition3+1),lty=2,col="#880088")
abline(h=log2(transition4+1),lty=2,col="#660066")
abline(h=log2(transition5+1),lty=2,col="#440044")

## Gamma can't deal with 0 values
##fit = fitdistr(ds[ds > 0], 'gamma')
##simg = dgamma(1:10000, shape = fit$estimate[1], rate = fit$estimate[2])
##simg = simg / max(simg) * 1200
##lines(1:10000,simg,lwd=2,col="red")

fit = fitdistr(ds[ds > 0], 'lognormal')
#simg = dlnorm(1:10000, meanlog = fit$estimate[1], sdlog = fit$estimate[2])
#simg = simg / max(simg) * 1200
#lines(1:10000,simg,lwd=2,col="darkgreen")

#transition1 = qlnorm(0.950, meanlog = fit$estimate[1], sdlog = fit$estimate[2])
#transition2 = qlnorm(0.975, meanlog = fit$estimate[1], sdlog = fit$estimate[2])
#transition3 = qlnorm(0.990, meanlog = fit$estimate[1], sdlog = fit$estimate[2])

#abline(h=log2(transition1+1),lty=3,col="#008888")
#abline(h=log2(transition2+1),lty=3,col="#006666")
#abline(h=log2(transition3+1),lty=3,col="#004444")

## -------------------------------------------------------------


#Draw entropy dots first
points(data_e$V1,(data_e$V2 * ymax) + (ymax*1.05),pch=21,cex=0.001,col="#00000002")

points(x,log2(data$s+1)*data_i$V2,col="red",pch=21,cex=0.20)
points(x,log2(data$e+1)/weight_e*weight_s*data_i$V2,col="darkgreen",pch=21,cex=0.20)

# Draw intron/exon boundaries
abline(v=x[data$j != 0],col="#0000FF55")
# Draw exon mask
#points(data_i$V1,(data_i$V2*ymax)+(0.015*ymax),pch=24,cex=0.2)

## Windows start end plot - not ideal
##plot(c(0,11*400),c(0,10000),type="n")
#x = rep(data_w$V1, times = 1, length.out = NA, each = 2)[2:(length(data_w$V1)*2)]
#y = rep(data_w$V3, times = 1, length.out = NA, each = 2)[1:(length(data_w$V1)*2)-1]
##lines(data_w$V1,data_w$V3)
#lines(x,log(y),col="red")
##abline(v=(0:10)*400,col="gray")
}



samples=c('7046-004-041','7046-004-043')
gene_names=c('erg','tmprss2','ar','sash1','samd5','klk3','gapdh')
#samples=c('7046-004-041')
#gene_names=c('tmprss2')

for(sample in samples) {
print(sample)
for(gene in gene_names) {
print(gene)
data = paste("data/",sample,"-",gene,"-",sep="")
figure = paste("plots/",sample,"_",gene,".png",sep="")

v1 = paste(data,"vec1_a.tabular.txt",sep="")
v2 = paste(data,"vec2.tabular.txt",sep="")
v3 = paste(data,"vec3.tabular.txt",sep="")

png(figure,width=1250,height=600)
plot_gene(v1,FALSE,v2,v3)
dev.off()

#png("043-erg.png",width=1250,height=600)
#plot_gene("043-erg-vec1_a.tabular.txt","043-erg-vec1_b.tabular.txt","043-erg-vec2.tabular.txt","043-erg-vec3.tabular.txt")
}
}








## windowed start and end position does not seem to matter that much

Loading

0 comments on commit 7429a31

Please sign in to comment.