-
Notifications
You must be signed in to change notification settings - Fork 1
/
make_gc_bias.R
64 lines (53 loc) · 2.55 KB
/
make_gc_bias.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
## code for GC bias model implemented in Polyester
library(ballgown)
library(polyester)
library(Biostrings)
load('../polyester_paper/experiments/fpkm.rda') #transcript expression data. URL: http://files.figshare.com/1625419/fpkm.rda
load('../polyester_paper/experiments/sequences.rda') #transcript sequences
# here is the code used to make "sequences": (sequences.rda)
# this is the path to this index: ftp://igenome:[email protected]/Homo_sapiens/UCSC/hg19/Homo_sapiens_UCSC_hg19.tar.gz
# (this download will give you up to "Homo_sapiens")
seqpath = '/amber2/scratch/jleek/iGenomes-index/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/'
sequences = seq_gtf(gtfdf, seqpath)
names(sequences) = substr(names(sequences), 2, nchar(names(sequences))-1)
save(sequences, file='sequences.rda')
# calculate GC content:
GC = letterFrequency(sequences, letters='GC', as.prob=TRUE)
o = order(GC)
# filter to reasonable expression
expressed = exprfilter(fpkm, cutoff=1)
# estimate transcript-level counts
txcounts = fpkm_to_counts(expressed, mean_rps=1e7)
# put counts in same order as sequence data
txcounts = txcounts[match(names(sequences), texpr(expressed,'all')$t_name),]
rownames(txcounts) = names(sequences)
# use the same 7 reps we used to estimate fragment distribution
# those reps are:
samples = c("NA06985", "NA18858", "NA20772", "NA12144", "NA20815", "NA12776", "NA20542")
cols_samples = ballgown:::ss(colnames(txcounts), pattern='\\.', slot=2)
counts = txcounts[,cols_samples %in% samples]
counts = counts[GC >= 0.25,] #remove some outliers driving loess
GC = GC[GC >= 0.25]
lcounts = log2(counts+1)
# this code creates Supplementary Figure 1 in the Polyester paper
pdf('GC_effects.pdf', height=12, width=6)
par(mfrow=c(4,2))
for(i in 1:7){
sample_id = ballgown:::ss(colnames(counts)[i], pattern='\\.', slot=2)
plot(GC, lcounts[,i] - mean(lcounts[,i]), col='#00000050', pch=19,
ylab='Difference from average count (log scale)', xlab='GC %',
main=paste0('Model ', i, ': Sample ', sample_id))
loessfit = loess(lcounts[,i] - mean(lcounts[,i]) ~ GC, span=0.3)
lines(GC[o], predict(loessfit)[o], col='deeppink', lwd=3)
legend('topright', lwd=2, col='deeppink', 'loess fit')
}
dev.off()
# This code saves the x,y coords for the 7 loess models as data sets
for(s in samples){
i = which(samples == s)
y = lcounts[,i] - mean(lcounts[,i])
x = GC
eval(parse(text=paste0('loessfit', i, '<- list(x=x, y=y)')))
save(list=paste0('loessfit',i), file=paste0('data/loessfit', i, '.rda'), compress='xz')
# fit = loess(y ~ x, span=0.3)
}