-
Notifications
You must be signed in to change notification settings - Fork 20
/
pcaStructure.R
67 lines (57 loc) · 1.98 KB
/
pcaStructure.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
inf=commandArgs(T)
#setwd("~/Dropbox/pipelines2020/auto_ok")
#inf="myresult2.ibsMat"
require(vegan)
ma = as.matrix(read.table(inf))
# reading bams, removing file name trash
bams=scan("bams.nr",what="character") # list of bam files
bams=sub(".bam","",bams)
bams=sub("/.+/","",bams)
dimnames(ma)=list(bams,bams)
# reading sequencing depths (proportion of sites with coverage >5x, for each sample - output of plotQC.R)
qc=read.table("quality.txt",header=F)
names(qc)=c("srr","q")
qc$srr=sub(".bam","",qc$srr)
row.names(qc)=qc$srr
qc=qc[bams,]
# partial PCoA, taking out the variation due to coverage
pp0=capscale(ma~1+Condition(qc$q))
# removing outlier samples by pc1
pc1=scores(pp0,display="sites",choices=1)
goods=bams[which(pc1>=quantile(pc1,0.05) & pc1<=quantile(pc1,0.95))]
ma=ma[goods,goods]
qc=qc[goods,]
#length(goods)
pp0=capscale(ma~1+Condition(qc$q))
pdf(file=paste(inf,"_pcoa_eigens.pdf",sep=""),height=3.8,width=12)
par(mfrow=c(1,3))
plot(pp0)
# function to calculate how much larder is the dropoff after MDS1+MDS2/2 compared to dropoff in the mid-MDS region
mds1.signal=function(x) {
N=length(x)
nulls=round(c(N/2-2,N/2+2),0)
slope1=(x[1]-x[2])
slope0=(x[nulls[1]]-x[nulls[2]])/4
return(slope1/slope0)
}
plot(pp0$CA$eig/sum(pp0$CA$eig))
# jackknifing the MDS signal (80% resampling)
mdss=c()
for (i in 1:499) {
choose=sample(1:nrow(ma),round(nrow(ma)*0.8,0))
ma1=ma[choose,choose]
qc1=qc[choose,]
pp1=capscale(ma1~1+Condition(qc1$q))
eigsn=pp1$CA$eig/sum(pp1$CA$eig)
good=mds1.signal(pp1$CA$eig)
bs=mds1.signal(bstick(length(pp1$CA$eig)))
# signal of structure = log10 of ratio of real to broken stick MDS1 signals
mdss=c(mdss,log(good/bs,10))
}
plot(density(mdss),main="log10 ratio of MDS1-2 signal \n to BS model signal, 80% jackknife")
abline(v=median(mdss),col="red")
mtext(paste("median = ",round(median(mdss),2)),side=3,cex=0.8)
dev.off()
save(eigsn,pp1,mdss,file=paste(inf,"_pcoa_eigens.RData",sep=""))
# min, 25%q, median, mean,75%q, max
cat(round(summary(mdss),3))