-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathash_allele_analysis_2018.R
492 lines (406 loc) · 24.8 KB
/
ash_allele_analysis_2018.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
if (.Platform$OS.type=="windows"){
setwd("C:/Users/seanmh/Dropbox/Projects/MSB_Kew_ash_Tree_Seed/")
setwd("C:/Users/shoban/Dropbox/Projects/MSB_Kew_ash_Tree_Seed/") }
if (.Platform$OS.type=="unix") {
setwd("/home/user/Dropbox/Projects/MSB_Kew_ash_Tree_Seed/")
#setwd("/media/sean/Windows7_OS/Users/seanmh/Dropbox/Projects/MSB_Kew_ash_Tree_Seed/")
}
library(adegenet); library(diveRsity)
library(parallel)
#library(foreach);
#library(doMC); registerDoMC() #OR..
#library(doParallel); cl<-makeCluster(8); registerDoParallel(cl)
source("Simulations_and_Code/src/sample_funcs.R")
colMax <- function(data) sapply(data, max, na.rm = TRUE)
thresh_freq_H<- 0.20; thresh_freq_L<- 0.05
thisgrid<-"large"
if (thisgrid=="small"){
scenarios_ran<-list.dirs(path = "./Scenarios_Nov_28_2016/10rep_10mark", full.names = TRUE,recursive=F)
SIZE_OF_GRID<-209; NUM_GRID_ROWS<-19; N_REGIONS<-15
#first_row_region<-c(1,3,5,7,9,11,13,15,17); last_row_region<-c(2,4,6,8,10,12,14,16,19)
first_row_region<- c(1,3,5,7,8,9,10,11,12,13,14,15,16,17,18)
last_row_region<- c(2,4,6,7,8,9,10,11,12,13,14,15,16,17,19)
}
if (thisgrid=="medium"){
scenarios_ran<-list.dirs(path = "./Scenarios_Nov_22_medium", full.names = TRUE,recursive=F)
SIZE_OF_GRID<-1215; NUM_GRID_ROWS<-45; N_REGIONS<-15
first_row_region<- c(1,3,6,9,12,15,18,21,24,27,30,33,36,39,42)
last_row_region<- c(2,5,8,11,14,17,20,23,26,29,32,35,38,41,45)
}
if (thisgrid=="large"){
scenarios_ran<-list.dirs(path = "./Scenarios_Nov_22_large", full.names = TRUE,recursive=F)
SIZE_OF_GRID<-4717; NUM_GRID_ROWS<-89; N_REGIONS<-18
first_row_region<- c(1,6,11,16,21,26,31,36,41,46,51,56,61,66,71,76,81,86)
last_row_region<- c(5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,89)
}
scenarios_ran<-list.dirs(path = "./Simulations_and_Code/test2018", full.names = TRUE,recursive=F)
#Set up file path where scenario is
num_scen<-length(scenarios_ran)
num_reps<-1000
#set up results
#Last 4 are MSB, NTSB, 10*MSB, 10*NTSB, TOTAL
summ_results<-array(dim=c(1485+4+1,8,num_scen,num_reps))
colnames(summ_results)<-c("total plants","total populations","G", "GLF", "GR", "RC", "LC", "LR")
type_samp<-c("random","each fr diff reg", "only N 2 rows", "center 2 rows", "only S 2 rows",
"core", "edge", "focus S", "focus N")
rownames(summ_results)<-c(rep(type_samp,each=165),"MSB","NTSB","MSB10","NTSB10","TOTAL")
######################################
#---FILE CHECK AND FILE CONVERSION---#
######################################
#for (scen in 1:length(scenarios_ran)){
# #Check for and remove genind
# gen_files<-dir(path=scenarios_ran[scen],pattern="gen")
# if (length(gen_files)!=0) file.remove(file.path(scenarios_ran[scen],gen_files))
# #convert to genind
# reps_ran_arp<-list.files(scenarios_ran[scen], pattern="arp")
# arp_file_list<-file.path(scenarios_ran[scen],reps_ran_arp,sep="")
# if (.Platform$OS.type=="unix") arp_file_list<-substr(arp_file_list,1,nchar(arp_file_list)-1)
# mclapply(arp_file_list,arp2gen,mc.cores=16)
# #foreach (nrep = 1:length(reps_ran_arp)) %dopar% { arp2gen(arp_file_list[nrep]) } #alternative MC loop
#}
##############################
#---DEFINE REGIONAL MAKEUP---#
#---AND WHERE MSB SAMPLED----#
##############################
scen<-1
region_makeup<-set.regions(scenarios_ran[scen], SIZE_OF_GRID, NUM_GRID_ROWS, N_REGIONS,
first_row_region, last_row_region)
file_MSB_all<-"Data_from_simon/sampled_so_far/2018/sampled_locations_all_2018.csv"
file_NTSB<-"Data_from_simon/sampled_so_far/2018/sampled_locations_NTSB_2018.csv"
MSB_locations<-MSB.samp(get.uk.grid(scenarios_ran[scen], SIZE_OF_GRID, NUM_GRID_ROWS), file_MSB_all)
MSB_locations<-MSB_locations[MSB_locations!=0]
MSB_plant_nums<-read.csv(file_MSB_all)[,4]*10
MSB_plant_nums[MSB_plant_nums>=250]<-240
NTSB_locations<-MSB.samp(get.uk.grid(scenarios_ran[scen], SIZE_OF_GRID, NUM_GRID_ROWS), file_NTSB)
NTSB_locations<-NTSB_locations[NTSB_locations!=0]
NTSB_plant_nums<-read.csv(file_NTSB)[,4]*10
NTSB_plant_nums[NTSB_plant_nums>=250]<-240
#####################
#---MAIN ANALYSIS---#
#####################
for (scen in 1:length(scenarios_ran)){
reps_ran_gen<-list.files(scenarios_ran[scen], pattern="gen")
for (nrep in 1:num_reps){
print(scenarios_ran[scen])
#make a genind (by individual) and genpop (by population)
temp_file_name<-file.path(scenarios_ran[scen],reps_ran_gen[nrep],sep="")
if (.Platform$OS.type=="unix") temp_file_name<-substr(temp_file_name,1,nchar(temp_file_name)-1)
UK_genind<-read.genepop(temp_file_name,ncode=3)
UK_genpop<-genind2genpop(UK_genind)
#--NUMBER OF POPULATIONS, INDIVIDUALS, REGIONS, REGIONAL MAKEUP--#
n_pops<-length(levels(UK_genind@pop))
n_total_indivs<- length(UK_genind@tab[,1])
n_ind_p_pop<-table(UK_genind@pop)
allele_freqs<-colSums(UK_genpop@tab)/(n_total_indivs*2)
#######################
#---#DETERMINE WHAT ALLELES FALL IN WHAT CATEGORIES---#
#######################
allele_cat<-get.allele.cat(UK_genpop, region_makeup, N_REGIONS, n_ind_p_pop)
glob_com<-allele_cat[[1]]; glob_lowfr<-allele_cat[[2]]; glob_rare<-allele_cat[[3]]
reg_com_int<-allele_cat[[4]]; loc_com_int<-allele_cat[[5]]; loc_rare<-allele_cat[[6]]
#######################
#--SUBSAMPLE SECTION
#######################
### LIST OF POPULATIONS SAMPLED
#sample certain number of populations, and individuals, and by region
center_pops_vect<-read.csv("Grids/center_edge/center_pops.txt",sep=",",header=F)[[1]]
edge_pops_vect<-read.csv("Grids/center_edge/edge_pops.txt",sep=",",header=F)[[1]]
type_samp<-c("random","each fr diff reg", "only N 2 rows", "center 2 rows", "only S 2 rows",
"core", "edge", "focus S", "focus N")
l_plt_smp<-length(c(2,seq(5,50, by=5)))
n_pops_to_samp<- rep(c(rep(2,l_plt_smp),rep(5,l_plt_smp),rep(10,l_plt_smp),rep(15,l_plt_smp),
rep(20,l_plt_smp),rep(25,l_plt_smp),rep(30,l_plt_smp),rep(35,l_plt_smp),
rep(40,l_plt_smp),rep(45,l_plt_smp),rep(50,l_plt_smp),rep(55,l_plt_smp),
rep(60,l_plt_smp),rep(65,l_plt_smp),rep(70,l_plt_smp)),length(type_samp)) #rep(n_pops,l_plt_smp),
N_SAMPS_P_POP<-as.list(rep(c(2,seq(5,50, by=5)),(length(type_samp)*length(unique(n_pops_to_samp)))))
POPS_TO_SAMP<-list(); this_slot<-1
for (t in 1:length(type_samp)) for (p in 1:length(unique(n_pops_to_samp))) {
if (p>6) REPLC_N=T; if (p<=6) REPLC_N=F
if (t==1) the_pops<-sample(1:n_pops, unique(n_pops_to_samp)[p])
#t=2 makes list of 1 pop per region then samples n_pops from that
if (t==2) the_pops<-sample(sapply(region_makeup,sample,3), unique(n_pops_to_samp)[p],replace=REPLC_N)
#t=3,4,5 takes the top, middle, & end of the grid, unlists, then samples n_pops from that
if (t==3) the_pops<-sample(unlist(region_makeup[1:2]),unique(n_pops_to_samp)[p],replace=REPLC_N)
if (t==4) {
temp_mid<-length(region_makeup)/2
the_pops<-sample(unlist(region_makeup[(temp_mid-1):(temp_mid+1)]),unique(n_pops_to_samp)[p])
}
if (t==5) the_pops<-sample(unlist(tail(region_makeup)[4:5]),unique(n_pops_to_samp)[p])
if (t==6) the_pops<-sample(center_pops_vect, unique(n_pops_to_samp)[p])
if (t==7) the_pops<-sample(edge_pops_vect, unique(n_pops_to_samp)[p])
if (t==8) {
num_in_focus<-floor(0.75*unique(n_pops_to_samp)[p])
the_pops<-c(sample(unlist(tail(region_makeup)[4:5]),num_in_focus),
sample(1:n_pops, (unique(n_pops_to_samp)[p]-num_in_focus)))
}
if (t==9) {
num_in_focus<-floor(0.75*unique(n_pops_to_samp)[p])
the_pops<-c(sample(unlist(region_makeup[1:2]),num_in_focus, replace=REPLC_N),
sample(1:n_pops, (unique(n_pops_to_samp)[p]-num_in_focus)))
}
for (x in 1:l_plt_smp) { POPS_TO_SAMP[[this_slot]]<-the_pops; this_slot=this_slot+1}
}
for (i in 1:length(POPS_TO_SAMP))
N_SAMPS_P_POP[[i]]<-unlist(rep(N_SAMPS_P_POP[i],length(POPS_TO_SAMP[[i]])))
#This checks the above code- just swap out "34" for larger numbers
#temp_ind<-vector(length=15)
#for (i in 1:15) temp_ind[i]<-(which(UK_grid==POPS_TO_SAMP[[34]][i],arr.ind=T)[1])
#sort(temp_ind)
#This is for the MSB samples
POPS_TO_SAMP[[this_slot]]<-MSB_locations; N_SAMPS_P_POP[[this_slot]]<-MSB_plant_nums/10
N_SAMPS_P_POP[[this_slot]][N_SAMPS_P_POP[[this_slot]][]==1]<-2; this_slot=this_slot+1
POPS_TO_SAMP[[this_slot]]<-NTSB_locations; N_SAMPS_P_POP[[this_slot]]<-NTSB_plant_nums/10
N_SAMPS_P_POP[[this_slot]][N_SAMPS_P_POP[[this_slot]][]==1]<-2; this_slot=this_slot+1
POPS_TO_SAMP[[this_slot]]<-MSB_locations; N_SAMPS_P_POP[[this_slot]]<-MSB_plant_nums
this_slot=this_slot+1
POPS_TO_SAMP[[this_slot]]<-NTSB_locations; N_SAMPS_P_POP[[this_slot]]<-NTSB_plant_nums
UK_genind_sep<-seppop(UK_genind)
for (samp in 1:length(POPS_TO_SAMP)){
alleles<-matrix(nrow=length(POPS_TO_SAMP[[samp]]),ncol=length(allele_freqs))
alleles<-sample.pop(UK_genind_sep,POPS_TO_SAMP[[samp]],N_SAMPS_P_POP[[samp]])
summ_results[samp,1,scen,nrep]<-sum(N_SAMPS_P_POP[[samp]]); summ_results[samp,2,scen,nrep]<-length(N_SAMPS_P_POP[[samp]])
#NOW SEE WHAT IS CAUGHT
#this will record each time an allele is in one of our sampled populations
for (p in 1:n_pops){
caught_glob<-colSums(alleles,na.rm=T)
caught_glob_lowfr<-colSums(alleles,na.rm=T)[glob_lowfr]
caught_glob_rare<-colSums(alleles,na.rm=T)[glob_rare]
caught_reg_com<-colSums(alleles,na.rm=T)[reg_com_int]
caught_loc_com<-colSums(alleles,na.rm=T)[loc_com_int]
caught_loc_rare<-colSums(alleles,na.rm=T)[loc_rare]
}
#as long as its not missed (0) it counts as being caught (could try other minima)
got_G<-sum(caught_glob>=1); got_GLF<-sum(caught_glob_lowfr>=1); got_GR<-sum(caught_glob_rare>=1)
got_RC<-sum(caught_reg_com>=1); got_LC<-sum(caught_loc_com>=1); got_LR<-sum(caught_loc_rare>=1)
summ_results[samp,3,scen,nrep]<-got_G; summ_results[samp,4,scen,nrep]<-got_GLF
summ_results[samp,5,scen,nrep]<-got_GR; summ_results[samp,6,scen,nrep]<-got_RC
summ_results[samp,7,scen,nrep]<-got_LC; summ_results[samp,8,scen,nrep]<-got_LR
}
summ_results[1490,3:8,scen,nrep]<-c(length(allele_freqs),length(glob_lowfr),length(glob_rare),
length(reg_com_int),length(loc_com_int),length(loc_rare))
}
#difference between the scenarios_ran
#apply(summ_results[,,1,1:10]-summ_results[,,2,1:10],c(1,2),mean)
#apply(summ_results[1089:1092,,1,1:num_reps],c(1,2),mean)
}
save(summ_results,file="summ_results_mina_2_14_18_MSB2.Rdata")
#####################################
# RESULTS #
#####################################
#FOR THE CHOSEN SCENARIO
if (.Platform$OS.type=="windows"){
setwd("C:/Users/shoban.DESKTOP-DLPV5IJ/Dropbox/Projects/MSB_Kew_ash_Tree_Seed/")
setwd("C:/Users/shoban/Dropbox/Projects/MSB_Kew_ash_Tree_Seed/") }
if (.Platform$OS.type=="unix") setwd("/home/user/Dropbox/Projects/MSB_Kew_ash_Tree_Seed/")
load(file="summ_results_2_14_18_MSB2.Rdata")
load(file="summ_results_mina_2_16_18_MSB2.Rdata")
type_samp<-c("random","each fr diff reg", "only N 2 rows", "center 2 rows", "only S 2 rows",
"core", "edge", "focus S", "focus N")
type_allele<-c("global", "global low frequency", "global rare", "regional", "local common", "local rare")
num_samp_strat<-length(summ_results[,1,1,1]); num_mom_pop_comb<-11*15
num_trees<-c(2,seq(5,50,5))
num_reps<-1000
###########################################################
# COMPARE SPATIAL STRATEGIES GRAPH #
###########################################################
#The first question is about ranking of the large scale spatial strategies relative to each other
#This uses 'matplot' to plot the different sampling strategies
total_exists<-apply(summ_results[num_samp_strat,3:8,,1:num_reps],1,mean)
pdf(file="compare_spatial.pdf",width=14,height=11)
par(mfrow=c(3,2),oma=c(4,3,4,3),mar=c(3,3,2,2))
#this will show 2 populations (lines 1:11), 20 populations (line 45) and 45 populations (line 100)
#If I wanted 5 populations sampled I would use 12
#and we will focus on allele category 3 (global) and 7 (locally common)
#FOR SUPPLEMENTAL
for (start_pop in c(1,45,100)){
#we need to get, for example, 2 populations for each of the sampling strategies (all nine of them)
#each spatial sampling strategy has 121 (or 11 times 11) slots for all tree/pop combinations
#so essentially need slots 1:11, then (1+121):(11+121) etc. etc. The next three lines get that list
start_each_samp<-seq(start_pop,num_samp_strat-5,by=num_mom_pop_comb)
for (i in 1:(length(num_trees)-1)) unique(start_each_samp<-c(start_each_samp,start_each_samp+1))
some_samps<-sort(unique(start_each_samp))
#For the two allele categories of focus, get the results for that list, make a matrix with
#the spatial strategies as columns and adding more individuals as rows, then plot this
for (i in c(3,7)) {
select_samp<-apply(summ_results[some_samps,i,,1:num_reps],1,mean,na.rm=T)
matplot(matrix(select_samp/total_exists[i-2],nrow=length(num_trees)),type="l",lwd=2,
col=c(rep("black",2),rep("red",3),rep("blue",5)), lty=c(1,2,1,2,3,1,2,3,4,5),
ylab="",xaxt="n",xlab="",cex.axis=1.8)
axis(4,labels=F); axis(1,labels=num_trees, at=1:11,cex.axis=1.8) }
mtext(side=4,line=2,paste(summ_results[start_pop,2,1,1]," populations sampled"),cex=1.4) }
mtext(side=1,line=1.3,"number of trees sampled per population",outer=T,cex=1.4)
mtext(side=2, line=1.3,"proportion of alleles preserved",outer=T,cex=1.4)
mtext(side=3," global alleles locally common alleles",outer=T,cex=2.1)
legend(6,.8,legend=type_samp, col=c(rep("black",2),rep("red",3),rep("blue",5)), lty=c(1,2,1,2,3,1,2,3,4,5), cex=1.3,bg="white", lwd=2)
dev.off()
#this as above but just the global alleles and assumes 40 populations
#FOR MAIN MANUSCRIPT
pdf(file="compare_spatial_global_40.pdf",width=9,height=7)
select_samp<-apply(summ_results[some_samps,3,,1:num_reps],1,mean,na.rm=T)
matplot(matrix(select_samp/total_exists[1],nrow=length(num_trees)),type="l",lwd=2.5,
col=c(rep("black",2),rep("red",3),rep("blue",5)), lty=c(1,2,1,2,3,1,2,3,4,5),
ylab="proportion of alleles captured",xaxt="n",xlab="number of trees sampled",
cex=1.5,cex.axis=1.5,cex.lab=1.5)
legend(6,.71,legend=type_samp, col=c(rep("black",2),rep("red",3),rep("blue",5)), lty=c(1,2,1,2,3,1,2,3,4,5), cex=1.3,bg="white", lwd=2.5)
axis(1,at=1:11,labels=(c(2,seq(5,50, by=5))), cex.axis=1.5)
dev.off()
###########################################################
# COMPARE SPATIAL STRATEGIES TABLE #
###########################################################
#This analysis is to rank the spatial strategies
#NOT REALLY USED IN PAPER- THE PLOT TELLS ALL THAT WE NEED
library("tidyr"); library("reshape2")
#melt and dcast will swap L and R, so have to fix order
total_exists<-apply(summ_results[num_samp_strat,3:8,,1:100],1,mean)
total_exists_ord<-total_exists; total_exists_ord[4:5]<-total_exists[5:6]; total_exists_ord[6]<-total_exists[4]
#first rbind the reps together and cbind a column to add sample strategies to each row
bound_results<-as.data.frame(summ_results[,,1,1])
for (i in 2:100) bound_results<-rbind(bound_results,summ_results[,,1,i])
labels_one_rep<-as.factor(c(rep(type_samp,each=num_mom_pop_comb),"MSB","NTSB","MSB10","NTSB10","TOTAL"))
bound_results<-cbind(rep(labels_one_rep,100),bound_results)
colnames(bound_results)<-c("strat","plants","pops","G","GLF","GR","RC","LC","LR")
melted_results<-gather(bound_results,allcat,alleles,G,GLF,GR,RC,LC,LR,-pops,-plants,-strat)
#first get MSB results
MSB_results<-melted_results[which(melted_results$strat=="MSB"),]; means_MSB<-dcast(MSB_results,strat~allcat,mean)
means_MSB[2:7]/total_exists_ord
#remove MSB and total for now
melted_results<-melted_results[-which(melted_results$strat=="MSB"),]; melted_results<-melted_results[-which(melted_results$strat=="MSB10"),]
melted_results<-melted_results[-which(melted_results$strat=="NTSB"),]; melted_results<-melted_results[-which(melted_results$strat=="NTSB10"),]
melted_results<-melted_results[-which(melted_results$strat=="TOTAL"),]
means_all<-dcast(melted_results,strat~allcat,mean)
ss_ranking<-function(num_plants,num_pops){
spec_mean<-dcast(melted_results[melted_results$plants==num_plants&melted_results$pops==num_pops,],strat~allcat,mean) #50/ 50
rownames(spec_mean)<-spec_mean[,1]; spec_mean<-spec_mean[,-1]
#print(rownames(t(t(spec_mean)/total_exists_ord)[order(spec_mean[,4]),])[5:9])
#identify the best by ranking
write.table(cbind(rownames(t(t(spec_mean)/total_exists_ord)[order(spec_mean[,1]),])[5:9],
t(t(spec_mean)/total_exists_ord)[order(spec_mean[,1]),][5:9]),
file="ss_ranking2.csv", append=T)
write.table( t(t(spec_mean)/total_exists_ord), file="ss_ranking.csv", append=T)
}
#do this for low, medium, high sampling, more pops, more individuals see if ranking changes
temp_plants<-c("2500","625","200","50","250","250"); temp_pops<-c("50","25","10","5","5","50")
for (i in 1:length(temp_plants)) ss_ranking(temp_plants[i],temp_pops[i])
#determine which are significantly different than each other
anov_res<-aov(alleles~strat+plants+pops+allcat,data=melted_results)
anov_res<-aov(alleles~strat,data=melted_results)
Tuk_res<-TukeyHSD(anov_res); Tuk_res$strat[order(Tuk_res$strat[,4]),]
#Interestingly the edge and core dont significantly differ from each other; nearly all the rest differ
barplot(means_all[,2]/total_exists_ord[1],names.arg=means_all[,1],las=2)
###########################################################
# MSB vs. POTENTIAL SAMPLING #
###########################################################
#Compare MSB to other samplings, see which of the strategies did better at proportion of alleles
#and/ or how many samples does it take to beat MSB, with a diff strategy (i.e. north)?
summ_results_ex<-summ_results
means_caught<-apply(summ_results_ex[1:1490,,1,1:1000],c(1,2),mean)
means_caught[1490,1:2]<-1
prop_caught<-t(t(means_caught)/means_caught[1490,])
write.csv(prop_caught,file="prop_caught.csv")
#which sampling strategies could have beaten the MSB in terms of choice of populations
which_beat_MSB<-prop_caught[which(prop_caught[,3]>prop_caught[1487,3]),]
write.csv(which_beat_MSB,"which_beat_MSB_G.csv")
which_beat_MSB<-prop_caught[which(prop_caught[,7]>prop_caught[1487,7]),]
write.csv(which_beat_MSB,"which_beat_MSB_LC.csv")
#to get as much as they did, if only sampling in the south, they would have had to sample
#very hard to capture all the alleles!
pdf(file="local_vs_global_accum.pdf")
plot(prop_caught[1:1093,3],prop_caught[1:1093,7],xlim=c(0,1),ylim=c(0,1), xlab="locally common alleles", ylab="all alleles")
abline(0,1,col="green",lwd=2)
dev.off()
prop_caught[as.numeric(which(prop_caught[,3]>prop_caught[,7])),1:2]
#Get global alleles "faster" for small collections, and local alleles faster with big collections-
#this means it is easier to get all the local common alleles because the accumulation of global ones slows one
###########################################################
# COMPARE TREES VS. POP'NS #
###########################################################
#The question is it better to sample more trees or more populations can partly be answered with a graph
#This makes two plots, one for number of trees on the X axis and one for number of populations on the X axis
#With additional lines being the other variable (populations and trees)
#The following loop will do this for every allele type
for (A in 3:8){
pdf(paste(A,"trees_vs_popns.pdf"), height=5,width=8)
all_mean<-apply(summ_results[,,1,1:1000],c(1,2),mean)
n_pops_samp<- c(2,seq(5,70,by=5)); n_trees_samp<- c(2,seq(5,50,by=5))
all_mean_glob<-matrix(all_mean[1:num_mom_pop_comb,A],nrow=11,dimnames=list(n_trees_samp,n_pops_samp))/all_mean[num_samp_strat,A]
#plots of adding more trees and more populations
par(mfrow=c(1,2),mar=c(5,5,2,1))
plot(all_mean_glob[1,]~colnames(all_mean_glob),type="l",ylim=c(0,1),xlim=c(-4,50),xlab="number of populations",ylab="proportion of genetic variation")
for (t in 1:11) lines(all_mean_glob[t,]~colnames(all_mean_glob))
for (pop_index in c(1:4,6,11)) text(-3,all_mean_glob[pop_index,1],n_pops_samp[pop_index],cex=1.1)
axis(4, at= c(0,.2,.4,.6,.8,1), labels=F)
#plot(all_mean_glob[1,]~colnames(all_mean_glob),type="l",ylim=c(0.4,.9),xlab="number of populations",ylab="proportion of genetic variation")
#for (t in 2:10) lines(all_mean_glob[t,]~colnames(all_mean_glob))
par(mar=c(5,2,2,4))
plot(all_mean_glob[,1]~rownames(all_mean_glob),type="l",ylim=c(0,1),xlim=c(-5,50),xlab="number of trees",yaxt="n")
for (t in 1:11) lines(all_mean_glob[,t]~rownames(all_mean_glob))
for (mom_index in c(1:4,6,11)) text(-3,all_mean_glob[1,mom_index],n_trees_samp[mom_index])
axis(4, at= c(0,.2,.4,.6,.8,1), labels=T); axis(2, at= c(0,.2,.4,.6,.8,1), labels=F)
dev.off()
}
#So the first black bar is 5 trees/ 35 populations, the third red bar is 10 populations/ 35 trees
#Second black bar is 10 trees/ 35 populations, the fourth red bar is 15 populations/ 35 trees
#########################################################################
# GET THE DIMINISHING RETURNS POINT (PLATEAU 1%) #
#########################################################################
#ALSO CALL IT THE STOPPING POINT
#this looks at more trees or more pops
#it will take in the all_mean (all alleles means) matrix
#first for trees it will go through the all_mean matrix and calculate the difference between row r+1 and r
#then will divide this by the number of trees sampled from r+1 to r e.g. divide by 5 or 10 trees
#then this is stored in tree_diff
#then we determine for every sampling group (2 to 50 trees, 11 types) we find the first diff less then thresh
#we do skip the first minimum sampling (2) because it is the diff from the last sampling (50)
#to look at other allele categories just replace the 3]<thresh below with other categories
thresh<-0.001
#first get the proportions
all_mean[,3:8]<-t(t(all_mean[,3:8])/(all_mean[1489,3:8]))
#add a column that is number of plants sampled
all_mean<-cbind(all_mean,all_mean[,1]/all_mean[,2])
diff<-all_mean[1:1489,]
#calculate the difference from the next closest sampling
for (i in 1:length(diff[,1])) for (c in 3:8) diff[i,c]<-(all_mean[i+1,c]-all_mean[i,c])/(all_mean[i+1,9]-all_mean[i,9])
#mean(diff[diff[,3]<0.005,9]) #this is wrong
#Note that all_mean is sorted by number of populations so we can look at difference as we add more trees
#The 99 is 9 sampling spatial strategies * 11 tree possibilities
b<-0; plateau_tree<-vector(length=99)
#Go through each set of 11, identify which of those is less than thresh, take the minimum of that
for (i in 0:98) plateau_tree[i+1]<-diff[1+i*10+i+min(which(diff[(1+i*10+i+1):(11+i*10+i),3]<thresh)),9]
mean(plateau_tree,na.rm=T)
#0.001 tree = 28.1; 0.005 = 11.9; 0.01 = 7.22
#Reorder all_mean by number of trees
all_mean_temp<-all_mean
all_mean_temp<-all_mean_temp[order(rownames(all_mean_temp),all_mean_temp[,9]),]
diff<-all_mean_temp[1:1489,]
#calculate the difference from the next closest sampling
for (i in 1:length(diff[,1])) for (c in 3:8) diff[i,c]<-(all_mean_temp[i+1,c]-all_mean_temp[i,c])/(all_mean[i+1,1]-all_mean[i,1])
#so we can look at difference as we add more populations
#The 135 is 9 sampling spatial strategies * 15 tree possibilities
b<-0; plateau_pop<-vector(length=135)
for (i in 0:134) plateau_pop[i+1]<-diff[1+i*10+i+min(which(diff[(1+i*10+i+1):(11+i*10+i),3]<thresh)),2]
mean(plateau_pop,na.rm=T)
#0.005 = 17.1 populations, 0.001 = 20.4 populations
##########
#JUNKYARD
##########
#separate data into single populations
#UK_genind_sep<-lapply(seppop(UK_genind), function(x) x[sample(1:nrow(x$tab), 10)])
#this will look at what is captured in a given population
#sum(as.vector(colSums(UK_genind_sep[[39]]@tab)[glob_com])==0)
#sum(as.vector(colSums(UK_genind_sep[[39]]@tab)[glob_lowfr])==0)
#sum(as.vector(colSums(UK_genind_sep[[39]]@tab)[glob_rare])==0)
#but really I want what is captured in all sampled populations
#pooled_data<-repool(UK_genind_sep)
#sum(as.vector(colSums(pooled_data_north@tab)[glob_com])==0)
#this doesn't work to repool, because it makes new allele counts
#colSums lists all alleles and the number of populations (N) having 0 copies of that allele
#we then pull out the indices (which) of alleles for which N = number of populations - 1
#(only one pop'n does not have 0 copies)
these<-as.vector(which(colSums(UK_genpop@tab<12)==(n_pops-1)))
#then take each of these alleles and sort the counts per population- how many counts are in that pop'n
sapply(these, function(x) sort(UK_genpop@tab[,x]))
big_enough<-table_counts[39,]>=30
these<-as.vector(which(colSums(UK_genpop@tab<3)==(n_pops-1)))
sapply(these, function(x) sort(UK_genpop@tab[,x]))
#CYCLE THROUGH THIS FROM 0 TO ABOUT 14, record the population and the allele, its count in that pop, and count in next pop
sum(as.vector(colSums(north_genpop@tab)[glob_lowfr])==0)