-
Notifications
You must be signed in to change notification settings - Fork 0
/
differentiation_index.R
174 lines (152 loc) · 9.25 KB
/
differentiation_index.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#########################################
## Differentiation Index (DI) ##
## meassure the directionality of ##
## allelic frequency differentiation ##
#########################################
# written by Hernán E. Morales
# paper: Morales, H. E., Faria, R., Johannesson, K., Larsson, T., Panova, M., Westram, A. M., & Butlin, R. (in press) Genomic architecture of parallel ecological divergence: beyond a single environmental contrast. Science Advances
# preprint: Morales, H. E., Faria, R., Johannesson, K., Larsson, T., Panova, M., Westram, A. M., & Butlin, R. (2018). Genomic architecture of parallel ecological divergence: beyond a single environmental contrast. bioRxiv, 447854.
# Changelog
# INPUT: 4 arguments
# The script takes two files (arguments 1 and 2)
# 1) The first, is the allelic frequencies file with SNPs in rows and pools in columns
# Allelic frequencies are from the reference allele
# Frequencies should laready be filtered by quality and minor allele frequency (downstream)
# The file must contain information about every SNP:
# Linkage Group or chromosome
# Position in cM within the LG or chrom
# The position on LG or chrom can be changed for contig or scaffold if desired to apply CSS to a smaller genomic scale (i.e. one CSS per contig/scaffold)
# example file for Low vs High pools (see Morales et al)
# LG cM Fr_Low Fr_High SPs_High SPs_Low SPn_High SPn_Low SFreqTablen5_C_High SFreqTablen5_C_Low SFreqTablen5_FreqTable_High SFreqTablen5_FreqTable_Low SFreqTablen3_C_High SFreqTablen3_C_Low SFreqTablen3_FreqTable_High SFreqTablen3_FreqTable_Low Ukw_High Ukw_Low Uke_High Uke_Low
# 1: 1 0.657 0.6363636 0.75675676 0.4255319 0.75000000 0.7529412 0.6511628 0.7708333 0.5348837 0.2758621 0.3333333 0.80882353 0.63461538 0.5652174 0.3896104 0.5058824 0.6708861 0.5588235 0.5660377
# 2: 1 0.657 0.6666667 0.75652174 0.3958333 0.73076923 0.7790698 0.6595745 0.7307692 0.5581395 0.2941176 0.3333333 0.82608696 0.67272727 0.3809524 0.3780488 0.5308642 0.6585366 0.5514019 0.6160714
# 3: 1 0.657 0.1891892 0.06818182 0.1102362 0.14754098 0.1844660 0.3095238 0.1774194 0.3296703 0.2500000 0.2321429 0.05882353 0.07843137 0.1923077 0.1666667 0.3982301 0.2087912 0.1487603 0.1848739
# 4: 1 0.657 0.5304348 0.49618321 0.5632184 0.92857143 0.4122807 0.5000000 0.3278689 0.4337349 0.7959184 0.6833333 0.31250000 0.38095238 0.5000000 0.6428571 0.8888889 0.7727273 0.6564885 0.5816327
# ---
# 9997: 1 12.850 0.3504274 0.39230769 0.8349515 0.73737374 0.5754717 0.8974359 0.8679245 0.8571429 0.7755102 0.7021277 0.92000000 0.93750000 0.9565217 0.7717391 0.8469388 0.7941176 0.7591241 0.7090909
# 9998: 1 12.850 0.4000000 0.53488372 0.8800000 0.74038462 0.5128205 0.8611111 0.9107143 0.8873239 0.7826087 0.7272727 0.93750000 0.82051282 0.9615385 0.8085106 0.7289720 0.7826087 0.8181818 0.8073394
# 9999: 1 12.850 0.3238095 0.20161290 0.4421053 0.15686275 0.1698113 0.2682927 0.7857143 0.7164179 0.6585366 0.4705882 0.42105263 0.55813953 0.2500000 0.4421053 0.3203883 0.4179104 0.4587156 0.4392523
# 2) The second file, contains the information about the pools
# This is needed to differentiate the differentiation index in each pool and to assign the reference pool to confer directionality
# example file for the frequency file above
# code shore country
# 1: Fr_Low Low France
# 2: Fr_High High France
# 3: SPn_High High Spain
# 4: SPn_Low Low Spain
# ---
# 16: Uke_High High UK
# 17: Ukw_Low Low UK
# 18: Ukw_High High UK
# 3) Reference group (country in our case): the group of reference used as reference to confer directionality to the Differentiation Index. Here this is assigned by defining the name of country
# 4) Prefix for output file names
# REQUIRED
# R-libraries data.table
# OUTPUT
# A single table with DI values (see at the bottom of script for example)
args <- commandArgs(trailingOnly = TRUE)
## Default setting when no arguments passed
if(length(args) < 4) {
args <- c("--help")
}
## Help section
if("--help" %in% args) {
cat("
Arguments:
--arg1= allelic frequencies file
--arg2= sample infomation file
--arg3= reference group
--arg4= output file prefix
--help - print this text
Example:
Rscript differentiation_index.R FREQ_FILE.txt.gz SampleInfo_FILE.txt.gz Spain out_DI \n\n")
q(save= "no")
}
##################
# Load arguments #
##################
FILE<-args[[1]]
SampleInfo<-args[[2]]
REFcountry<-args[[3]]
OUTprefix<- args[[4]]
##################
# Load libraries #
##################
library(data.table)
##################
# Script #
##################
# read frequency file in termination ".gz" is recognised as compressed file
if(grepl(".gz", FILE)){
FreqTable<-fread(paste("zcat<",FILE),header=T)}else{FreqTable<-fread(FILE,header=T)
}
# read file
SampleInfo<-fread(SampleInfo,h=T)
# define sample sets
ALL_pools<-as.character(SampleInfo$code)
HIGH_pools<-sort(as.character(SampleInfo[shore=="High"]$code))
LOFreqTable_pools<-sort(ALL_pools[!ALL_pools%in%HIGH_pools])
# define reference sets
high_REF<-as.character(SampleInfo[country==REFcountry&shore=="High"]$code)
low_REF<-as.character(SampleInfo[country==REFcountry&shore=="Low"]$code)
# make a new matrix to store results
DI_table<-data.frame(LG=NA,cM=NA,REF_mean=NA,matrix(data = NA,nrow = 1,ncol = length(HIGH_pools)))
names(DI_table)[4:ncol(DI_table)]<-HIGH_pools
DI_table<-DI_table[0,]
# Hybrid Index = HI
i=1
LG<-unique(FreqTable$LG)
# loop over LGs
for(i in LG){
# susbset to LG
FreqTable_LG<-FreqTable[LG==i]
# loop over map positions
cM_position<-unique(FreqTable_LG$cM)
for(j in cM_position){
# susbset to a map position
FreqTable_LG_cM<-FreqTable_LG[cM==j]
# take average for high shore level reference pools
HIGH_pools_table_average<-rowMeans(FreqTable_LG_cM[,high_REF,with=F])
# take average for low shore level reference pools
LOFreqTable_pools_table_average<-rowMeans(FreqTable_LG_cM[,low_REF,with=F])
# get allelic frequency differential
DIFF<-HIGH_pools_table_average-LOFreqTable_pools_table_average
# ask which positions have negative differences
POS<-which(DIFF<0)
# then for those positions take the frequency of the other pool (i.e. 1-p)
if(length(POS)>0){HIGH_pools_table_average[POS]<-1-HIGH_pools_table_average[POS]}
if(length(POS)>0){LOFreqTable_pools_table_average[POS]<-1-LOFreqTable_pools_table_average[POS]}
HIGH_pools_table_average<-rowMeans(FreqTable_LG_cM[,HIGH_pools,with=F])
LOFreqTable_pools_table_average<-rowMeans(FreqTable_LG_cM[,LOFreqTable_pools,with=F])
if(length(POS)>0){HIGH_pools_table_average[POS]<-1-HIGH_pools_table_average[POS]}
if(length(POS)>0){LOFreqTable_pools_table_average[POS]<-1-LOFreqTable_pools_table_average[POS]}
DI_table_x<-data.frame(LG=i,cM=j,REF_mean=(mean(HIGH_pools_table_average)-mean(LOFreqTable_pools_table_average)),
matrix(data = NA,nrow = 1,ncol = length(HIGH_pools)))
names(DI_table_x)[4:ncol(DI_table_x)]<-HIGH_pools
# then for each of the high pools, do the flipping and add its average DI to the table
HIGH_pools_table<-FreqTable_LG_cM[,HIGH_pools,with=F]
LOFreqTable_pools_table<-FreqTable_LG_cM[,LOFreqTable_pools,with=F]
for(z in 1:length(HIGH_pools)){
HIGH_pool_s<-HIGH_pools[z]
LOFreqTable_pool_s<-LOFreqTable_pools[z]
REPLACE<-as.numeric(unlist(1-HIGH_pools_table[POS,z,with=FALSE]))
HIGH_pools_table[POS,names(HIGH_pools_table)[z] := REPLACE]
REPLACE<-as.numeric(unlist(1-LOFreqTable_pools_table[POS,z,with=FALSE]))
LOFreqTable_pools_table[POS,names(LOFreqTable_pools_table)[z] := REPLACE]
COL2<-which(names(DI_table_x)==HIGH_pool_s)
DI_table_x[,COL2]<- (mean(unlist(HIGH_pools_table[,z,with=F]))-mean(unlist(LOFreqTable_pools_table[,z,with=F])))
}
# merge each newly generated line with main table
DI_table<-rbind(DI_table,DI_table_x)
}
}
gz1 <- gzfile(paste(OUTprefix,"_DI_table_REF_",REFcountry,".txt.gz",sep=""), "w")
write.table(DI_table, gz1,row.names = F,quote = F,sep="\t")
close(gz1)
# example of output, the names of the pools are labelled "High" but they should be simply the name of the pool. This extra labell is a left-over from their original names in the Frequency Table
# LG cM REF_mean Fr_High SPn_High SPs_High SWn3_C_High SWn3_W_High SWn5_C_High SWn5_W_High Uke_High Ukw_High
# 1: 1 0.6570 0.04633239 0.0592410839 0.06822184 0.19828067 -3.116543e-04 3.608055e-02 0.0238851079 0.003072025 0.042658124 -0.0141362554
# 2: 1 0.9655 0.02852787 0.0210745941 0.12068976 0.08160229 2.194858e-02 3.484261e-05 0.0216379972 0.000129408 -0.006754697 -0.0036119589
# 3: 1 2.0905 0.02479449 0.0076523380 0.13540482 0.08602227 1.828682e-02 -1.036285e-02 0.0050609639 -0.006713134 -0.013195334 0.0009945203
# 4: 1 2.6565 0.02343077 -0.0078660603 0.10943387 0.06815485 1.894425e-02 6.462131e-03 0.0004925125 0.006616356 -0.007488625 0.0161276361
# ----