-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathTEKaKs.R
285 lines (210 loc) · 9.84 KB
/
TEKaKs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
##%######################################################%##
# #
#### This script computes pairwise Ka/Ks and ####
#### overal molecular distance between homologous TEs ####
# #
##%######################################################%##
# this script is run at stages 7 and 13 of the pipeline
source("HTvFunctions.R")
require(seqinr)
args <- commandArgs(trailingOnly = TRUE)
# the arguments are
# - the path to the tabular file of HSPs between TE sequences, with columns query, subject, qStart, qEnd, sStart, sEnd
TEhitFile <- args[1]
# - the path to a tabular file of HPSs between these sequences and proteins.
# These HSPs must not overlap on a given TE (see step 7)
# This file has fields: TE copy name, start and end coordinates of HSP on the TE (start < end),
# the start coordinate of the HSP on the protein, and a logical telling whether the HSP is
# in reverse orientation in respect to the TE
# All values in query and subject columns in the first file must be found in the copy column of this file
blastxFile <- args[2]
# - the path to the fasta file containing the TE sequences.
fastaFile <- args[3]
# -folder where results file will go.
outputFolder <- args[4]
# - the number of CPUs to use
nCPUs <- as.integer(args[5])
dir.create(outputFolder, showWarnings = F)
# STEP ONE, preparation of the TE-TE copy alignments ---------------------------------------------------------------
# We import HSP files
TEhits <- fread(TEhitFile)
blastx <- fread(blastxFile)
# we only retain blastx HSPs that involve sequences in the TE-TE hits
blastx <- blastx[copy %chin% TEhits[,c(query, subject)]]
# to keep track of hits, we give them an integer identifier in a column
TEhits[, hit := 1:.N]
# We select HSPs for which there is enough protein regions per copy
# we first compute the total length of HSPs per copy
covPerCopy <- blastx[, sum(end - start + 1L), by = copy]
# the query and subject TEs must have an alignment on proteins longer than 30 bp
# "hits" below are simply row indices of TEhits table
hits <- TEhits[, which(
query %chin% covPerCopy[V1 > 30L, copy] & subject %chin% covPerCopy[V1 > 30L, copy]
)]
# we split the work into several batches (jobs) for parallelism
# The size of a batch of HSPs (4000) must not be too big due to memory constrains.
nJobs <- max(nCPUs, ceiling(length(hits) / 4000))
# we split the HSPs into the batches
hits <- splitEqual(sample(hits), n = nJobs)
# Note that we randomize HPSs (rows) to reduce differences in job duration
# we import the TE sequences
seqs <- readDNAStringSet(fastaFile)
# we modify names to match copy names of the TEhits table (the fasta
# file has longer sequences names that also comprise the host species)
names(seqs) <- copyName(names(seqs))
# we extract the parts of sequences involved in TE hits
qSeqs <- TEhits[, subSeq(seqs[query], qStart, qEnd)]
sSeqs <- TEhits[, subSeq(seqs[subject], sStart, sEnd)]
# we splits these subsequences into batches corresponding to the hit batches to be processed in parallel
qSeqs <- lapply(hits, function(batch) qSeqs[batch])
sSeqs <- lapply(hits, function(batch) sSeqs[batch])
# and we do the same for the hits themselves (we only retain coordinates of hits)
TEhits <- lapply(hits, function(batch) {
TEhits[batch, .(query, subject, qStart, qEnd, sStart, sEnd)]
})
rm(seqs, covPerCopy) # we reclaim some RAM
# STEP TWO, TE-TE pairwise sequence alignment and Ka/Ks computation ----------------------------------------------------------------
# below is the core function that pairwise aligns TEs and computes Ka/Ks on a batch of hits (jobs).
KaKsForJob <- function(job) { # the only argument is the job number
# we align pairs of copies (this uses the Biostrings package)
aln <- alignWithEndGaps(
seq1 = qSeqs[[job]],
seq2 = sSeqs[[job]]
)
# we retrieve the table of coordinates of hits corresponding to this job
TEhitBatch <- TEhits[[job]]
# we determine position of bases within codons --------------------------------
# We split alignments into a table of individual nucleotides and positions
# see function definition in HTvFunctions.R
nuc <- splitAlignment(
aln = aln,
coords = TEhitBatch[, .(query, subject, qStart, qEnd, sStart, sEnd)]
)
# in this script, we also measure some overall molecular distance between copies,
# which we will use later.
# For this, we need to make DNAbins (used by ape)
sequencePairs <- Map(
f = list,
pattern = split(nuc$base1, f = nuc$aln),
subject = split(nuc$base2, f = nuc$aln)
)
sequencePairs <- lapply(sequencePairs, as.DNAbin)
rawDistance <- sapply(sequencePairs, dist.dna, model = "raw")
# Kimura 80 is the model on which the Ka Ks estimate is based on
K80distance <- sapply(sequencePairs, dist.dna, model = "K80")
# to determine the position of bases within codons, we determine
# the blastx HSP that covers each aligned position, for each copy.
# for this we add a new integer column simply denoting the row index of the HSP in the blastx table
# alignment positions outside these HSPs will get NA values
# we begin with the "query" TE
nuc[, hsp1 := assignToRegion(
bed = blastx[, .(copy, start, end)],
pos = .(seq1, pos1)
)]
# then we process the subject TE copy
nuc[, hsp2 := assignToRegion(
bed = blastx[, .(copy, start, end)],
pos = .(seq2, pos2)
)]
# we can already discard positions outside protein HSPs (no evidence that they are in ORFs)
nuc <- nuc[!is.na(hsp1) & !is.na(hsp2)]
# we now determine the position in codon ="frame" (1 to 3 or -1 to -3) of each position
# for this, we need to know the the strand of the protein with respect to the copy
# we thus add two logical column that tell whether the blastx HSPs are in reversed (for query and subject)
nuc[, c("rev1", "rev2") := .(
blastx[hsp1, rev],
blastx[hsp2, rev]
)]
# we determine the frame of the "query" base (base1)
nuc[, frame1 := ifelse(
test = !rev1,
yes = (pos1 - blastx[hsp1, protStart]) %% 3L + 1L,
no = -((blastx[hsp1, protStart] - pos1) %% 3L + 1L)
)]
# then for the subject base (base2)
nuc[, frame2 := ifelse(
test = !rev2,
yes = (pos2 - blastx[hsp2, protStart]) %% 3L + 1L,
no = -((blastx[hsp2, protStart] - pos2) %% 3L + 1L)
)]
# we reverse the frame of the query bases if it is itself reversed in the TE-TE hit
nuc[TEhitBatch[aln, qStart > qEnd], frame1 := -frame1]
# same for the subject
nuc[TEhitBatch[aln, sStart > sEnd], frame2 := -frame2]
# we remove aligned position where the frames (position in codons) don't match
nuc <- nuc[frame1 == frame2]
# as well as columns that are no longer necessary (to free some ram)
nuc[, c("hsp1", "hsp2", "rev1", "rev2", "frame2") := NULL]
# we discard incomplete codons from alignments -----------------------
# Codons will be identified by integers (first one, second one, etc.)
# we first determine that a new codon starts when...
shift <- nuc[, c(
T, # it is the very first base of the table or
# the absolute difference in "frame" between successive bases is not 1, or
abs(diff(frame1)) != 1L |
# the absolute difference in coordinates in the query copy is not 1, or
abs(diff(pos1)) != 1L |
# same for the subject copy
abs(diff(pos2)) != 1L |
# or if we shift to a different alignment
diff(aln) != 0
)]
# each aligned position gets an identifier that differs between codons
codon <- cumsum(shift)
# we retain only complete codons (found 3 times)
nuc <- nuc[codon %in% which(tabulate(codon) == 3L)]
# we prepare, then do, the Ka Ks computations -----------------------------
# we discard alignments that are too short
nuc <- nuc[aln %in% which(tabulate(aln) > 30L)]
# we count the number of substitutions per alignment
nSubstitutions <- nuc[,.(nMut = sum(base1 != base2)), by = aln]
# we concatenate the individual bases back into sequences, in a new table
flattenedSequences <- nuc[, .(
query = stri_flatten(base1),
subject = stri_flatten(base2),
frame = frame1[1] # we need this column below
),
by = aln
]
# we reverse-complement the sequence that are not from the coding strand (frame is negative)
flattenedSequences[frame < 0L, c("query", "subject") :=
.(revCom(query), revCom(subject))]
# we produce seqinr alignment objects that are required for kaks()
alns <- Map(
f = function(seq1, seq2) {
seqinrAlignment(c(seq1, seq2))
},
flattenedSequences$query,
flattenedSequences$subject
)
# and compute Ka and Ks with seqinr
KaKs <- lapply(alns, kaks)
# we stack the results into a data.table
KaKs <- rbindlist(lapply(
X = KaKs,
FUN = as.data.table
))
# we add columns for HSP identifiers and alignment lengths
alnID <- flattenedSequences$aln
KaKs = data.table(
hit = hits[[job]][alnID],
KaKs,
length = nchar(flattenedSequences$query),
nMut = nSubstitutions[match(alnID, aln), nMut],
K80distance = K80distance[alnID],
rawDistance = rawDistance[alnID]
)
# we write results for safety (we write the combined results at the end)
writeT(KaKs, stri_c(outputFolder, "/KaKs.", job, ".txt"))
KaKs
}
# we apply the above function for the different jobs in parallel
res <- mclapply(
X = 1:length(TEhits),
FUN = KaKsForJob,
mc.cores = nCPUs,
mc.preschedule = F
)
writeT(data = rbindlist(res), stri_c(outputFolder, "/allKaKs.txt"))
# if we have reached this step, we may remove intermediate files
unlink(stri_c(outputFolder, "/KaKs.*.txt"))