-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path13-TEKaKsWithinGenomes.R
233 lines (167 loc) · 8.91 KB
/
13-TEKaKsWithinGenomes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
## %######################################################%##
# #
#### This stage establishes ka ks rate for pairs ####
#### of TEs copies that ####
#### diverged by transposition within genomes ####
# #
## %######################################################%##
# the principle is to compute Ka and Ks values for related copies within a genomes
# These copies belong to the same "community" of hits, so are unlikely to represent
# different htt events. They would have diverged by transposition within the genome
# This script uses
# - the table of HTT hits between TEs, to obtain the communities to which these hits belong
# - the self-blastn output of TE copies against themselves, generated at stage 8
source("HTvFunctions.R")
# STEP ONE, we select relevant blast hits of TEs to compute ka ks rates.--------------------------------
# We do it per superfamily (for big super families, several blast searches were performed in
# parallel so there are multiple output files)
# we retrieve blastn hits of copies against themselves (generated by
# TEselfBlastn.R at stage 08-prepareClustering.R).
# As these files are very big, we will select only the hits we need.
# We filter the hits separately from Ka Ks computation because it takes some time
# Should be need to redo the Ka Ks computations, we won't have to filter hits again
blastFiles <- list.files(
path = "TEs/clustering/blastn/done",
pattern = ".out",
full.names = T
)
# some output file may be empty (no hits), which could cause an issue during import
blastFiles <- blastFiles[file.size(blastFiles) > 0]
# we extract superfamily names from file names
superF <- splitToColumns(basename(blastFiles), "_", 1)
# we split files names per superfamily
blastFiles <- split(blastFiles, superF)
# we import the copy integer identifiers as these were
# used in the blasts, instead of very long copy names
ids <- ids <- fread("TEs/clustering/selectedCopiesKs05occ200.IDs.txt")
# we retrieve the host species of copies, as we will only compare copies within the same genome
ids[, c("sp", "name") := .(extractSpeciesNames(copy), copyName(copy))]
# We also import the tree to give integer ids (tip numbers) to species, for speed
tree <- read.tree("additional_files/timetree.nwk")
# we make the correspondance between the copy id (index of
# the vector below, as the id is an integer) and species id
spForCopy <- ids[match(1:max(id), id), chmatch(sp, tree$tip.label)]
# we import the HTT hits (including hitgroups that were not retained by
# our criteria in stage 11, just in case we change our filters afterwards)
httHits <- fread("HTThitsAssessed.txt")
# we extract the information we need.
# We need to know the "communities", which are clusters of related copies
# that should have diverged only by transposition events, avoiding potential HTT
copiesInCom <- httHits[, union(q, s), by = com]
# we also make a vector for the correspondance between a copy (its index) and a community (its value)
comForCopy <- copiesInCom[match(1:max(ids$id), V1), com]
dir.create("TEs/TEevolution/selectedHits/", recursive = T) # where selected hits will go
# we will now select the hits we need process self blastn output --------------------------------------------------------------
# the hits must involve copies from the same community and species (genome)
# we do it per super family as there are no hit between super families (no blast)
selectHits <- function(superF) {
import <- function(file) {
hits <- fread(
input = file,
header = F,
drop = c(3, 9),
col.names = c("q", "s", "length", "qStart", "qEnd", "sStart", "sEnd"),
# since the function will be launched in parallel, we don't need several threads
nThread = 1
)
# we can already select hits that are long enough for Ka Ks computations
hits <- hits[length >= 300L]
# and those between copies from the same species
hits <- hits[spForCopy[q] == spForCopy[s] &
# and same community
comForCopy[q] == comForCopy[s], ]
cat(".") # progress indicator (the files are big, this takes time)
hits[,-"length"]
}
# we apply the above function in parallele using 20 processes
# and rbind filtered blast hits from the same super family
rbindlist(mclapply(
X = blastFiles[[superF]],
FUN = import,
mc.cores = 20,
mc.preschedule = F
))
}
# we apply the function for all super families
# which are names of the blastFile list
intraGenomeHits <- rbindlist(lapply(names(blastFiles), selectHits))
# we write the selected hits do disk as a single file
writeT(
data = intraGenomeHits,
path = "TEs/TEevolution/selectedHits/all.out"
)
# STEP TWO, we filter TE hits that cover a protein region of at least 300 bp-----------------------------
# this is to avoid unnecessary ka/ks computations on codon alignments that will be too short
# we import the a blastx file generated earlier (stage 6), listing
# alignment coordinates of copies against repeat proteins
# this one doe not have overlapping HSPs
blastx <- fread("TEKs/blastxOCC2000EXcov1.out")
# we replace copy names by their integer ids, since ids are used in the self blastn files
blastx[, id := ids[chmatch(blastx$copy, name), id]]
# we only retain blastx results for copies involved in our selected hits
# this will speed things up a bit
blastx <- blastx[id %in% httHits[, c(q, s)]]
# as there are several HSPs (alignments) per TE, we get the minimal start
# and maximal end of these, to approximate the protein region covered by a TE
blastxCoords <- blastx[, .(start = min(start), end = max(end)), by = id]
# we add columns of TE hit coordinates where starts always < ends
intraGenomeHits[, c("qSt", "qEn", "sSt", "sEn") := data.table(
pmin(qStart, qEnd),
pmax(qStart, qEnd),
pmin(sStart, sEnd),
pmax(sStart, sEnd)
)]
# we add protein ranges for query and subject in blastn hits
intraGenomeHits[, c("qSprot", "qEprot") := blastxCoords[match(q, id), .(start, end)]]
intraGenomeHits[, c("sSprot", "sEprot") := blastxCoords[match(s, id), .(start, end)]]
# we obtain the region of the TE query that aligns on the TE subject and also aligns on proteins
intraGenomeHits[, c("qSprot", "qEprot") := data.table(intersection(qSt, qEn, qSprot, qEprot, T))]
# same for the subject TE
intraGenomeHits[, c("sSprot", "sEprot") := data.table(intersection(sSt, sEn, sSprot, sEprot, T))]
# we convert the above into query coordinates
intraGenomeHits[, c("qsS", "qsE") := .(qSt + sSprot - sSt, qEn + sEprot - sEn)]
# we estimate the length of the protein part of copies that align with each other
intraGenomeHits[, inter := pmin(qEprot, qsE) - pmax(qSprot, qsS) + 1L]
# selects hits for which this region is at least 300 bp, and the columns we need
# we also retrieve full TE names as these will be needed for Ka Ks computation,
# because these names are those in the fasta file of copy sequences
intraGenomeHits <- intraGenomeHits[inter >= 300L, data.table(q, s,
query = ids[match(q, id), name],
subject = ids[match(s, id), name],
qStart, qEnd, sStart, sEnd
)]
# we rbind these selected hits with the htt hits, although we have already
# computed Ka and Ks rates for the latter. This is because we will use the
# TEKaKs.R script to compute overall distance between copies, which we did
# not do at stage 7 (TEKaKs.R has been modified since)
selectedHits <- rbind(
data.table(intraGenomeHits, htt = F), # we add a logical column to differentiate the two types of hits
httHits[,.(q, s, query, subject, qStart, qEnd, sStart, sEnd, htt = T)]
)
# we prepare the files needed for the TEKaKs.R script
TEhitFile <- "TEs/TEevolution/selectedHits/selected.out"
blastxFile <- "TEs/TEevolution/selectedHits/blastx.out"
# for these tables, we remove columns that are not used in TEKaKs.R
writeT(selectedHits[, -c("q", "s", "htt")], TEhitFile)
writeT(blastx[, -"id"], blastxFile)
# STEP THREE, we compute Ka and Ks rates -------------------------------------------------------------
outFolder <- "TEs/TEevolution/Ks"
dir.create(outFolder)
nCPUs <- 30
system(paste(
"Rscript TEKaKs.R",
TEhitFile, blastxFile,
"TEs/clustering/selectedCopiesKs05occ200.fas", # the fasta file of copy sequences
outFolder, nCPUs
))
# We imports results generated by TEKaKs.R to merge results with the TEhit table
KaKs <- fread(stri_c(outFolder, "/allKaKs.txt"))
selectedHits[, hit := 1:.N] # required for the merged below, as the kaks results have a hit identifier
KaKs <- merge(selectedHits, KaKs, by = "hit")
# we add columns for superfamily names and community ids based on copy ids
ids[, superF := stri_extract_last(copy, regex = "[^#]+")]
KaKs[, c("superF", "com") := .(
ids[match(q, id), superF],
comForCopy[q]
)]
writeT(Kaks[,-"hit"], "TEs/TEevolution/TEKaKsAndDistance.txt")