-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path09-hitClusteringRound1.R
313 lines (221 loc) · 12 KB
/
09-hitClusteringRound1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
## %######################################################%##
# #
#### This stage clusters TE-TE hits iteratively ####
#### to delinate HTT events ("hit groups") ####
# #
## %######################################################%##
# This stage is the first step where clusters hits from the same pair of "young"
# clades: clades of less than 40 My, in which we did not even look for HTT and
# did not measure any BUSCO Ks. Then we will aggregate the clusters that may
# reflect the same HTT. Clustering and aggregation will be based on criterion 1:
# pID of TEs between clade is lower than pID within clades
source("HTvFunctions.R")
# this script uses:
# - the tabular file of selected TE-TE hit ("HTT hits") from stage 07-TEKsAndHTTfilter.R
httHits <- fread("occ200Ks05.txt")
# - the self-blastn output generated by the previous stage (used later)
# the output will be a tabular file of HTT hits with a new column attributing a community number to each
# STEP ONE, we swap query and subject species in the TE-TE hit table -------------------------------------------------------
# An HTT Involves two clades,
# We compare TE copies within clade to evaluate criterion 1. To do this, it is
# easier to swap sp1 and sp2 so that sp1 always corresponds to the same clade
# (and sp2 to the other) among the two sister clades that diverged from an MRCA.
# So we will compare "queries" (copies from the left-hand clade of the table) with each others
# and "subjects" (copies from the right-hand clade) with each others (not queries
# to subjects, that is already done with the pID in the HTT hits) so for each hit
# between species coalescing to an MRCA, we find its clade (the one among the two
# that diverged from the mrca)
tree <- read.tree("additional_files/timetree.nwk")
# the different MRCAs of the two species involved in each HTT hits
mrcas <- httHits[, unique(mrca)]
# this 2-column table gives the children (subclades, in column V2) of each node (MRCA, column V1)
edges <- data.table(tree$edge)
# these nodes are the subclades of each mrca in the httHits
subClades <- edges[V1 %in% mrcas, V2]
# we retrieve the species of these subclades
subClades <- tipsForNodes(tree, subClades, names = T)
# and the parent node (mrca) of each subclade
subClades[, MRCA := edges[match(node, V2), V1]]
# we combine tip (species) name and MRCA number, to match these combinations in the httHits
subClades[, comb := stri_c(tip, MRCA, sep = " ")]
# We assign subclade numbers (node numbers of the tree) to species in each hit,
httHits[, c("cladeA", "cladeB") := .(
subClades[chmatch(stri_c(sp1, mrca, sep = " "), comb), node],
subClades[chmatch(stri_c(sp2, mrca, sep = " "), comb), node]
)]
# and we ensure that the subclade with lower number is always on the left.
# This requires swapping many columns of the table:
httHits[cladeA > cladeB, c("query", "subject", "sp1", "sp2", "f1", "f2", "qStart", "qEnd", "sStart", "sEnd", "cladeA", "cladeB")
:= .(subject, query, sp2, sp1, f2, f1, sStart, sEnd, qStart, qEnd, cladeB, cladeA)]
# saved to disk
writeT(httHits, "occ200_subClades.txt")
# STEP TWO, we filter the self-blastn hits of TE copies ----------------------------------------------------------------
# to get within-clades pID of TE hits, we use the self-blastn outputs, which are
# huge (hundreds of GBs total). So we filter them to select only the hits we
# need: those between copies of the same (sub)clade, to evaluate criterion 1
# (within-clade pID lower than between-clade pID). We already have between-clade
# pIDs, since these are in the httHits table.
# We filter the blast outputs to generate more manageable files, in case we need to redo
# the clustering several times. The hits will not have to be filtered again
# for this step, we need copy integer identifiers as these were used in the self blast to save space
# we retrieve copy integer IDs
copyIDs <- fread("TEs/clustering/selectedCopiesKs05occ200.IDs.txt")
copyIDs[, name := copyName(copy)]
# we add new columns for integer ids corresponding to both copies in each hit
httHits[, c("q", "s") := .(copyIDs[chmatch(query, name), id], copyIDs[chmatch(subject, name), id])]
# since we will compare only copies within superFamilies and MRCA (the combination of which we call a "group"), we extract them from the table
# we start with the "queries", which are copies from the left-hand clade ("clade A")
queries <- httHits[, unique(q), by = .(superF, mrca)]
# we split these copies for each superfamily and mrca, which we will compare as these are from the same clade
cladeAcopies <- split(queries$V1, queries[, paste(superF, mrca)])
# we will sort groups by decreasing number of copies in queries (for better CPU usage)
nQueryCopies <- lengths(cladeAcopies)
cladeAcopies <- cladeAcopies[order(nQueryCopies, decreasing = T)]
# we do the same for subjects TE copies (from clade B)
subjects <- httHits[, unique(s), by = .(superF, mrca)]
cladeBcopies <- split(subjects$V1, subjects[, paste(superF, mrca)])
cladeBcopies <- cladeBcopies[order(nQueryCopies, decreasing = T)]
# as we will extract self-blast hits per super family, we re-split the TE copies (subjects and queries) lists by super Families
# prior, we extract super family names from copy names and replaces slashes with periods as they will be used to generate file names
superF <- gsub("/", ".", splitToColumns(names(cladeAcopies), " ", 1), fixed = T)
cladeAcopies <- split(cladeAcopies, superF)
cladeBcopies <- split(cladeBcopies, superF)
# and we sort these list by decreasing numbers of copies (we prefer starting with the large ones)
# we obtain the number of copy in each element of the list
nq <- sapply(cladeAcopies, function(x) {
length(unique(unlist(x)))
})
cladeAcopies <- cladeAcopies[order(nq, decreasing = T)]
cladeBcopies <- cladeBcopies[order(nq, decreasing = T)]
# we list the self blast result files generated in set 8-prepareClustering.R
blastFiles <- list.files(
path = "TEs/clustering/blastn/done",
pattern = "",
full.names = T
)
# which we order by decreasing size
blastFiles <- blastFiles[order(file.size(blastFiles), decreasing = T)]
# splits blast files per super families (large superfamilies have several blast outputs)
blastFiles <- split(blastFiles, splitToColumns(basename(blastFiles), "_", 1))
# where the selected hits will go
filteredHitFolder <- "TEs/clustering/selectedHits/"
dir.create(filteredHitFolder)
# this function extracts the relevant TE-TE hits for a given super family
getHits <- function(superF) {
files <- blastFiles[[superF]]
import <- function(file) {
hits <- fread(
input = file,
sep = "\t",
header = F,
# the function will be used in parallel, so we don't need more than 1 CPU here
nThread = 1,
#we only need these columns for the clustering (we don't use the score, but we import it just in case)
select = c(1:3, 9),
col.names = c("query","subject","pID","score")
)
# now select hits between copies of the same clade
selectedHits <- Map(
function(cladeAcopies, caldeBcopies) {
# each hit must be between two copies of clade A
# or between two copies of clade B
hits[(query %in% cladeAcopies & subject %in% cladeAcopies) |
(query %in% caldeBcopies & subject %in% caldeBcopies), ]
},
cladeAcopies[[superF]], cladeBcopies[[superF]]
)
selectedHits <- rbindlist(selectedHits)
# some rare hits may have been selected several times, we remove duplicates
selectedHits <- selectedHits[!duplicated(data.table(query, subject)), ]
# some progress indicator as files can be quite big
cat(".")
selectedHits
}
# we apply the function for each blast file of the super family, in parallel with 20 CPUs
hits <- mclapply(
X = files,
FUN = import,
mc.cores = min(20, length(files)),
mc.preschedule = F
)
# writes a single file of selected hits
writeT(
data = rbindlist(hits),
path = stri_c(filteredHitFolder, superF, ".out"),
)
# progress report
print(paste("done", superF))
NULL
}
# we do it iteratively for each super family.
m <- lapply(names(cladeAcopies), getHits)
# STEP THREE, clustering of hits according to criterion 1----------------------------------------------------
# we cluster hits between species of the same pair of "young" clades that
# diverged within the last 40 My. Within these clades, we did not even search for
# HTT (no blast) and we didn't even measure Ks at busco genes. These clusters of hits will
# be the initial ones (the "communities") that we will aggregate in a second round
# so we assign species involved hits to the young clades
# the young clades of <40 My
clades <- cladesOfAge(tree, 40, withTips = T)
# assigning species to these clades
httHits[, c("C1", "C2") := .(clades[chmatch(sp1, tip), node], clades[chmatch(sp2, tip), node])]
# To cluster hits, we prepare a table with only the columns we need.
# clustering of hits will be done within each "group" (hits involving a given pair of young clades in a super family).
# Hits will be identified by their row indices. We again replace slashes in super family names with periods
httHits[, c("hit", "group") := .(1:.N, stri_c(stri_replace(superF, ".", fixed = "/"), C1, C2, sep = "_"))]
# We get the column we need for the clustering
conv <- httHits[, data.table(hit,
# we convert blast pIDs to integers for efficiency in the clustering
pID = as.integer(pID * 1000L),
group,
query = q,
subject = s
)]
# we split these hits according to "groups"
hitList <- split(conv, conv$group)
hitList <- hitList[order(-sapply(hitList, nrow))]
# if there is just one hit in a group, no clustering needs to be done so we can remove the group
hitList <- hitList[sapply(hitList, nrow) > 1L]
# where results will go
dir.create("TEs/clustering/round1/")
# hitList will be used in the clustering jobs. Below we plan the jobs:
saveRDS(hitList, file = "TEs/clustering/round1/hitsForFirstClusteringKs05occ200.RDS")
groupList <- data.table(
group = names(hitList),
# number of hits to cluster per "group"
nHits = sapply(hitList, nrow),
superF = splitToColumns(names(hitList), "_", 1)
)
# we use the number of hits and groups per super family.
# We'll do the clustering by super families, so that a
# filtered self-blastn output (which concerns a super family)
# needs only be imported once for all the groups within the super family
perSuperF <- groupList[, .(nHits = sum(nHits), .N), by = superF]
# we attribute batches (job numbers) to super families.
# several super families with low number of hits are processed in the same job
perSuperF[, batch := 0L]
perSuperF[nHits > 30000, batch := 1:.N]
# this table will be used to manage jobs in iterativeFirstClustering.R
writeT(perSuperF, "TEs/clustering/round1/batchesKs05occ200.txt")
# example of job number 6 using 5 cpus:
system('sbatch --mail-type=BEGIN,END,FAIL --cpus-per-task=5 --mem=100G --wrap="Rscript iterativeFirstClustering.R 6 5"')
# STEP FOUR, we collect the results of the clustering ---------------------------------------------------------------------
# there is one clustering output per job, we list them and import them
files <- list.files(
path = "TEs/clustering/round1",
pattern = "allGroups.txt",
full.names = T
)
groups <- rbindlist(lapply(files, fread, header = T, sep = "\t"))
# we generate unique integer community ids (accross all groups)
groups[, ucomm := toInteger(string = stri_c(comm,
group,
sep = " "
))]
# we add these identifiers to the HTT hit table
httHits[groups$hit, com := groups$ucomm]
# the com column is NA for hits were not even clustered because there are just one per
# super family and clade pair, so we give them unique community numbers.
httHits[is.na(com), com := 1:.N + max(groups$ucomm)]
# write the new HTT hit table to disk
writeT(httHits, "occ200_comm.txt")