-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathiterativeSecondClustering.R
219 lines (173 loc) · 8.87 KB
/
iterativeSecondClustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
## %######################################################%##
# #
#### This scripts applies criterion ####
#### 1 to hits from different ####
#### communities, to potentially merge them ####
#### into hit groups (htt events) ####
# #
## %######################################################%##
# This scripts is launched from 10-hitClusteringRound2.R and uses files generated by that script
# This shares a lot of code with iterativeFirstClustering.R, but there are subtleties.
# Here, we return statistic for every pair of communities, in particular the number of pairs
# of hits passing criterion 1.
source("HTvFunctions.R")
# the only argument is the number of CPUs to use
args <- commandArgs(trailingOnly = TRUE)
nCPUs <- as.integer(args[1])
# we import the list of hit tables to cluster
hitList <- readRDS("TEs/clustering/round2/hitsFor2ndClustering.RDS")
# STEP ONE, list self-blastn files of copies for each family -------------------------------------------------------------------------------------
# we retrieve super family names from the list of hits
superF <- splitToColumns(names(hitList), "_", 1)
nHits <- sapply(hitList, nrow)
nHits <- sort(tapply(nHits, superF, sum), T) # to process the super families with more hits first
blastFiles <- list.files(
path = "TEs/clustering/selectedHits",
full.names = T
)
# we name the blast files elements with superfamily names (extracted from file names)
names(blastFiles) <- gsub(
pattern = ".out",
replacement = "", # this simply require removing the file extension
x = basename(blastFiles), # from the base file name
fixed = T
)
# STEP TWO, we "confront" every two communities of hits, we do this per super family------------------------------------------------------------
communityPairStats <- function(superFam) {
# we identify the self-blastn outputs we need, and import them
blastFile <- blastFiles[superFam]
blast <- fread(
input = blastFile,
sep = "\t",
header = F,
drop = 4,
col.names = c("query", "subject", "pID")
)
blast[, pID := as.integer(pID * 1000L)] # we convert pIDs to integers as we did for the htt hits (to save memory)
# but here we don't create a matrix of pIDs where subject and queries are rows and columns numbers,
# such matrix would be too big for this round.
# We create a unique subject-query identifier based on the fact that these are integers, and both lower than 10^6.
blast[, copyPair := 10^6 * query + subject]
# we select the group of hits (one per mrca) for this superfamily
groups <- hitList[superF == superFam]
# the function bellow processes hits of a given group, and will be launched in parallel
processHitsOfGroup <- function(group) {
# we retrieve the hits of the group
hits <- copy(group)
# we sort these hits per community to ensure that the
# community pairs are formed in the same "direction"
# (com1 vs com2 and never the opposite)
setorder(hits, com)
# the ids of copies involved in the hits to cluster
uCopies <- hits[, unique(c(query, subject))]
# we selected the TE hits involving these copies. We only
# need the subject-query pair identifier and the blast pID
selectedHits <- blast[query %in% uCopies & V2 %in% uCopies, .(copyPair, pID)]
# we add a copyPair that does not actually exist (with 0 pID)
# at the end of this table. This trick will help us later
selectedHits <- rbind(selectedHits, data.table(copyPair = 0, pID = 0L))
# as before, we anticipate the number of hits we will have to compare
# to create batches that will analyse 2^28 pairs of hits at once
nHits <- nrow(hits)
nBatches <- ceiling(nHits^2 / 2^28)
# we split the hits into several batches
hitBatches <- splitEqual(1:(nHits - 1L), n = nBatches)
# this function "connects" hits according to the criterion, similar to what we did in the first round
criterion_1 <- function(hitBatch) {
# we again make all possible pairs of hits for this batch
pairs <- data.table(
hit1 = rep(hitBatch, nHits - hitBatch),
hit2 = unlist(lapply(
X = hitBatch[1]:max(hitBatch) + 1L,
FUN = function(hit) hit:nHits
))
)
# we add columns for the ids of copies involved in the 2 hits
pairs[, c(
"q1", # query ids (copies from clade A) for hit1
"s1", # subject ids (copies from clade B)
"com1", # community id
"inter1", # the inter-clade identity (representing the HTT)
"q2", "s2", "com2", "inter2" # and the equivalent for hit2
)
:= data.table(
hits[hit1, .(query, subject, com, pID)],
hits[hit2, .(query, subject, com, pID)]
)]
# we create query-subject pair identifiers as we did for the self-blastn output
# we must ensure that the left copy id is always lower than right one
# (as it is the case for the copyPair identifiers in the self-blast files)
pairs[q1 > q2, c("q1", "q2") := .(q2, q1)] # we swap these ids if it is not the case
pairs[s1 > s2, c("s1", "s2") := .(s2, s1)]
# we create the same pair identifiers we created for the blast results
pairs[, c("qPair", "sPair") := .(
10^6 * q1 + q2,
10^6 * s1 + s2
)]
# so we can retrieve within-clade sequence identities.
# The nomatch argument below makes it so that the pID of the last row
# in selectedHits (which has pID 0) is returned where there is no match (no hit).
# This speeds things up a little
pairs[, c("intra1", "intra2") := .(
selectedHits[match(qPair, copyPair, nomatch = .N), pID],
selectedHits[match(sPair, copyPair, nomatch = .N), pID]
)]
# for hits of a copy with itself (not in the blast outputs), we define 100% identity
# this is equivalent to what we did for the matrix' diagonal in the other script)
pairs[q1 == q2, intra1 := 100000L]
pairs[s1 == s2, intra2 := 100000L]
# as before, the 2 hits will be connected in the best intra-clade
# identity is higher than one inter-clade identity (that of hits)
pairs[, maxIntra := pmax(intra1, intra2)]
pairs[, connected := inter1 < maxIntra |
inter2 < maxIntra]
cat("*")
# we return statistics for each community pair
# (note that a community pair may involve the same community twice,
# which can help to assess the degree to which hits within a community are connected)
pairs[, .(
allDiff = sum(!connected), # number of pairs of hits not passing the criterion
diff = sum(maxIntra > 0L & !connected), # the same, but for hits whose copies have some degree of intra-clade identity
valid = sum(maxIntra > 0L), # the number of pairs of hits whose copies have some degree of intra-clade identity,
tot = .N # the total number of pairs of hits evaluated
),
by = .(com1, com2) # all this is done for each community pair
]
}
# we apply the function in parallel for the different batches
statsPerCom <- rbindlist(mclapply(
X = hitBatches,
FUN = criterion_1,
mc.cores = min(nCPUs, length(hitBatches)),
mc.preschedule = F
))
# as the statistic returned by criterion_1() were obtained for
# separate batch, we need to sum them for the whole group
statsPerCom[, .(
allDiff = sum(allDiff),
diff = sum(diff),
valid = sum(valid),
tot = sum(tot)
),
by = .(com1, com2)
]
}
# applies the above function for the different groups successively
stats <- rbindlist(lapply(
X = groups,
FUN = processHitsOfGroup
))
# writes to disk for safety, this is also returned by he function
writeT(
data = stats,
path = stri_c("TEs/clustering/round2/", superFam, ".all.txt")
)
print(paste("done", superFam))
stats
}
# we finally apply the above for all super families
res <- lapply(
X = names(nHits), # nHits names correspond to super family names
FUN = communityPairStats
)
writeT(rbindlist(res), "TEs/clustering/round2/all.txt")