-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathfindHighestSimilarity.R
191 lines (149 loc) · 7.1 KB
/
findHighestSimilarity.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
##%######################################################%##
# #
#### For each copy of each hit group, this ####
#### script finds # the copy that has the highest ####
#### sequence similarity (pID) ####
#### in every other hit group ####
#### (assuming homology between copies ####
#### # was detected by blastn) ####
# #
##%######################################################%##
# this script is launched by stage 12-HTTeventCount.R
source("HTvFunctions.R")
# the only argument is the number of CPU to use
args <- commandArgs(trailingOnly = TRUE)
nCPUs <- as.integer(args[1])
# the output will be a table that reports on the homology of copies between hit groups
dir.create("TEs/clustering/networks") # where results will go
# we import the list of hits (data tables with query id,
# subject id, pID, hit group id) in each super family
# This list was generated by 12-HTTeventCount.R
hitList <- readRDS("TEs/clustering/forHighestSimilarity.RDS")
# to find homologous copies, we use the self-blastn files generated at stage 8
# we list them
blastFiles <- list.files(
path = "TEs/clustering/blastn/done",
pattern = ".out",
full.names = T
)
# we split these file names by superfamily (large superfamilies have several blast files)
# for this we extract superfamily names from file names
superF <- splitToColumns(basename(blastFiles), "_", 1)
blastFiles <- split(blastFiles, superF)
# this is the core function that finds similarities between
# copies of different hit groups of a super family
findSimilarities <- function(superF) { # superF is the superfamily name (character)
# we get the HTT hits of the super family being processed
hits <- hitList[[superF]]
# this was only to get all the copies involved in these hits
copies <- hits[, unique(c(q, s))]
# this function import blast files of copies against themselves
import <- function(file) { # file is the path to a blast output
selfBlast <- fread(
input = file,
sep = "\t",
header = F,
select = c(1:3),
col.names = c("query", "subject", "pID"),
nThread = 2
)
# we remove its not involving copies in the htt hits
selfBlast <- selfBlast[query %in% copies & subject %in% copies]
# and convert pID to integer for speed and RAM usage
selfBlast[, pID := as.integer(pID * 1000)]
selfBlast
}
# we apply the function in parallel for the blast
# files of the super family, using 5 processes
selfBlast <- mclapply(
X = blastFiles[[superF]],
FUN = import,
mc.cores = 5,
mc.preschedule = F
)
selfBlast <- rbindlist(selfBlast) # we rbind the hits in a single table
# we will need to quickly retrieve all the copies that
# are homologous to a given one (including itself)
# for this, we first duplicate the hits by creating reciprocal
# ones and add self hits with 100% pIDs
selfBlast <- rbind(
selfBlast, # the original hits
selfBlast[, .(query = subject, subject = query, pID)], # the hits in "reverse"
data.table(query = copies, subject = copies, pID = 100000L) # and the self hits
)
# we can now create the following list
subjectsForQuery <- reList(split(selfBlast$subject, selfBlast$query))
# subjectsForQuery[[x]] returns all copies that have some homology
# with copy x (x is an integer)
# we will also need to quickly retrieve the pID score of all hits involving a query
# the principle is the same as above
idForQuery <- reList(split(selfBlast$pID, selfBlast$query))
# as well as the hit group(s) in which each copy is found, following the same principle
# but for this, we first have to obtain all ids of copies involved in each hit group
copyPerHitGroup <- hits[, .(copy = unique(c(q, s))), by = hitGroup]
hitGroupsForCopy <- reList(split(copyPerHitGroup$hitGroup, copyPerHitGroup$copy))
copyPerHitGroup <- split(copyPerHitGroup$copy, copyPerHitGroup$hitGroup)
# we count the number of subjects per query, used later
nSubjects <- lengths(subjectsForQuery)
# as well as the number of hit groups where each copy is found
nHitGroups <- lengths(hitGroupsForCopy)
rm(selfBlast) # to reclaim RAM, as there are lots of hits
# for the copies of a given hit group ("focal" hit group), the function below
# retrieves the best similarity this copy has with every other hit group
findSimilaritiesForHitGroup <- function(copies, focalHitGroup) {
# focalHitGroup is just the integer id of the hit group
# we first retrieve all copies that are homologous to
# those in the hit group (i.e., 'subject' of the blast)
subject <- unlist(subjectsForQuery[copies])
# with the pIDs associated with this homologies
pID <- unlist(idForQuery[copies])
# we place them in a data table (which requires replicating
# copies that are homologous to several "subjects")
pairs <- data.table(copy = rep(copies, nSubjects[copies]), subject, pID)
# and we get the hit groups to which these subjects belongs
explanatoryHitGroup <- unlist(hitGroupsForCopy[pairs$subject])
# we place all this information in a table
# for this, we need the number of hitGroups in which the subjects are found
len <- nHitGroups[pairs$subject]
# we generate the table (by replacing the previous one we no longer need)
pairs <- pairs[, data.table(
copy = rep(copy, len),
pID = rep(pID, len),
explanatoryHitGroup
)]
# we don't keep rows involving subjects that belong to the focal hit group
pairs <- pairs[explanatoryHitGroup != focalHitGroup]
# for each copy from focalHitGroup, we retain only the most similar subject per
# explanatoryHitGroup
setorder(pairs, -pID)
pairs <- pairs[!duplicated(data.table(copy, explanatoryHitGroup))]
# we now add the focalHitGroup id to the table.
# Doing it earlier would have increased the memory footprint.
pairs[, focalHitGroup := focalHitGroup]
cat(".") # progress indicator
pairs
}
# we apply the above function to all hit groups of the super family
similarities <- Map(
f = findSimilaritiesForHitGroup,
copies = copyPerHitGroup,
focalHitGroup = as.integer(names(copyPerHitGroup))
)
# we write this table for safety, as it is returned by the function
writeT(
data = rbindlist(similarities),
path = stri_c("TEs/clustering/networks/", superF, ".similarities.txt")
)
rbindlist(hitGroupPairs)
}
# we apply the function to all super families in parallel
res <- mclapply(
X = names(hitList),
FUN = findSimilarities,
mc.cores = nCPUs,
mc.preschedule = F
)
writeT(
data = rbindlist(res),
path = stri_c("TEs/clustering/networks/all.hitGroupSimilarities.txt")
)