-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path08-prepareClustering.R
169 lines (121 loc) · 6.01 KB
/
08-prepareClustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
## %######################################################%##
# #
#### This stage prepares the clustering of ####
#### TE hits to count HTT events. ####
# #
## %######################################################%##
# In this stage, we estimate the sequence identify between all TE copies within
# all super families with blastn all vs all (referred to as "self blast"
# afterwards). This will be used to determine if two hits represent the same HTT
# (identity at the hits is lower than identity between copies in each clade,
# referred to as "criterion 1" in the paper), and also for the analysis of TE
# evolution within genomes
# this script uses
# - the TE-TE hits tabular file we retained in the previous stage
# - the fasta file fo TE copies in the hits, from the previous stage
# the output is the blast hits (tabular format) of all TE copies within each super family
source("HTvFunctions.R")
# where output file will go
dir.create("TEs/clustering")
# selected TE-TE hits from the previous stage
httHits <- fread("occ2000ds05.txt")
# STEP ONE, we reduce the number of hits ---------------------------------------------------------
# there are too many hits for this clustering, we only retain 200 of hits
# per single-linkage cluster, as hits from the same cluster are very likely
# to represent the same HTT
# we favor hits that involve the longest protein coding regions
setorder(httHits, -length.aa)
httHits <- httHits[occ <= 200L, ]
# and write them to file
writeT(httHits, "occ200ds05.txt")
# STEP TWO, we extract TE copies sequences in the retained hits ----------------------------------
# we add species and classification information to copy names, as specified in the fasta file of TE copies, so we can extract them
copies <- httHits[, union(
x = stri_c(sp1, ":", query, ":", f1, "#", superF),
y = stri_c(sp2, ":", subject, ":", f2, "#", superF)
)]
# writes them to temporary file used by seqtk
write(copies, "temp.bed")
# and we extract these sequences into a new fasta file
seqtk(
fas = "TEKs/selectedCopiesAA300cl2000.fas",
bed = "temp.bed",
out = "TEs/clustering/selectedCopiesKs05occ200.fas"
)
file.remove("temp.bed")
# STEP THREE, we prepare and launch the self blastn within each super family of the copies extracted above ------------------------------------------
# we will rename copies with integers, to reduce the size of blastn files, and improve speed in further stages
seqs <- readDNAStringSet("TEs/clustering/selectedCopiesKs05occ200.fas")
copies <- names(seqs)
# we extract super family names and replaces slashes with periods as file names will contains super family names
superF <- gsub("/", ".", stri_extract_last(copies, regex = "[^#]+"), fixed = T)
# we attribute integer numbers to copies, which we will used instead of long copy names
copyID <- data.table(copy = copies, id = 1:length(copies))
# we write this correspondence to disk as it will be reused many times afterwards
writeT(copyID, "TEs/clustering/selectedCopiesKs05occ200.IDs.txt")
# we replace copy names with the integers in the sequences
names(seqs) <- copyID[chmatch(names(seqs), copy), id]
dir.create("TEs/clustering/blastn/db", recursive = T)
# we split copy sequences by super families, as blastn searches will be done within super families
all.db <- split(seqs, superF)
# we generate output file names
fasNames <- stri_c("TEs/clustering/blastn/db/", names(all.db), ".fas")
m <- Map(writeXStringSet, all.db, fasNames)
# we build blast databases
m <- lapply(fasNames, function(x) {
system(paste("makeblastdb -dbtype nucl -in", x))
})
# we will split query files for better CPU usage (num_threads of blastn isn't very efficient so we will leave it to 1)
# and for better management of jobs + outputs (which are very big)
# where split query fastas will go
dir.create("TEs/clustering/blastn/split")
# temporary blastn output files with go there
dir.create("TEs/clustering/blastn/out")
# and the output file will go there
dir.create("TEs/clustering/blastn/done")
# the number of sequences per super family, which we used to decide how much we split the queries
nD <- as.numeric(lengths(all.db))
# the criterion is the size of query * size of db
nComp <- data.table(
query = names(all.db),
nc = nD * as.numeric(sapply(all.db, length))
)
# we ensure that a query is not split in more than 200 parts
nComp[, nt := round(nc / (max(nc) / 200))]
nComp[nt == 0, nt := 1L]
# we create an integer index corresponding to the sub-query (max 200), for each sequences
indices <- unlist(Map(function(x, y) {
rep(1:x, length.out = y)
}, nComp$nt, nD))
# this index is used to attribute each sequence to a sub-query (part)
queryParts <- stri_c(
"TEs/clustering/blastn/split/",
rep(nComp$query, nD),
"_",
indices,
".fas"
)
# so we split the copy sequences in these sub queries
splitSequences <- split(unlist(all.db, use.names = F), queryParts)
# and save them to fastas
l <- Map(writeXStringSet, splitSequences, names(splitSequences))
# we create the databases corresponding to the split queries
dbs <- gsub(
pattern = "split",
replacement = "db",
x = stri_extract_last(
str = names(splitSequences),
regex = "[A-Z]+.[^_]+"
)
)
# creates a table listing the blast searches to do
searches <- data.table(query = names(splitSequences), db = stri_c(dbs, ".fas"))
searches[, out := gsub("split", "out", query)]
searches[, out := gsub(".fas", ".out", out)]
# we attribute batches of blastn searches to be launched from a given R instance, on a node (using sarray would have been better)
searches[, batch := rep(1:(.N / 30), length.out = .N)]
writeT(searches, "copiesToSelfBlast.txt")
# we now run the script for the self-blastn searches. This is done by batches.
# These use a lot of ram and take quite a bit of time, as we don't limit the number of hits
# example for batch 1
system('sbatch --mail-type=BEGIN,END,FAIL --cpus-per-task=4 --mem=120G --wrap="Rscript TEselfBlastn.R 1 4"')