-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path03-findDubiousTEs.R
287 lines (214 loc) · 9.59 KB
/
03-findDubiousTEs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
## %######################################################%##
# #
#### This stage finds TE consensuses (families) ####
#### that may cover non-TE genes ####
# to exclude them as aterfactual #
# #
## %######################################################%##
# We perform a blastx search of all TE consensus sequences generated
# by RepeatModeler against the non-redundant “nr” protein database of
# NCBI, using Diamond
# We do a similar search against a database of known TE proteins.
# We discard a TE consensus (hence family) if (i) it has homology to an nr
# protein over at least 90 amino acids in a region that did not show homology
# with a RepBase protein and if (ii) this nr protein does not show a homology of
# at least 35% over >=100 amino acids with a RepBase protein. Homology between
# proteins is determined by a Diamond blastp search of nr proteins fulfilling
# criterion (i) against TE proteins. For all searches, we ignore alignments of
# e-value < 10−3.
# the script uses
# - the fasta files of repeatmodeler family consensuses
# - the repeat masker database of repeated proteins
# the output is a text file listing names of family consensuses to exclude
source("HTvFunctions.R")
# folder where intermediate and results file will be save
workFolder <- ("TEs/findDubious/")
dir.create(workFolder)
# STEP ONE, we blastx consensus of repeat elements against the ncbi non-redundant database of proteins (with diamond) -----------------------------------
# we thus collect all the species TE consensuses, and put them in a single fasta file
# (we prefer doing a single blast for all species, and we need to add species names to the sequences to keep track of them)
consensusFiles <- list.files(
pattern = "families.fa$",
full.names = T,
recursive = T
)
# we determine the species for each consensus fasta. As for the previous stage, species names must be present in file paths
species <- extractSpeciesNames(consensusFiles)
# this function imports a single fasta file
import <- function(file, sp) {
seqs <- readDNAStringSet(filepath = file)
# we add the species name to each sequence name
setNames(seqs, stri_c(sp, "-", names(seqs)))
}
# we import fasta sequence in parallel with 10 CPUs
seqs <- mcMap(
f = import,
file = consensusFiles,
sp = species,
mc.cores = 10,
mc.preschedule = F
)
# we combine the sequences into a single DNAStringSet
seqs <- do.call(c, seqs)
setwd(workFolder)
# write all consensus sequences to a single fasta
writeXStringSet(seqs, "allTEconsensuses.fas")
# we obtain the nr database (not recommended if you already have it somewhere)
download.file("ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz", "nr.gz")
# we make a diamond database fro it
system("diamond makedb --in nr.gz -d nr -p 10")
# file name of the output of the blastx search
out <- "consensusesOn_nr.out"
# column names of the output (used throughout)
fields <- c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "stitle")
# we launch the diamond blastx search:
system(
command = paste(
"diamond blastx -p 1 --more-sensitive -q allTEconsensuses.fas -d nr.dmnd --quiet -k 20 -f 6",
paste(fields, collapse = " "),
"-o",
out
)
)
# processing the hits -----------------------------------------------------
blast_nr <- fread(
input = out,
header = F,
sep = "\t",
col.names = fields
)
# we replace subject id by subject title (description)
blast_nr[, c("sseqid", "stitle") := .(stitle, NULL)]
# remove hits of low evalue
blast_nr <- blast_nr[evalue <= 0.001]
setorder(blast_nr, qseqid, qstart, -qend)
# remove hits nested within others (in the query), as these hits are of lower quality (shorter)
blast_nr[, set := regionSets(data.frame(qseqid, qstart, qend))]
blast_nr <- blast_nr[!duplicated(set)]
# we combine relatively adjacent hits (with a lot of margin). This function also renames columns
combined <- combineHits(
blast = blast_nr,
maxDist = 300,
maxOverlap = 10000,
maxDiff = 300,
blastX = T
)
# puts best hits on top, per query
setorder(combined, query, -score)
# attributes a number to every hit related to the same query (starting at 1)
combined[, hit := occurrences(query)]
# STEP TWO, we blast TE consensuses against repeat_pep proteins -----------------------------
# we use RepeatPeps.lib = the repeat modeler classification fasta file "RepeatPeps.lib"
# it is part of the RepeatModeler package, not installed with this script
# this file should be put in current working directory
# we make a diamond database from it
system("diamond makedb --in RepeatPeps.lib -d repeatPeps")
out <- "consensusesOnRepeatPeps.out"
# we launch the diamond search
system(
paste(
"diamond blastx -p 1 --more-sensitive -q allTEconsensuses.fas -d repeatPeps.dmnd --quiet -k 20 -f 6",
paste(fields, collapse = " "),
"-o",
out
)
)
# we process the hits in the same fashion as the previous step: -------------------------------------------------
blast_repBase <- fread(
input = out,
header = F,
sep = "\t",
col.names = fields
)
blast_repBase <- blast_repBase[evalue <= 0.001]
setorder(blast_repBase, qseqid, qstart, -qend)
blast_repBase[, set := regionSets(data.frame(qseqid, qstart, qend))]
blast_repBase <- blast_repBase[!duplicated(set)]
# replaces subject name by this generic term
blast_repBase[, sseqid := "repProtein"]
# here we combine all hsps of each consensus, regardless of the protein
combinedRep <- combineHits(
blast = blast_repBase,
maxDist = 100000,
maxOverlap = 100000,
maxDiff = 100000,
blastX = T
)
# STEP THREE ----------------------------------------------------------------------------------------------------
# to find consensuses that may not be TEs, we compare their hits on repbase proteins to hits on nr.
# But since there may be several hit on NR per consensus, we do it on a per-hit (not per-consensus) basis.
# The first hit per consensus (query) is processed first, then the second, etc.
# the function below confronts the hit on repbase to the hit on nr for each consensus:
dubious <- function(occ) {
merged <- merge(
x = combined[hit == occ],
y = combinedRep,
by = "query",
suffixes = c("", ".r"),
all = T
)
# we compute the length of the part that aligns on the nr protein before it aligns on the rep protein
merged[, leading := pmin(qStart.r, qEnd.r) - pmin(qStart, qEnd)]
# the length pf the part that aligns on the nr protein after it aligns on the rep protein
merged[, trailing := pmax(qStart, qEnd) - pmax(qStart.r, qEnd.r)]
# we retain pairs of hits where there is no alignment on repbase,
m <- merged[is.na(subject.r) |
# or where there are parts aligning on nr protein not covered by alignment on rep proteins (of at least 90 bp)
leading > 90 | trailing > 90]
m
}
# we apply the above fonctions to hits involving every consensus
dubiousConsensusHits <- lapply(unique(combined$hit), dubious)
dubiousConsensusHits <- rbindlist(dubiousConsensusHits)
# we only retain hits on nr proteins that are not annotated as repeat proteins :
dubiousConsensusHits <- dubiousConsensusHits[!grepl(
pattern = "transpo|retro|reverse|integrase|gag|pol-|pol |rna-dependent|polyprot|mobile|jockey|jerky|setmar|copia|recombinase|crypton|mariner|tcmar|tc1|gypsy|helitron|harbi|piggy|maveri|polinton|academ|ltr|cmc|envelop",
# the keyword list above has been determined after carefull inspection of protein names in the hits
x = subject,
ignore.case = T
)
# and with alignment length >100
& abs(qStart - qEnd + 1) > 100, ]
# we put accession numbers of nr proteins in this new column
dubiousConsensusHits[, acc := splitToColumns(subject, " ", 1)]
# STEP FOUR, we blastp nr proteins similar to TE consensuses against repeat proteins -----------------------------------
# we put their names in a file for seqtk
# (we use all proteins, not just those in the dubiousConsensusHits table. We can afford it)
writeLines(unique(blast_nr$sseqid), "hitted_nrProteins.bed")
# we extract the protein sequences
seqtk(nr.gz, bed, "hitted_nrProteins.fas")
# the futre output of the blastp
out <- "nrProtsOnRepeatPeps.out"
# we launch the search
system(
paste(
"diamond blastp -p 1 -q hitted_nrProteins.fas --more-sensitive -d repeatPeps.dmnd --quiet -k 20 -f 6",
paste(fields, collapse = " "),
"-o",
out
)
)
# processing the hits --------------------------------------
blastp <- fread(out,
header = F,
sep = "\t",
col.names = fields
)
blastp <- blastp[evalue < 0.001]
# again, combining HSPs on the same query
combinedP <- combineHits(
blast = blastp,
maxDist = 1000,
maxOverlap = 100000,
maxDiff = 1000
)
# we get accession number of nr proteins that have an acceptable hit with a rebpase proteins.
# We will consider them as legit TE proteins
repProtAccessions <- combinedP[abs(qEnd - qStart) + 1 >= 100 & identity >= 35,
unique(splitToColumns(query, " ", 1))]
# so we can extract consensuses that hit to other proteins than these
dubiousFamilies <- dubiousConsensusHits[!acc %in% repProtAccessions, unique(query)]
# we write these family names to disk.
write(dubiousFamilies, "familiesToIgnore.txt")
# We still blast these TEs to find HTT.
# This leaves us the possibility to remove these TEs afterwards