-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path06-filterTEhits.R
220 lines (156 loc) · 8.5 KB
/
06-filterTEhits.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
## %######################################################%##
# #
#### This stage filters the hits ####
#### between TEs of different species ####
# #
## %######################################################%##
# because these hits are so numerous, we filter the blastn outputs
# generated at stage 4 before attempting to stack them in a unique table.
# this scipt uses
# - the blastn output files generated at stage 4
# - Ks values of BUSCO genes obtained at stage 5
# the outputs (not listing intermediate files) is a tabular files of filtered TE-TE blastn hit
source("HTvFunctions.R")
# STEP ONE ----------------------------------------------------------------------------------
# we filter-out all hits whose percentage identity (not Ks) is lower than
# 1 - the 0.5% quantile of Ks of the corresponding clade pair.
# Hits not passing this filter would have been very unlikely to pass downstream filters based on TE Ks
# this stage also also removes any hits involving a TE that belongs to a family
# whose consensus may involve a non-TE gene (see stage 03-findDubiousTEs.R)
# we import the file of core gene Ks generated in stage 05-coreGeneKs.R
Ks <- fread("Ks200AAnoRedundancy.txt")
# we compute the 0.5% Ks quantile for every clade
# we will add this quantile to for every species pair that was blasted
quant <- Ks[, .(q = quantile(Ks, 0.005)), by = clade]
# to assign species to the clade, we obtain the mrca for each pair of species
tree <- read.tree("additional_files/timetree.nwk")
mrcaMat <- mrca(tree)
# we import the table of pairs of species whose TEs were compared by blast, generated in 4-blastTEs.R
pairs <- fread("pairsToBlastn.txt", select = c(1, 2, 8))
# we can now add a column indicating the minimum pID to retain blast hits
pairs[, q := 100 - 100 * quant[match(mrcaMat[cbind(sp1, sp2)], clade), q]]
writeT(pairs, "pairsToFilter.txt")
# we launch the jobs that filters the hits based on the above table, with 10 CPUs
system('sbatch --cpus-per-task=10 --mem=30G --wrap="Rscript filterTEblastnHits.R 10"')
# we import filtered hits generated by the script above.
# these have been collapsed in one tabular file (which is quite big)
TEhits <- fread(
input = "all.quantile005score200.blastn.out",
header = F,
sep = "\t",
col.names = c(
"query", # the TE copy of the first species (sp1)
"subject", # that of the other species (sp2)
"sp1", # the host species of the query
"sp2", # and of the subject
"f1", # the TE family of the query
"superF1", # and its super family
"f2", "superF2", # same as above, for the subject
"pID", "length", "qStart", "qEnd", "sStart", "sEnd", "score" # and standard attributes of the hsp
)
)
# we report the proportion of hits between TEs of the same super family
TEhits[,mean(superF1 == superF2)]
# STEP TWO, we blast TE copies in retained hits against repeat proteins -------------------------------------------
# this is to prepare the computation of Ka/Ks between TEs (at later stages)
# but it is also used to select hits involving an ORF of sufficient length
# we export TE copies involved in hits to blast against rep proteins
# copies are identified by different fields,
copies <- TEhits[, data.table(
sp = c(sp1, sp2), # the host species
seq = c(query, subject), # the copy name (contig + coordinates and orientation)
fam = c(f1, f2), # the TE family
superF = c(superF1, superF2) # and super family
)]
# copies are typically involved in several hits, so we remove duplicates
copies <- copies[!duplicated(seq)]
# we paste the field and separate them with colons, except the super family name
pasted <- copies[, stri_c(sp, ":", seq, ":", fam, "#", superF)]
copiesPerSpecies <- split(pasted, copies$sp)
# we write the selected copy names to files
# copy names are saved in separate files for each species,
# as the blastx will be run in parallel
dir.create("TEs/selectedCopies")
m <- Map(
f = write,
x = copiesPerSpecies,
file = stri_c("TEs/selectedCopies/", names(copiesPerSpecies), ".txt")
)
# we blast copies against the diamond repeatPep database, as we use this to compute Ks between TEs
system('sbatch --cpus-per-task=10 --mem=50G --wrap="Rscript successiveBlastx.R 10"')
# we now use the blastx information to select hits involving protein regions
# we import the combined blastx results generated above
blastx <- fread(
input = "TEs/blastx/all.copies.successiveBlastx.out",
header = T,
sep = "\t",
col.names = c(
"query", "subject", "pID", "length", "mismatch", "gapopen",
"qStart", "qEnd", "sStart", "sEnd", "evalue", "score", "ex"
)
)
# we report the proportion of TE that hit a protein of the same super family
# ex was generated by expectedSubject() defined in HTvFunctions.R
blastx[,mean(ex == 2L)]
# STEP THREE, we select TE-TE hits involving TE protein regions >= 300 bp, -------------------------------------------------------------------
# this is to avoid computing unnecessary Ks (since we require kS computed on ≥300 bp)
# we generate copy names from query names in the blastx file
# (i.e., excluding species and family info), to match the TE-TE hits
blastx[, copy := copyName(query)]
# we compute the first and last position of all alignments on proteins for each copy,
# which we infer as the protein region of each copy (even if it may encompass several ORFs)
# this is only done for proteins that are of the same super family as the copy (ex == 2)
perCopy <- blastx[ex == 2L,
.(
start = min(c(qStart, qEnd)),
end = max(c(qStart, qEnd))
),
by = copy
]
# to select TE-TE hits that cover a protein region that is long enough,
# we need to get the blastn coordinates where starts always < ends
TEhits[, c("qSt", "qEn", "sSt", "sEn") := data.table(
pmin(qStart, qEnd),
pmax(qStart, qEnd),
pmin(sStart, sEnd),
pmax(sStart, sEnd)
)]
# we add portein ranges for query and subject in blastn hits
TEhits[, c("qS", "qE") := perCopy[chmatch(query, copy), .(start, end)]]
TEhits[, c("sS", "sE") := perCopy[chmatch(subject, copy), .(start, end)]]
# we prevent bugs that are caused by NAs in intersection()
TEhits[is.na(qS), c("qS", "qE") := 0L]
TEhits[is.na(sS), c("sS", "sE") := 0L]
# we get the region of the query that aligns on the subjects and also aligns on proteins
TEhits[, c("qS", "qE") := data.table(intersection(qSt, qEn, qS, qE, T))]
TEhits[, c("sS", "sE") := data.table(intersection(sSt, sEn, sS, sE, T))] # same, for the subject
# converts the above into query coordinates
TEhits[, c("qsS", "qsE") := .(qSt + sS - sSt, qEn + sE - sEn)]
# so we can compute the lenght of the TE alignment that also aligns on proteins
TEhits[, inter := pmin(qE, qsE) - pmax(qS, qsS) + 1L]
TEhits[is.na(inter) | inter < 0L, inter := 0L]
# we remove columns we no longer need
TEhits[, c("qSt", "qEn", "sSt", "sEn", "qS", "qE", "sS", "sE", "qsS", "qsE") := NULL]
# we select hits between TEs assigned to the same super families
# and with sufficient protein region (300 bp)
TEhitsCoveringProteins <- TEhits[superF1 == superF2 & inter >= 300L]
# now we only need one superfamily name per hit
setnames(TEhitsCoveringProteins, "superF1", "superF")
TEhitsCoveringProteins[, superF2 := NULL]
writeT(TEhitsCoveringProteins, "all300aaSameSuperF.txt")
# there are still too many hits to compute Ks on.
# We select among redundant hits that may represent the same HTT. ---------------------------------------
# This is based on the fact that many hits share a copy (see Method section).
# we place the best hits on top (longest protein region then higher score)
setorder(TEhitsCoveringProteins, -inter, -score)
# we add an integer column denoting the single-linkage cluster of copies, based on hits
TEhitsCoveringProteins[, cl := clusterFromPairs(query, subject)]
# we add an integer column indicating number of times we have seen a cluster for each a species pair.
# Within a species pair, hits of the same cluster would represent the same transfer and are thus redundant.
TEhitsCoveringProteins[, occ := occurences(data.table(cl, sp1, sp2))]
# we select at most 2000 hits between copies from a given cluster for any species pair
sel <- TEhitsCoveringProteins[occ <= 2000L, ]
# we also add an integer column for the mrca of each species pair, for later use
sel[, mrca := mrcaMat[cbind(sp1, sp2)]]
# and write the filtered hits to disk
writeT(sel, "allOCC2000.txt")