-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path07-TEKsAndHTTfilter.R
193 lines (132 loc) · 6.95 KB
/
07-TEKsAndHTTfilter.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
## %######################################################%##
# #
#### This stage computes Ks between TEs for filtered ####
#### TE-TE hits, to select hits that represent HTT. ####
# #
## %######################################################%##
source("HTvFunctions.R")
# this script uses
# - the TE-TE blast hits we selected in the previous stage
selectedHits <- fread("allOCC2000.txt")
# - the blastx hits between TE copies and proteins also generated in the previous stage
# (imported later)
# the output is (intermediate files not listed)
# - a fasta file of TE copies involved in the hits (as it will be used in further stages)
# - the table of filtered TE-TE hits with new columns for Ka and Ks values between copies
# STEP ONE, we extract TE copy sequences --------------------------------------------
# they were actually not extracted in the blastx we preformed prior, as we piped directly from seqtk to diamond
# we generate TE copy names
copies <- selectedHits[, union(
x = stri_c(sp1, ":", query, ":", f1, "#", superF),
y = stri_c(sp2, ":", subject, ":", f2, "#", superF)
)]
sp <- extractSpeciesNames(copies)
# we will write copy names for each species so we can use seqtk to extract them from fastas -------
# so we split copies per species as
copiesPerSpecies <- split(copies, sp)
# where the copy sequences will go
dir.create("TEKs/selectedCopies", recursive = T)
# the file of TE copy names that seqtk will use
fileNames <- stri_c("TEKs/selectedCopies/", names(copiesPerSpecies), ".txt")
# we write these to disk
m <- Map(
f = write,
x = copiesPerSpecies,
file = fileNames
)
# ----------
# we import copy sequences -------------
# we list gzip fasta files of TE copies (generated in stage 2)
fasNames <- stri_c("TEs/copies/", names(copiesPerSpecies), ".TEs.fasta.gz")
# we import selected copies from TE fastas in parallel
seqs <- mcMap(
f = seqtk,
fas = fasNames,
bed = fileNames,
mc.cores = 20,
mc.preschedule = F
)
# we stack these sequence in a single DNAStringset
allSequences <- unlist(seqs, use.names = F)
# and write it to a single fasta
write(allSequences, file = "TEKs/selectedCopiesAA300cl2000.fas")
# STEP TWO, we prepare and launch computations of Ks value of TE hits. -------------------------------------------
# This is based on blastx results of copies against repeat proteins (used by repeat modeler for TE classification)
# for this, we import "raw" results from the blastx performed prior
# which differ from blastx results used to select hits, as those had successive HSPs combined.
blastx <- fread(
input = "TEs/blastx/allRaw.out",
header = F,
sep = "\t",
col.names = c(
"copy", "prot", "pID", "length",
"mismatches", "gapopen", "qStart", "qEnd",
"sStart", "sEnd", "eValue", "score", "qlen"
),
)
# we extract copy names and other info
copyFields <- blastx[, stri_split(copy, fixed = ":", simplify = T)]
# we reconstruct copy names as they were in the selectedHits table
copies <- stri_c(copyFields[, 2], copyFields[, 3], copyFields[, 4], copyFields[, 5], sep = ":")
# we select hits between copies present in the blastn hits
f <- copies %chin% selectedHits[, c(query, subject)]
blastx <- blastx[f]
# we also replace copy names with shorter names matching the TE-TE hits
blastx[, copy := copies[f]]
copyFields <- copyFields[f, ]
# as sub-regions of copies were blasted, we obtains their start coordinates (field 7, generated by seqtk)
starts <- as.integer(splitToColumns(copyFields[, 7], "-", 1))
starts[is.na(starts)] <- 1L
# so we can convert blastx coordinates in original copy coordinates
blastx[, c("qStart", "qEnd") := .(qStart + starts - 1L, qEnd + starts - 1L)]
# we add a column that tells the consistency between the TEs and protein super families
blastx[, ex := expectedProtein(copy, prot)]
# we write the file for safety, but it is not used afterwards
writeT(blastx, "TEKs/blastxOCC2000EX.out")
# we select hits between TEs and rep protein of the same super family, and with sufficient evalue
blastx <- blastx[eValue <= 0.001 & ex == 2L]
#we create new start and end coordinates so that the former is always lower
blastx[, c("start", "end") := .(pmin(qStart, qEnd), pmax(qStart, qEnd))]
# we select regions of TE copies covered by just one hsp, hence for which there is no visible conflict between reading frames.
cov <- blastx[, genomeCov(data.table(copy, start, end), successive = F)]
noOverlappedHSP <- cov[cov == 1L]
# we retrieve the hit covering each of these regions (the row index of the blastx table)
# the new column we add is the row index of the hit in the blastx table
noOverlappedHSP[, hit := assignToRegion(blastx[, data.table(copy, start, end)],
data.table(copy, start))]
# we retrieve the start position of the hit on the protein, and whether the hit was in
# reverse orientation. This is necessary to determine positions in codons later
noOverlappedHSP[, c("protStart", "rev") := blastx[hit, .(qStart, qStart > qEnd)]]
# we remove columns we no longer need
noOverlappedHSP[, c("cov", "hit") := NULL]
writeT(noOverlappedHSP, "TEKs/blastxOCC2000EXcov1.out")
# Ka Ks computations performed via this script :
system("Rscript TEKaKs.R allOCC2000.txt TEKs/blastxOCC2000EXcov1.out TEKs/selectedCopiesAA300cl2000.fas TEKs/ 30")
# STEP THREE, we remove hits that may result from VT based on Ks --------------------------------------------
# we gather results and merge them with the table of selected hits
TEKs <- fread("TEKs/allKaKs.txt")
# we add an integer identifier to the hits, as it was used in TEKaKs.R
selectedHits[, hit := 1:.N]
#this allows merging the Ka Ks results with our table of hits
selectedHits <- merge(
x = selectedHits,
y = TEKs,
by = "hit",
all = T,
suffixes = c("", ".aa")
)
# we write the table for safety, this file is not used afterwards
writeT(selectedHits, "allOCC2000Ks.txt")
# file of filtered BUSCO Ks (200 AA alignments and one score per BUSCO gene per pair of clades) generated in stage 05-coreGeneKs.R
Ks <- fread("Ks200AAnoRedundancy.txt")
# 0.5% quantiles of Ks, per pair of sister clades
KsQuantile <- Ks[, .(q05 = quantile(Ks, 0.005)), by = clade]
# gets the relevant quantile for every hit, that corresponding to the clade pair
qKs <- KsQuantile[match(selectedHits$mrca, clade), q05]
# we select hits that should constitute HTT, according to our criteria
# the ks value + twice the standard deviation must be lower than 99.5% of the core gene Ks
# this value must be computed on at least 100 codons and be lower then 0.5
httHits <- selectedHits[ks + 2 * vks < qKs & length.aa >= 100L & ks < 0.5, ]
# (The removal of all hits between species that diverged in the last 120 My was done quite late in the pipeline.
# it was not considered as needed at this stage. Doing it here would save some computational time, but won't change the results)
writeT(httHits, "occ200Ks05.txt")