-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsuccessiveBlastx.R
114 lines (86 loc) · 4.09 KB
/
successiveBlastx.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
## %######################################################%##
# #
#### This script blasts TE copies against the database ####
#### of repeat proteins to find protein regions in TEs. ####
# #
## %######################################################%##
# this script is executed at stage 7 of the pipeline
# As a single copy may encompass several proteins (in particular, for
# retrotransposons), we blast copies once to retain only the best hit per copy,
# then we extract TE parts that are not in HSPs, and blast them again. Five
# "rounds" of blast searches are performed. We do that rather than retrieving
# many hits per TE in a single blast because this gives not guaranty to find all
# the proteins of the TE (most of the hits would be likely to involve the same TE
# region that aligns on homologous proteins of repbase).
# but doing so require blasting sub-part of copies, and some particular care
# about blast coordinates , which will have to be converted to coordinates
# in the reference of the original copy
source("HTvFunctions.R")
# the only argument is the number of CPUs to use
args <- commandArgs(trailingOnly = TRUE)
nCPUs <- as.integer(args[1])
setwd("TEs")
# we prepare output folders. The first one is that of blast output for ongoing searches
outFolder <- "blastx/out/"
# and this one is where results will be moved upon completion of the search
doneFolder <- "blastx/done/"
dir.create(doneFolder, recursive = T)
dir.create(outFolder)
# we import bed files of TE copy (or copy parts) to extract f
# rom TE fastas (with seqtk), generated by 07-TEKsAndHTTfilter.R
beds <- list.files("selectedCopies", pattern = ".txt", full.names = T)
species <- unique(extractSpeciesNames(beds))
# generates output file names of finished searches
done <- stri_c(doneFolder, species, ".copies.successiveBlastX.out")
# in case the job needs to be relaunched, avoids redoing searches that were already done
f <- !file.exists(done)
species <- species[f]
done <- done[f]
# database or repeat proteins generated at stage 03-findDubiousTEs.R
db <- ("findDubious/repeatPeps.dmnd")
# below, the function that does the successive blastx searches for a given species. maxRounds will be 5
successiveBlastx <- function(sp, done, maxRounds) {
#we generate file names of TE fasta file and the file listing TE copy names
fas <- stri_c("copies/", sp, ".TEs.fasta.gz")
bed <- stri_c("selectedCopies/", sp, ".txt")
# this empty data table will contains the combined hits of the successive blasts
combined <- data.table()
for (round in 1:maxRounds) {
# generates output file name of current round
out <- stri_c(outFolder, sp, ".", round, ".out")
# this extracts TEs (or TE parts) with seqtk and pipes them to diamond blastx
blast <- seqtkDiamond(fas, bed, db, out)
if (nrow(blast) == 0) {
break
}
# we combine the current hits with any previous ones and process the output (see processBlastX() function)
combined <- rbind(combined, processBlastX(blast))
if (round < maxRounds) {
# extract TE parts not involved in hits (a bed-like table)
parts <- unalignedParts(combined)
if (nrow(parts) == 0) {
break
}
# creates a bed file for these parts
bed <- stri_c("selectedCopies/", sp, ".", round, ".bed")
writeT(parts, bed, col.names = F)
# and repeats
}
}
# writes output to the done/ folder once finished
writeT(combined, done, col.names = F)
return(NULL)
}
# the function is applied in parallel for several species
m <- mcMap(successiveBlastx,
sp = species,
done = done,
maxRounds = 5,
mc.cores = nCPUs,
mc.preschedule = F
)
# we concatenate the output
# these are for the combined outputs of the different rounds
system("cat blastx/done/*.out > blastx/all.copies.successiveBlastx.out")
# and these are the "raw" uncombined outputs
system("cat blastx/out/*.out > blastx/allRaw.out")