07-TEKsAndHTTfilter.R


## %######################################################%##
#                                                          #
####  This stage computes Ks between TEs for filtered   ####
####   TE-TE hits, to select hits that represent HTT.   ####
#                                                          #
## %######################################################%##

source("HTvFunctions.R")

# this script uses
# - the TE-TE blast hits we selected in the previous stage
selectedHits <- fread("allOCC2000.txt")

# - the blastx hits between TE copies and proteins also generated in the previous stage
# (imported later)

# the output is (intermediate files not listed)
# - a fasta file of TE copies involved in the hits (as it will be used in further stages)
# - the table of filtered TE-TE hits with new columns for Ka and Ks values between copies


# STEP ONE, we extract TE copy sequences --------------------------------------------
# they were actually not extracted in the blastx we preformed prior, as we piped directly from seqtk to diamond


# we generate TE copy names
copies <- selectedHits[, union(
    x = stri_c(sp1, ":", query, ":", f1, "#", superF),
    y = stri_c(sp2, ":", subject, ":", f2, "#", superF)
)]

sp <- extractSpeciesNames(copies)


# we will write copy names for each species so we can use seqtk to extract them from fastas -------
# so we split copies per species as 
copiesPerSpecies <- split(copies, sp)

# where the copy sequences will go
dir.create("TEKs/selectedCopies", recursive = T)

# the file of TE copy names that seqtk will use
fileNames <- stri_c("TEKs/selectedCopies/", names(copiesPerSpecies), ".txt")

# we write these to disk
m <- Map(
    f = write,
    x = copiesPerSpecies,
    file = fileNames
)
# ----------


# we import copy sequences -------------
# we list gzip fasta files of TE copies (generated in stage 2)
fasNames <- stri_c("TEs/copies/", names(copiesPerSpecies), ".TEs.fasta.gz")

# we import selected copies from TE fastas in parallel
seqs <- mcMap(
    f = seqtk,
    fas = fasNames,
    bed = fileNames,
    mc.cores = 20,
    mc.preschedule = F
)

# we stack these sequence in a single DNAStringset
allSequences <- unlist(seqs, use.names = F)

# and write it to a single fasta
write(allSequences, file = "TEKs/selectedCopiesAA300cl2000.fas")


# STEP TWO, we prepare and launch computations of Ks value of TE hits. -------------------------------------------
# This is based on blastx results of copies against repeat proteins (used by repeat modeler for TE classification)


# for this, we import "raw" results from the blastx performed prior
# which differ from blastx results used to select hits, as those had successive HSPs combined.
blastx <- fread(
    input = "TEs/blastx/allRaw.out", 
    header = F, 
    sep = "\t",
    col.names = c(
        "copy", "prot", "pID", "length", 
        "mismatches", "gapopen", "qStart", "qEnd", 
        "sStart", "sEnd", "eValue", "score", "qlen"
        ),
)

# we extract copy names and other info
copyFields <- blastx[, stri_split(copy, fixed = ":", simplify = T)]

# we reconstruct copy names as they were in the selectedHits table
copies <- stri_c(copyFields[, 2], copyFields[, 3], copyFields[, 4], copyFields[, 5], sep = ":")

# we select hits between copies present in the blastn hits
f <- copies %chin% selectedHits[, c(query, subject)]
blastx <- blastx[f]

# we also replace copy names with shorter names matching the TE-TE hits
blastx[, copy := copies[f]]
copyFields <- copyFields[f, ]

# as sub-regions of copies were blasted, we obtains their start coordinates (field 7, generated by seqtk)
starts <- as.integer(splitToColumns(copyFields[, 7], "-", 1))
starts[is.na(starts)] <- 1L

# so we can convert blastx coordinates in original copy coordinates
blastx[, c("qStart", "qEnd") := .(qStart + starts - 1L, qEnd + starts - 1L)]

# we add a column that tells the consistency between the TEs and protein super families
blastx[, ex := expectedProtein(copy, prot)]


# we write the file for safety, but it is not used afterwards
writeT(blastx, "TEKs/blastxOCC2000EX.out")


# we select hits between TEs and rep protein of the same super family, and with sufficient evalue
blastx <- blastx[eValue <= 0.001 & ex == 2L]

#we create new start and end coordinates so that the former is always lower
blastx[, c("start", "end") := .(pmin(qStart, qEnd), pmax(qStart, qEnd))]

# we select regions of TE copies covered by just one hsp, hence for which there is no visible conflict between reading frames.
cov <- blastx[, genomeCov(data.table(copy, start, end), successive = F)]
noOverlappedHSP <- cov[cov == 1L]

# we retrieve the hit covering each of these regions (the row index of the blastx table)
# the new column we add is the row index of the hit in the blastx table
noOverlappedHSP[, hit := assignToRegion(blastx[, data.table(copy, start, end)], 
                                        data.table(copy, start))]

# we retrieve the start position of the hit on the protein, and whether the hit was in
# reverse orientation. This is necessary to determine positions in codons later
noOverlappedHSP[, c("protStart", "rev") := blastx[hit, .(qStart, qStart > qEnd)]]

# we remove columns we no longer need
noOverlappedHSP[, c("cov", "hit") := NULL]

writeT(noOverlappedHSP, "TEKs/blastxOCC2000EXcov1.out")

# Ka Ks computations performed via this script :
system("Rscript TEKaKs.R allOCC2000.txt TEKs/blastxOCC2000EXcov1.out TEKs/selectedCopiesAA300cl2000.fas TEKs/ 30")


# STEP THREE, we remove hits that may result from VT based on Ks --------------------------------------------

# we gather results and merge them with the table of selected hits
TEKs <- fread("TEKs/allKaKs.txt")

# we add an integer identifier to the hits, as it was used in TEKaKs.R 
selectedHits[, hit := 1:.N]

#this allows merging the Ka Ks results with our table of hits
selectedHits <- merge(
    x = selectedHits,
    y = TEKs,
    by = "hit",
    all = T,
    suffixes = c("", ".aa")
)


# we write the table for safety, this file is not used afterwards
writeT(selectedHits, "allOCC2000Ks.txt")


# file of filtered BUSCO Ks (200 AA alignments and one score per BUSCO gene per pair of clades) generated in stage 05-coreGeneKs.R
Ks <- fread("Ks200AAnoRedundancy.txt")

# 0.5% quantiles of Ks, per pair of sister clades
KsQuantile <- Ks[, .(q05 = quantile(Ks, 0.005)), by = clade]

# gets the relevant quantile for every hit, that corresponding to the clade pair
qKs <- KsQuantile[match(selectedHits$mrca, clade), q05]

# we select hits that should constitute HTT, according to our criteria 
# the ks value + twice the standard deviation must be lower than 99.5% of the core gene Ks
#  this value must be computed on at least 100 codons and be lower then 0.5
httHits <- selectedHits[ks + 2 * vks < qKs & length.aa >= 100L & ks < 0.5, ]

# (The removal of all hits between species that diverged in the last 120 My was done quite late in the pipeline.
# it was not considered as needed at this stage. Doing it here would save some computational time, but won't change the results)


writeT(httHits, "occ200Ks05.txt")