06-filterTEhits.R



## %######################################################%##
#                                                          #
####            This stage filters the hits             ####
####          between TEs of different species          ####
#                                                          #
## %######################################################%##

# because these hits are so numerous, we filter the blastn outputs
# generated at stage 4 before attempting to stack them in a unique table.

# this scipt uses
# - the blastn output files generated at stage 4 
# - Ks values of BUSCO genes obtained at stage 5

# the outputs (not listing intermediate files) is a tabular files of filtered TE-TE blastn hit

source("HTvFunctions.R")

# STEP ONE ----------------------------------------------------------------------------------
# we filter-out all hits whose percentage identity (not Ks) is lower than
# 1 - the 0.5% quantile of Ks of the corresponding clade pair.
# Hits not passing this filter would have been very unlikely to pass downstream filters based on TE Ks
# this stage also also removes any hits involving a TE that belongs to a family
# whose consensus may involve a non-TE gene (see stage 03-findDubiousTEs.R)

# we import the file of core gene Ks generated in stage 05-coreGeneKs.R
Ks <- fread("Ks200AAnoRedundancy.txt")

# we compute the 0.5% Ks quantile for every clade
# we will add this quantile to for every species pair that was blasted
quant <- Ks[, .(q = quantile(Ks, 0.005)), by = clade]

# to assign species to the clade, we obtain the mrca for each pair of species
tree <- read.tree("additional_files/timetree.nwk")

mrcaMat <- mrca(tree)


# we import the table of pairs of species whose TEs were compared by blast, generated in 4-blastTEs.R
pairs <- fread("pairsToBlastn.txt", select = c(1, 2, 8))

# we can now add a column indicating the minimum pID to retain blast hits
pairs[, q := 100 - 100 * quant[match(mrcaMat[cbind(sp1, sp2)], clade), q]]

writeT(pairs, "pairsToFilter.txt")

# we launch the jobs that filters the hits based on the above table, with 10 CPUs
system('sbatch --cpus-per-task=10 --mem=30G --wrap="Rscript filterTEblastnHits.R 10"')


# we import filtered hits generated by the script above.
# these have been collapsed in one tabular file (which is quite big)
TEhits <- fread(
    input = "all.quantile005score200.blastn.out",
    header = F,
    sep = "\t",
    col.names = c(
        "query",         # the TE copy of the first species (sp1) 
        "subject",       # that of the other species (sp2)
        "sp1",           # the host species of the query
        "sp2",           # and of the subject
        "f1",            # the TE family of the query
        "superF1",       # and its super family
        "f2", "superF2", # same as above, for the subject
        "pID", "length", "qStart", "qEnd", "sStart", "sEnd", "score"  # and standard attributes of the hsp
    )
) 

# we report the proportion of hits between TEs of the same super family
TEhits[,mean(superF1 == superF2)]


# STEP TWO, we blast TE copies in retained hits against repeat proteins -------------------------------------------
# this is to prepare the computation of Ka/Ks between TEs (at later stages)
# but it is also used to select hits involving an ORF of sufficient length

# we export TE copies involved in hits to blast against rep proteins
# copies are identified by different fields,
copies <- TEhits[, data.table(
    sp = c(sp1, sp2),        # the host species
    seq = c(query, subject), # the copy name (contig + coordinates and orientation)
    fam = c(f1, f2),         # the TE family
    superF = c(superF1, superF2)  # and super family
)] 

# copies are typically involved in several hits, so we remove duplicates
copies <- copies[!duplicated(seq)]

# we paste the field and separate them with colons, except the super family name
pasted <- copies[, stri_c(sp, ":", seq, ":", fam, "#", superF)]
copiesPerSpecies <- split(pasted, copies$sp)

# we write the selected copy names to files
# copy names are saved in separate files for each species,
# as the blastx will be run in parallel
dir.create("TEs/selectedCopies")

m <- Map(
    f = write,
    x = copiesPerSpecies,
    file = stri_c("TEs/selectedCopies/", names(copiesPerSpecies), ".txt")
)

# we blast copies against the diamond repeatPep database, as we use this to compute Ks between TEs

system('sbatch --cpus-per-task=10 --mem=50G --wrap="Rscript successiveBlastx.R 10"')


# we now use the blastx information to select hits involving protein regions 

# we import the combined blastx results generated above
blastx <- fread(
    input = "TEs/blastx/all.copies.successiveBlastx.out",
    header = T,
    sep = "\t",
    col.names = c(
        "query", "subject", "pID", "length", "mismatch", "gapopen",
        "qStart", "qEnd", "sStart", "sEnd", "evalue", "score", "ex"
    )
)

# we report the proportion of TE that hit a protein of the same super family
# ex was generated by expectedSubject() defined in HTvFunctions.R
blastx[,mean(ex == 2L)]


# STEP THREE, we select TE-TE hits involving TE protein regions  >= 300 bp,  -------------------------------------------------------------------
# this is to avoid computing unnecessary Ks (since we require kS computed on ≥300 bp)

# we generate copy names from query names in the blastx file
# (i.e., excluding species and family info), to match the TE-TE hits
blastx[, copy := copyName(query)]

# we compute the first and last position of all alignments on proteins for each copy,
# which we infer as the protein region of each copy (even if it may encompass several ORFs)
# this is only done for proteins that are of the same super family as the copy (ex == 2)

perCopy <- blastx[ex == 2L, 
    .(
        start = min(c(qStart, qEnd)),
        end = max(c(qStart, qEnd))
    ),
    by = copy
]


# to select TE-TE hits that cover a protein region that is long enough,
# we need to get the blastn coordinates where starts always < ends
TEhits[, c("qSt", "qEn", "sSt", "sEn") := data.table(
    pmin(qStart, qEnd),
    pmax(qStart, qEnd),
    pmin(sStart, sEnd),
    pmax(sStart, sEnd)
    )]

# we add portein ranges for query and subject in blastn hits
TEhits[, c("qS", "qE") := perCopy[chmatch(query, copy), .(start, end)]]
TEhits[, c("sS", "sE") := perCopy[chmatch(subject, copy), .(start, end)]]

# we prevent bugs that are caused by NAs in intersection()
TEhits[is.na(qS), c("qS", "qE") := 0L]
TEhits[is.na(sS), c("sS", "sE") := 0L]

# we get the region of the query that aligns on the subjects and also aligns on proteins
TEhits[, c("qS", "qE") := data.table(intersection(qSt, qEn, qS, qE, T))]

TEhits[, c("sS", "sE") := data.table(intersection(sSt, sEn, sS, sE, T))] # same, for the subject

# converts the above into query coordinates
TEhits[, c("qsS", "qsE") := .(qSt + sS - sSt, qEn + sE - sEn)]

# so we can compute the lenght of the TE alignment that also aligns on proteins
TEhits[, inter := pmin(qE, qsE) - pmax(qS, qsS) + 1L]
TEhits[is.na(inter) | inter < 0L, inter := 0L]

# we remove columns we no longer need
TEhits[, c("qSt", "qEn", "sSt", "sEn", "qS", "qE", "sS", "sE", "qsS", "qsE") := NULL]


# we select hits between TEs assigned to the same super families 
# and with sufficient protein region (300 bp)
TEhitsCoveringProteins <- TEhits[superF1 == superF2 & inter >= 300L] 

# now we only need one superfamily name per hit
setnames(TEhitsCoveringProteins, "superF1", "superF")
TEhitsCoveringProteins[, superF2 := NULL]

writeT(TEhitsCoveringProteins, "all300aaSameSuperF.txt")


# there are still too many hits to compute Ks on.
# We select among redundant hits that may represent the same HTT. ---------------------------------------
# This is based on the fact that many hits share a copy (see Method section).

# we place the best hits on top (longest protein region then higher score)
setorder(TEhitsCoveringProteins, -inter, -score)

# we add an integer column denoting the single-linkage cluster of copies, based on hits
TEhitsCoveringProteins[, cl := clusterFromPairs(query, subject)]

# we add an integer column indicating number of times we have seen a cluster for each a species pair.
# Within a species pair, hits of the same cluster would represent the same transfer and are thus redundant.
TEhitsCoveringProteins[, occ := occurences(data.table(cl, sp1, sp2))]

# we select at most 2000 hits between copies from a given cluster for any species pair
sel <- TEhitsCoveringProteins[occ <= 2000L, ]

# we also add an integer column for the mrca of each species pair, for later use
sel[, mrca := mrcaMat[cbind(sp1, sp2)]]

# and write the filtered hits to disk
writeT(sel, "allOCC2000.txt")