13-TEKaKsWithinGenomes.R

## %######################################################%##
#                                                          #
####    This stage establishes ka ks rate for pairs     ####
####                 of TEs copies that                 ####
####      diverged by transposition within genomes      ####
#                                                          #
## %######################################################%##


# the principle is to compute Ka and Ks values for related copies within a genomes
# These copies belong to the same "community" of hits, so are unlikely to represent
# different htt events. They would have diverged by transposition within the genome

# This script uses 
# - the table of HTT hits between TEs, to obtain the communities to which these hits belong
# - the self-blastn output of TE copies against themselves, generated at stage 8


source("HTvFunctions.R")


# STEP ONE, we select relevant blast hits of TEs to compute ka ks rates.--------------------------------

# We do it per superfamily (for big super families, several blast searches were performed in
# parallel so there are multiple output files)

# we retrieve blastn hits of copies against themselves (generated by
# TEselfBlastn.R at stage 08-prepareClustering.R). 
# As these files are very big, we will select only the hits we need. 
# We filter the hits separately from Ka Ks computation because it takes some time
# Should be need to redo the Ka Ks computations, we won't have to filter hits again

blastFiles <- list.files(
    path = "TEs/clustering/blastn/done",
    pattern = ".out",
    full.names = T
)

# some output file may be empty (no hits), which could cause an issue during import
blastFiles <- blastFiles[file.size(blastFiles) > 0]

# we extract superfamily names from file names
superF <- splitToColumns(basename(blastFiles), "_", 1)

# we split files names per superfamily
blastFiles <- split(blastFiles, superF)

# we import the copy integer identifiers as these were 
# used in the blasts, instead of very long copy names
ids <- ids <- fread("TEs/clustering/selectedCopiesKs05occ200.IDs.txt")

# we retrieve the host species of copies, as we will only compare copies within the same genome
ids[, c("sp", "name") := .(extractSpeciesNames(copy), copyName(copy))]

# We also import the tree to give integer ids (tip numbers) to species, for speed
tree <- read.tree("additional_files/timetree.nwk")

# we make the correspondance between the copy id (index of
# the vector below, as the id is an integer) and species id
spForCopy <- ids[match(1:max(id), id), chmatch(sp, tree$tip.label)]

# we import the HTT hits (including hitgroups that were not retained by
# our criteria in stage 11, just in case we change our filters afterwards)
httHits <- fread("HTThitsAssessed.txt")

# we extract the information we need. 
# We need to know the "communities", which are clusters of related copies
# that should have diverged only by transposition events, avoiding potential HTT

copiesInCom <- httHits[, union(q, s), by = com]

# we also make a vector for the correspondance between a copy (its index) and a community (its value)
comForCopy <- copiesInCom[match(1:max(ids$id), V1), com]

dir.create("TEs/TEevolution/selectedHits/", recursive = T) # where selected hits will go


# we will now select the hits we need process self blastn output --------------------------------------------------------------
# the hits must involve copies from the same community and species (genome)
# we do it per super family as there are no hit between super families (no blast)

selectHits <- function(superF) {
      import <- function(file) {
        hits <- fread(
            input = file,
            header = F,
            drop = c(3, 9),
            col.names = c("q", "s", "length", "qStart", "qEnd", "sStart", "sEnd"),
            
            # since the function will be launched in parallel, we don't need several threads
            nThread = 1
        ) 

        # we can already select hits that are long enough for Ka Ks computations
        hits <- hits[length >= 300L]

        # and those  between copies from the same species
        hits <- hits[spForCopy[q] == spForCopy[s] &
                     # and same community
                     comForCopy[q] == comForCopy[s], ] 
        
        cat(".") # progress indicator (the files are big, this takes time)
        hits[,-"length"]
    }

    # we apply the above function in parallele using 20 processes
    # and rbind filtered blast hits from the same super family
    rbindlist(mclapply(
        X = blastFiles[[superF]],
        FUN = import,
        mc.cores = 20,
        mc.preschedule = F
    ))
}


# we apply the function for all super families
# which are names of the blastFile list
intraGenomeHits <- rbindlist(lapply(names(blastFiles), selectHits))

# we write the selected hits do disk as a single file
writeT(
    data = intraGenomeHits,
    path = "TEs/TEevolution/selectedHits/all.out"
)


# STEP TWO, we filter TE hits that cover a protein region of at least 300 bp-----------------------------
# this is to avoid unnecessary ka/ks computations on codon alignments that will be too short

# we import the a blastx file generated earlier (stage 6), listing
# alignment coordinates of copies against repeat proteins
# this one doe not have overlapping HSPs
blastx <- fread("TEKs/blastxOCC2000EXcov1.out")

# we replace copy names by their integer ids, since ids are used in the self blastn files
blastx[, id := ids[chmatch(blastx$copy, name), id]]

# we only retain blastx results for copies involved in our selected hits
# this will speed things up a bit

blastx <- blastx[id %in% httHits[, c(q, s)]]

# as there are several HSPs (alignments) per TE, we get the minimal start
# and maximal end of these, to approximate the protein region covered by a TE
blastxCoords <- blastx[, .(start = min(start), end = max(end)), by = id]

# we add columns of TE hit coordinates where starts always < ends
intraGenomeHits[, c("qSt", "qEn", "sSt", "sEn") := data.table(
  pmin(qStart, qEnd),
  pmax(qStart, qEnd),
  pmin(sStart, sEnd),
  pmax(sStart, sEnd)
)]

# we add protein ranges for query and subject in blastn hits
intraGenomeHits[, c("qSprot", "qEprot") := blastxCoords[match(q, id), .(start, end)]]
intraGenomeHits[, c("sSprot", "sEprot") := blastxCoords[match(s, id), .(start, end)]]

# we obtain the region of the TE query that aligns on the TE subject and also aligns on proteins
intraGenomeHits[, c("qSprot", "qEprot") := data.table(intersection(qSt, qEn, qSprot, qEprot, T))]

# same for the subject TE
intraGenomeHits[, c("sSprot", "sEprot") := data.table(intersection(sSt, sEn, sSprot, sEprot, T))]

# we convert the above into query coordinates
intraGenomeHits[, c("qsS", "qsE") := .(qSt + sSprot - sSt, qEn + sEprot - sEn)]

# we estimate the length of the protein part of copies that align with each other
intraGenomeHits[, inter := pmin(qEprot, qsE) - pmax(qSprot, qsS) + 1L]

# selects hits for which this region is at least 300 bp, and the columns we need
# we also retrieve full TE names as these will be needed for Ka Ks computation, 
# because these names are those in the fasta file of copy sequences
intraGenomeHits <- intraGenomeHits[inter >= 300L, data.table(q, s,
    query = ids[match(q, id), name],
    subject = ids[match(s, id), name],
    qStart, qEnd, sStart, sEnd
)]


# we rbind these selected hits with the htt hits, although we have already
# computed Ka and Ks rates for the latter. This is because we will use the 
# TEKaKs.R script to compute overall distance between copies, which we did 
# not do at stage 7 (TEKaKs.R has been modified since)

selectedHits <- rbind(
  data.table(intraGenomeHits, htt = F), # we add a logical column to differentiate the two types of hits
  httHits[,.(q, s, query, subject, qStart, qEnd, sStart, sEnd, htt = T)]
)

# we prepare the files needed for the TEKaKs.R script
TEhitFile <- "TEs/TEevolution/selectedHits/selected.out"
blastxFile <- "TEs/TEevolution/selectedHits/blastx.out"

# for these tables, we remove columns that are not used in TEKaKs.R
writeT(selectedHits[, -c("q", "s", "htt")], TEhitFile)
writeT(blastx[, -"id"], blastxFile)


# STEP THREE, we compute Ka and Ks rates -------------------------------------------------------------

outFolder <- "TEs/TEevolution/Ks"
dir.create(outFolder)
nCPUs <- 30

system(paste(
    "Rscript TEKaKs.R",
    TEhitFile, blastxFile,
    "TEs/clustering/selectedCopiesKs05occ200.fas", # the fasta file of copy sequences
    outFolder, nCPUs
))


# We imports results generated by TEKaKs.R to merge results with the TEhit table
KaKs <- fread(stri_c(outFolder, "/allKaKs.txt"))

selectedHits[, hit := 1:.N] # required for the merged below, as the kaks results have a hit identifier
KaKs <- merge(selectedHits, KaKs, by = "hit")

# we add columns for superfamily names and community ids based on copy ids
ids[, superF := stri_extract_last(copy, regex = "[^#]+")]
KaKs[, c("superF", "com") := .(
    ids[match(q, id), superF],
    comForCopy[q]
)]


writeT(Kaks[,-"hit"], "TEs/TEevolution/TEKaKsAndDistance.txt")