11-hitGroupEvaluation.R



## %######################################################%##
#                                                          #
####              This stage evaluates the              ####
####          reliability of hit groups, i.e.           ####
####       whether they reflect DNA contamination       ####
####     or non-horizontal transfer between species     ####
#                                                          #
## %######################################################%##


# A reliable hit group (putative HTT) must involve enough TE copies per clade (to
# reduce the risk that it actually reflects contamination). But since we filtered
# a lot of hits (hence copies) in previous stages, we evaluate whether copies
# involved in HTT have close relatives in their respective genomes in order to
# retrieve similar copies. So we blast copies against the TEs of their host
# genomes.

source("HTvFunctions.R")


# this script uses the tabular file of htt hits with hit group identifier generated by 10-hitClusteringRound2.R
httHits <- fread("oc200HitGroup.txt")

# the output will be a tabular file listing statistic that we then use to evaluate the reliability of hit groups


# STEP ONE, we gather the TE copies present in hit groups ---------------------------------------------------------------
# this will be our criterion to evaluate the risk of DNA contamination
# As many hits have been discarded in early stages of the analysis,
# we retrieve TE copies that should belong to the various hit groups
# by blasting the TE copies that are in the retained hits against
# all TEs of their respective species

# for this, we import the sequences of copies involved in the putative HTTs
fas <- readDNAStringSet("TEs/clustering/selectedCopiesKs05occ200.fas")

# we will save fasta files for separate species, to make one blast for each.
# So we split the sequences according to species
sp <- extractSpeciesNames(names(fas))
copiesPerSp <- split(fas, sp)

dir.create("TEs/clustering/testConta") # where the results will go

# we generate names of the fasta of copy sequences
fasFiles <- stri_c(
    "TEs/clustering/testConta/",
    names(copiesPerSp), # these are the species name
    ".selectedCopies.fas"
)

# we write the sequences into these files
Map(
    f = writeXStringSet,
    x = copiesPerSp,
    filepath = fasFiles
)

# we run the script that blasts these copies against all copies of their source genome, with 15 CPUs
system('sbatch --mail-type=BEGIN,END,FAIL --cpus-per-task=15 --mem=100G --wrap="Rscript blastAgainstAllTEs 15"')

# we analyze bast outputs ---------------------------------------------------------------

# concatenated results of the intraspecific blast performed above,
blast <- fread(
    input = "TEs/clustering/testConta/blastn/all.out",
    select = c(1:4, 12),
    col.names = c("httCopy", "speciesCopy", "pID", "length", "score")
)

# in these outputs the query "httCopy" is a copy involved the candidate 
# hits and the subject is a TEs of the same species the subject "speciesCopy" 
# may not be one of the copies of the HTT hits, which was the whole point of the blast)

# we remove hits of low quality since none of the retained HTT hits had pID < 75%
# as well as self hits
blast <- blast[pID > 75 & length >= 200L & httCopy != speciesCopy, ]

blast[, sp := splitToColumns(httCopy, ":", 1)] # adds a column for species names.

# we may now remove species names from copy names
blast[, c("httCopy", "speciesCopy") :=
    .(copyName(httCopy), copyName(speciesCopy))]

# we will also need the copy integer IDs
copyIDs <- fread("TEs/clustering/selectedCopiesKs05occ200.IDs.txt")
copyIDs[, copy := copyName(copy)]

# so as to add the identifier for the query of the blast
blast[, q := copyIDs[chmatch(httCopy, copy), id]]

# we list all hits (identified their row number) involving a given copy

# in the list below, hits for copy x are accessed by hitsForCopy[[x]]
hitsForCopy <- reList(split(1:nrow(blast), blast$q)) 

# we get all the copy ids per hit group
copiesInHitGroups <- httHits[, .(copy = unique(c(q, s))), by = hitGroup]

# We retrieve full copy names (useful later)
copiesInHitGroups[, name := copyIDs[match(copiesInHitGroups$copy, id), copy]]

# we get all intraspecific hits involving each copy of each hit group
hitsInHitGroups <- copiesInHitGroups[, .(hit = unlist(hitsForCopy[copy])), by = hitGroup]

# we add useful hit information in new columns
hitsInHitGroups[, c("query", "subject", "pID", "score") :=
    blast[hit, .(httCopy, speciesCopy, pID, score)]]

setorder(hitsInHitGroups, hitGroup, -pID)

# we now decide if a subject hit by some query (copy involved in htt) can be assigned to the hit group.
# we will compare the pID of each hit between a subject and query to that of hits involving the query within the hit group
# those hits are supposed to reflect HTT, while the former hit should reflect within-genome transposition

# For this, we retrieve the mean pID and scores per copy per hit group
# we need to do that for copies that are in "query" and "subject" columns

bestIDs <- httHits[, .(meanID = mean(pID), meanScore = mean(score)),
    by = .(hitGroup, copy = query)
]

bestIDs <- rbind(
    bestIDs,
    httHits[, .(meanID = mean(pID), meanScore = mean(score)),
        by = .(hitGroup, copy = subject)
    ]
)

# we create a group-copy pair identifier
bestIDs[, pair := stri_c(hitGroup, "_", copy)]

# so as to retrieve the mean pID and score for hits involving a query, for each hit group
hitsInHitGroups[, c("meanID", "meanScore") :=
    bestIDs[
        chmatch(
            x = stri_c(hitsInHitGroups$hitGroup, "_", query),
            table = pair
        ),
        .(meanID, meanScore)
    ]]

# computes the proportion of copies of a hit group that are more similar
# to a subject than they are to TEs in the other clade (supposed HTT)
perSubject <- hitsInHitGroups[, .(portion = mean(pID > meanID)),
    by = .(hitGroup, subject)
]

# and we retain subjects that are on average more similar to
# same-genome copies than these are to TEs of the other clade
# we will consider that they belong to the hit group
perSubject <- perSubject[portion > 0.5]

# we can now concatenate these "new" copies with the previous ones
# a logical column "new", will differentiate these copies
copiesInHitGroups <- rbind(
    copiesInHitGroups[, .(hitGroup, copy = name, new = F)],
    perSubject[, .(hitGroup, copy = subject, new = T)]
)

# and remove "new" copies that were in fact already present in hit groups
copiesInHitGroups <- copiesInHitGroups[!duplicated(data.table(hitGroup, copy))]

# we also retrieve the species to which copies belong,
# as we will need to know the clade they belong to
# since our contamination filter imposes a certain number of copies per clade per hit group)

# for this we generate a table that makes the correspondence between copy and species
spForCopy <- blast[, unique(c(httCopy, speciesCopy)), by = sp]


copiesInHitGroups[, sp := spForCopy[chmatch(copy, V1), sp]]

# to determine the clade to which the copy belongs, we make a
# list of species of the "left" clade (clade A) in each hit group
sp1perHitGroup <- httHits[, unique(sp1), by = hitGroup]
sp1perHitGroup <- split(sp1perHitGroup$V1, sp1perHitGroup$hitGroup)

setorder(copiesInHitGroups, hitGroup)

# we add a logical column that is TRUE for copies belonging to the "left" clade of a hit group
# and obviously, it is FALSE for copies belonging to the "right" clade "clade B"
copiesInHitGroups$inClade1 <- copiesInHitGroups[, sp %chin% sp1perHitGroup[[hitGroup]], by = hitGroup]$V1


# STEP 2, we evaluate if hit groups should be excluded -----------------------------------------------------------

# to exclude VT, we evaluate the distribution of Ks of TEs in a hit group, in
# lights ot the thresholds we imposed (the 0.5% quantile of the BUSCO Ks
# distribution, or 0.5)

# for this, we need the species phylogeny
tree <- read.tree("additional_files/timetree.nwk")

# we import the table of filtered BUSCO Ks (200 AA alignments 
# and one score per BUSCO gene per pair of clades)
Ks <- fread(input = "gunzip -c Ks200AAnoRedundancy.txt.gz")

# we collect information related to Ks (including columns we don't use afterward)
KsPerClade <- Ks[Ks < 10, .(
    min = min(Ks),
    q05 = quantile(Ks, 0.005),
    meanKs = mean(Ks),
    divTime = divTime[1]
),
by = clade
]

# we ensure that hitGroupStats generated below is sorted by hit group
setorder(httHits, hitGroup, -pID) 

# we generate statistiques per hit groups to evaluate them. Mainly
# and number of copies per clade

hitGroupStats <- httHits[, data.frame(
    maxkS = max(ks),        # the max Ks between copies of the hit groups
    ksMode(ks + 2 * vks),   # the Modal class of the Ks distribution (see HTvFunctions.R)
    N = .N,                 # number of hits per hitGroup
    nq = length(unique(q)), # number of "query" copies, i.e. from the left clade
    ns = length(unique(s))  # same for copies from the right clade
), 
by = .(hitGroup, mrca, superF)
]

# we add 2 columns to this table:
hitGroupStats <- merge(
    x = hitGroupStats,
    y = copiesInHitGroups[, .(
        nQ = sum(inClade1), # nQ is as for nq above, but also including the copies we retrieved with our blast
        nS = sum(!inClade1)
    ), by = hitGroup],      # and equivalent column for the other clade
    by = "hitGroup"
)

# and other important columns
hitGroupStats[, c(
    "qKs", # the 0.5% quantile of BUSCO gene Ks for the clade
    "mKs", # the mean BUSCO Ks for the clade (which I don't use as a criterion later)
    "age"
) # the age of the clade
:= data.table(
        KsPerClade[match(mrca, clade), .(q05, meanKs)],
        nodeDepth(tree, mrca)
    )]

# we save these for the next stage
writeT(hitGroupStats, "TEs/clustering/hitGroupStats.txt")


# we check if ka/ks of TEs correlates with the divergence of clades involved. ---------------------------------------------------------
# It should not (a priori), especially for DNA transposons, unless there is some
# spurious vertical transfer for which ka/ks should be closer to 1, while ka/ks for TEs
# transferred by HTT should be < 1 due to selection during HTT

# we make classes of divergence times for graphical visualisation
ageBreaks <- hitGroupStats[, seq(min(age), max(age), length.out = 20)]

httHits[, age := nodeDepth(tree, mrca)] # we add the divergence time of the species involved in each hit

# so that we can assign hits to the classes of divergence, by adding a new column
httHits[, ageClass := .bincode(age, ageBreaks, include.lowest = T)]

# we now obtain mean ka/ks per divergence time class and TE class (DNA or RNA)
perAge <- httHits[ka / ks < 3, .(
    kaks = weighted.mean(ka / ks, length.aa),
    age = mean(age)
),
by = .(DNA = grepl("DNA", superF), ageClass)
]

# makes the plot for DNA transposons (there was no obvious trends for class I TEs) : figure S1 of the paper
pdf("FigureS1.pdf")
p <- perAge[DNA == T, plot(
    x = age,
    y = kaks,
    ylim = c(0, max(kaks)),
    xlab = "Time since divergence (My)",
    ylab = "Ka/Ks"
)]

abline(v = 120, col = "red")
dev.off()

# we add a logical column that tells whether a hit group should be retained
hitGroupStats[, retained := 
    nQ >= 5L & # it must involve at last 5 copies in the left clade
    nS >= 5L & # and in the right clade
    nq >= 2L & # among which, at least 2 copies must be involved in the htt
    ns >= 2L &
    (nMode > nLast + 20L | #+ additional criteria related to the Ks distribution (see methods)
        maxkS < qKs - 0.2) &
    age > 120]

writeT(
    data = hitGroupStats,
    path = "TEs/clustering/hitGroupStats.txt"
)