09-hitClusteringRound1.R



## %######################################################%##
#                                                           #
####     This stage clusters TE-TE hits iteratively      ####
####       to delinate HTT events ("hit groups")         ####
#                                                           #
## %######################################################%##


# This stage is the first step where clusters hits from the same pair of "young"
# clades: clades of less than 40 My, in which we did not even look for HTT and
# did not measure any BUSCO Ks. Then we will aggregate the clusters that may
# reflect the same HTT. Clustering and aggregation will be based on criterion 1:
# pID of TEs between clade is lower than pID within clades

source("HTvFunctions.R")


# this script uses:
# - the tabular file of selected TE-TE hit ("HTT hits") from stage 07-TEKsAndHTTfilter.R
httHits <- fread("occ200Ks05.txt")

# - the self-blastn output generated by the previous stage (used later)

# the output will be a tabular file of HTT hits with a new column attributing a community number to each


# STEP ONE, we swap query and subject species in the TE-TE hit table -------------------------------------------------------

# An HTT Involves two clades,
# We compare TE copies within clade to evaluate criterion 1. To do this, it is
# easier to swap sp1 and sp2 so that sp1 always corresponds to the same clade
# (and sp2 to the other) among the two sister clades that diverged from an MRCA.
# So we will compare "queries" (copies from the left-hand clade of the table) with each others
# and "subjects" (copies from the right-hand clade) with each others (not queries
# to subjects, that is already done with the pID in the HTT hits) so for each hit
# between species coalescing to an MRCA, we find its clade (the one among the two
# that diverged from the mrca)

tree <- read.tree("additional_files/timetree.nwk")

# the different MRCAs of the two species involved in each HTT hits
mrcas <- httHits[, unique(mrca)]

# this 2-column table gives the children (subclades, in column V2) of each node (MRCA, column V1)
edges <- data.table(tree$edge)

# these nodes are the subclades of each mrca in the httHits
subClades <- edges[V1 %in% mrcas, V2]

# we retrieve the species of these subclades
subClades <- tipsForNodes(tree, subClades, names = T)

# and the parent node (mrca) of each subclade
subClades[, MRCA := edges[match(node, V2), V1]]

# we combine tip (species) name and MRCA number, to match these combinations in the httHits
subClades[, comb := stri_c(tip, MRCA, sep = " ")]

# We assign subclade numbers (node numbers of the tree) to species in each hit, 
httHits[, c("cladeA", "cladeB") := .(
    subClades[chmatch(stri_c(sp1, mrca, sep = " "), comb), node],
    subClades[chmatch(stri_c(sp2, mrca, sep = " "), comb), node]
    )]

# and we ensure that the subclade with lower number is always on the left.
# This requires swapping many columns of the table:
httHits[cladeA > cladeB, c("query", "subject", "sp1", "sp2", "f1", "f2", "qStart", "qEnd", "sStart", "sEnd", "cladeA", "cladeB")
:= .(subject, query, sp2, sp1, f2, f1, sStart, sEnd, qStart, qEnd, cladeB, cladeA)]


# saved to disk
writeT(httHits, "occ200_subClades.txt")


# STEP TWO, we filter the self-blastn hits of TE copies ----------------------------------------------------------------
# to get within-clades pID of TE hits, we use the self-blastn outputs, which are
# huge (hundreds of GBs total). So we filter them to select only the hits we
# need: those between copies of the same (sub)clade, to evaluate criterion 1
# (within-clade pID lower than between-clade pID). We already have between-clade
# pIDs, since these are in the httHits table.

# We filter the blast outputs to generate more manageable files, in case we need to redo
# the clustering several times. The hits will not have to be filtered again

# for this step, we  need copy integer identifiers as these were used in the self blast to save space
# we retrieve copy integer IDs
copyIDs <- fread("TEs/clustering/selectedCopiesKs05occ200.IDs.txt")
copyIDs[, name := copyName(copy)]

# we add new columns for integer ids corresponding to both copies in each hit
httHits[, c("q", "s") := .(copyIDs[chmatch(query, name), id], copyIDs[chmatch(subject, name), id])]

# since we will compare only copies within superFamilies and MRCA (the combination of which we call a "group"), we extract them from the table
# we start with the "queries", which are copies from the left-hand clade ("clade A")
queries <- httHits[, unique(q), by = .(superF, mrca)]

# we split these copies for each superfamily and mrca, which we will compare as these are from the same clade
cladeAcopies <- split(queries$V1, queries[, paste(superF, mrca)])

# we will sort groups by decreasing number of copies in queries (for better CPU usage)
nQueryCopies <- lengths(cladeAcopies)
cladeAcopies <- cladeAcopies[order(nQueryCopies, decreasing = T)]

# we do the same for subjects TE copies (from clade B)
subjects <- httHits[, unique(s), by = .(superF, mrca)]
cladeBcopies <- split(subjects$V1, subjects[, paste(superF, mrca)])
cladeBcopies <- cladeBcopies[order(nQueryCopies, decreasing = T)]

# as we will extract self-blast hits per super family, we re-split the TE copies (subjects and queries) lists by super Families
# prior, we extract super family names from copy names and replaces slashes with periods as they will be used to generate file names
superF <- gsub("/", ".", splitToColumns(names(cladeAcopies), " ", 1), fixed = T)
cladeAcopies <- split(cladeAcopies, superF)
cladeBcopies <- split(cladeBcopies, superF)

# and we sort these list by decreasing numbers of copies (we prefer starting with the large ones)
 # we obtain the number of copy in each element of the list
nq <- sapply(cladeAcopies, function(x) {
    length(unique(unlist(x)))
})

cladeAcopies <- cladeAcopies[order(nq, decreasing = T)]
cladeBcopies <- cladeBcopies[order(nq, decreasing = T)]

# we list the self blast result files generated in set 8-prepareClustering.R
blastFiles <- list.files(
    path = "TEs/clustering/blastn/done",
    pattern = "",
    full.names = T
)


# which we order by decreasing size
blastFiles <- blastFiles[order(file.size(blastFiles), decreasing = T)]

# splits blast files per super families (large superfamilies have several blast outputs)
blastFiles <- split(blastFiles, splitToColumns(basename(blastFiles), "_", 1))


# where the selected hits will go
filteredHitFolder <- "TEs/clustering/selectedHits/"
dir.create(filteredHitFolder)


# this function extracts the relevant TE-TE hits for a given super family
getHits <- function(superF) {
    files <- blastFiles[[superF]]
    
    import <- function(file) {
        hits <- fread(
            input = file,
            sep = "\t",
            header = F,

            # the function will be used in parallel, so we don't need more than 1 CPU here
            nThread = 1,
            
            #we only need these columns for the clustering (we don't use the score, but we import it just in case)
            select = c(1:3, 9),
            col.names = c("query","subject","pID","score")
        )

        # now select hits between copies of the same clade
        selectedHits <- Map(
            function(cladeAcopies, caldeBcopies) {

                # each hit must be between two copies of clade A
                # or between two copies of clade B
                hits[(query %in% cladeAcopies & subject %in% cladeAcopies) | 
                     (query %in% caldeBcopies & subject %in% caldeBcopies), ]
            },
            
            cladeAcopies[[superF]], cladeBcopies[[superF]]
        )
        selectedHits <- rbindlist(selectedHits)

        # some rare hits may have been selected several times, we remove duplicates
        selectedHits <- selectedHits[!duplicated(data.table(query, subject)), ]

        # some progress indicator as files can be quite big
        cat(".")
        selectedHits
    }

    # we apply the function for each blast file of the super family, in parallel with 20 CPUs
    hits <- mclapply(
        X = files,
        FUN = import,
        mc.cores = min(20, length(files)),
        mc.preschedule = F
    )

    # writes a single file of selected hits
    writeT(
        data = rbindlist(hits),
        path = stri_c(filteredHitFolder, superF, ".out"),
    )


    # progress report
    print(paste("done", superF))
    NULL
}

# we do it iteratively for each super family.
m <- lapply(names(cladeAcopies), getHits)


# STEP THREE, clustering of hits according to criterion 1----------------------------------------------------
# we cluster hits between species of the same pair of "young" clades that
# diverged within the last 40 My. Within these clades, we did not even search for
# HTT (no blast) and we didn't even measure Ks at busco genes. These clusters of hits will
# be the initial ones (the "communities") that we will aggregate in a second round
# so we assign species involved hits to the young clades


# the young clades of <40 My
clades <- cladesOfAge(tree, 40, withTips = T)

# assigning species to these clades
httHits[, c("C1", "C2") := .(clades[chmatch(sp1, tip), node], clades[chmatch(sp2, tip), node])]

# To cluster hits, we prepare a table with only the columns we need.
# clustering of hits will be done within each "group" (hits involving a given pair of young clades in a super family).
# Hits will be identified by their row indices. We again replace slashes in super family names with periods
httHits[, c("hit", "group") := .(1:.N, stri_c(stri_replace(superF, ".", fixed = "/"), C1, C2, sep = "_"))]

# We get the column we need for the clustering
conv <- httHits[, data.table(hit,

    # we convert blast pIDs to integers for efficiency in the clustering
    pID = as.integer(pID * 1000L),
    group,
    query = q,
    subject = s
)]


# we split these hits according to "groups"
hitList <- split(conv, conv$group)
hitList <- hitList[order(-sapply(hitList, nrow))]

# if there is just one hit in a group, no clustering needs to be done so we can remove the group
hitList <- hitList[sapply(hitList, nrow) > 1L]


# where results will go
dir.create("TEs/clustering/round1/")

# hitList will be used in the clustering jobs. Below we plan the jobs:
saveRDS(hitList, file = "TEs/clustering/round1/hitsForFirstClusteringKs05occ200.RDS")

groupList <- data.table(
    group = names(hitList),

    # number of hits to cluster per "group"
    nHits = sapply(hitList, nrow),
    superF = splitToColumns(names(hitList), "_", 1)
)


# we use the number of hits and groups per super family.
# We'll do the clustering by super families, so that a
# filtered self-blastn output (which concerns a super family)
# needs only be imported once for all the groups within the super family
perSuperF <- groupList[, .(nHits = sum(nHits), .N), by = superF]

# we attribute batches (job numbers) to super families.
# several super families with low number of hits are processed in the same job
perSuperF[, batch := 0L]
perSuperF[nHits > 30000, batch := 1:.N]

# this table will be used to manage jobs in iterativeFirstClustering.R
writeT(perSuperF, "TEs/clustering/round1/batchesKs05occ200.txt")

# example of job number 6 using 5 cpus:
system('sbatch --mail-type=BEGIN,END,FAIL --cpus-per-task=5 --mem=100G --wrap="Rscript iterativeFirstClustering.R 6 5"')


# STEP FOUR, we collect the results of the clustering ---------------------------------------------------------------------
# there is one clustering output per job, we list them and import them
files <- list.files(
    path = "TEs/clustering/round1",
    pattern = "allGroups.txt",
    full.names = T
)

groups <- rbindlist(lapply(files, fread, header = T, sep = "\t"))

# we generate unique integer community ids (accross all groups)
groups[, ucomm := toInteger(string = stri_c(comm,
    group,
    sep = " "
))]


# we add these identifiers to the HTT hit table
httHits[groups$hit, com := groups$ucomm]

# the com column is NA for hits were not even clustered because there are just one per
# super family and clade pair, so we give them unique community numbers.
httHits[is.na(com), com := 1:.N + max(groups$ucomm)]


# write the new HTT hit table to disk
writeT(httHits, "occ200_comm.txt")