iterativeSecondClustering.R



## %######################################################%##
#                                                          #
####           This scripts applies criterion           ####
####              1 to hits from different              ####
####       communities, to potentially merge them       ####
####            into hit groups (htt events)            ####
#                                                          #
## %######################################################%##
# This scripts is launched from 10-hitClusteringRound2.R and uses files generated by that script
# This shares a lot of code with iterativeFirstClustering.R, but there are subtleties.
# Here, we return statistic for every pair of communities, in particular the number of pairs
# of hits passing criterion 1.


source("HTvFunctions.R")

# the only argument is the number of CPUs to use
args <- commandArgs(trailingOnly = TRUE)
nCPUs <- as.integer(args[1])

# we import the list of hit tables to cluster
hitList <- readRDS("TEs/clustering/round2/hitsFor2ndClustering.RDS")


# STEP ONE, list self-blastn files of copies for each family -------------------------------------------------------------------------------------

# we retrieve super family names from the list of hits
superF <- splitToColumns(names(hitList), "_", 1)
nHits <- sapply(hitList, nrow)
nHits <- sort(tapply(nHits, superF, sum), T) # to process the super families with more hits first

blastFiles <- list.files(
    path = "TEs/clustering/selectedHits",
    full.names = T
)

# we name the blast files elements with superfamily names (extracted from file names)
names(blastFiles) <- gsub(
    pattern = ".out",
    replacement = "", # this simply require removing the file extension
    x = basename(blastFiles), # from the base file name
    fixed = T
)


# STEP TWO, we "confront" every two communities of hits, we do this per super family------------------------------------------------------------
communityPairStats <- function(superFam) {
    # we identify the self-blastn outputs we need, and import them
    blastFile <- blastFiles[superFam]
    blast <- fread(
        input = blastFile,
        sep = "\t",
        header = F,
        drop = 4,
        col.names = c("query", "subject", "pID")
    )

    blast[, pID := as.integer(pID * 1000L)] # we convert pIDs to integers as we did for the htt hits (to save memory)

    # but here we don't create a matrix of pIDs where subject and queries are rows and columns numbers,
    # such matrix would be too big for this round.
    # We create a unique subject-query identifier based on the fact that these are integers, and both lower than 10^6.
    blast[, copyPair := 10^6 * query + subject]

    # we select the group of hits (one per mrca) for this superfamily
    groups <- hitList[superF == superFam]

    # the function bellow processes hits of a given group, and will be launched in parallel
    processHitsOfGroup <- function(group) {
        # we retrieve the hits of the group
        hits <- copy(group)

        # we sort these hits per community to ensure that the
        # community pairs are formed in the same "direction"
        # (com1 vs com2 and never the opposite)
        setorder(hits, com)

        # the ids of copies involved in the hits to cluster
        uCopies <- hits[, unique(c(query, subject))]

        # we selected the TE hits involving these copies. We only
        # need the subject-query pair identifier and the blast pID
        selectedHits <- blast[query %in% uCopies & V2 %in% uCopies, .(copyPair, pID)]

        # we add a copyPair that does not actually exist (with 0 pID)
        # at the end of this table. This trick will help us later
        selectedHits <- rbind(selectedHits, data.table(copyPair = 0, pID = 0L))

        # as before, we anticipate the number of hits we will have to compare
        # to create batches that will analyse 2^28 pairs of hits at once

        nHits <- nrow(hits)
        nBatches <- ceiling(nHits^2 / 2^28)
        
        # we split the hits into several batches 
        hitBatches <- splitEqual(1:(nHits - 1L), n = nBatches)
        

        # this function "connects" hits according to the criterion, similar to what we did in the first round
        criterion_1 <- function(hitBatch) {

            # we again make all possible pairs of hits for this batch
            pairs <- data.table(
                hit1 = rep(hitBatch, nHits - hitBatch),
                hit2 = unlist(lapply(
                    X = hitBatch[1]:max(hitBatch) + 1L,
                    FUN = function(hit) hit:nHits
                ))
            )

            # we add columns for the ids of copies involved in the 2 hits
            pairs[, c(
                "q1",                        # query ids (copies from clade A) for hit1
                "s1",                        # subject ids (copies from clade B)
                "com1",                      # community id
                "inter1",                    # the inter-clade identity (representing the HTT)
                "q2", "s2", "com2", "inter2" # and the equivalent for hit2
            ) 
            := data.table(
                    hits[hit1, .(query, subject, com, pID)],
                    hits[hit2, .(query, subject, com, pID)]
                )]

            # we create query-subject pair identifiers as we did for the self-blastn output
            # we must ensure that the left copy id is always lower than right one
            # (as it is the case for the copyPair identifiers in the self-blast files)
            pairs[q1 > q2, c("q1", "q2") := .(q2, q1)] # we swap these ids if it is not the case
            pairs[s1 > s2, c("s1", "s2") := .(s2, s1)]

            # we create the same pair identifiers we created for the blast results
            pairs[, c("qPair", "sPair") := .(
                10^6 * q1 + q2,
                10^6 * s1 + s2
            )]

            # so we can retrieve within-clade sequence identities.
            # The nomatch argument below makes it so that the pID of the last row
            # in selectedHits (which has pID 0) is returned where there is no match (no hit).
            # This speeds things up a little
            pairs[, c("intra1", "intra2") := .(
                selectedHits[match(qPair, copyPair, nomatch = .N), pID],
                selectedHits[match(sPair, copyPair, nomatch = .N), pID]
            )]

            # for hits of a copy with itself (not in the blast outputs), we define 100% identity
            # this is equivalent to what we did for the matrix' diagonal in the other script)
            pairs[q1 == q2, intra1 := 100000L]
            pairs[s1 == s2, intra2 := 100000L]

            # as before, the 2 hits will be connected in the best intra-clade
            # identity is higher than one inter-clade identity (that of hits)
            pairs[, maxIntra := pmax(intra1, intra2)]
            pairs[, connected := inter1 < maxIntra |
                inter2 < maxIntra]
            cat("*")

            # we return statistics for each community pair
            # (note that a community pair may involve the same community twice,
            # which can help to assess the degree to which hits within a community are connected)

            pairs[, .(
                allDiff = sum(!connected), # number of pairs of hits not passing the criterion
                diff = sum(maxIntra > 0L & !connected), # the same, but for hits whose copies have some degree of intra-clade identity
                valid = sum(maxIntra > 0L), # the number of pairs of hits whose copies have some degree of intra-clade identity,
                tot = .N # the total number of pairs of hits evaluated
            ), 
            by = .(com1, com2) # all this is done for each community pair
            ] 
        }

        # we apply the function in parallel for the different batches
        statsPerCom <- rbindlist(mclapply(
            X = hitBatches,
            FUN = criterion_1,
            mc.cores = min(nCPUs, length(hitBatches)),
            mc.preschedule = F
        ))

        # as the statistic returned by criterion_1() were obtained for
        # separate batch, we need to sum them for the whole group
        statsPerCom[, .(
            allDiff = sum(allDiff),
            diff = sum(diff),
            valid = sum(valid),
            tot = sum(tot)
        ),
        by = .(com1, com2)
        ]
    }

    # applies the above function for the different groups successively
    stats <- rbindlist(lapply(
        X = groups,
        FUN = processHitsOfGroup
    ))

    # writes to disk for safety, this is also returned by he function
    writeT(
        data = stats,
        path = stri_c("TEs/clustering/round2/", superFam, ".all.txt")
    )

    print(paste("done", superFam))

    stats
}

# we finally apply the above for all super families
res <- lapply(
    X = names(nHits), # nHits names correspond to super family names
    FUN = communityPairStats
)

writeT(rbindlist(res), "TEs/clustering/round2/all.txt")