findHighestSimilarity.R

##%######################################################%##
#                                                          #
####       For each copy of each hit group, this        ####
####    script finds # the copy that has the highest    ####
####            sequence similarity (pID)               ####
####              in every other hit group              ####
####          (assuming homology between copies         ####
####             # was detected by blastn)              ####
#                                                          #
##%######################################################%##


# this script is launched by stage 12-HTTeventCount.R

source("HTvFunctions.R")

# the only argument is the number of CPU to use
args <- commandArgs(trailingOnly = TRUE)
nCPUs <- as.integer(args[1])

# the output will be a table that reports on the homology of copies between hit groups

dir.create("TEs/clustering/networks") # where results will go

# we import the list of hits (data tables with query id, 
# subject id, pID, hit group id) in each super family
# This list was generated by 12-HTTeventCount.R
hitList <- readRDS("TEs/clustering/forHighestSimilarity.RDS")

# to find homologous copies, we use the self-blastn files generated at stage 8
# we list them
blastFiles <- list.files(
    path = "TEs/clustering/blastn/done",
    pattern = ".out",
    full.names = T
)

# we split these file names by superfamily (large superfamilies have several blast files)
# for this we extract superfamily names from file names
superF <- splitToColumns(basename(blastFiles), "_", 1)
blastFiles <- split(blastFiles, superF)

# this is the core function that finds similarities between 
# copies of different hit groups of a super family
findSimilarities <- function(superF) { # superF is the superfamily name (character)

    # we get the HTT hits of the super family being processed
    hits <- hitList[[superF]] 
    
    # this was only to get all the copies involved in these hits
    copies <- hits[, unique(c(q, s))] 

    # this function import blast files of copies against themselves
    import <- function(file) { # file is the path to a blast output
        selfBlast <- fread(
            input = file,
            sep = "\t",
            header = F,
            select = c(1:3),
            col.names = c("query", "subject", "pID"),
            nThread = 2
        )

        # we remove its not involving copies in the htt hits
        selfBlast <- selfBlast[query %in% copies & subject %in% copies]

        # and convert pID to integer for speed and RAM usage
        selfBlast[, pID := as.integer(pID * 1000)]
        selfBlast
    }

    # we apply the function in parallel for the blast 
    # files of the super family, using 5 processes
    selfBlast <- mclapply(
        X = blastFiles[[superF]],
        FUN = import,
        mc.cores = 5,
        mc.preschedule = F
    )

    selfBlast <- rbindlist(selfBlast) # we rbind the hits in a single table

    # we will need to quickly retrieve all the copies that  
    # are homologous to a given one (including itself)
    # for this, we first duplicate the hits by creating reciprocal 
    # ones and add self hits with 100% pIDs
    selfBlast <- rbind(
        selfBlast, # the original hits
        selfBlast[, .(query = subject, subject = query, pID)], # the hits in "reverse"
        data.table(query = copies, subject = copies, pID = 100000L) # and the self hits
    ) 


    # we can now create the following list
    subjectsForQuery <- reList(split(selfBlast$subject, selfBlast$query))
    # subjectsForQuery[[x]] returns all copies that have some homology
    # with copy x (x is an integer)

    # we will also need to quickly retrieve the pID score of all hits involving a query
    # the principle is the same as above
    idForQuery <- reList(split(selfBlast$pID, selfBlast$query))

    # as well as the hit group(s) in which each copy is found, following the same principle
    # but for this, we first have to obtain all ids of copies involved in each hit group
    copyPerHitGroup <- hits[, .(copy = unique(c(q, s))), by = hitGroup]
    hitGroupsForCopy <- reList(split(copyPerHitGroup$hitGroup, copyPerHitGroup$copy))

    copyPerHitGroup <- split(copyPerHitGroup$copy, copyPerHitGroup$hitGroup)

    # we count the number of subjects per query, used later
    nSubjects <- lengths(subjectsForQuery)

    # as well as the number of hit groups where each copy is found
    nHitGroups <- lengths(hitGroupsForCopy)

    rm(selfBlast) # to reclaim RAM, as there are lots of hits

    # for the copies of a given hit group ("focal" hit group), the function below
    # retrieves the best similarity this copy has with every other hit group
    findSimilaritiesForHitGroup <- function(copies, focalHitGroup) { 
    # focalHitGroup is just the integer id of the hit group

        # we first retrieve all copies that are homologous to 
        # those in the hit group (i.e., 'subject' of the blast)
        subject <- unlist(subjectsForQuery[copies])

        # with the pIDs associated with this homologies
        pID <- unlist(idForQuery[copies])

        # we place them in a data table (which requires replicating
        # copies that are homologous to several "subjects")
        pairs <- data.table(copy = rep(copies, nSubjects[copies]), subject, pID)

        # and we get the hit groups to which these subjects belongs
        explanatoryHitGroup <- unlist(hitGroupsForCopy[pairs$subject])

        # we place all this information in a table
        # for this, we need the number of hitGroups in which the subjects are found
        len <- nHitGroups[pairs$subject]

        # we generate the table (by replacing the previous one we no longer need)
        pairs <- pairs[, data.table(
            copy = rep(copy, len),
            pID = rep(pID, len),
            explanatoryHitGroup
        )]

        # we don't keep rows involving subjects that belong to the focal hit group
        pairs <- pairs[explanatoryHitGroup != focalHitGroup]

        # for each copy from focalHitGroup, we retain only the most similar subject per
        # explanatoryHitGroup
        setorder(pairs, -pID) 
        pairs <- pairs[!duplicated(data.table(copy, explanatoryHitGroup))]

        # we now add the focalHitGroup id to the table. 
        # Doing it earlier would have increased the memory footprint.
        pairs[, focalHitGroup := focalHitGroup]
        cat(".") # progress indicator
        pairs
    }

    # we apply the above function to all hit groups of the super family
    similarities <- Map(
        f = findSimilaritiesForHitGroup,
        copies = copyPerHitGroup,
        focalHitGroup = as.integer(names(copyPerHitGroup))
    )

    # we write this table for safety, as it is returned by the function
    writeT(
        data = rbindlist(similarities),
        path = stri_c("TEs/clustering/networks/", superF, ".similarities.txt")
    )

    rbindlist(hitGroupPairs)
}

# we apply the function to all super families in parallel
res <- mclapply(
    X = names(hitList),
    FUN = findSimilarities,
    mc.cores = nCPUs,
    mc.preschedule = F
)

writeT(
    data = rbindlist(res),
    path = stri_c("TEs/clustering/networks/all.hitGroupSimilarities.txt")
)