diff --git a/DESCRIPTION b/DESCRIPTION index de4d31a..c6ba8d7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: LDWeaver Type: Package Title:Genomewide Epistasis Analysis on Bacteria -Version: 1.2.0 +Version: 1.3.1 Authors@R: person("Sudaraka", "Mallawaarachchi", email = "smallawaarachchi@gmail.com", role = c("aut", "cre")) Maintainer: Sudaraka Mallawaarachchi Description:Perform genomewide epistasis analysis by evaluating the LD structure in bacteria. diff --git a/R/BacGWES.R b/R/BacGWES.R index afa4cf2..03a42d6 100644 --- a/R/BacGWES.R +++ b/R/BacGWES.R @@ -18,10 +18,9 @@ #' maf_freq filter. Eg: Under default filter values, a site with allele frequencies A:0.85, C:0.0095, N:0.1405 will be respectively dropped and allowed by 'default' and 'relaxed' methods. #' @param gap_freq sites with a gap frequency >gap_greq will be dropped (default = 0.15) #' @param maf_freq sites with a minor allele frequency . If unavailable or if annotations are not required, set SnpEff_Annotate = F #' @param hdw_threshold Hamming distance similarity threshold (default = 0.1, i.e. 10\%) - lower values will force stricter population structure control at the cost of masking real signal. #' @param perform_SR_analysis_only specify whether to only perform the short range link analysis (default = FALSE) -#' @param SnpEff_Annotate specify whether to perform annotations using SnpEff +#' @param SnpEff_Annotate specify whether to perform annotations using SnpEff (default = TRUE) #' @param sr_dist links less than apart are considered 'short range' (default = 20000), range 1000 - 25000 bp. #' @param lr_retain_links specify the maximum number of long-range MI links to retain (default = 1000000) - in each block, only a top subset of links will be saved #' @param max_tophits specify the maximum number of short range links to save as . Note: all short-range links will be annotated (and saved separately), @@ -46,24 +45,27 @@ #' @export LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path = NULL, gff3_path = NULL, ref_fasta_path = NULL, validate_ref_ann_lengths = T, snp_filt_method = "default", - gap_freq = 0.15, maf_freq = 0.01, snpeff_jar_path = NULL, hdw_threshold = 0.1, - perform_SR_analysis_only = F, SnpEff_Annotate = T, sr_dist = 20000, lr_retain_links = 1e6, + gap_freq = 0.15, maf_freq = 0.01, hdw_threshold = 0.1, perform_SR_analysis_only = F, + SnpEff_Annotate = T, sr_dist = 20000, lr_retain_links = 1e6, max_tophits = 250, num_clusts_CDS = 3, srp_cutoff = 3, tanglegram_break_segments = 5, multicore = T, max_blk_sz = 10000, ncores = NULL, save_additional_outputs = F){ # Build blocks # BLK1: Extract SNPs and create sparse Mx from MSA (fasta) - # BLK2: Parse GBK + # BLK2: Parse GBK or GFF+REF # BLK3: Estimate diversity within each CDS, cluster and paint < # possible inputs on methods> # BLK4: Compute Hamming Distance weights # BLK5: Compute MI between all links, sr_links model fitter, ARACNE - # BLK6: GWES_plots - # BLK7: Snpeff annotation pipeline, dtermine tophits - # BLK8: Tanglegram (depends: chromoMap) - # BLK9: GWESExplorer (depends: GWESExplorer) + # BLK6: Genomewide LD Map + # BLK7: GWES_plots + # BLK8: Snpeff annotation pipeline, determine tophits + # BLK9: Tanglegram (depends: chromoMap) + # BLK10: GWESExplorer (depends: GWESExplorer) + # BLK11: Cleanup #TODO: Add the option to provide genbank file without reference sequence #TODO: Count through blocks and automate the displayed BLOCK NUMBER + #NOTE: SnpEff does not parse the GBK and GFF3 file from the same refseq reference genome the same way. There might be differences between annotations/tophits/etc. # # Welcome message # # # Sanity checks @@ -72,10 +74,15 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path if(!is.null(gff3_path) & is.null(ref_fasta_path)) stop("Reference fasta file must be provided for gff3 annoations") # only one of gbk or gff can be NULL if(SnpEff_Annotate == T) { + # Added snpEff to inst/extdata + snpeff_jar_path = system.file("extdata", "snpEff.jar", package = "LDWeaver") + ######################################## These checks must be unnecessary now ######################################## if(is.null(snpeff_jar_path)) stop(" must be provided for annotations. To run without annotations, set SnpEff_Annotate = F") if(!file.exists(snpeff_jar_path)) stop(paste(" not found at:", snpeff_jar_path, "please check the path provided")) + ###################################################################################################################### order_links = F # sr_links should be ordered at the end after adding annotations } else { + snpeff_jar_path = NULL order_links = T # sr_links will be ordered and saved without annotations } @@ -189,6 +196,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path ######## Welcome message ######## { timestamp() + cat("\n ***** This is LDWeaver", as.character(packageVersion(pkg = "LDWeaver")), " *****") if(ncores > 1) cat(paste("\n\n Performing GWES analysis on:", dset, " - using", ncores, "cores\n\n")) if(ncores == 1) cat(paste("\n\n Performing GWES analysis on:", dset, "\n\n")) if(perform_SR_analysis_only) cat("Only short-range analysis requested. \n") @@ -360,6 +368,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path # BLK7 cat("\n\n #################### BLOCK 8 #################### \n\n") if(SnpEff_Annotate == F){ + LDWeaver::cleanup(dset = dset, delete_after_moving = F) cat(paste("\n\n ** All done in", round(difftime(Sys.time(), t_global, units = "mins"), 3), "m ** \n")) return() } diff --git a/R/SnpEffAnnotations.R b/R/SnpEffAnnotations.R index 644176b..eeb713e 100644 --- a/R/SnpEffAnnotations.R +++ b/R/SnpEffAnnotations.R @@ -29,7 +29,8 @@ perform_snpEff_annotations = function(dset_name, annotation_folder, snpeff_jar, snp.dat, cds_var, links_df, gbk = NULL, gbk_path = NULL, gff = NULL, tophits_path = NULL, max_tophits = 250, links_type = "SR"){ - #TODO: We don't need both gbk and gbk_path, save gbk_path to the gbk when parsing and read from here (same for gff) + #TODO: Can we bundle snpEff for local run? Conda version can install SnpEff as a dependency, devtools version can have it in inst/extdata + #TODO: We don't need both gbk and gbk_path, save gbk_path to the gbk when parsing and read from here (same for gff) # only one of gbk or gff can be NULL if( (is.null(gbk) & is.null(gff)) | (!is.null(gbk) & !is.null(gff)) ) stop("Provide either one of gbk or gff") diff --git a/R/computePairwiseMI.R b/R/computePairwiseMI.R index fa0f457..eed9bb8 100644 --- a/R/computePairwiseMI.R +++ b/R/computePairwiseMI.R @@ -398,6 +398,7 @@ mergeNsort_sr_links = function(cds_var, sr_links, sr_dist, plt_path, srp_cutoff) # print(summary(maxvls$max)) # plot(maxvls) + saveRDS(object = maxvls, file = file.path(plt_path, paste("c", i, "_fit_data.rds", sep= ""))) ggplot2::ggsave(filename = file.path(plt_path, paste("c", i, "_fit.png", sep= "")), plot = p_cf, width = 2200, height = 1200, units = "px") cat(paste(" Done in", round(difftime(Sys.time(), t0, units = "secs"), 2), "s \n")) diff --git a/R/io_functions.R b/R/io_functions.R index c640a2d..fc814b0 100644 --- a/R/io_functions.R +++ b/R/io_functions.R @@ -242,7 +242,15 @@ cleanup = function(dset, delete_after_moving = F){ files = dir(dset) ##### Additional Outputs ##### - idx = c(grep("*.rds", files)) + # New fit data 20231102 + idx = grep("^c*[:0-999:]_fit_data.rds$", files) + if(length(idx) > 0){ + fldr = file.path(dset, "Fit") + cleanup_support(files = file.path(dset, files[idx]), fldr) + mv_success = c(mv_success, idx) + } + + idx = c(grep("cds_var.rds", files), grep("hdw.rds", files), grep("parsed_gbk.rds", files), grep("snp_ACGTN.rds", files)) if(length(idx) > 0){ fldr = file.path(dset, "Additional_Outputs") cleanup_support(files = file.path(dset, files[idx]), fldr) diff --git a/README.md b/README.md index 783581b..1d1641e 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,8 @@ LDWeaver accepts a sequence alignment (fasta) and its reference annotation pairs of variants (links) that is unusually high given the genomic distance between the pair. This high LD could be the result of co-selection or epistasis. Approximate statistical significance is used to rank outlier links and the -output is reported in `tsv` format, along with several other helpful figures. -Additionally, LDWeaver has functions assist the detection of genomic regions +output is reported in `tsv` format, along with several other helpful annotations and figures. +Additionally, LDWeaver has functions to assist the detection of genomic regions that have potentially undergone co-selection or epistasis. LDWeaver `tsv` output can be directly used as input for GWESExplorer @@ -140,18 +140,10 @@ folder called `sample`, which should be created in the current working directory ## Performing Annotations -Additionally, LDWeaver has an interface to perform detailed annotations -using -SnpEff. -Once downloaded, set the two options: `SnpEff_Annotate=T` and -`snpeff_jar_path=` - -> **Note** Since the genbank annotation file is provided, LDWeaver only requires -> the path to the \ file. You can download this by visiting the -> SnpEff github page. - -Once set, this will create these outputs in the \ folder. Note that `X` in -the following outputs refer to **sr** (short range) or **lr** (long range). +By default, LDWeaver performs detailed annotations on all link SNPs using +SnpEff. +This will create the following outputs in \. Note that `X` here +refers to **sr** (short range) or **lr** (long range). - Outputs @@ -168,7 +160,7 @@ the following outputs refer to **sr** (short range) or **lr** (long range). GWESExplorer (X = sr,lr). -> **Note** The default srp_cutoff is 3 (i.e. p=0.001). Short-range links +> **Note** The default srp_cutoff is 3 (i.e., p=0.001). Short-range links > with p\>0.001 are automatically discarded, this can be modified using > the \ option. The default max_tophits value is 250, this > can be modified using the \ option. @@ -195,17 +187,16 @@ To cite LDWeaver please use: Mallawaarachchi, Sudaraka et al. Detecting co-selec ## Detailed Workthrough using Real Data -The following analysis demonstrates some of the current options available in +The following analysis demonstrates most of the options available in LDWeaver. The alignment with 616 *S. pnuemoniae* genomes is available here, and -the same sample.gbk annotation was used to generate this alignment. This annotation is also available -here. +the same sample.gbk annotation was used to generate this alignment (also available here. For this example, it is assumed that the current working directory is set to `~/LDWeaver_run` and the alignment -and \ file (see [above](#performing-annotations)) are available in the same folder. Please note that file paths here are written for Linux and macOS operating systems, -windows users will need to modify as required. +is available in the same folder. Please note that file paths here are written for Linux/macOS operating systems, +windows users will need to modify as needed. The following few lines of code can perform the complete LDWeaver analysis. @@ -218,19 +209,19 @@ setwd("~/LDWeaver_run") dset <- "msch" aln_path <- "spn23f_msch.aln.gz" gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver") -snpeff_jar_path <- "snpEff.jar" LDWeaver::LDWeaver(dset = dset, aln_path = aln_path, gbk_path = gbk_path, - snpeff_jar_path = snpeff_jar_path, save_additional_outputs = T) ``` -While the `LDWeaver::LDWeaver()` one-liner function is generally -versatile for most analyses, it is possible to write a customised -pipeline using available functions. For a full list of available functions -and options, see: `help(package="LDWeaver")`. +`LDWeaver::LDWeaver()` one-liner is versatile for most +analyses. If previously created outputs are available in \, this +function will load those instead of repeating possibly time and resource heavy analysis. + +It is also possible to write customised pipelines using available functions. For a full list of available functions +and options, run: `help(package="LDWeaver")`. ``` r library(LDWeaver) @@ -239,7 +230,6 @@ dir.create(dset) # folder to save outputs aln_path <- "~/LDWeaver_run/spn23f_msch.aln.gz" gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver") -snpeff_jar_path <- "~/LDWeaver_run/snpEff.jar" ncores = parallel::detectCores() snp.dat = LDWeaver::parse_fasta_alignment(aln_path = aln_path) # parse the alignment and extract SNPs @@ -276,8 +266,8 @@ LDWeaver::make_gwes_plots(sr_links = sr_links, plt_folder = dset) ``` r # Identify the top hits by performing snpEff annotations tophits = LDWeaver::perform_snpEff_annotations(dset_name = dset, annotation_folder = file.path(getwd(), dset), - snpeff_jar = snpeff_jar_path, gbk = gbk, gbk_path = gbk_path, - cds_var = cds_var, links_df = sr_links, snp.dat = snp.dat, + gbk = gbk, gbk_path = gbk_path, cds_var = cds_var, + links_df = sr_links, snp.dat = snp.dat, tophits_path = "msch/sr_tophits.tsv") ``` @@ -311,8 +301,7 @@ Next step is to analyse the long range links # Analyse long range links LDWeaver::analyse_long_range_links(dset = dset, lr_links_path = "msch/lr_links.tsv", sr_links_path = "msch/sr_links.tsv", SnpEff_Annotate = T, - snp.dat = snp.dat, snpeff_jar_path = snpeff_jar_path, - gbk_path = gbk_path, cds_var = cds_var) + snp.dat = snp.dat, gbk_path = gbk_path, cds_var = cds_var) ``` ![](inst/sup/lr_gwes.png) @@ -339,10 +328,13 @@ sites and their magnitude can be generated using: ``` r # Generate the Network Plot for pbp genes -LDWeaver::create_network(LDWeaver::create_network_for_gene("pbp", + +network = LDWeaver::create_network_for_gene("pbp", sr_annotated_path = "msch/Annotated_links/sr_links_annotated.tsv", lr_annotated_path = "msch/Annotated_links/lr_links_annotated.tsv", - level = 2), + level = 2) + +LDWeaver::create_network(network, plot_title = "pbp network", netplot_path = "msch/pbp_network.png", plot_w = 2000, plot_h = 2000) diff --git a/inst/extdata/snpEff.jar b/inst/extdata/snpEff.jar new file mode 100644 index 0000000..c7a2bce Binary files /dev/null and b/inst/extdata/snpEff.jar differ diff --git a/man/LDWeaver.Rd b/man/LDWeaver.Rd index b6b8e85..015ee6e 100644 --- a/man/LDWeaver.Rd +++ b/man/LDWeaver.Rd @@ -16,7 +16,6 @@ LDWeaver( snp_filt_method = "default", gap_freq = 0.15, maf_freq = 0.01, - snpeff_jar_path = NULL, hdw_threshold = 0.1, perform_SR_analysis_only = F, SnpEff_Annotate = T, @@ -57,13 +56,11 @@ maf_freq filter. Eg: Under default filter values, a site with allele frequencies \item{maf_freq}{sites with a minor allele frequency . If unavailable or if annotations are not required, set SnpEff_Annotate = F} - \item{hdw_threshold}{Hamming distance similarity threshold (default = 0.1, i.e. 10\%) - lower values will force stricter population structure control at the cost of masking real signal.} \item{perform_SR_analysis_only}{specify whether to only perform the short range link analysis (default = FALSE)} -\item{SnpEff_Annotate}{specify whether to perform annotations using SnpEff} +\item{SnpEff_Annotate}{specify whether to perform annotations using SnpEff (default = TRUE)} \item{sr_dist}{links less than apart are considered 'short range' (default = 20000), range 1000 - 25000 bp.}