From 16afb54bd60847003b45235287999200b40a272d Mon Sep 17 00:00:00 2001 From: Al-Murphy Date: Wed, 24 Apr 2024 15:17:29 +0100 Subject: [PATCH] enable local chain files --- DESCRIPTION | 2 +- NEWS.md | 6 ++++++ R/format_sumstats.R | 10 +++++++++- R/liftover.R | 25 +++++++++++++++++++++++-- R/validate_parameters.R | 9 +++++++++ README.Rmd | 1 + README.md | 3 ++- longtests/testthat/test-liftover.R | 27 +++++++++++++++++++++++++++ man/format_sumstats.Rd | 7 +++++++ man/import_sumstats.Rd | 5 +++++ man/liftover.Rd | 7 +++++++ man/validate_parameters.Rd | 7 +++++++ 12 files changed, 104 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 45be2e5..d7c6df9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: MungeSumstats Type: Package Title: Standardise summary statistics from GWAS -Version: 1.11.9 +Version: 1.11.10 Authors@R: c(person(given = "Alan", family = "Murphy", diff --git a/NEWS.md b/NEWS.md index 253e702..394a403 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +## CHANGES IN VERSION 1.11.10 + +### New features +* Can now pass local chain files for liftover (`local_chain` in +`format_sumstats()` and `liftover()`). + ## CHANGES IN VERSION 1.11.9 ### New features diff --git a/R/format_sumstats.R b/R/format_sumstats.R index e66c8a5..4a404f4 100644 --- a/R/format_sumstats.R +++ b/R/format_sumstats.R @@ -48,6 +48,11 @@ #' @param chain_source source of the chain file to use in liftover, if converting #' genome build ("ucsc" or "ensembl"). Note that the UCSC chain files require a #' license for commercial use. The Ensembl chain is used by default ("ensembl"). +#' @param local_chain Path to local chain file to use instead of downlaoding. +#' Default of NULL i.e. no local file to use. NOTE if passing a local chain file +#' make sure to specify the path to convert from and to the correct build like +#' GRCh37 to GRCh38. We can not sense check this for local files. The chain file +#' can be submitted as a gz file (as downloaed from source) or unzipped. #' @param convert_small_p Binary, should non-negative #' p-values <= 5e-324 be converted to 0? #' Small p-values pass the R limit and can cause errors with LDSC/MAGMA and @@ -239,6 +244,7 @@ format_sumstats <- function(path, ref_genome = NULL, convert_ref_genome = NULL, chain_source = "ensembl", + local_chain = NULL, convert_small_p = TRUE, convert_large_p = TRUE, convert_neg_p = TRUE, @@ -377,6 +383,7 @@ format_sumstats <- function(path, mapping_file = mapping_file, tabix_index = tabix_index, chain_source = chain_source, + local_chain = local_chain, drop_na_cols = drop_na_cols, #deprecated parameters rmv_chrPrefix = rmv_chrPrefix @@ -1047,7 +1054,8 @@ format_sumstats <- function(path, convert_ref_genome = convert_ref_genome, ref_genome = ref_genome, imputation_ind = imputation_ind, - chain_source = chain_source + chain_source = chain_source, + local_chain = local_chain ) #update ref genome of data if(!is.null(convert_ref_genome)) diff --git a/R/liftover.R b/R/liftover.R index 534fccf..58e5268 100644 --- a/R/liftover.R +++ b/R/liftover.R @@ -50,6 +50,7 @@ liftover <- function(sumstats_dt, end_col = start_col, as_granges = FALSE, style = "NCBI", + local_chain = NULL, verbose = TRUE) { IMPUTATION_gen_build <- width <- strand <- end <- seqnames <- NULL; @@ -63,6 +64,14 @@ liftover <- function(sumstats_dt, "ensembl")){ stop(chain_msg) } + #check local chain file + if (!is.null(local_chain) && !file.exists(local_chain)){ + lcl_chain_msg <- paste0( + "The local_chain parameter is invalid, please chose a valid path to a ", + "local chain file or leave as NULL to download a chain file." + ) + stop(lcl_chain_msg) + } #Only continue with checks if user specifies a genome to convert to if (!is.null(convert_ref_genome)){ #### Map genome build synonyms #### @@ -110,12 +119,24 @@ liftover <- function(sumstats_dt, end.field = end_col ) #### Specify chain file #### - chain <- get_chain_file( + if (is.null(local_chain)) { + if (verbose) {message("Downloading chain file...")} + chain <- get_chain_file( from = query_ucsc, to = target_ucsc, chain_source = chain_source, verbose = verbose - ) + ) + } + else { + if (verbose) {message("Using local chain file...")} + #unzip if necessary + filetype = summary(file(local_chain))$class + if(filetype=='gzfile'){ + local_chain <- R.utils::gunzip(local_chain, overwrite=TRUE) + } + chain <- rtracklayer::import.chain(local_chain) + } #### Liftover #### gr_lifted <- unlist(rtracklayer::liftOver( x = gr, diff --git a/R/validate_parameters.R b/R/validate_parameters.R index b1637d5..a81b52a 100644 --- a/R/validate_parameters.R +++ b/R/validate_parameters.R @@ -46,6 +46,7 @@ validate_parameters <- function(path, mapping_file, tabix_index, chain_source, + local_chain, drop_na_cols, #deprecated parameters rmv_chrPrefix) { @@ -90,6 +91,14 @@ validate_parameters <- function(path, "ensembl")){ stop(chain_msg) } + #check local chain file + if (!is.null(local_chain) && !file.exists(local_chain)){ + lcl_chain_msg <- paste0( + "The local_chain parameter is invalid, please chose a valid path to a ", + "local chain file or leave as NULL to download a chain file." + ) + stop(lcl_chain_msg) + } #check dbSNP version avail_dbSNP <- c(144,155) dbSNP_msg <- paste0( diff --git a/README.Rmd b/README.Rmd index e206337..a1a3dab 100644 --- a/README.Rmd +++ b/README.Rmd @@ -143,5 +143,6 @@ development: * [Kitty Murphy](https://github.com/KittyMurphy) * [Mykhaylo Malakhov](https://github.com/MykMal) * [Alasdair Warwick](https://github.com/rmgpanw) + * [Ao Lu](https://github.com/leoarrow1) # References diff --git a/README.md b/README.md index 7549280..7dc342f 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ [![](https://img.shields.io/badge/release%20version-1.10.1-black.svg)](https://www.bioconductor.org/packages/MungeSumstats) -[![](https://img.shields.io/badge/devel%20version-1.11.9-black.svg)](https://github.com/neurogenomics/MungeSumstats) +[![](https://img.shields.io/badge/devel%20version-1.11.10-black.svg)](https://github.com/neurogenomics/MungeSumstats) [![R build status](https://github.com/neurogenomics/MungeSumstats/workflows/rworkflows/badge.svg)](https://github.com/neurogenomics/MungeSumstats/actions) [![](https://img.shields.io/github/last-commit/neurogenomics/MungeSumstats.svg)](https://github.com/neurogenomics/MungeSumstats/commits/master) @@ -151,6 +151,7 @@ We would like to acknowledge all those who have contributed to - [Kitty Murphy](https://github.com/KittyMurphy) - [Mykhaylo Malakhov](https://github.com/MykMal) - [Alasdair Warwick](https://github.com/rmgpanw) +- [Ao Lu](https://github.com/leoarrow1) # References diff --git a/longtests/testthat/test-liftover.R b/longtests/testthat/test-liftover.R index 1169f15..63bc04b 100644 --- a/longtests/testthat/test-liftover.R +++ b/longtests/testthat/test-liftover.R @@ -123,6 +123,33 @@ test_that("liftover works", { # drop 1 row from org not in chain files ref_37_org <- ref_37_org[SNP %in% ref_37$SNP, ] expect_equal(all.equal(ref_37_org, ref_37), TRUE) + + #test local chain file + save_dir <- tempdir() + lcl_chain <- system.file("extdata",'GRCh37_to_GRCh38.chain.gz', + package="MungeSumstats") + copied <- R.utils::copyFile( + srcPathname = lcl_chain, + overwrite=TRUE, + save_dir) + reformatted4 <- MungeSumstats::format_sumstats( + path = file, + ref_genome = "GRCh37", + on_ref_genome = TRUE, + convert_ref_genome = "GRCh38", + strand_ambig_filter = FALSE, + bi_allelic_filter = FALSE, + allele_flip_check = FALSE, + log_folder_ind = TRUE, + imputation_ind = TRUE, + dbSNP=144, + local_chain = paste0(save_dir,'/GRCh37_to_GRCh38.chain.gz') + ) + lcl_ss <- data.table::fread(reformatted$sumstats) + remote_ss <- data.table::fread(reformatted4$sumstats) + expect_equal(all.equal(lcl_ss,remote_ss)) + + } else { expect_equal((is_32bit_windows||!Sys.info()["sysname"]=="Linux"), TRUE) } diff --git a/man/format_sumstats.Rd b/man/format_sumstats.Rd index b5eabe0..d8c9cce 100644 --- a/man/format_sumstats.Rd +++ b/man/format_sumstats.Rd @@ -9,6 +9,7 @@ format_sumstats( ref_genome = NULL, convert_ref_genome = NULL, chain_source = "ensembl", + local_chain = NULL, convert_small_p = TRUE, convert_large_p = TRUE, convert_neg_p = TRUE, @@ -82,6 +83,12 @@ not match. Default is not to convert the genome build (NULL).} genome build ("ucsc" or "ensembl"). Note that the UCSC chain files require a license for commercial use. The Ensembl chain is used by default ("ensembl").} +\item{local_chain}{Path to local chain file to use instead of downlaoding. +Default of NULL i.e. no local file to use. NOTE if passing a local chain file +make sure to specify the path to convert from and to the correct build like +GRCh37 to GRCh38. We can not sense check this for local files. The chain file +can be submitted as a gz file (as downloaed from source) or unzipped.} + \item{convert_small_p}{Binary, should non-negative p-values <= 5e-324 be converted to 0? Small p-values pass the R limit and can cause errors with LDSC/MAGMA and diff --git a/man/import_sumstats.Rd b/man/import_sumstats.Rd index 44a3b3f..566adf9 100644 --- a/man/import_sumstats.Rd +++ b/man/import_sumstats.Rd @@ -68,6 +68,11 @@ not match. Default is not to convert the genome build (NULL).} \item{\code{chain_source}}{source of the chain file to use in liftover, if converting genome build ("ucsc" or "ensembl"). Note that the UCSC chain files require a license for commercial use. The Ensembl chain is used by default ("ensembl").} + \item{\code{local_chain}}{Path to local chain file to use instead of downlaoding. +Default of NULL i.e. no local file to use. NOTE if passing a local chain file +make sure to specify the path to convert from and to the correct build like +GRCh37 to GRCh38. We can not sense check this for local files. The chain file +can be submitted as a gz file (as downloaed from source) or unzipped.} \item{\code{convert_small_p}}{Binary, should non-negative p-values <= 5e-324 be converted to 0? Small p-values pass the R limit and can cause errors with LDSC/MAGMA and diff --git a/man/liftover.Rd b/man/liftover.Rd index 4eef3f3..e4212de 100644 --- a/man/liftover.Rd +++ b/man/liftover.Rd @@ -24,6 +24,7 @@ liftover( end_col = start_col, as_granges = FALSE, style = "NCBI", + local_chain = NULL, verbose = TRUE ) } @@ -66,6 +67,12 @@ instead of a \link[data.table]{data.table} (default: \code{FALSE}).} \item{style}{Style to return \link[GenomicRanges]{GRanges} object in (e.g. "NCBI" = 4; "UCSC" = "chr4";) (default: \code{"NCBI"}).} +\item{local_chain}{Path to local chain file to use instead of downlaoding. +Default of NULL i.e. no local file to use. NOTE if passing a local chain file +make sure to specify the path to convert from and to the correct build like +GRCh37 to GRCh38. We can not sense check this for local files. The chain file +can be submitted as a gz file (as downloaed from source) or unzipped.} + \item{verbose}{Print messages.} } \value{ diff --git a/man/validate_parameters.Rd b/man/validate_parameters.Rd index f6bcaac..af6bd25 100644 --- a/man/validate_parameters.Rd +++ b/man/validate_parameters.Rd @@ -48,6 +48,7 @@ validate_parameters( mapping_file, tabix_index, chain_source, + local_chain, drop_na_cols, rmv_chrPrefix ) @@ -237,6 +238,12 @@ data(sumstatsColHeaders) for default mapping and necessary format.} genome build ("ucsc" or "ensembl"). Note that the UCSC chain files require a license for commercial use. The Ensembl chain is used by default ("ensembl").} +\item{local_chain}{Path to local chain file to use instead of downlaoding. +Default of NULL i.e. no local file to use. NOTE if passing a local chain file +make sure to specify the path to convert from and to the correct build like +GRCh37 to GRCh38. We can not sense check this for local files. The chain file +can be submitted as a gz file (as downloaed from source) or unzipped.} + \item{drop_na_cols}{A character vector of column names to be checked for missing values. Rows with missing values in any of these columns (if present in the dataset) will be dropped. If \code{NULL}, all columns will be checked for