From cccf77b2249f52be59fe1749f13a386ffaaae528 Mon Sep 17 00:00:00 2001 From: Al-Murphy Date: Mon, 15 Jan 2024 12:17:29 +0000 Subject: [PATCH] Update LDSC output format --- DESCRIPTION | 2 +- NEWS.md | 8 ++++++++ R/format_sumstats.R | 25 ++++++++++++++++++++++--- README.md | 21 ++++++++++++--------- man/check_ldsc_format.Rd | 7 ++++++- man/format_sumstats.Rd | 7 ++++++- man/import_sumstats.Rd | 7 ++++++- man/validate_parameters.Rd | 7 ++++++- man/write_sumstats.Rd | 7 ++++++- tests/testthat/test-vcf_formatting.R | 14 ++++++++++++++ vignettes/MungeSumstats.Rmd | 13 +++++++++++-- 11 files changed, 98 insertions(+), 20 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 40e8751..51ceed3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: MungeSumstats Type: Package Title: Standardise summary statistics from GWAS -Version: 1.11.2 +Version: 1.11.3 Authors@R: c(person(given = "Alan", family = "Murphy", diff --git a/NEWS.md b/NEWS.md index c13f8cf..8563586 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +## CHANGES IN VERSION 1.11.3 + +### Bug fix +* For LDSC format, rename A1 and A2 as LDSC expects A1 to be the effect column +rather than A2 (the opposite to MSS's default) - see more [here](https://groups.google.com/g/ldsc_users/c/S7FZK743w68). +Although, this didn't seem to make any difference to results in tests, see more +[here](https://github.com/neurogenomics/MungeSumstats/issues/160#issuecomment-1891899253). + ## CHANGES IN VERSION 1.11.2 ### Bug fix diff --git a/R/format_sumstats.R b/R/format_sumstats.R index c6a6f7e..cb05691 100644 --- a/R/format_sumstats.R +++ b/R/format_sumstats.R @@ -187,7 +187,12 @@ #' @param ldsc_format DEPRECATED, do not use. Use save_format="LDSC" instead. #' @param save_format Output format of sumstats. Options are NULL - standardised #' output format from MungeSumstats, LDSC - output format compatible with LDSC -#' and openGWAS - output compatible with openGWAS VCFs. Default is NULL. +#' and openGWAS - output compatible with openGWAS VCFs. Default is NULL. +#' **NOTE** - If LDSC format is used, the naming convention of A1 as the +#' reference (genome build) allele and A2 as the effect allele will be reversed +#' to match LDSC (A1 will now be the effect allele). See more info on this +#' [here](https://groups.google.com/g/ldsc_users/c/S7FZK743w68). Note that any +#' effect columns (e.g. Z) will be inrelation to A1 now instead of A2. #' @param log_folder_ind Binary Should log files be stored containing all #' filtered out SNPs (separate file per filter). The data is outputted in the #' same format specified for the resulting sumstats file. The only exception to @@ -285,8 +290,7 @@ format_sumstats <- function(path, #### Setup multi-threading #### data.table::setDTthreads(threads = nThread) #### Setup empty variables #### - rsids <- NULL - orig_dims <- NULL + rsids <- orig_dims <- A1_n <- A2 <- A1 <- NULL log_files <- vector(mode = "list") t1 <- Sys.time() @@ -1036,6 +1040,21 @@ format_sumstats <- function(path, ### Check 39: Ensure CHR follows the requested style ### CHR <- NULL sumstats_return$sumstats_dt[, CHR := GenomeInfoDb::mapSeqlevels(CHR, style = chr_style)] + + ### IF LDSC, rename A1 and A2, effect columns are fine + if (!is.null(save_format) && + tolower(save_format)=="ldsc") { + message("Renaming A1,A2 to match LDSC format.") + #For LDSC format, rename A1 and A2 as LDSC expects A1 to be the effect + #column rather than A2 (the opposite to MSS's default) - see more + #[here](https://groups.google.com/g/ldsc_users/c/S7FZK743w68).Although, + #this didn't seem to make any difference to results in tests, see more + #https://github.com/neurogenomics/MungeSumstats/issues/160#issuecomment-1891899253 + sumstats_return$sumstats_dt[,A1_n:=A2] + sumstats_return$sumstats_dt[,A2:=A1] + sumstats_return$sumstats_dt[,A1:=A1_n] + sumstats_return$sumstats_dt[,A1_n:=NULL] + } #### WRITE data.table TO PATH #### check_save_out$save_path <- write_sumstats( diff --git a/README.md b/README.md index d95a0d6..50ba30a 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,22 @@ `MungeSumstats`: Standardise the format of GWAS summary statistics ================ -
Authors: Alan Murphy, Brian Schilder and Nathan Skene ¶ +
+Authors: Alan Murphy, Brian Schilder and Nathan Skene +
+
+Updated: Jan-15-2024
-
Updated: Jul-13-2023 ¶
-[![](https://img.shields.io/badge/release%20version-1.8.0-black.svg)](https://www.bioconductor.org/packages/MungeSumstats) -[![](https://img.shields.io/badge/devel%20version-1.9.11-black.svg)](https://github.com/neurogenomics/MungeSumstats) +[![](https://img.shields.io/badge/release%20version-1.10.1-black.svg)](https://www.bioconductor.org/packages/MungeSumstats) +[![](https://img.shields.io/badge/devel%20version-1.11.3-black.svg)](https://github.com/neurogenomics/MungeSumstats) [![R build status](https://github.com/neurogenomics/MungeSumstats/workflows/rworkflows/badge.svg)](https://github.com/neurogenomics/MungeSumstats/actions) [![](https://img.shields.io/github/last-commit/neurogenomics/MungeSumstats.svg)](https://github.com/neurogenomics/MungeSumstats/commits/master) [![](https://codecov.io/gh/neurogenomics/MungeSumstats/branch/master/graph/badge.svg)](https://codecov.io/gh/neurogenomics/MungeSumstats) -[![](https://img.shields.io/badge/download-5460/total-blue.svg)](https://bioconductor.org/packages/stats/bioc/MungeSumstats) +[![](https://img.shields.io/badge/download-11379/total-blue.svg)](https://bioconductor.org/packages/stats/bioc/MungeSumstats) [![License: Artistic-2.0](https://img.shields.io/badge/license-Artistic--2.0-blue.svg)](https://cran.r-project.org/web/licenses/Artistic-2.0) [![](https://img.shields.io/badge/doi-https://doi.org/10.1093/bioinformatics/btab665-blue.svg)](https://doi.org/https://doi.org/10.1093/bioinformatics/btab665) @@ -154,10 +157,10 @@ We would like to acknowledge all those who have contributed to
-1. Nathan G. Skene, T. E. B., Julien Bryois. -Genetic identification of brain cell types underlying schizophrenia. -*Nature Genetics* (2018). +1. +Nathan G. Skene, T. E. B., Julien +Bryois. Genetic identification of brain cell types underlying +schizophrenia. *Nature Genetics* (2018). doi:[10.1038/s41588-018-0129-5](https://doi.org/10.1038/s41588-018-0129-5)
diff --git a/man/check_ldsc_format.Rd b/man/check_ldsc_format.Rd index 069a8e5..7a86eed 100644 --- a/man/check_ldsc_format.Rd +++ b/man/check_ldsc_format.Rd @@ -22,7 +22,12 @@ GWAS.} \item{save_format}{Output format of sumstats. Options are NULL - standardised output format from MungeSumstats, LDSC - output format compatible with LDSC -and openGWAS - output compatible with openGWAS VCFs. Default is NULL.} +and openGWAS - output compatible with openGWAS VCFs. Default is NULL. +\strong{NOTE} - If LDSC format is used, the naming convention of A1 as the +reference (genome build) allele and A2 as the effect allele will be reversed +to match LDSC (A1 will now be the effect allele). See more info on this +\href{https://groups.google.com/g/ldsc_users/c/S7FZK743w68}{here}. Note that any +effect columns (e.g. Z) will be inrelation to A1 now instead of A2.} \item{convert_n_int}{Binary, if N (the number of samples) is not an integer, should this be rounded? Default is TRUE.} diff --git a/man/format_sumstats.Rd b/man/format_sumstats.Rd index 324bbe3..86dac05 100644 --- a/man/format_sumstats.Rd +++ b/man/format_sumstats.Rd @@ -264,7 +264,12 @@ FALSE.} \item{save_format}{Output format of sumstats. Options are NULL - standardised output format from MungeSumstats, LDSC - output format compatible with LDSC -and openGWAS - output compatible with openGWAS VCFs. Default is NULL.} +and openGWAS - output compatible with openGWAS VCFs. Default is NULL. +\strong{NOTE} - If LDSC format is used, the naming convention of A1 as the +reference (genome build) allele and A2 as the effect allele will be reversed +to match LDSC (A1 will now be the effect allele). See more info on this +\href{https://groups.google.com/g/ldsc_users/c/S7FZK743w68}{here}. Note that any +effect columns (e.g. Z) will be inrelation to A1 now instead of A2.} \item{log_folder_ind}{Binary Should log files be stored containing all filtered out SNPs (separate file per filter). The data is outputted in the diff --git a/man/import_sumstats.Rd b/man/import_sumstats.Rd index 094c5da..4083cd3 100644 --- a/man/import_sumstats.Rd +++ b/man/import_sumstats.Rd @@ -209,7 +209,12 @@ FALSE.} \item{\code{ldsc_format}}{DEPRECATED, do not use. Use save_format="LDSC" instead.} \item{\code{save_format}}{Output format of sumstats. Options are NULL - standardised output format from MungeSumstats, LDSC - output format compatible with LDSC -and openGWAS - output compatible with openGWAS VCFs. Default is NULL.} +and openGWAS - output compatible with openGWAS VCFs. Default is NULL. +\strong{NOTE} - If LDSC format is used, the naming convention of A1 as the +reference (genome build) allele and A2 as the effect allele will be reversed +to match LDSC (A1 will now be the effect allele). See more info on this +\href{https://groups.google.com/g/ldsc_users/c/S7FZK743w68}{here}. Note that any +effect columns (e.g. Z) will be inrelation to A1 now instead of A2.} \item{\code{log_folder_ind}}{Binary Should log files be stored containing all filtered out SNPs (separate file per filter). The data is outputted in the same format specified for the resulting sumstats file. The only exception to diff --git a/man/validate_parameters.Rd b/man/validate_parameters.Rd index 1ebcdf0..91c2cc4 100644 --- a/man/validate_parameters.Rd +++ b/man/validate_parameters.Rd @@ -198,7 +198,12 @@ datasets this should be set to FALSE otherwise keep as TRUE. Default is TRUE.} \item{save_format}{Output format of sumstats. Options are NULL - standardised output format from MungeSumstats, LDSC - output format compatible with LDSC -and openGWAS - output compatible with openGWAS VCFs. Default is NULL.} +and openGWAS - output compatible with openGWAS VCFs. Default is NULL. +\strong{NOTE} - If LDSC format is used, the naming convention of A1 as the +reference (genome build) allele and A2 as the effect allele will be reversed +to match LDSC (A1 will now be the effect allele). See more info on this +\href{https://groups.google.com/g/ldsc_users/c/S7FZK743w68}{here}. Note that any +effect columns (e.g. Z) will be inrelation to A1 now instead of A2.} \item{imputation_ind}{Binary Should a column be added for each imputation step to show what SNPs have imputed values for differing fields. This diff --git a/man/write_sumstats.Rd b/man/write_sumstats.Rd index 3186de7..59c6876 100644 --- a/man/write_sumstats.Rd +++ b/man/write_sumstats.Rd @@ -39,7 +39,12 @@ reference genome from the data.} \item{save_format}{Output format of sumstats. Options are NULL - standardised output format from MungeSumstats, LDSC - output format compatible with LDSC -and openGWAS - output compatible with openGWAS VCFs. Default is NULL.} +and openGWAS - output compatible with openGWAS VCFs. Default is NULL. +\strong{NOTE} - If LDSC format is used, the naming convention of A1 as the +reference (genome build) allele and A2 as the effect allele will be reversed +to match LDSC (A1 will now be the effect allele). See more info on this +\href{https://groups.google.com/g/ldsc_users/c/S7FZK743w68}{here}. Note that any +effect columns (e.g. Z) will be inrelation to A1 now instead of A2.} \item{tabix_index}{Index the formatted summary statistics with \href{http://www.htslib.org/doc/tabix.html}{tabix} for fast querying.} diff --git a/tests/testthat/test-vcf_formatting.R b/tests/testthat/test-vcf_formatting.R index c9c2332..2aa9be4 100644 --- a/tests/testthat/test-vcf_formatting.R +++ b/tests/testthat/test-vcf_formatting.R @@ -146,6 +146,19 @@ test_that("VCF is correctly formatted", { ldsc_cols <- c("SNP", "N", "A1", "A2", "Z") testthat::expect_true(all(ldsc_cols %in% names(res))) + #also ensure A1 and A2 have been renamed + #For LDSC format, rename A1 and A2 as LDSC expects A1 to be the effect + #column rather than A2 (the opposite to MSS's default) - see more + #[here](https://groups.google.com/g/ldsc_users/c/S7FZK743w68).Although, + #this didn't seem to make any difference to results in tests, see more + #[here](https://github.com/neurogenomics/MungeSumstats/issues/160#issuecomment-1891899253). + data.table::setnames(res,c("A1","A2"),c("A2","A1")) + res[,CHR:=as.character(CHR)] + testthat::expect_true(all.equal(res[,c("SNP","CHR","BP","A1","A2","END", + "FILTER","FRQ","BETA","LP","SE", + "P")],rtrn_dt)) + + testthat::expect_equal(reformatted_lines[1:5], corr_res) } else { testthat::expect_true((is_32bit_windows||!Sys.info()["sysname"]=="Linux")) @@ -153,5 +166,6 @@ test_that("VCF is correctly formatted", { testthat::expect_true((is_32bit_windows||!Sys.info()["sysname"]=="Linux")) testthat::expect_true((is_32bit_windows||!Sys.info()["sysname"]=="Linux")) testthat::expect_true((is_32bit_windows||!Sys.info()["sysname"]=="Linux")) + testthat::expect_true((is_32bit_windows||!Sys.info()["sysname"]=="Linux")) } }) \ No newline at end of file diff --git a/vignettes/MungeSumstats.Rmd b/vignettes/MungeSumstats.Rmd index 089e745..c90e305 100644 --- a/vignettes/MungeSumstats.Rmd +++ b/vignettes/MungeSumstats.Rmd @@ -165,7 +165,11 @@ flexibility to export the reformatted file as tab-delimited, VCF or R native objects such as data.table, GRanges or VRanges objects. The output can also be outputted in an **LDSC ready** format which means the file can be fed directly into LDSC without the need for additional -munging. +munging. **NOTE** - If LDSC format is used, the naming convention of A1 as the +reference (genome build) allele and A2 as the effect allele will be reversed +to match LDSC (A1 will now be the effect allele). See more info on this +[here](https://groups.google.com/g/ldsc_users/c/S7FZK743w68). Note that any +effect columns (e.g. Z) will be inrelation to A1 now instead of A2. # Data @@ -419,7 +423,12 @@ conducted by *MungeSumstats* are: ("data.table","vranges","granges"). - **save_format** Ensure that output format meets all requirements to be passed directly into LDSC ("ldsc") without the need for additional - munging or for IEU OpenGWAS format ("opengwas") before saving as a VCF + munging or for IEU OpenGWAS format ("opengwas") before saving as a VCF. + **NOTE** - If LDSC format is used, the naming convention of A1 as the + reference (genome build) allele and A2 as the effect allele will be reversed + to match LDSC (A1 will now be the effect allele). See more info on this + [here](https://groups.google.com/g/ldsc_users/c/S7FZK743w68). Note that any + effect columns (e.g. Z) will be inrelation to A1 now instead of A2. - **log_folder_ind** Should log files be stored containing all filtered out SNPs (separate file per filter). The data is outputted in the same format specified for the resulting sumstats file.