From cccf77b2249f52be59fe1749f13a386ffaaae528 Mon Sep 17 00:00:00 2001
From: Al-Murphy <alanmurph94@hotmail.com>
Date: Mon, 15 Jan 2024 12:17:29 +0000
Subject: [PATCH] Update LDSC output format

---
 DESCRIPTION                          |  2 +-
 NEWS.md                              |  8 ++++++++
 R/format_sumstats.R                  | 25 ++++++++++++++++++++++---
 README.md                            | 21 ++++++++++++---------
 man/check_ldsc_format.Rd             |  7 ++++++-
 man/format_sumstats.Rd               |  7 ++++++-
 man/import_sumstats.Rd               |  7 ++++++-
 man/validate_parameters.Rd           |  7 ++++++-
 man/write_sumstats.Rd                |  7 ++++++-
 tests/testthat/test-vcf_formatting.R | 14 ++++++++++++++
 vignettes/MungeSumstats.Rmd          | 13 +++++++++++--
 11 files changed, 98 insertions(+), 20 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 40e8751..51ceed3 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: MungeSumstats
 Type: Package
 Title: Standardise summary statistics from GWAS
-Version: 1.11.2
+Version: 1.11.3
 Authors@R:
     c(person(given = "Alan",
            family = "Murphy",
diff --git a/NEWS.md b/NEWS.md
index c13f8cf..8563586 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,11 @@
+## CHANGES IN VERSION 1.11.3
+
+### Bug fix
+* For LDSC format, rename A1 and A2 as LDSC expects A1 to be the effect column 
+rather than A2 (the opposite to MSS's default) - see more [here](https://groups.google.com/g/ldsc_users/c/S7FZK743w68).
+Although, this didn't seem to make any difference to results in tests, see more
+[here](https://github.com/neurogenomics/MungeSumstats/issues/160#issuecomment-1891899253).
+
 ## CHANGES IN VERSION 1.11.2
 
 ### Bug fix
diff --git a/R/format_sumstats.R b/R/format_sumstats.R
index c6a6f7e..cb05691 100644
--- a/R/format_sumstats.R
+++ b/R/format_sumstats.R
@@ -187,7 +187,12 @@
 #' @param ldsc_format DEPRECATED, do not use. Use save_format="LDSC" instead.
 #' @param save_format Output format of sumstats. Options are NULL - standardised
 #' output format from MungeSumstats, LDSC - output format compatible with LDSC
-#' and openGWAS - output compatible with openGWAS VCFs. Default is NULL.
+#' and openGWAS - output compatible with openGWAS VCFs. Default is NULL. 
+#' **NOTE** - If LDSC format is used, the naming convention of A1 as the 
+#' reference (genome build) allele and A2 as the effect allele will be reversed
+#' to match LDSC (A1 will now be the effect allele). See more info on this 
+#' [here](https://groups.google.com/g/ldsc_users/c/S7FZK743w68). Note that any 
+#' effect columns (e.g. Z) will be inrelation to A1 now instead of A2.
 #' @param log_folder_ind Binary Should log files be stored containing all
 #' filtered out SNPs (separate file per filter). The data is outputted in the
 #' same format specified for the resulting sumstats file. The only exception to
@@ -285,8 +290,7 @@ format_sumstats <- function(path,
     #### Setup multi-threading ####
     data.table::setDTthreads(threads = nThread)
     #### Setup empty variables ####
-    rsids <- NULL
-    orig_dims <- NULL
+    rsids <- orig_dims <- A1_n <- A2 <- A1 <- NULL
     log_files <- vector(mode = "list")
     t1 <- Sys.time()
 
@@ -1036,6 +1040,21 @@ format_sumstats <- function(path,
         ### Check 39: Ensure CHR follows the requested style ###
         CHR <- NULL
         sumstats_return$sumstats_dt[, CHR := GenomeInfoDb::mapSeqlevels(CHR, style = chr_style)]
+        
+        ### IF LDSC, rename A1 and A2, effect columns are fine
+        if (!is.null(save_format) && 
+            tolower(save_format)=="ldsc") {
+          message("Renaming A1,A2 to match LDSC format.")
+          #For LDSC format, rename A1 and A2 as LDSC expects A1 to be the effect 
+          #column rather than A2 (the opposite to MSS's default) - see more 
+          #[here](https://groups.google.com/g/ldsc_users/c/S7FZK743w68).Although, 
+          #this didn't seem to make any difference to results in tests, see more
+          #https://github.com/neurogenomics/MungeSumstats/issues/160#issuecomment-1891899253
+          sumstats_return$sumstats_dt[,A1_n:=A2]
+          sumstats_return$sumstats_dt[,A2:=A1]
+          sumstats_return$sumstats_dt[,A1:=A1_n]
+          sumstats_return$sumstats_dt[,A1_n:=NULL]
+        }
 
         #### WRITE data.table TO PATH ####
         check_save_out$save_path <- write_sumstats(
diff --git a/README.md b/README.md
index d95a0d6..50ba30a 100644
--- a/README.md
+++ b/README.md
@@ -1,19 +1,22 @@
 `MungeSumstats`: Standardise the format of GWAS summary statistics
 ================
-<h5> ¶ <i>Authors</i>: Alan Murphy, Brian Schilder and Nathan Skene ¶
+<h5>  
+<i>Authors</i>: Alan Murphy, Brian Schilder and Nathan Skene  
+</h5>
+<h5>  
+<i>Updated</i>: Jan-15-2024  
 </h5>
-<h5> ¶ <i>Updated</i>: Jul-13-2023 ¶ </h5>
 
 <!-- Readme.md is generated from Readme.Rmd. Please edit that file -->
 <!-- badges: start -->
 
-[![](https://img.shields.io/badge/release%20version-1.8.0-black.svg)](https://www.bioconductor.org/packages/MungeSumstats)
-[![](https://img.shields.io/badge/devel%20version-1.9.11-black.svg)](https://github.com/neurogenomics/MungeSumstats)
+[![](https://img.shields.io/badge/release%20version-1.10.1-black.svg)](https://www.bioconductor.org/packages/MungeSumstats)
+[![](https://img.shields.io/badge/devel%20version-1.11.3-black.svg)](https://github.com/neurogenomics/MungeSumstats)
 [![R build
 status](https://github.com/neurogenomics/MungeSumstats/workflows/rworkflows/badge.svg)](https://github.com/neurogenomics/MungeSumstats/actions)
 [![](https://img.shields.io/github/last-commit/neurogenomics/MungeSumstats.svg)](https://github.com/neurogenomics/MungeSumstats/commits/master)
 [![](https://codecov.io/gh/neurogenomics/MungeSumstats/branch/master/graph/badge.svg)](https://codecov.io/gh/neurogenomics/MungeSumstats)
-[![](https://img.shields.io/badge/download-5460/total-blue.svg)](https://bioconductor.org/packages/stats/bioc/MungeSumstats)
+[![](https://img.shields.io/badge/download-11379/total-blue.svg)](https://bioconductor.org/packages/stats/bioc/MungeSumstats)
 [![License:
 Artistic-2.0](https://img.shields.io/badge/license-Artistic--2.0-blue.svg)](https://cran.r-project.org/web/licenses/Artistic-2.0)
 [![](https://img.shields.io/badge/doi-https://doi.org/10.1093/bioinformatics/btab665-blue.svg)](https://doi.org/https://doi.org/10.1093/bioinformatics/btab665)
@@ -154,10 +157,10 @@ We would like to acknowledge all those who have contributed to
 
 <div id="ref-Skene2018" class="csl-entry">
 
-<span class="csl-left-margin">1. </span><span
-class="csl-right-inline">Nathan G. Skene, T. E. B., Julien Bryois.
-Genetic identification of brain cell types underlying schizophrenia.
-*Nature Genetics* (2018).
+<span class="csl-left-margin">1.
+</span><span class="csl-right-inline">Nathan G. Skene, T. E. B., Julien
+Bryois. Genetic identification of brain cell types underlying
+schizophrenia. *Nature Genetics* (2018).
 doi:[10.1038/s41588-018-0129-5](https://doi.org/10.1038/s41588-018-0129-5)</span>
 
 </div>
diff --git a/man/check_ldsc_format.Rd b/man/check_ldsc_format.Rd
index 069a8e5..7a86eed 100644
--- a/man/check_ldsc_format.Rd
+++ b/man/check_ldsc_format.Rd
@@ -22,7 +22,12 @@ GWAS.}
 
 \item{save_format}{Output format of sumstats. Options are NULL - standardised
 output format from MungeSumstats, LDSC - output format compatible with LDSC
-and openGWAS - output compatible with openGWAS VCFs. Default is NULL.}
+and openGWAS - output compatible with openGWAS VCFs. Default is NULL.
+\strong{NOTE} - If LDSC format is used, the naming convention of A1 as the
+reference (genome build) allele and A2 as the effect allele will be reversed
+to match LDSC (A1 will now be the effect allele). See more info on this
+\href{https://groups.google.com/g/ldsc_users/c/S7FZK743w68}{here}. Note that any
+effect columns (e.g. Z) will be inrelation to A1 now instead of A2.}
 
 \item{convert_n_int}{Binary, if N (the number of samples) is not an integer,
 should this be rounded? Default is TRUE.}
diff --git a/man/format_sumstats.Rd b/man/format_sumstats.Rd
index 324bbe3..86dac05 100644
--- a/man/format_sumstats.Rd
+++ b/man/format_sumstats.Rd
@@ -264,7 +264,12 @@ FALSE.}
 
 \item{save_format}{Output format of sumstats. Options are NULL - standardised
 output format from MungeSumstats, LDSC - output format compatible with LDSC
-and openGWAS - output compatible with openGWAS VCFs. Default is NULL.}
+and openGWAS - output compatible with openGWAS VCFs. Default is NULL.
+\strong{NOTE} - If LDSC format is used, the naming convention of A1 as the
+reference (genome build) allele and A2 as the effect allele will be reversed
+to match LDSC (A1 will now be the effect allele). See more info on this
+\href{https://groups.google.com/g/ldsc_users/c/S7FZK743w68}{here}. Note that any
+effect columns (e.g. Z) will be inrelation to A1 now instead of A2.}
 
 \item{log_folder_ind}{Binary Should log files be stored containing all
 filtered out SNPs (separate file per filter). The data is outputted in the
diff --git a/man/import_sumstats.Rd b/man/import_sumstats.Rd
index 094c5da..4083cd3 100644
--- a/man/import_sumstats.Rd
+++ b/man/import_sumstats.Rd
@@ -209,7 +209,12 @@ FALSE.}
     \item{\code{ldsc_format}}{DEPRECATED, do not use. Use save_format="LDSC" instead.}
     \item{\code{save_format}}{Output format of sumstats. Options are NULL - standardised
 output format from MungeSumstats, LDSC - output format compatible with LDSC
-and openGWAS - output compatible with openGWAS VCFs. Default is NULL.}
+and openGWAS - output compatible with openGWAS VCFs. Default is NULL.
+\strong{NOTE} - If LDSC format is used, the naming convention of A1 as the
+reference (genome build) allele and A2 as the effect allele will be reversed
+to match LDSC (A1 will now be the effect allele). See more info on this
+\href{https://groups.google.com/g/ldsc_users/c/S7FZK743w68}{here}. Note that any
+effect columns (e.g. Z) will be inrelation to A1 now instead of A2.}
     \item{\code{log_folder_ind}}{Binary Should log files be stored containing all
 filtered out SNPs (separate file per filter). The data is outputted in the
 same format specified for the resulting sumstats file. The only exception to
diff --git a/man/validate_parameters.Rd b/man/validate_parameters.Rd
index 1ebcdf0..91c2cc4 100644
--- a/man/validate_parameters.Rd
+++ b/man/validate_parameters.Rd
@@ -198,7 +198,12 @@ datasets this should be set to FALSE otherwise keep as TRUE. Default is TRUE.}
 
 \item{save_format}{Output format of sumstats. Options are NULL - standardised
 output format from MungeSumstats, LDSC - output format compatible with LDSC
-and openGWAS - output compatible with openGWAS VCFs. Default is NULL.}
+and openGWAS - output compatible with openGWAS VCFs. Default is NULL.
+\strong{NOTE} - If LDSC format is used, the naming convention of A1 as the
+reference (genome build) allele and A2 as the effect allele will be reversed
+to match LDSC (A1 will now be the effect allele). See more info on this
+\href{https://groups.google.com/g/ldsc_users/c/S7FZK743w68}{here}. Note that any
+effect columns (e.g. Z) will be inrelation to A1 now instead of A2.}
 
 \item{imputation_ind}{Binary Should a column be added for each imputation
 step to show what SNPs have imputed values for differing fields. This
diff --git a/man/write_sumstats.Rd b/man/write_sumstats.Rd
index 3186de7..59c6876 100644
--- a/man/write_sumstats.Rd
+++ b/man/write_sumstats.Rd
@@ -39,7 +39,12 @@ reference genome from the data.}
 
 \item{save_format}{Output format of sumstats. Options are NULL - standardised
 output format from MungeSumstats, LDSC - output format compatible with LDSC
-and openGWAS - output compatible with openGWAS VCFs. Default is NULL.}
+and openGWAS - output compatible with openGWAS VCFs. Default is NULL.
+\strong{NOTE} - If LDSC format is used, the naming convention of A1 as the
+reference (genome build) allele and A2 as the effect allele will be reversed
+to match LDSC (A1 will now be the effect allele). See more info on this
+\href{https://groups.google.com/g/ldsc_users/c/S7FZK743w68}{here}. Note that any
+effect columns (e.g. Z) will be inrelation to A1 now instead of A2.}
 
 \item{tabix_index}{Index the formatted summary statistics with
 \href{http://www.htslib.org/doc/tabix.html}{tabix} for fast querying.}
diff --git a/tests/testthat/test-vcf_formatting.R b/tests/testthat/test-vcf_formatting.R
index c9c2332..2aa9be4 100644
--- a/tests/testthat/test-vcf_formatting.R
+++ b/tests/testthat/test-vcf_formatting.R
@@ -146,6 +146,19 @@ test_that("VCF is correctly formatted", {
         ldsc_cols <- c("SNP", "N", "A1", "A2", "Z")
         testthat::expect_true(all(ldsc_cols %in% names(res)))
         
+        #also ensure A1 and A2 have been renamed
+        #For LDSC format, rename A1 and A2 as LDSC expects A1 to be the effect 
+        #column rather than A2 (the opposite to MSS's default) - see more 
+        #[here](https://groups.google.com/g/ldsc_users/c/S7FZK743w68).Although, 
+        #this didn't seem to make any difference to results in tests, see more
+        #[here](https://github.com/neurogenomics/MungeSumstats/issues/160#issuecomment-1891899253).
+        data.table::setnames(res,c("A1","A2"),c("A2","A1"))
+        res[,CHR:=as.character(CHR)]
+        testthat::expect_true(all.equal(res[,c("SNP","CHR","BP","A1","A2","END",
+                                               "FILTER","FRQ","BETA","LP","SE",
+                                               "P")],rtrn_dt))
+        
+                              
         testthat::expect_equal(reformatted_lines[1:5], corr_res)
     } else {
         testthat::expect_true((is_32bit_windows||!Sys.info()["sysname"]=="Linux"))
@@ -153,5 +166,6 @@ test_that("VCF is correctly formatted", {
         testthat::expect_true((is_32bit_windows||!Sys.info()["sysname"]=="Linux"))
         testthat::expect_true((is_32bit_windows||!Sys.info()["sysname"]=="Linux"))
         testthat::expect_true((is_32bit_windows||!Sys.info()["sysname"]=="Linux"))
+        testthat::expect_true((is_32bit_windows||!Sys.info()["sysname"]=="Linux"))
     }
 })
\ No newline at end of file
diff --git a/vignettes/MungeSumstats.Rmd b/vignettes/MungeSumstats.Rmd
index 089e745..c90e305 100644
--- a/vignettes/MungeSumstats.Rmd
+++ b/vignettes/MungeSumstats.Rmd
@@ -165,7 +165,11 @@ flexibility to export the reformatted file as tab-delimited, VCF or R
 native objects such as data.table, GRanges or VRanges objects. The
 output can also be outputted in an **LDSC ready** format which means the
 file can be fed directly into LDSC without the need for additional
-munging.
+munging. **NOTE** - If LDSC format is used, the naming convention of A1 as the 
+reference (genome build) allele and A2 as the effect allele will be reversed
+to match LDSC (A1 will now be the effect allele). See more info on this 
+[here](https://groups.google.com/g/ldsc_users/c/S7FZK743w68). Note that any 
+effect columns (e.g. Z) will be inrelation to A1 now instead of A2.
 
 # Data
 
@@ -419,7 +423,12 @@ conducted by *MungeSumstats* are:
     ("data.table","vranges","granges").
 -   **save_format** Ensure that output format meets all requirements to
     be passed directly into LDSC ("ldsc") without the need for additional
-    munging or for IEU OpenGWAS format ("opengwas") before saving as a VCF
+    munging or for IEU OpenGWAS format ("opengwas") before saving as a VCF.
+    **NOTE** - If LDSC format is used, the naming convention of A1 as the 
+    reference (genome build) allele and A2 as the effect allele will be reversed
+    to match LDSC (A1 will now be the effect allele). See more info on this 
+    [here](https://groups.google.com/g/ldsc_users/c/S7FZK743w68). Note that any 
+    effect columns (e.g. Z) will be inrelation to A1 now instead of A2.
 -   **log_folder_ind** Should log files be stored containing all
     filtered out SNPs (separate file per filter). The data is outputted
     in the same format specified for the resulting sumstats file.