From d16c5fb815ad6f80bfcaf01d2f1294d203808679 Mon Sep 17 00:00:00 2001
From: Al-Murphy <alanmurph94@hotmail.com>
Date: Fri, 13 Oct 2023 17:27:05 +0100
Subject: [PATCH] Bug Fix: Infer effect allele

---
 DESCRIPTION                                   |   2 +-
 NAMESPACE                                     |   1 +
 NEWS.md                                       |  41 ++++
 R/format_sumstats.R                           |  11 ++
 R/get_eff_frq_allele_combns.R                 |  77 ++++++++
 R/get_genome_build.R                          | 187 ++++++++++++++----
 R/infer_effect_column.R                       | 182 +++++++++++++++++
 ...se_sumstats_column_headers_crossplatform.R |   6 +-
 R/sysdata.rda                                 | Bin 2106 -> 2014 bytes
 data/sumstatsColHeaders.rda                   | Bin 2150 -> 2053 bytes
 man/get_eff_frq_allele_combns.Rd              |  28 +++
 man/get_genome_build.Rd                       |  11 +-
 man/infer_effect_column.Rd                    |  73 +++++++
 tests/testthat/test-infer_effect_column.R     |  95 +++++++++
 14 files changed, 670 insertions(+), 44 deletions(-)
 create mode 100644 R/get_eff_frq_allele_combns.R
 create mode 100644 R/infer_effect_column.R
 create mode 100644 man/get_eff_frq_allele_combns.Rd
 create mode 100644 man/infer_effect_column.Rd
 create mode 100644 tests/testthat/test-infer_effect_column.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 2cba6af5..b9a36fba 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: MungeSumstats
 Type: Package
 Title: Standardise summary statistics from GWAS
-Version: 1.9.17
+Version: 1.9.18
 Authors@R:
     c(person(given = "Alan",
            family = "Murphy",
diff --git a/NAMESPACE b/NAMESPACE
index 4761e75b..03c8c9af 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -8,6 +8,7 @@ export(formatted_example)
 export(get_genome_builds)
 export(import_sumstats)
 export(index_tabular)
+export(infer_effect_column)
 export(liftover)
 export(list_sumstats)
 export(load_snp_loc_data)
diff --git a/NEWS.md b/NEWS.md
index 5b55a28a..c3c10cfe 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,44 @@
+## CHANGES IN VERSION 1.9.18
+
+### Bug fix
+* Fixed column header mappings
+  * Made all uncorrected header names uppercase and removed duplicates
+  * "TOTALSAMPLESIZE" now maps to "N" instead of "NSTUDY"
+  * "MAJORALLELE", "MAJOR_ALLELE", "MAJOR-ALLELE", and "MAJOR ALLELE" now map to 
+    "A1" instead of "A2"
+  * Removed the mappings for "OR-A1", "OR.A1", "OR_A1", and "BETA1" because MSS 
+    assumes that A2 is the effect allele
+  * Removed mappings for "A1FREQ", "A1FRQ", "AF1", "FREQ.A1.1000G.EUR", 
+    "FREQ.A1.ESP.EUR", "FREQ.ALLELE1.HAPMAPCEU", "FREQ1", "FREQ1.HAPMAP", and 
+    "FRQ_A1" because MSS defines "FRQ" to be the allele frequency of A2
+  * Removed mappings for "CHR36", "BASE_GRCH36", "POSITION36", "POSGRCH36", 
+    "BASEGRCH36", "POS36", "POS GRCH36", "POS.GRCH36", "POS-GRCH36", and 
+    "POS_GRCH36" 
+    because MSS does not support the GRCh36 genome build
+  * Removed the ambiguous mapping "NMISS" -> "N" because "NMISS" can refer to 
+    the number of samples with missing data
+  * Removed the ambiguous mapping "WEIGHT" -> "N" because "WEIGHT" can refer to 
+    coefficient weights
+* Fixed inference of allele where ambiguous (A1, A2) naming used (see 
+  infer_effect_column.R for code) but in short:
+  * Three checks now made to infer which allele the effect/frequency information
+    relates to. See infer_effect_column.R for further details.
+  * See get_eff_frq_allele_combns.R for how effect/frequency columns that infer 
+    the allele are captured in the mapping file
+
+### New features
+* New column header mappings:
+  * "VARIANT_ID" and "RSIDS" --> "SNP"
+  * "P_BOLT_LMM" --> "P"
+  * "NCASES" --> "N_CAS"
+  * "N_EFFECTIVE", "N_INFORMATIVE", and "TOTAL_N" --> "N"
+  * "HET_P" --> "HETPVAL"
+  * "HET_ISQ" --> "HETISQT"
+  * "ALL_AF" --> "FRQ"
+  * "DIRECT" --> "DIRECTION"
+  * "ALT_EFFSIZE" --> "BETA"
+  * "INFORMATIVE_ALT_AC" --> "AC"
+
 ## CHANGES IN VERSION 1.9.17
 
 ### Bug fix
diff --git a/R/format_sumstats.R b/R/format_sumstats.R
index ce517e53..e60aee45 100644
--- a/R/format_sumstats.R
+++ b/R/format_sumstats.R
@@ -445,6 +445,17 @@ format_sumstats <- function(path,
           mapping_file <- rbind(mapping_file,es_cols)
         }
 
+        #### Check 2:Check for effect direction ####
+        sumstats_return <-
+          infer_effect_column(
+            sumstats_dt = sumstats_return$sumstats_dt,
+            mapping_file = mapping_file,
+            dbSNP = dbSNP,
+            nThread = nThread,
+            ref_genome = ref_genome,
+            on_ref_genome = on_ref_genome
+          )
+        
         #### Check 3:Standardise headers for all OS ####
         sumstats_return <-
             standardise_sumstats_column_headers_crossplatform(
diff --git a/R/get_eff_frq_allele_combns.R b/R/get_eff_frq_allele_combns.R
new file mode 100644
index 00000000..e6e93751
--- /dev/null
+++ b/R/get_eff_frq_allele_combns.R
@@ -0,0 +1,77 @@
+#' Get combinations of uncorrected allele and effect (and frq) columns
+#'
+#' @inheritParams format_sumstats 
+#' @inheritParams compute_nsize
+#' @param eff_frq_cols Corrected effect or frequency column names found in a 
+#' sumstats. Default of BETA, OR, LOG_ODDS, SIGNED_SUMSTAT, Z and FRQ.
+#' @return datatable containing uncorrected and corrected combinations
+#' @importFrom data.table setnames as.data.table := setkey rbindlist data.table
+get_eff_frq_allele_combns <-
+  function(mapping_file = sumstatsColHeaders, 
+           eff_frq_cols = c("BETA", "OR", "LOG_ODDS", "SIGNED_SUMSTAT","Z",
+                            "FRQ")) {
+    ### Add this to avoid confusing BiocCheck
+    CORRECTED <- UNCORRECTED <- Var1 <- Var2 <- NULL 
+    colnames(mapping_file) <- toupper(colnames(mapping_file))
+    #get allele associated effect/FRQ columns
+    #get all combinations with allele columns
+    eff_frq_cols_uncorrc <- 
+      mapping_file[mapping_file$CORRECTED %in% eff_frq_cols,]$UNCORRECTED
+    #join with all allele cols
+    allele_uncorrc <- 
+      mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]$UNCORRECTED
+    #get combinations
+    eff_frq_allele_dt <- 
+      data.table::as.data.table(expand.grid(eff_frq_cols_uncorrc, 
+                                            allele_uncorrc))
+    mapping_file_dt <- data.table::as.data.table(mapping_file)
+    data.table::setkey(mapping_file_dt,"UNCORRECTED")
+    data.table::setkey(eff_frq_allele_dt,"Var1")
+    #add corrected
+    eff_frq_allele_dt[mapping_file,CORRECTED:=CORRECTED,]
+    #now loop through every joining character and join with eff both before
+    #and after
+    joining_char <- c("","_",".","-"," ")
+    all_combns <- vector(mode="list",length = length(joining_char)*2)
+    counter <- 1
+    for(join_i in joining_char){
+      eff_frq_allele_dt_i <- copy(eff_frq_allele_dt)  
+      eff_frq_allele_dt_i[,UNCORRECTED:=paste0(Var1,join_i,Var2)]
+      all_combns[[counter]] <- 
+        eff_frq_allele_dt_i[,c("UNCORRECTED","CORRECTED")]
+      counter <- counter+1
+      #same for Var 2 in front
+      eff_frq_allele_dt_i <- copy(eff_frq_allele_dt)  
+      eff_frq_allele_dt_i[,UNCORRECTED:=paste0(Var2,join_i,Var1)]
+      all_combns[[counter]] <- 
+        eff_frq_allele_dt_i[,c("UNCORRECTED","CORRECTED")]
+      counter <- counter+1
+    }
+    #join all together
+    eff_frq_allele_matches <- data.table::rbindlist(all_combns)
+    #finally add some custom ones 
+    custom_adds <- data.table::data.table("UNCORRECTED" = 
+                                            c("BETA1", "BETA2","AF1","AF2",
+                                              "FREQ.A1.1000G.EUR",
+                                              "FREQ.A2.1000G.EUR",
+                                              "FREQ.A1.ESP.EUR",
+                                              "FREQ.A2.ESP.EUR",
+                                              "FREQ.ALLELE1.HAPMAPCEU",
+                                              "FREQ.ALLELE2.HAPMAPCEU",
+                                              "FREQ1","FREQ2", 
+                                              "FREQ1.HAPMAP","FREQ2.HAPMAP"),
+                                          "CORRECTED" = 
+                                            c("BETA", "BETA","FRQ","FRQ",
+                                              "FRQ",
+                                              "FRQ",
+                                              "FRQ",
+                                              "FRQ",
+                                              "FRQ",
+                                              "FRQ",
+                                              "FRQ","FRQ",
+                                              "FRQ","FRQ"))
+    eff_frq_allele_matches <- data.table::rbindlist(list(
+      eff_frq_allele_matches,custom_adds))
+    
+    return(eff_frq_allele_matches)
+  }
diff --git a/R/get_genome_build.R b/R/get_genome_build.R
index dbd5b12d..0209ced0 100644
--- a/R/get_genome_build.R
+++ b/R/get_genome_build.R
@@ -21,9 +21,12 @@
 #' only read in the first N rows where N=\code{sampled_snps}.
 #' This should help speed up cases where you have to read in \code{sumstats}
 #' from disk each time.
-#'
+#' @param allele_match_ref Instead of returning the genome_build this will 
+#' return the propotion of matches to each genome build for each allele (A1,A2).
+#' @inheritParams format_sumstats 
+#' 
 #' @return ref_genome the genome build of the data
-#' @importFrom data.table setDT
+#' @importFrom data.table setDT :=
 #' @keywords internal
 get_genome_build <- function(sumstats,
                              nThread = 1,
@@ -31,11 +34,14 @@ get_genome_build <- function(sumstats,
                              standardise_headers = TRUE,
                              mapping_file = sumstatsColHeaders,
                              dbSNP=155,
-                             header_only = FALSE) {
+                             header_only = FALSE,
+                             allele_match_ref = FALSE,
+                             ref_genome = NULL) {
     ### Add this to avoid confusing BiocCheck
-    seqnames <- CHR <- SNP <- BP <- NULL 
-
-    message("Inferring genome build.")
+    seqnames <- CHR <- SNP <- BP <- alt_alleles <- NULL 
+    if(isFALSE(allele_match_ref)){
+      message("Inferring genome build.")
+    }
     # if not a data.table, must be a path
     if (!is.data.frame(sumstats)) {
         # Checking if the file exists should happen first
@@ -82,7 +88,13 @@ get_genome_build <- function(sumstats,
         )
     # Infer genome build using SNP & CHR & BP
     if (!all(c("SNP", "CHR", "BP") %in% colnames(sumstats))) {
+      #want it returned rather than throwing an error
+      if(isTRUE(allele_match_ref)){
+        return(err_msg)
+      }
+      else{  
         stop(err_msg)
+      }
     }
 
     #### Do some filtering first to avoid errors ####
@@ -113,7 +125,13 @@ get_genome_build <- function(sumstats,
         size_okay <- TRUE
     }
     if (!size_okay) {
+      #want it returned rather than throwing an error
+      if(isTRUE(allele_match_ref)){
+        return(err_msg2)
+      }
+      else{  
         stop(err_msg2)
+      }
     }
     #### Downsample SNPs to save time ####
     if ((nrow(sumstats) > sampled_snps) && !(is.null(sampled_snps))) {
@@ -124,58 +142,145 @@ get_genome_build <- function(sumstats,
 
     sumstats <- sumstats[SNP %in% snps, ]
 
-    # otherwise SNP, CHR, BP were all found and can infer
-    snp_loc_data_37 <- load_ref_genome_data(
+    #now split into functions two roles
+    if(isTRUE(allele_match_ref)){
+      #1. checking for matches for A1/A2 to ref genomes
+      if (is.null(ref_genome)){
+        #have to check multiple
+        snp_loc_data_37 <- load_ref_genome_data(
+          snps = snps,
+          ref_genome = "GRCH37",
+          dbSNP = dbSNP
+        )
+        snp_loc_data_38 <- load_ref_genome_data(
+          snps = snps,
+          ref_genome = "GRCH38",
+          dbSNP = dbSNP
+        )
+        # convert CHR filed in ref genomes to character not factor
+        snp_loc_data_37[, seqnames := as.character(seqnames)]
+        snp_loc_data_38[, seqnames := as.character(seqnames)]
+        # convert CHR filed in data to character if not already
+        sumstats[, CHR := as.character(CHR)]
+        # Now check which genome build has more matches to data
+        num_37 <-
+          nrow(snp_loc_data_37[sumstats, ,
+                               on = c("SNP"="SNP","pos"="BP","seqnames"="CHR"),
+                               nomatch = FALSE
+          ])
+        num_38 <-
+          nrow(snp_loc_data_38[sumstats, ,
+                               on = c("SNP"="SNP","pos"="BP","seqnames"="CHR"),
+                               nomatch = FALSE
+          ])
+        if (num_37 > num_38) {
+          ref_gen_num <- num_37
+          ref_genome <- "GRCH37"
+          snp_loc_data <- snp_loc_data_37
+          
+        } else {
+          ref_gen_num <- num_38
+          ref_genome <- "GRCH38"
+          snp_loc_data <- snp_loc_data_38
+        }
+      }
+      else{
+        #only check one chosen
+        snp_loc_data <- load_ref_genome_data(
+          snps = snps,
+          ref_genome = ref_genome,
+          dbSNP = dbSNP
+        )
+        # convert CHR filed in ref genomes to character not factor
+        snp_loc_data[, seqnames := as.character(seqnames)]
+        # convert CHR filed in data to character if not already
+        sumstats[, CHR := as.character(CHR)]
+      }
+      # Now check which allele has more matches to data
+      # want to match on ref and alt alleles too 
+      # need to take first alt from list to do this
+      snp_loc_data[,alt_alleles := unlist(lapply(alt_alleles, 
+                                                 function(x) x[[1]]))]
+      num_a1 <-
+        nrow(snp_loc_data[sumstats, ,
+                          on = c("SNP"="SNP","pos"="BP","seqnames"="CHR",
+                                 "ref_allele"="A1","alt_alleles"="A2"),
+                          nomatch = FALSE
+        ])
+      num_a2 <-
+        nrow(snp_loc_data[sumstats, ,
+                          on = c("SNP"="SNP","pos"="BP","seqnames"="CHR",
+                                 "ref_allele"="A2","alt_alleles"="A1"),
+                          nomatch = FALSE
+        ])
+      if(num_a2>=num_a1){
+        message("Effect/frq column(s) relate to A2 in the inputted sumstats")
+        #this is what MSS expects so no action required
+        switch_req <- FALSE
+      }else{#num_a2<num_a1
+        message("Effect/frq column(s) relate to A1 in the inputted sumstats")
+        switch_req <- TRUE
+      }
+      
+      return(switch_req)
+    }  
+    else{
+      #2. checking for ref genome
+      # otherwise SNP, CHR, BP were all found and can infer
+      snp_loc_data_37 <- load_ref_genome_data(
         snps = snps,
         ref_genome = "GRCH37",
         dbSNP = dbSNP
-    )
-    snp_loc_data_38 <- load_ref_genome_data(
+      )
+      snp_loc_data_38 <- load_ref_genome_data(
         snps = snps,
         ref_genome = "GRCH38",
         dbSNP = dbSNP
-    )
-    # convert CHR filed in ref genomes to character not factor
-    snp_loc_data_37[, seqnames := as.character(seqnames)]
-    snp_loc_data_38[, seqnames := as.character(seqnames)]
-    # convert CHR filed in data to character if not already
-    sumstats[, CHR := as.character(CHR)]
-    # Now check which genome build has more matches to data
-    num_37 <-
+      )
+      # convert CHR filed in ref genomes to character not factor
+      snp_loc_data_37[, seqnames := as.character(seqnames)]
+      snp_loc_data_38[, seqnames := as.character(seqnames)]
+      # convert CHR filed in data to character if not already
+      sumstats[, CHR := as.character(CHR)]
+      # Now check which genome build has more matches to data
+      num_37 <-
         nrow(snp_loc_data_37[sumstats, ,
-            on = c("SNP" = "SNP", "pos" = "BP", "seqnames" = "CHR"),
-            nomatch = FALSE
+                             on = c("SNP"="SNP", "pos"="BP", "seqnames"="CHR"),
+                             nomatch = FALSE
         ])
-    num_38 <-
+      num_38 <-
         nrow(snp_loc_data_38[sumstats, ,
-            on = c("SNP" = "SNP", "pos" = "BP", "seqnames" = "CHR"),
-            nomatch = FALSE
+                             on = c("SNP"="SNP", "pos"="BP", "seqnames"="CHR"),
+                             nomatch = FALSE
         ])
-    #if no matches throw error
-    if(num_37==0 && num_38==0){
-      msg_err <- paste0("No matches found in either reference genome for your ",
-                        "SNPs.\nPlease check their formatting (SNP, CHR and BP",
-                        " columns) or supply the genome build.")
-      stop(msg_err)
-    }
-    if (num_37 > num_38) {
+      #if no matches throw error
+      if(num_37==0 && num_38==0){
+        msg_err <- 
+          paste0("No matches found in either reference genome for your ",
+                 "SNPs.\nPlease check their formatting (SNP, CHR and BP",
+                 " columns) or supply the genome build.")
+        stop(msg_err)
+      }
+      if (num_37 > num_38) {
         ref_gen_num <- num_37
         ref_genome <- "GRCH37"
-    } else {
+      } else {
         ref_gen_num <- num_38
         ref_genome <- "GRCH38"
-    }
-
-    message("Inferred genome build: ", ref_genome)
-    # add a warning if low proportion of matches found
-    msg <- paste0(
+      }
+      
+      message("Inferred genome build: ", ref_genome)
+      # add a warning if low proportion of matches found
+      msg <- paste0(
         "WARNING: Less than 10% of your sampled SNPs matched that of ",
         "either reference genome, this may question the quality of ",
         "your summary statistics file."
-    )
-    if (ref_gen_num / length(snps) < 0.1) {
+      )
+      if (ref_gen_num / length(snps) < 0.1) {
         message(msg)
+      }
+      
+      return(ref_genome)
+      
     }
-
-    return(ref_genome)
 }
diff --git a/R/infer_effect_column.R b/R/infer_effect_column.R
new file mode 100644
index 00000000..f70fbc66
--- /dev/null
+++ b/R/infer_effect_column.R
@@ -0,0 +1,182 @@
+#' Infer if effect relates to a1 or A2 if ambiguously named
+#'
+#' Three checks are made to infer which allele the effect/frequency information
+#' relates to if they are ambiguous (named A1 and A2 or equivalent):
+#' 1. Check if ambiguous naming conventions are used (i.e. allele 1 and 2 or 
+#' equivalent). If not exit, otherwise continue to next checks. This can be 
+#' checked by using the mapping file and splitting A1/A2 mappings by those that 
+#' contain 1 or 2 (ambiguous) or doesn't contain 1 or 2 e.g. effect, 
+#' tested (unambiguous so fine for MSS to handle as is).
+#' 2. Look for effect column/frequency column where the A1/A2 explicitly 
+#' mentioned, if found then  we know the direction and should update A1/A2 
+#' naming so A2 is the effect column. We can look for such columns by getting 
+#' every combination of A1/A2 naming and effect/frq naming.
+#' 3. If note found in 2, a final check should be against the reference genome, 
+#' whichever of A1 and A2 has more of a match with the reference genome should 
+#' be taken as **not** the effect allele. There is an assumption in this but is 
+#' still better than guessing the ambiguous allele naming.
+#'
+#' @inheritParams format_sumstats 
+#' @inheritParams compute_nsize
+#' @inheritParams get_genome_build
+#' @return list containing sumstats_dt, the modified summary statistics data
+#' table object
+#' @export
+#' @importFrom data.table setnames as.data.table := setkey rbindlist data.table
+#' @examples
+#' sumstats <- MungeSumstats::formatted_example()
+#' #for speed, don't run on_ref_genome part of check (on_ref_genome = FALSE)
+#' sumstats_dt2<-infer_effect_column(sumstats_dt=sumstats,on_ref_genome = FALSE)
+infer_effect_column <-
+  function(sumstats_dt,
+           dbSNP=155,
+           sampled_snps = 10000,
+           mapping_file = sumstatsColHeaders,
+           nThread = nThread,
+           ref_genome = NULL,
+           on_ref_genome = TRUE,
+           return_list=TRUE) {
+    
+    message("Infer Effect Column")
+    message("First line of summary statistics file: ")
+    msg <- paste0(names(sumstats_dt), split = "\t")
+    message(msg)
+    #### first make all column headers upper case ####
+    column_headers <- names(sumstats_dt) 
+    # load synonym mapping - internal data no loading
+    # Identify allele mappings which are ambiguous and problematic
+    # vs those that are interpretable
+    colnames(mapping_file) <- toupper(colnames(mapping_file))
+    allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]
+    ambig_allele_map <- allele_mapping[grepl('1',allele_mapping$UNCORRECTED)|
+                                         grepl('2',allele_mapping$UNCORRECTED),]
+    unambig_allele_map <- 
+      allele_mapping[!(grepl('1',allele_mapping$UNCORRECTED)|
+                         grepl('2',allele_mapping$UNCORRECTED)),]
+    #as long as the sumstats contains 1 unambiguous allele column MSS will work
+    #as expected
+    unambig_cols <- intersect(unambig_allele_map$UNCORRECTED,
+                              toupper(column_headers))
+    ambig_cols <- intersect(ambig_allele_map$UNCORRECTED,
+                            toupper(column_headers))
+    #if both ambiguous and unambiguous columns found, rename ambiguous ones so
+    #they aren't used later by MSS
+    #example: 'A1','A2','EFFECT_ALLELE' all present
+    if (length(unambig_cols)>0 && length(ambig_cols)>0){
+      #find if unambig and ambig relate to the same allele
+      #get corrected name for unambig 
+      unambig_corrcted <- 
+        unique(allele_mapping[allele_mapping$UNCORRECTED %in% unambig_cols,
+                              ]$CORRECTED)
+      #check if any ambig are to the same allele
+      ambig_corrcted <- 
+        unique(allele_mapping[allele_mapping$UNCORRECTED %in% ambig_cols,
+                              ]$CORRECTED)
+      #overlap?
+      ambig_corrcted_rnme <- 
+        ambig_corrcted[ambig_corrcted %in% unambig_corrcted]
+      if (length(ambig_corrcted_rnme)>0){
+        message("There are multiple columns relating to the same allele info")
+        message("Renaming ambiguous allele columns so they won't be used")
+        #get the related ambig naming and change there name so won't be used
+        ambig_uncorrcted_rnme <- 
+          ambig_allele_map[ambig_allele_map$CORRECTED %in% ambig_corrcted_rnme,
+                           ]$UNCORRECTED
+        #now rename any matches in sumstats
+        chng_nmes <- column_headers[toupper(column_headers) %in% 
+                                     ambig_uncorrcted_rnme]
+        for(chng_i in chng_nmes){
+          data.table::setnames(sumstats_dt, chng_i, paste0(chng_i,"_INPUTTED"))
+        }
+      }
+    } else if (length(unambig_cols)==0 && length(ambig_cols)>=2){
+      #only continue if no unambiguous columns found but 2 ambig ones are found-
+      #less than 2 in total means allele info is missing which MSS can try fill
+      #in later
+      message("Allele columns are ambiguous, attempting to infer direction")
+      #get names for allele mared eff/frq columns
+      eff_frq_allele_matches <- get_eff_frq_allele_combns()
+      #now look for matches in sumstats
+      fnd_allele_indicator <- 
+        column_headers[toupper(column_headers) %in% 
+                         eff_frq_allele_matches$UNCORRECTED]
+      if(length(fnd_allele_indicator)>0){
+        message("Found direction from effect/frq column naming")
+        #fnd_allele_indicator could be >1 so majority vote
+        a1_mtch <- sum(grepl("A1",fnd_allele_indicator))
+        a2_mtch <- sum(grepl("A2",fnd_allele_indicator))
+        if(a2_mtch>=a1_mtch){
+          message("Effect/frq column(s) relate to A2 in the inputted sumstats")
+          #this is what MSS expects so no action required
+        }else{#a2_mtch<a1_mtch
+          message("Effect/frq column(s) relate to A1 in the inputted sumstats")
+          #this is the opposite to what MSS expects so switch A1/A2 naming
+          #first get corrected names for allele columns then switch
+          for (headerI in seq_len(nrow(mapping_file))) {
+            un <- mapping_file[headerI, "UNCORRECTED"]
+            cr <- mapping_file[headerI, "CORRECTED"]
+            #note ambig_cols here not all col headers
+            if (un %in% ambig_cols & (!cr %in% column_headers)) {
+              data.table::setnames(sumstats_dt, un, cr)
+            }
+          }
+          #now switch
+          data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_")
+          data.table::setnames(sumstats_dt,"A1","A2")
+          data.table::setnames(sumstats_dt,"A2_INPUTTED_OLD_","A1")
+        }
+        
+      }
+      else{
+        #didn't find any allele specific
+        #last option is to check the reference genome and take the allele with
+        #more matches to it as the non-effect allele
+        #need to standardise the sumstats for this
+        #use get_genome_build function for this
+        if(isTRUE(on_ref_genome)){
+          switch_req <- get_genome_build(
+            sumstats = sumstats_dt,
+            standardise_headers = TRUE,
+            sampled_snps = sampled_snps,
+            mapping_file = mapping_file,
+            dbSNP=dbSNP,
+            nThread = nThread,
+            allele_match_ref = TRUE,
+            ref_genome = ref_genome
+          )
+          if(is.logical(switch_req)){
+            if(isTRUE(switch_req)){
+              #swap A1 and A2
+              #this is the opposite to what MSS expects so switch A1/A2 naming
+              #first get corrected names for allele columns then switch
+              for (headerI in seq_len(nrow(mapping_file))) {
+                un <- mapping_file[headerI, "UNCORRECTED"]
+                cr <- mapping_file[headerI, "CORRECTED"]
+                #note ambig_cols here not all col headers
+                if (un %in% ambig_cols & (!cr %in% column_headers)) {
+                  data.table::setnames(sumstats_dt, un, cr)
+                }
+              }
+              #now switch
+              data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_")
+              data.table::setnames(sumstats_dt,"A1","A2")
+              data.table::setnames(sumstats_dt,"A2_INPUTTED_OLD_","A1")
+            }
+          }
+          else{ #couldn't infer
+            message(switch_req)
+            message("Can't infer allele columns from sumstats")
+          }
+        }
+        else{
+          message("Can't infer allele columns from sumstats")
+        }
+      }
+    }
+    #### Return format ####
+    if(return_list){
+      return(list("sumstats_dt" = sumstats_dt))
+    }else {
+      return(sumstats_dt)
+    }
+  }    
diff --git a/R/standardise_sumstats_column_headers_crossplatform.R b/R/standardise_sumstats_column_headers_crossplatform.R
index ef9592a6..34a17492 100644
--- a/R/standardise_sumstats_column_headers_crossplatform.R
+++ b/R/standardise_sumstats_column_headers_crossplatform.R
@@ -14,7 +14,7 @@
 #' @return list containing sumstats_dt, the modified summary statistics data
 #' table object
 #' @export
-#' @importFrom data.table setnames
+#' @importFrom data.table setnames rbindlist
 #' @examples
 #' sumstats_dt <- data.table::fread(system.file("extdata", "eduAttainOkbay.txt",
 #'                                              package = "MungeSumstats"))
@@ -41,6 +41,10 @@ standardise_header <- standardise_sumstats_column_headers_crossplatform <-
         # Go through each and get correct spelling
         # allow for differing cases column names
         colnames(mapping_file) <- toupper(colnames(mapping_file))
+        #get names for allele mared eff/frq columns
+        eff_frq_allele_matches <- get_eff_frq_allele_combns()
+        mapping_file <- rbind(mapping_file,
+                              as.data.frame(eff_frq_allele_matches))
         for (headerI in seq_len(nrow(mapping_file))) {
             un <- mapping_file[headerI, "UNCORRECTED"]
             cr <- mapping_file[headerI, "CORRECTED"]
diff --git a/R/sysdata.rda b/R/sysdata.rda
index 06028d078dfb4dbd214553e7976dc0ee7e2a1a81..3955c63bff494541a9e959522f61f39e764ea16e 100644
GIT binary patch
literal 2014
zcmV<42O;=ET4*^jL0KkKSrg9oZU7Vb|G@wM`QS(sf8am=|8TDdo`66I00CeKzB_^Q
zt}m?i=yrYgyPmlF+1qaB=`weSNf9TcWKT^RDYlwZ)cmM=Ak#nrpwmDA4K#{)O)!Yj
z8k#*LO&S0I(X@=z01XBpNCc*a)HGz!00E#G41j1E0z#UmlWLfG6w}mtMxL6D27mwn
z000D1nxOF@4FC^Q)Bw-`0iX>4Xe3A=nE+~G8l&|#nus(200Te({gaClumIV-!IGdj
zkO;IwiYL?v5NS~)N7n6uMv@ue1R3BFZq;&13J&eQu}I_X7Tq3+6oI*q3WcfRMCDfk
zUG&&c4pGFVJV+yj2qaLEjySV4DZy9+s#YWDi2*kXbi#Mj8vqcM<;_rmp?00a3rn52
zykK)F1pL~=5Yu9b6h|%E1CpCyCIv3;4HTfN0!7^S*xNJhs0-LYaJW_M$+rO&etfUO
zIcG@5Th4YkkQs(G_%Kn1R3knG0o51URL>?xp;+fL6NdHbtX$1jt+Eh^6DU}-t%Eee
zeRCL;YZx$B)k7U@(%%kDTOIQ(H4JAI0Z13Vb4WsDl7zPSVLVx9z?dm83f0@8;*REN
zceRF}DPKr6Uj>Z@m-gs@nO2KfaLiGU?sjgdZ<t_2O^W8Qv0o|aisfwsE;m?Ut1el(
z1(+2^(kcT82o%9EKqxYrVFd{**o0_cfI75{7W^@Lw@_r%8W`)QLK>@P8W>KVJo;dX
z2`gi%X5&QoJ%fcSL=@!dXmpqnx~hgw5Lo4lWs-MSH9AUl<l)+~gt0Af&ILRN6+`d-
z`2IMC!*Ip>oxakr0ExQhYfRln?~v4r0!5`D^+or&mv}-bUi(JAo}^^N^Mp;4V)<=g
zZ{K^iMkJkr!_;8X9|WIRbdYNk5)qqjQu)s_D=M@t&v!pf*Vx=Jt1)XbATSMxTt<Kw
z-LgXbcD5@7+0@zHos%If@W~+1_K1WNS{O#{3Utuycgz%EqXT^3?M4lc0`)0F7eSHJ
z&j+fk7u|P9$dY<s#z;l5eKewLuqd2BBv^y7-1zwKMZ}R_Ai9ESgf2~t28lG{cqBB9
zD?N90Zyn7BmJHNF7li`Rb}u8=geTApOCYbmPclnZ&qH!Bn8uw1LvGs&OjZIln-IS@
z?c*-ncIxrb*B377Oha~aRiRjPHVveXaR^+lHZ?4dtIE_9W66z;l}1<hAT+C7)r_HG
z^ey`>6>G%?3=cxS;4c*=32ynmCqG_k5>`pexpyf#4m%S7B~-v`L{b!`2?>%Y6J{A%
zW`!GJbrD%^K&m35pj8l{s2VlA`qquC*k6cpV9=NiB{z3=+imY%5xq$@05<84rK+!W
z*Eu}8d3w4|thj8=3Nks#^5>uC(^sO!Wuv5nuE=X<YK=!e^b7bHU`w0Z^Fa|<%~F+z
zPMZ2$U2JAGW9Nqu<Ig3F8VXxt0ff-R7nfOQ>NOTtCOHkOPC;TcVlhn;4Y9_AnF$vq
z=k`JwV(@|Bw*jMs#ryZW!@3Ae)Xoc;7!BOU%~H-T{yfgMAktJ>SpJ?UTq1K7_uoyB
zgkTg*JcgZC1s%W$eX=WfYZ_=mMDr8(JelEr?9Unm5n(su4hEwX(rp?g9~9A>Ea}8=
z-~?5A9`hRxwFAmo2v_$efsx)$5rR!5z?wCqgMd)@p#imtv^J$h3iKdMniE2H)JPN9
z_p7)-ud8V!*cUTAk7u7n(c_eio+s9oc4HtZGJ*;ey%Fzy==&|8B8UnD!m<c8i>vAh
z84bn;R7bewP(~=#o~GVOBUk6c9)0?1k~!vCJ^C>e{tTq5XGJu6M0(Is1y0L1y_!@P
zFaoYH#9e8g4j^C>+jG8L=w}gs9ufnUY0k?u-OpAsVqAwEZev4H*E8M2bP=ChGiG;$
zk|`@=nT>ts;es9nNujhBO*H}&inw*8HbafCwcMg2A-k4D<u`H#*kNdPTphr664H<-
zY6Rmxhlb~-18s5;gWIr?a@RSEqy|Ub8A^M*QrZ%OX2D&Nlg3zu55r5i)e%IM*Wf8W
zHs8U*bQC}k#Uf3%n41#@hD0!pHqoc5(k%f(&055>Tuh=x7~GGJ#kcY1>37v-SDOJA
zhta51px?IBg3{Wn762@XY_gJITu}x^QzF0wP)AyCJRWxY@W3tB$0+8^C`YhS1ZbL$
zS(4!rG8r^nEohO8j4<vwdbGOj)73=0q_qOfER_&vWEe4bV*WU(m^W6mo`DXP)M?XI
zk%KJ>3j?o)sbhiOgNEHJ$=%_L+`z-T9vYBY(YDam&9S+oS+$#O<p$Hw#KI#*mMbM%
z5SbOEDFD*3WX2440Ad-arV((6He8@GnuszaO)IUkV5>4V(U@!>0LI*8a)n|vhEuNi
zq6z2+ibgQThBPu$hD&`<6pAISTWB#Dfo|1*mEpz&aF-V;N*0d01X|Is30|T~04Z&J
zfhQHZ;0H!*r1Ep?Rs~KCNyT3)?B-ziDr+Gk%`WWWkPQylbVq)+U=v7?gtbIXq-_VL
zcaUs?iwjd!xTl4E2&|MLfGtukc#LROP1P6Z&lRw-%$&=GC=13Ja`E!=4emMa#R0eV
w<nv%4BGK9Ze_)+WpJKt%>LbB$u-sc&)0JItRfnUIKO_Dw<ce^iASa#d+)US!V*mgE

literal 2106
zcmV-A2*vk8T4*^jL0KkKS(nA&hX54=|Iq*c`QS(sf8am=|M0I5o`66AAOHXXU<n>R
z;3w03#ousr&erbJi7#e)@^s^$2_!_?5tN^&qNk=5HYv2l27np?wLMJ$JrHT6N1+p9
zOp1D*Q`824000|MX`lcMjR^!()beV2jEAJq%}js+kN{`^0U=1Iri_uPnIQEuQ}qdu
z01X>R0001rRX<eGwLKsj0MkIzO$`H002u%VgFzxe0W>Bf38eHir<7@<APoQj03Wh)
zVpF1&@Nr}!jzlV~(1Iy?0SGG1AxF~u>JA|SsNuq@-_`b#fB`fpqFL_hP`MnPsNA2;
zB9I%|a41=$0uLyx$Ju@WlFH1T9H&GqpZ7pk6^D*E#6>-ETwTaXa-9Q|fQX!ep9oJ6
zj(R`{O8V@WK=8FZg0#DX@~i<cgK)7yIW}w|V&Xzxh|4TQ31pz-h?-LFxXyxthzSzm
zc%${#m<946Flh<Lr?WH{TOE5Pru~_uV;h~#*$5o-DZ$xicGn6x;F`3M3t2em$}WZY
zIqcZJwN+JD?N_Zei=+e!VD-Y44%rxVOkzXa;fl1>I@(!wdxKQvZQHETrV-Pe??4oR
zh4Y$9B1TH8vsH4dM+1?4q{%^M3svK+69XecB2zl_odJT3u~5YVEzc#zx)owN@Fvwh
z$ET{m+~%?+nuNqgD+yOjS8d_d!s_JnEYrmA4Q&V#^dLZlz(@>ifau7<Bq+fo02tVk
zHkt$nVS>S9w;a!93KLLd$6VxwXRf)#;HH0fHvtqIb{J>2Gfc_KEFg1vG1n4J<g%p6
z_PW&bW|;Ps(X1KZvzc>^%FZOUrw0cuvYv^CCE>{F3oHzvA!;8__p|Hn8Xwyi^9KOn
zVPZ%Vb;ty)GuDd2=PIoO4|v)l#1LdpDj6<khFEJvK*6>?om#m`TuzCxN;8ClZ6A%=
zO}Jp>kwzq(l$(3jhLHF~ev!=wXqX7YXH=z09p1Vm3YDa$QI<=kp=;^8+`9QU3@Y<8
z1$$BqLsBjlNfP60!Z5_%GusxmO_Fr%?OWApt2;9XvbrJ~9pFHC5bY^QwW^a<CaN{2
z&qbk9v2(sxTPw0++x1zwB*i_nbnj+n-3Pvz<b_uykBO4f3J*su&5dYWXr1W2jU$>9
z8X<6OU@=LKF-V|}rKWqmS<g<>se?DDP+k)>F&eBptJ)-#y8|M~E%&M3CDE_Rxn`Kg
zoe)DUu%yw=+K{!I1ivSz(VlYIv!kAm49;7nF%8+~RiRjPHMWF#q7b_7HMK6I>GmxG
zY<(<jY^pkM+JMllb5=5ihs?Ivic^9EaNS}Bz~oL&pD<gff{i3nDFA~6ArBiDaY$O)
zD9-mvLSs;x5^Y%93ebfmngVknOq>MfK`@*p(aGuLxLbCM^oO)1p(Z66Wqp~MtGwh_
zB21nGvc$Pv#q!OW#L(TZe&c?Ocq^ud9N%7ebv2l6R^(rYf<n}!7KG)<+E!^mg92p2
zKE3YzV`l)TStEcj!=r_zxTZ|k@$ym9ITh@-Ag$Jj7*PyM#-;`ITByiYu{7cfr5<!?
zV;VCKHMZc}PB2M=M)Em)ob=07u7mJu>+*NG_Uj)FbR62XInVJSB-T2~!CymGI|U5b
zNTuVRw+}mRkU9fE+tMDhE#gJOx|{g#>Jp+RUD6FQ^D^<C!8O{DrANqB_)e27fw5@w
z`mVP7oJx2CCdAu6nzr`IuR_?&|2(o)1#|>w+61b|yOg1cK-T<178{dEk;GwKK!8mt
zi&8i^NQa#y2DT>9*ri1a0SHW*6GC`YLKdjpuHgd5u-K3@xxa$Ic@}yZ-n6gPwTv#g
zqR2;LR#|36W%IL$wIQM5G6?CI^7(-ktJoGIQeNdsrUf3B-Eu}4^KEr=+oGx@Z{%7Y
z?ZT&zMHOY}uP)&nprTh;c&PU9l?B)URmMFjPr+}_(!@2~@26!9kYB%afZ<wJx$Vlr
zXJ?VOOctTv$kEs1#z@hjx0Xs+k#U)ZeNDq<4`oD{HU$#~ph9oer+7)S7&Z3T=@LRl
zJ5>osi6pp2ix)Mtc7e?c(g`PO1p|j1NWw$w(;{=6Z`^7S=O(CzssJoS@x7WGxSMJL
zf^XiF-y4}VVig*??c#!lNp@o`{8e42b?z`pG7y4ov<yv&gF_%S8)>$U-ZmuAn{kwG
zl8tKOAwp0P({T=ynwhgk-lu%$Vzli@Tm$uZEK=pyyzyG!Pe8l`mEzs9V6(^?G))yq
zNTLb8drp>DZFuM++88p{hDi^?%^(NOGFJ@~vII{>oELIZq$V{GLhf7+JbNwCg`XK_
z!7PP-gxU`Fxc2Ackd=|#up`hRz*$Zj;`td=kf}7~;(hu^WJ?o9wclzo)X`utT~;gz
zCgfp~Wri-bkzp25hBp?M`0gR+s;o$MI4er217O3GQzn?fj;IcRzGT=z*dXqzfMnW)
z6iAJ$%epkF+$+Us+R{O{8je(oBtpIyP!!4z;7;M8jSg|Q0>XXZs+U5x;9@fYQvGT?
zFAea6bho)mP`GLW#u{}2&j=nY2DI?<7FWg?CHGEDY|XW}=sD@?Y~1m)KYIFkfYQPQ
z(qx?nU>+L{vBfBZOh{o5I-+bR5lrzU*f40YwLCnl<S7Y-;JOs*>?l=DsxHdvE1KNw
zq~$0yuosFi^!4wKPrn7g^g<gYoL%0i5I%b;C<zV)`HV)Hj=mKWm<;u}J0U%qE-99t
k#}kk6)Ht=JkD6>Uoe!FkhcxqSevk2YBvXY634C4%aH1LH3;+NC

diff --git a/data/sumstatsColHeaders.rda b/data/sumstatsColHeaders.rda
index 29605026cedb4bcd9cd37e5940fb861f9d03af2e..83f04f9e6abfebb1e066fbd0ae9180c2a5be7251 100644
GIT binary patch
literal 2053
zcmV+g2>SOQiwFP!000001MON{Z`?K(9?#8l$;EYn-R+xzye%NYV>`RfBS+$lwX2a7
zNfRV~A&kU*Y8${#(bs+Ne_kv}=^P%4a?!Lw(%p4|H0S%yjXZ}UsZq-K_1WU}VlWtt
z2IG<X|3ry{$-dr%7lXlgsKCKsJeaG$kMDl`{_ex|hr8(gcb7NU?{04Il>OIAqpgmV
z;m@t*!Bs0vuc|07MT-uvLJH^GNILqVK)+qhw-V5(%8FK;C^3m!FfJ>UCuJoOEOxy7
zrzkVN6By7ikf^M{Qn_XO-C7_sM+Qc%Qj$nR%raF9MhCzgm87b`QCYNX#>Gg2mXbad
z5NckOB2J<@hhZ*~SjKA={QQNTi95B?Gt7JVikAt`YH)6$Gv*`)$0t>?%|tA#{jREc
z?XqDn8}zcwWyeIG){>xMCD60xS<K6riL%7mA}f2@u$K)eduWyzrkPe^EoyF&jJHvX
zjufg=A&Pik@;b@0mVXR^Y20a5YYd%hP?~Si-~iV0w0OnSJw6<Tq;Ra2&<mp?Pff=$
zq;O&&>X@rzB#<e9o5s$0R>(BTO0jM6)1tQoi$bpRw3g{^2c5%-PVzG3rxr*G7=lF}
z$CWfuDZ@M_q|971o=HHN88}faJD6EK&8m7Izk$+alERr$CIYvKMs10&nrBVsn~q|V
z$;j5rfV#}n3In$USb$z;HkE5KQZ6`V8P#P<QgNGC$+$?gY(q|sp|DdmYUp4V*+ksh
zh*u)BwW1ye5cg@F*F3HGu1H0dTtR{(HDFEEqE*3(qG#49q^lWL)`o!%>v{*E3A&mP
zG@+|mip@qub@EDJEykJRY<qad(>GTlCVh4ys6v=YLckX4y%B1YuPfCR+yRWcB;{x+
z=2M|h1%z67l~w6+MzAYoGRZc1x#NU9Yz7H)o~aE<>g7)`>)^111A-@XLmTUKQ2At6
z>}#__@F-GyMFbDfa;MHK-i!?fbt!7H|4bVQg>RL(c`X#-?tE&iQyc)JCXtisWlbV0
zh$r|dF_TuIL&UHYW(q`?M7b$n+S5YtjXgX5)U<%7sYu196fd14U}_j)(aT)|eK^!{
zEc}`VQndC2ODz*eS8Jm3*G^P~ZdCr-i)tNR?MOuhQH6xYXp6nbu(P@{a*sY%+@a5y
zL_OBBR9iu<ZWPU+U<=iO`V@^zGmRdXKE<CMQg+9D?5iPSSA#X?r^IZc5rCt6vMl}Z
z3oq^vt29$h)yE`O8ium5NiCQvEkQgbhEEjI_g}NoS(_w_sD~uc=1}*IJ5Vz8nNC3$
zk6vb%CEk*{PXS&3Y~7-7npW=1ybx+PSe=}lY(<vu><(=VE_t!zh2!Ha47H7@)Zp{T
ztU$S1ovogq+c=s#Tb(_>aLvxn&tIIZT(k4D^XGqq4YbJf)MTn@wKkfth&d>vD6N^(
zmykA)Q4rGj%$`F!8%1eKIu$3S-th5GF;WDazfBLi7T>fB33<7G<J@khjRpcnm-tqp
z#C(@mIlg#I3<J;gGiy4#+^$|kc0Ej=QOi+f*O3zSn+$f`<^XD}DUg*p9=@$Bq%fX%
zXIR668Lx{RUwW)KQ({8OvRtb>jNF#d<=NkT>H>q=o6@mS7d$CtCzjDLtzmtdKx=}&
z{LIfE`c&Xk%o!zdyzm`?I$Rf0w+WyckX&Qfxq?m}B*1^6L*B!H(*kDM!+_UBF{n1c
z9U;Q$x*+2<h?+7F+PSUO?ZDB7{>(4P2m@~xZtXO~JR9cOgq}_4*+|dEU2{vQn8Y<r
zmR_1iynpF+G!ylLTH?N>_F4NZ%$r@;5>wr4f%gj~GON@ubfL6xOx~OiV52{5_8bh~
zKzLN`RbBh`zFM|dbxDE)MJ~Y2TP(3TRF^O6qIYrSMqGv*wy3{2$7FM`MEM@ya;QFR
z^x=K_pwS0Ze`Jo3UyjQfJ?r}Q<vTJ*OTT?z{wFlQ!=kmrVoqd79E){K-YS~2F~Wkv
zh5B*dRcv9d9h4ag>{i2r|EwMCDW=C`^^v(8F(=OB=1vgyk#5Ejmz^8|;A4Lg^k~?<
zn($st;Et=Q4eKMrKGJnxnCo7bVIC8Dj2mZ321pvjyp{zX<A#3J-d26C&o1ceQExY0
z%eGr#r+dt<5F(=0!agW43VJ(*y+h5(C6t?6EHidDP%}#RErit64XCoZE1$2hEc_pS
z`83xppXNF}G@L`V57NpllLUNJij64Mt4U{xKg#P%(HX^8S)TR2Q!QfAd5}jLKFR6{
zgJdI>%o)AgiiwJkZqUz_?Pq5Wp;d()Rp3w0O=bD;9zN(P2kYU&9xm|87W6&2TEY4r
zeys@mHh?(}k5ry{y$C#}GfQ3Qo<1(k=pE`QW^`$c;EqqF@&?@9Z%=T^gVe6k6Ss-7
zgM$tZJD8o=rD8>8b<a1bj}pkIrU%gzfAKreXJYM_q0cMfXPYl8*v~7US8_j&{8r?A
z3;FbbeEax306%>EZwP%3~vUk2`zJ0Af)F(e-WJ_`Q&PXWIYxWD`VE`&eV|5ikQ
z*MATn#E%iZU!Ok(L4I)lBZ&5g<Ok{hS)#vdekplKer5ae(&>ZrUn#8~q(7AO*H13F
z|5SJK!}a$!cLVc_$Y9u#%l!{;-{0Qey!~+VuBGM?5^jL`?fah?1bM83{A-~9JNUVq
j>FsydcaRCl#k=bd*X-L{<*(4c^`HL$q8LtRXe<B#?+O9C

literal 2150
zcmV-s2$}aEiwFP!000001MON{ZyUK4mUXiv+lk`^cDru_<gEY+%XSjyks49bw9<?`
zGfES`5JsZD)Y$;Gi@xq(*+05ik~2I!B*$sG$u=qK1%f@_cW%5KUPg32eSJE5J!-XD
zomRKqZgqN6Z1pzfjGVV5Bf(Cq+ZxLMAKm`*{q5b=-EI8gda=EFx4pTQUf)TLvO1K8
zztomHm$lHpEaEKZHQK!-7!K#LaP(~seyftrg+ZMnUDskyipjJF-Mj#~mlp!SXvOls
z@;qfL6MFRi>gFHYo1d=UZy|-YF{sj;hUA6ulVQ`%YQljT0%Lk5n}jm8p%s0pptyn5
zQ3<Sy9F;|TRVyh<P*WnLB18?>IiDtRnVDvelBt+ZWbm_>wkK}aI!__*?kkojEG=vJ
zs7U51pNe9$DoW<Dq>+spS?aP~B=<{!P`iKuz1>2LlBH9YPbtrH=oqk~ktL06gxOtP
zZIG~E@Ch%OMPfRSYjh}4vE+O`-Q=uHvb5$8Okf<hUz7@)NvhB?n}flQDeM0F6<cm#
zova0hU8OXoqE%=Cd5#3bo`#^Ku8yKWdH~!sw$FpLSSD%C=QV!3ZXAL(bgP%;gpDnr
z7zhjqJHeV0>|?+vn@$U%Q(=Y%48hD$F@Z^hnSpV_Sa_IP+)s;gGks&mGZh#Pv@+qa
zO;l=2JT9}6Eep0<FL{w%nt6CqEkTjZJo{cMVi?<z24$Wt3&>X!#=_`DYFpr%bfgP(
zMZ{&kM5&m~D`hAsCEJjFZOH8iXtkODG`5MDwc)Qg>?1o6>2ExlFARMsTc*aP<n{QN
z6GwHSQV3VmF02hTHq`fyj3)9m#At}C+2gYrkIUp0hfa4>$-&^P638qkg{5ry=8{j*
zq&wkc_w}@!=rvUPs2PZCQpmZ0Jwm$+R3J4)LyCk{gs4$aYXiCW2vyVwQx6jlM+o=u
z-qlUgnJK5EWVPOudNW{gEVqT&EQ`IBJcU>)kn!$<muRc$SI~@Lt%QAQBnfwule?Pa
zP_U%9C{Zv8;V6a;Me<WKi317Zg*EHV@(a89b2hU_!S_`;u;r33`7Gxb&W_&K47ccF
zm6)p8RdFo*4hNF7@-&u8MvkslM`f>_r~uul>~#>;I=b49jA}#~(hR<`*zgo)PmPS&
zfR7bd;L}A>kCZIrA}-}sp=y-b<y4?1Mdea;qrs&~@q<GOzs*OX+C=m<Fve_*%z6r0
z)rq4zv8uy+8n72;9tA1eFn|L&-|FmxNieOP*`yZqr4}K!M23$fQkP!6ds&+#jj@M7
z(S~M94AEh;vD;?Ui-F39OH38xB8RJyzgFx-@(XzdpyT7?<2mIUvwR*K1uxc!{=TAh
zm@!?j^@^=yZc0zDIXbZg$}dcd_cf?JYVOCXzBYNb=5ia-P2g#^vKziOXz+2WRqkK1
zr-g09g}uh?>qR2Q<J0l8GaFTLr{mLS=dRi5+1bnEv1@jAdiLyjO&zSWY^n2>c-Cf$
zu!uT{V6-f$Qy&Bu62*Yv_|zV9D%*W%I8AbOKElnUBMCE~RRQ=GKh^~eyqLUku1=kJ
z0Us$+%vM>E!FNngGiI}SYE6~m)Qs9-F(1E-Z7uenVz*9VYgmfvD+#9GCNxx6QAU>L
zaF*AuCWZEd`^dypPj|A;;DgGF11TnGRQ4wF>LliQyf}RkQqM!`9OBy>*pXJRS&|Dc
zyw*^qp-q}dX`-ek(XdG*q@s|bPAie)dFW`U-N{<0nKQF*8U+2{m5i7`qM?LNW+2Xe
z!bBLUtd5CgUIfhZpd~K?#>DAK2Gkljn&(a*LD!$GQI9pk!$AWGwh|IoOR%;YsdLm^
zcakg<+Td=}z>5Z6H1HyU7YV#b;3Wbt5qNnE!c2pJ1&0>FGK*RBBJeal`MI9Mo+(cR
zyHJPK`Cy5@Dk%=vGbvK5RFJq(SlC6sBkY(nYq#1DA$$eRK4QQBl<ydNB0=L<cm2U4
zP$om6n}79zk8F13MHay}Gdt&>oJ&M?_{G@<zTmK)RC;osKC1K))*tGF?c;Il!=L(o
zP5BP>f!A!`l>ZUV@33I)u#gkj4uipJ!;cgPO*hq%zItJ%&0zv6-uMsdCY?YMIMVm^
z1(7;&4$AHA-S!>1y1w3CZ(UM&WW%kyCW%9K7-ZjHvA?K0DOOh;Y2YG<t9ufeSVv(&
zHE8I)8gj43&0Eg^Z6Zk%>AH{f#oA|Rz>t7(<LsdZqB5XCVMGDrk8Sl`5OPg+QPceT
zHKWPK*&<#YQd@^e1gnu<bfy*f@^3aB*(aBfZth^|w!49<R(fWt%pA&FK2Ol7yD4S6
z+!Y9WY8_(i_FNcznDZIW<#(M|wfkAQ;NHAEN#%jmxC2bNw*z!i*aGB&Gs!wKnX{(c
z(e26j;AW<1Y^TWU1F<UNsiLrZZYs-@d-$kfIA{Y$8n`GJwy0?_Dg|v?&q_g>*7OjX
zhen<Ta}foM*Go0f4fD9CTklYhA>+TbgrIJdW%f@j^D*kBzh@sqVIR9r$L`Dv4@Vv*
z9;U~T1Bue=o~OS5NI)-o4`LvG^tJ83#G?1NuOz|a*jEwkYu2|Uxld4E200H+Uml=`
zr9TY7v(o>f(DW+vIT%08+$Vc)FrN$23(Uvir@zB|B5)sFJ`2HH$?rw@QRP8=5Wg`o
z`04(45a_4+Z%nj5z&}V2(jOt=A9KH!+(th=el6+bLHa+I#t+grApQA^OYWa~_tYOs
z-?sEmJ*{?4?rna0`{Cwh`}S`8uBL`D5N?3s&4+(c6XZYz`MIV3+xf-M^!EDd)?_l|
c=-t)b75)2-`fKf0>sR&fH?%avmZ&ZO00P)P6951J

diff --git a/man/get_eff_frq_allele_combns.Rd b/man/get_eff_frq_allele_combns.Rd
new file mode 100644
index 00000000..2d6f228c
--- /dev/null
+++ b/man/get_eff_frq_allele_combns.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_eff_frq_allele_combns.R
+\name{get_eff_frq_allele_combns}
+\alias{get_eff_frq_allele_combns}
+\title{Get combinations of uncorrected allele and effect (and frq) columns}
+\usage{
+get_eff_frq_allele_combns(
+  mapping_file = sumstatsColHeaders,
+  eff_frq_cols = c("BETA", "OR", "LOG_ODDS", "SIGNED_SUMSTAT", "Z", "FRQ")
+)
+}
+\arguments{
+\item{mapping_file}{MungeSumstats has a pre-defined column-name mapping file
+which should cover the most common column headers and their interpretations.
+However, if a column header that is in youf file is missing of the mapping we
+give is incorrect you can supply your own mapping file. Must be a 2 column
+dataframe with column names "Uncorrected" and "Corrected". See
+data(sumstatsColHeaders) for default mapping and necessary format.}
+
+\item{eff_frq_cols}{Corrected effect or frequency column names found in a
+sumstats. Default of BETA, OR, LOG_ODDS, SIGNED_SUMSTAT, Z and FRQ.}
+}
+\value{
+datatable containing uncorrected and corrected combinations
+}
+\description{
+Get combinations of uncorrected allele and effect (and frq) columns
+}
diff --git a/man/get_genome_build.Rd b/man/get_genome_build.Rd
index 6ee12d1b..9ffce59c 100644
--- a/man/get_genome_build.Rd
+++ b/man/get_genome_build.Rd
@@ -12,7 +12,9 @@ get_genome_build(
   standardise_headers = TRUE,
   mapping_file = sumstatsColHeaders,
   dbSNP = 155,
-  header_only = FALSE
+  header_only = FALSE,
+  allele_match_ref = FALSE,
+  ref_genome = NULL
 )
 }
 \arguments{
@@ -41,6 +43,13 @@ dataframe with column names "Uncorrected" and "Corrected". See
 only read in the first N rows where N=\code{sampled_snps}.
 This should help speed up cases where you have to read in \code{sumstats}
 from disk each time.}
+
+\item{allele_match_ref}{Instead of returning the genome_build this will
+return the propotion of matches to each genome build for each allele (A1,A2).}
+
+\item{ref_genome}{name of the reference genome used for the GWAS ("GRCh37" or
+"GRCh38"). Argument is case-insensitive. Default is NULL which infers the
+reference genome from the data.}
 }
 \value{
 ref_genome the genome build of the data
diff --git a/man/infer_effect_column.Rd b/man/infer_effect_column.Rd
new file mode 100644
index 00000000..cb6a08d7
--- /dev/null
+++ b/man/infer_effect_column.Rd
@@ -0,0 +1,73 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/infer_effect_column.R
+\name{infer_effect_column}
+\alias{infer_effect_column}
+\title{Infer if effect relates to a1 or A2 if ambiguously named}
+\usage{
+infer_effect_column(
+  sumstats_dt,
+  dbSNP = 155,
+  sampled_snps = 10000,
+  mapping_file = sumstatsColHeaders,
+  nThread = nThread,
+  ref_genome = NULL,
+  on_ref_genome = TRUE,
+  return_list = TRUE
+)
+}
+\arguments{
+\item{sumstats_dt}{data table obj of the summary statistics file for the
+GWAS.}
+
+\item{dbSNP}{version of dbSNP to be used for imputation (144 or 155).}
+
+\item{sampled_snps}{Downsample the number of SNPs used when inferring genome
+build to save time.}
+
+\item{mapping_file}{MungeSumstats has a pre-defined column-name mapping file
+which should cover the most common column headers and their interpretations.
+However, if a column header that is in youf file is missing of the mapping we
+give is incorrect you can supply your own mapping file. Must be a 2 column
+dataframe with column names "Uncorrected" and "Corrected". See
+data(sumstatsColHeaders) for default mapping and necessary format.}
+
+\item{nThread}{Number of threads to use for parallel processes.}
+
+\item{ref_genome}{name of the reference genome used for the GWAS ("GRCh37" or
+"GRCh38"). Argument is case-insensitive. Default is NULL which infers the
+reference genome from the data.}
+
+\item{on_ref_genome}{Binary Should a check take place that all SNPs are on
+the reference genome by SNP ID. Default is TRUE.}
+
+\item{return_list}{Return the \code{sumstats_dt} within a named list
+(default: \code{TRUE}).}
+}
+\value{
+list containing sumstats_dt, the modified summary statistics data
+table object
+}
+\description{
+Three checks are made to infer which allele the effect/frequency information
+relates to if they are ambiguous (named A1 and A2 or equivalent):
+\enumerate{
+\item Check if ambiguous naming conventions are used (i.e. allele 1 and 2 or
+equivalent). If not exit, otherwise continue to next checks. This can be
+checked by using the mapping file and splitting A1/A2 mappings by those that
+contain 1 or 2 (ambiguous) or doesn't contain 1 or 2 e.g. effect,
+tested (unambiguous so fine for MSS to handle as is).
+\item Look for effect column/frequency column where the A1/A2 explicitly
+mentioned, if found then  we know the direction and should update A1/A2
+naming so A2 is the effect column. We can look for such columns by getting
+every combination of A1/A2 naming and effect/frq naming.
+\item If note found in 2, a final check should be against the reference genome,
+whichever of A1 and A2 has more of a match with the reference genome should
+be taken as \strong{not} the effect allele. There is an assumption in this but is
+still better than guessing the ambiguous allele naming.
+}
+}
+\examples{
+sumstats <- MungeSumstats::formatted_example()
+#for speed, don't run on_ref_genome part of check (on_ref_genome = FALSE)
+sumstats_dt2<-infer_effect_column(sumstats_dt=sumstats,on_ref_genome = FALSE)
+}
diff --git a/tests/testthat/test-infer_effect_column.R b/tests/testthat/test-infer_effect_column.R
new file mode 100644
index 00000000..2656f15c
--- /dev/null
+++ b/tests/testthat/test-infer_effect_column.R
@@ -0,0 +1,95 @@
+test_that("Test infer effect column function works", {
+  ## Call uses reference genome as default with more than 2GB of memory,
+  ## which is more than what 32-bit Windows can handle so remove tests
+  is_32bit_windows <-
+    .Platform$OS.type == "windows" && .Platform$r_arch == "i386"
+  if (!is_32bit_windows) {
+    a <- MungeSumstats::formatted_example()
+    b <- copy(a)
+    
+    #check if inferring effect direction is working
+    
+    #make A1 the effect col not A2
+    #change FRQ and BETA to reflect this change
+    data.table::setnames(b,"A2","A3")
+    data.table::setnames(b,"A1","A2")
+    data.table::setnames(b,"A3","A1")
+    c <- copy(b)
+    data.table::setnames(b,"BETA","BETA1")
+    data.table::setnames(c,"FRQ","A1FRQ")
+    
+    # if the allele columns are named unambiguously, formatting will be performed 
+    # correctly - use this to test against as eff will be flipped compared to org
+    b_renamed <- copy(b)
+    data.table::setnames(b_renamed, "A1", "effect_allele")
+    data.table::setnames(b_renamed, "A2", "non_effect_allele")
+    b_renamed_for <- MungeSumstats::format_sumstats(b_renamed, return_data = TRUE,
+                                                    #all just make MSS run faster
+                                                    ref_genome = 'GRCh37',
+                                                    on_ref_genome = FALSE,
+                                                    strand_ambig_filter = FALSE,
+                                                    bi_allelic_filter = FALSE,
+                                                    allele_flip_check = FALSE,
+                                                    dbSNP=144)
+    
+    # what if BETA gives the direction
+    b_for <- MungeSumstats::format_sumstats(b, return_data = TRUE, 
+                                            #all just make MSS run faster
+                                            ref_genome = 'GRCh37',
+                                            on_ref_genome = FALSE,
+                                            strand_ambig_filter = FALSE,
+                                            bi_allelic_filter = FALSE,
+                                            allele_flip_check = FALSE,
+                                            dbSNP=144)
+    
+    # what if FRQ gives the direction
+    c_for <- MungeSumstats::format_sumstats(b, return_data = TRUE, 
+                                            #all just make MSS run faster
+                                            ref_genome = 'GRCh37',
+                                            on_ref_genome = FALSE,
+                                            strand_ambig_filter = FALSE,
+                                            bi_allelic_filter = FALSE,
+                                            allele_flip_check = FALSE,
+                                            dbSNP=144)
+    
+    testthat::expect_equal(
+      all.equal(b_renamed_for, b_for,ignore.col.order = TRUE),
+      TRUE
+    )
+    
+    testthat::expect_equal(
+      all.equal(b_renamed_for, c_for,ignore.col.order = TRUE),
+      TRUE
+    )
+    
+    
+    #finally check if the ref genome can be used to infer rather than being told in 
+    #eff/frq cols - just subset for speed
+    snps <- c("rs11210860","rs34305371","rs1008078","rs11588857","rs1777827",
+              "rs76076331","rs2457660","rs10496091","rs4851251","rs12987662",
+              "rs10930008","rs301800","rs2568955","rs61787263","rs2992632",
+              "rs11689269","rs11690172")
+    d <- copy(b)
+    data.table::setnames(d,"BETA1","BETA")
+    d_for <- MungeSumstats::format_sumstats(d[SNP %in% snps,], return_data = TRUE, 
+                                            on_ref_genome = TRUE,
+                                            #all just make MSS run faster
+                                            ref_genome = 'GRCh37',
+                                            strand_ambig_filter = FALSE,
+                                            bi_allelic_filter = FALSE,
+                                            allele_flip_check = FALSE,
+                                            dbSNP=144)
+    data.table::setkey(b_renamed_for,"SNP")
+    data.table::setkey(d_for,"SNP")
+    testthat::expect_equal(
+      all.equal(b_renamed_for[SNP %in% snps], d_for,
+                ignore.col.order = TRUE),
+      TRUE
+    )
+  }    
+  else{
+    testthat::expect_equal(is_32bit_windows, TRUE)
+    testthat::expect_equal(is_32bit_windows, TRUE)
+    testthat::expect_equal(is_32bit_windows, TRUE)
+  }
+})