diff --git a/DESCRIPTION b/DESCRIPTION
index 2cba6af..b9a36fb 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: MungeSumstats
 Type: Package
 Title: Standardise summary statistics from GWAS
-Version: 1.9.17
+Version: 1.9.18
 Authors@R:
     c(person(given = "Alan",
            family = "Murphy",
diff --git a/NAMESPACE b/NAMESPACE
index 4761e75..03c8c9a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -8,6 +8,7 @@ export(formatted_example)
 export(get_genome_builds)
 export(import_sumstats)
 export(index_tabular)
+export(infer_effect_column)
 export(liftover)
 export(list_sumstats)
 export(load_snp_loc_data)
diff --git a/NEWS.md b/NEWS.md
index 5b55a28..c3c10cf 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,44 @@
+## CHANGES IN VERSION 1.9.18
+
+### Bug fix
+* Fixed column header mappings
+  * Made all uncorrected header names uppercase and removed duplicates
+  * "TOTALSAMPLESIZE" now maps to "N" instead of "NSTUDY"
+  * "MAJORALLELE", "MAJOR_ALLELE", "MAJOR-ALLELE", and "MAJOR ALLELE" now map to 
+    "A1" instead of "A2"
+  * Removed the mappings for "OR-A1", "OR.A1", "OR_A1", and "BETA1" because MSS 
+    assumes that A2 is the effect allele
+  * Removed mappings for "A1FREQ", "A1FRQ", "AF1", "FREQ.A1.1000G.EUR", 
+    "FREQ.A1.ESP.EUR", "FREQ.ALLELE1.HAPMAPCEU", "FREQ1", "FREQ1.HAPMAP", and 
+    "FRQ_A1" because MSS defines "FRQ" to be the allele frequency of A2
+  * Removed mappings for "CHR36", "BASE_GRCH36", "POSITION36", "POSGRCH36", 
+    "BASEGRCH36", "POS36", "POS GRCH36", "POS.GRCH36", "POS-GRCH36", and 
+    "POS_GRCH36" 
+    because MSS does not support the GRCh36 genome build
+  * Removed the ambiguous mapping "NMISS" -> "N" because "NMISS" can refer to 
+    the number of samples with missing data
+  * Removed the ambiguous mapping "WEIGHT" -> "N" because "WEIGHT" can refer to 
+    coefficient weights
+* Fixed inference of allele where ambiguous (A1, A2) naming used (see 
+  infer_effect_column.R for code) but in short:
+  * Three checks now made to infer which allele the effect/frequency information
+    relates to. See infer_effect_column.R for further details.
+  * See get_eff_frq_allele_combns.R for how effect/frequency columns that infer 
+    the allele are captured in the mapping file
+
+### New features
+* New column header mappings:
+  * "VARIANT_ID" and "RSIDS" --> "SNP"
+  * "P_BOLT_LMM" --> "P"
+  * "NCASES" --> "N_CAS"
+  * "N_EFFECTIVE", "N_INFORMATIVE", and "TOTAL_N" --> "N"
+  * "HET_P" --> "HETPVAL"
+  * "HET_ISQ" --> "HETISQT"
+  * "ALL_AF" --> "FRQ"
+  * "DIRECT" --> "DIRECTION"
+  * "ALT_EFFSIZE" --> "BETA"
+  * "INFORMATIVE_ALT_AC" --> "AC"
+
 ## CHANGES IN VERSION 1.9.17
 
 ### Bug fix
diff --git a/R/format_sumstats.R b/R/format_sumstats.R
index ce517e5..e60aee4 100644
--- a/R/format_sumstats.R
+++ b/R/format_sumstats.R
@@ -445,6 +445,17 @@ format_sumstats <- function(path,
           mapping_file <- rbind(mapping_file,es_cols)
         }
 
+        #### Check 2:Check for effect direction ####
+        sumstats_return <-
+          infer_effect_column(
+            sumstats_dt = sumstats_return$sumstats_dt,
+            mapping_file = mapping_file,
+            dbSNP = dbSNP,
+            nThread = nThread,
+            ref_genome = ref_genome,
+            on_ref_genome = on_ref_genome
+          )
+        
         #### Check 3:Standardise headers for all OS ####
         sumstats_return <-
             standardise_sumstats_column_headers_crossplatform(
diff --git a/R/get_eff_frq_allele_combns.R b/R/get_eff_frq_allele_combns.R
new file mode 100644
index 0000000..e6e9375
--- /dev/null
+++ b/R/get_eff_frq_allele_combns.R
@@ -0,0 +1,77 @@
+#' Get combinations of uncorrected allele and effect (and frq) columns
+#'
+#' @inheritParams format_sumstats 
+#' @inheritParams compute_nsize
+#' @param eff_frq_cols Corrected effect or frequency column names found in a 
+#' sumstats. Default of BETA, OR, LOG_ODDS, SIGNED_SUMSTAT, Z and FRQ.
+#' @return datatable containing uncorrected and corrected combinations
+#' @importFrom data.table setnames as.data.table := setkey rbindlist data.table
+get_eff_frq_allele_combns <-
+  function(mapping_file = sumstatsColHeaders, 
+           eff_frq_cols = c("BETA", "OR", "LOG_ODDS", "SIGNED_SUMSTAT","Z",
+                            "FRQ")) {
+    ### Add this to avoid confusing BiocCheck
+    CORRECTED <- UNCORRECTED <- Var1 <- Var2 <- NULL 
+    colnames(mapping_file) <- toupper(colnames(mapping_file))
+    #get allele associated effect/FRQ columns
+    #get all combinations with allele columns
+    eff_frq_cols_uncorrc <- 
+      mapping_file[mapping_file$CORRECTED %in% eff_frq_cols,]$UNCORRECTED
+    #join with all allele cols
+    allele_uncorrc <- 
+      mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]$UNCORRECTED
+    #get combinations
+    eff_frq_allele_dt <- 
+      data.table::as.data.table(expand.grid(eff_frq_cols_uncorrc, 
+                                            allele_uncorrc))
+    mapping_file_dt <- data.table::as.data.table(mapping_file)
+    data.table::setkey(mapping_file_dt,"UNCORRECTED")
+    data.table::setkey(eff_frq_allele_dt,"Var1")
+    #add corrected
+    eff_frq_allele_dt[mapping_file,CORRECTED:=CORRECTED,]
+    #now loop through every joining character and join with eff both before
+    #and after
+    joining_char <- c("","_",".","-"," ")
+    all_combns <- vector(mode="list",length = length(joining_char)*2)
+    counter <- 1
+    for(join_i in joining_char){
+      eff_frq_allele_dt_i <- copy(eff_frq_allele_dt)  
+      eff_frq_allele_dt_i[,UNCORRECTED:=paste0(Var1,join_i,Var2)]
+      all_combns[[counter]] <- 
+        eff_frq_allele_dt_i[,c("UNCORRECTED","CORRECTED")]
+      counter <- counter+1
+      #same for Var 2 in front
+      eff_frq_allele_dt_i <- copy(eff_frq_allele_dt)  
+      eff_frq_allele_dt_i[,UNCORRECTED:=paste0(Var2,join_i,Var1)]
+      all_combns[[counter]] <- 
+        eff_frq_allele_dt_i[,c("UNCORRECTED","CORRECTED")]
+      counter <- counter+1
+    }
+    #join all together
+    eff_frq_allele_matches <- data.table::rbindlist(all_combns)
+    #finally add some custom ones 
+    custom_adds <- data.table::data.table("UNCORRECTED" = 
+                                            c("BETA1", "BETA2","AF1","AF2",
+                                              "FREQ.A1.1000G.EUR",
+                                              "FREQ.A2.1000G.EUR",
+                                              "FREQ.A1.ESP.EUR",
+                                              "FREQ.A2.ESP.EUR",
+                                              "FREQ.ALLELE1.HAPMAPCEU",
+                                              "FREQ.ALLELE2.HAPMAPCEU",
+                                              "FREQ1","FREQ2", 
+                                              "FREQ1.HAPMAP","FREQ2.HAPMAP"),
+                                          "CORRECTED" = 
+                                            c("BETA", "BETA","FRQ","FRQ",
+                                              "FRQ",
+                                              "FRQ",
+                                              "FRQ",
+                                              "FRQ",
+                                              "FRQ",
+                                              "FRQ",
+                                              "FRQ","FRQ",
+                                              "FRQ","FRQ"))
+    eff_frq_allele_matches <- data.table::rbindlist(list(
+      eff_frq_allele_matches,custom_adds))
+    
+    return(eff_frq_allele_matches)
+  }
diff --git a/R/get_genome_build.R b/R/get_genome_build.R
index dbd5b12..0209ced 100644
--- a/R/get_genome_build.R
+++ b/R/get_genome_build.R
@@ -21,9 +21,12 @@
 #' only read in the first N rows where N=\code{sampled_snps}.
 #' This should help speed up cases where you have to read in \code{sumstats}
 #' from disk each time.
-#'
+#' @param allele_match_ref Instead of returning the genome_build this will 
+#' return the propotion of matches to each genome build for each allele (A1,A2).
+#' @inheritParams format_sumstats 
+#' 
 #' @return ref_genome the genome build of the data
-#' @importFrom data.table setDT
+#' @importFrom data.table setDT :=
 #' @keywords internal
 get_genome_build <- function(sumstats,
                              nThread = 1,
@@ -31,11 +34,14 @@ get_genome_build <- function(sumstats,
                              standardise_headers = TRUE,
                              mapping_file = sumstatsColHeaders,
                              dbSNP=155,
-                             header_only = FALSE) {
+                             header_only = FALSE,
+                             allele_match_ref = FALSE,
+                             ref_genome = NULL) {
     ### Add this to avoid confusing BiocCheck
-    seqnames <- CHR <- SNP <- BP <- NULL 
-
-    message("Inferring genome build.")
+    seqnames <- CHR <- SNP <- BP <- alt_alleles <- NULL 
+    if(isFALSE(allele_match_ref)){
+      message("Inferring genome build.")
+    }
     # if not a data.table, must be a path
     if (!is.data.frame(sumstats)) {
         # Checking if the file exists should happen first
@@ -82,7 +88,13 @@ get_genome_build <- function(sumstats,
         )
     # Infer genome build using SNP & CHR & BP
     if (!all(c("SNP", "CHR", "BP") %in% colnames(sumstats))) {
+      #want it returned rather than throwing an error
+      if(isTRUE(allele_match_ref)){
+        return(err_msg)
+      }
+      else{  
         stop(err_msg)
+      }
     }
 
     #### Do some filtering first to avoid errors ####
@@ -113,7 +125,13 @@ get_genome_build <- function(sumstats,
         size_okay <- TRUE
     }
     if (!size_okay) {
+      #want it returned rather than throwing an error
+      if(isTRUE(allele_match_ref)){
+        return(err_msg2)
+      }
+      else{  
         stop(err_msg2)
+      }
     }
     #### Downsample SNPs to save time ####
     if ((nrow(sumstats) > sampled_snps) && !(is.null(sampled_snps))) {
@@ -124,58 +142,145 @@ get_genome_build <- function(sumstats,
 
     sumstats <- sumstats[SNP %in% snps, ]
 
-    # otherwise SNP, CHR, BP were all found and can infer
-    snp_loc_data_37 <- load_ref_genome_data(
+    #now split into functions two roles
+    if(isTRUE(allele_match_ref)){
+      #1. checking for matches for A1/A2 to ref genomes
+      if (is.null(ref_genome)){
+        #have to check multiple
+        snp_loc_data_37 <- load_ref_genome_data(
+          snps = snps,
+          ref_genome = "GRCH37",
+          dbSNP = dbSNP
+        )
+        snp_loc_data_38 <- load_ref_genome_data(
+          snps = snps,
+          ref_genome = "GRCH38",
+          dbSNP = dbSNP
+        )
+        # convert CHR filed in ref genomes to character not factor
+        snp_loc_data_37[, seqnames := as.character(seqnames)]
+        snp_loc_data_38[, seqnames := as.character(seqnames)]
+        # convert CHR filed in data to character if not already
+        sumstats[, CHR := as.character(CHR)]
+        # Now check which genome build has more matches to data
+        num_37 <-
+          nrow(snp_loc_data_37[sumstats, ,
+                               on = c("SNP"="SNP","pos"="BP","seqnames"="CHR"),
+                               nomatch = FALSE
+          ])
+        num_38 <-
+          nrow(snp_loc_data_38[sumstats, ,
+                               on = c("SNP"="SNP","pos"="BP","seqnames"="CHR"),
+                               nomatch = FALSE
+          ])
+        if (num_37 > num_38) {
+          ref_gen_num <- num_37
+          ref_genome <- "GRCH37"
+          snp_loc_data <- snp_loc_data_37
+          
+        } else {
+          ref_gen_num <- num_38
+          ref_genome <- "GRCH38"
+          snp_loc_data <- snp_loc_data_38
+        }
+      }
+      else{
+        #only check one chosen
+        snp_loc_data <- load_ref_genome_data(
+          snps = snps,
+          ref_genome = ref_genome,
+          dbSNP = dbSNP
+        )
+        # convert CHR filed in ref genomes to character not factor
+        snp_loc_data[, seqnames := as.character(seqnames)]
+        # convert CHR filed in data to character if not already
+        sumstats[, CHR := as.character(CHR)]
+      }
+      # Now check which allele has more matches to data
+      # want to match on ref and alt alleles too 
+      # need to take first alt from list to do this
+      snp_loc_data[,alt_alleles := unlist(lapply(alt_alleles, 
+                                                 function(x) x[[1]]))]
+      num_a1 <-
+        nrow(snp_loc_data[sumstats, ,
+                          on = c("SNP"="SNP","pos"="BP","seqnames"="CHR",
+                                 "ref_allele"="A1","alt_alleles"="A2"),
+                          nomatch = FALSE
+        ])
+      num_a2 <-
+        nrow(snp_loc_data[sumstats, ,
+                          on = c("SNP"="SNP","pos"="BP","seqnames"="CHR",
+                                 "ref_allele"="A2","alt_alleles"="A1"),
+                          nomatch = FALSE
+        ])
+      if(num_a2>=num_a1){
+        message("Effect/frq column(s) relate to A2 in the inputted sumstats")
+        #this is what MSS expects so no action required
+        switch_req <- FALSE
+      }else{#num_a2<num_a1
+        message("Effect/frq column(s) relate to A1 in the inputted sumstats")
+        switch_req <- TRUE
+      }
+      
+      return(switch_req)
+    }  
+    else{
+      #2. checking for ref genome
+      # otherwise SNP, CHR, BP were all found and can infer
+      snp_loc_data_37 <- load_ref_genome_data(
         snps = snps,
         ref_genome = "GRCH37",
         dbSNP = dbSNP
-    )
-    snp_loc_data_38 <- load_ref_genome_data(
+      )
+      snp_loc_data_38 <- load_ref_genome_data(
         snps = snps,
         ref_genome = "GRCH38",
         dbSNP = dbSNP
-    )
-    # convert CHR filed in ref genomes to character not factor
-    snp_loc_data_37[, seqnames := as.character(seqnames)]
-    snp_loc_data_38[, seqnames := as.character(seqnames)]
-    # convert CHR filed in data to character if not already
-    sumstats[, CHR := as.character(CHR)]
-    # Now check which genome build has more matches to data
-    num_37 <-
+      )
+      # convert CHR filed in ref genomes to character not factor
+      snp_loc_data_37[, seqnames := as.character(seqnames)]
+      snp_loc_data_38[, seqnames := as.character(seqnames)]
+      # convert CHR filed in data to character if not already
+      sumstats[, CHR := as.character(CHR)]
+      # Now check which genome build has more matches to data
+      num_37 <-
         nrow(snp_loc_data_37[sumstats, ,
-            on = c("SNP" = "SNP", "pos" = "BP", "seqnames" = "CHR"),
-            nomatch = FALSE
+                             on = c("SNP"="SNP", "pos"="BP", "seqnames"="CHR"),
+                             nomatch = FALSE
         ])
-    num_38 <-
+      num_38 <-
         nrow(snp_loc_data_38[sumstats, ,
-            on = c("SNP" = "SNP", "pos" = "BP", "seqnames" = "CHR"),
-            nomatch = FALSE
+                             on = c("SNP"="SNP", "pos"="BP", "seqnames"="CHR"),
+                             nomatch = FALSE
         ])
-    #if no matches throw error
-    if(num_37==0 && num_38==0){
-      msg_err <- paste0("No matches found in either reference genome for your ",
-                        "SNPs.\nPlease check their formatting (SNP, CHR and BP",
-                        " columns) or supply the genome build.")
-      stop(msg_err)
-    }
-    if (num_37 > num_38) {
+      #if no matches throw error
+      if(num_37==0 && num_38==0){
+        msg_err <- 
+          paste0("No matches found in either reference genome for your ",
+                 "SNPs.\nPlease check their formatting (SNP, CHR and BP",
+                 " columns) or supply the genome build.")
+        stop(msg_err)
+      }
+      if (num_37 > num_38) {
         ref_gen_num <- num_37
         ref_genome <- "GRCH37"
-    } else {
+      } else {
         ref_gen_num <- num_38
         ref_genome <- "GRCH38"
-    }
-
-    message("Inferred genome build: ", ref_genome)
-    # add a warning if low proportion of matches found
-    msg <- paste0(
+      }
+      
+      message("Inferred genome build: ", ref_genome)
+      # add a warning if low proportion of matches found
+      msg <- paste0(
         "WARNING: Less than 10% of your sampled SNPs matched that of ",
         "either reference genome, this may question the quality of ",
         "your summary statistics file."
-    )
-    if (ref_gen_num / length(snps) < 0.1) {
+      )
+      if (ref_gen_num / length(snps) < 0.1) {
         message(msg)
+      }
+      
+      return(ref_genome)
+      
     }
-
-    return(ref_genome)
 }
diff --git a/R/infer_effect_column.R b/R/infer_effect_column.R
new file mode 100644
index 0000000..f70fbc6
--- /dev/null
+++ b/R/infer_effect_column.R
@@ -0,0 +1,182 @@
+#' Infer if effect relates to a1 or A2 if ambiguously named
+#'
+#' Three checks are made to infer which allele the effect/frequency information
+#' relates to if they are ambiguous (named A1 and A2 or equivalent):
+#' 1. Check if ambiguous naming conventions are used (i.e. allele 1 and 2 or 
+#' equivalent). If not exit, otherwise continue to next checks. This can be 
+#' checked by using the mapping file and splitting A1/A2 mappings by those that 
+#' contain 1 or 2 (ambiguous) or doesn't contain 1 or 2 e.g. effect, 
+#' tested (unambiguous so fine for MSS to handle as is).
+#' 2. Look for effect column/frequency column where the A1/A2 explicitly 
+#' mentioned, if found then  we know the direction and should update A1/A2 
+#' naming so A2 is the effect column. We can look for such columns by getting 
+#' every combination of A1/A2 naming and effect/frq naming.
+#' 3. If note found in 2, a final check should be against the reference genome, 
+#' whichever of A1 and A2 has more of a match with the reference genome should 
+#' be taken as **not** the effect allele. There is an assumption in this but is 
+#' still better than guessing the ambiguous allele naming.
+#'
+#' @inheritParams format_sumstats 
+#' @inheritParams compute_nsize
+#' @inheritParams get_genome_build
+#' @return list containing sumstats_dt, the modified summary statistics data
+#' table object
+#' @export
+#' @importFrom data.table setnames as.data.table := setkey rbindlist data.table
+#' @examples
+#' sumstats <- MungeSumstats::formatted_example()
+#' #for speed, don't run on_ref_genome part of check (on_ref_genome = FALSE)
+#' sumstats_dt2<-infer_effect_column(sumstats_dt=sumstats,on_ref_genome = FALSE)
+infer_effect_column <-
+  function(sumstats_dt,
+           dbSNP=155,
+           sampled_snps = 10000,
+           mapping_file = sumstatsColHeaders,
+           nThread = nThread,
+           ref_genome = NULL,
+           on_ref_genome = TRUE,
+           return_list=TRUE) {
+    
+    message("Infer Effect Column")
+    message("First line of summary statistics file: ")
+    msg <- paste0(names(sumstats_dt), split = "\t")
+    message(msg)
+    #### first make all column headers upper case ####
+    column_headers <- names(sumstats_dt) 
+    # load synonym mapping - internal data no loading
+    # Identify allele mappings which are ambiguous and problematic
+    # vs those that are interpretable
+    colnames(mapping_file) <- toupper(colnames(mapping_file))
+    allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]
+    ambig_allele_map <- allele_mapping[grepl('1',allele_mapping$UNCORRECTED)|
+                                         grepl('2',allele_mapping$UNCORRECTED),]
+    unambig_allele_map <- 
+      allele_mapping[!(grepl('1',allele_mapping$UNCORRECTED)|
+                         grepl('2',allele_mapping$UNCORRECTED)),]
+    #as long as the sumstats contains 1 unambiguous allele column MSS will work
+    #as expected
+    unambig_cols <- intersect(unambig_allele_map$UNCORRECTED,
+                              toupper(column_headers))
+    ambig_cols <- intersect(ambig_allele_map$UNCORRECTED,
+                            toupper(column_headers))
+    #if both ambiguous and unambiguous columns found, rename ambiguous ones so
+    #they aren't used later by MSS
+    #example: 'A1','A2','EFFECT_ALLELE' all present
+    if (length(unambig_cols)>0 && length(ambig_cols)>0){
+      #find if unambig and ambig relate to the same allele
+      #get corrected name for unambig 
+      unambig_corrcted <- 
+        unique(allele_mapping[allele_mapping$UNCORRECTED %in% unambig_cols,
+                              ]$CORRECTED)
+      #check if any ambig are to the same allele
+      ambig_corrcted <- 
+        unique(allele_mapping[allele_mapping$UNCORRECTED %in% ambig_cols,
+                              ]$CORRECTED)
+      #overlap?
+      ambig_corrcted_rnme <- 
+        ambig_corrcted[ambig_corrcted %in% unambig_corrcted]
+      if (length(ambig_corrcted_rnme)>0){
+        message("There are multiple columns relating to the same allele info")
+        message("Renaming ambiguous allele columns so they won't be used")
+        #get the related ambig naming and change there name so won't be used
+        ambig_uncorrcted_rnme <- 
+          ambig_allele_map[ambig_allele_map$CORRECTED %in% ambig_corrcted_rnme,
+                           ]$UNCORRECTED
+        #now rename any matches in sumstats
+        chng_nmes <- column_headers[toupper(column_headers) %in% 
+                                     ambig_uncorrcted_rnme]
+        for(chng_i in chng_nmes){
+          data.table::setnames(sumstats_dt, chng_i, paste0(chng_i,"_INPUTTED"))
+        }
+      }
+    } else if (length(unambig_cols)==0 && length(ambig_cols)>=2){
+      #only continue if no unambiguous columns found but 2 ambig ones are found-
+      #less than 2 in total means allele info is missing which MSS can try fill
+      #in later
+      message("Allele columns are ambiguous, attempting to infer direction")
+      #get names for allele mared eff/frq columns
+      eff_frq_allele_matches <- get_eff_frq_allele_combns()
+      #now look for matches in sumstats
+      fnd_allele_indicator <- 
+        column_headers[toupper(column_headers) %in% 
+                         eff_frq_allele_matches$UNCORRECTED]
+      if(length(fnd_allele_indicator)>0){
+        message("Found direction from effect/frq column naming")
+        #fnd_allele_indicator could be >1 so majority vote
+        a1_mtch <- sum(grepl("A1",fnd_allele_indicator))
+        a2_mtch <- sum(grepl("A2",fnd_allele_indicator))
+        if(a2_mtch>=a1_mtch){
+          message("Effect/frq column(s) relate to A2 in the inputted sumstats")
+          #this is what MSS expects so no action required
+        }else{#a2_mtch<a1_mtch
+          message("Effect/frq column(s) relate to A1 in the inputted sumstats")
+          #this is the opposite to what MSS expects so switch A1/A2 naming
+          #first get corrected names for allele columns then switch
+          for (headerI in seq_len(nrow(mapping_file))) {
+            un <- mapping_file[headerI, "UNCORRECTED"]
+            cr <- mapping_file[headerI, "CORRECTED"]
+            #note ambig_cols here not all col headers
+            if (un %in% ambig_cols & (!cr %in% column_headers)) {
+              data.table::setnames(sumstats_dt, un, cr)
+            }
+          }
+          #now switch
+          data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_")
+          data.table::setnames(sumstats_dt,"A1","A2")
+          data.table::setnames(sumstats_dt,"A2_INPUTTED_OLD_","A1")
+        }
+        
+      }
+      else{
+        #didn't find any allele specific
+        #last option is to check the reference genome and take the allele with
+        #more matches to it as the non-effect allele
+        #need to standardise the sumstats for this
+        #use get_genome_build function for this
+        if(isTRUE(on_ref_genome)){
+          switch_req <- get_genome_build(
+            sumstats = sumstats_dt,
+            standardise_headers = TRUE,
+            sampled_snps = sampled_snps,
+            mapping_file = mapping_file,
+            dbSNP=dbSNP,
+            nThread = nThread,
+            allele_match_ref = TRUE,
+            ref_genome = ref_genome
+          )
+          if(is.logical(switch_req)){
+            if(isTRUE(switch_req)){
+              #swap A1 and A2
+              #this is the opposite to what MSS expects so switch A1/A2 naming
+              #first get corrected names for allele columns then switch
+              for (headerI in seq_len(nrow(mapping_file))) {
+                un <- mapping_file[headerI, "UNCORRECTED"]
+                cr <- mapping_file[headerI, "CORRECTED"]
+                #note ambig_cols here not all col headers
+                if (un %in% ambig_cols & (!cr %in% column_headers)) {
+                  data.table::setnames(sumstats_dt, un, cr)
+                }
+              }
+              #now switch
+              data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_")
+              data.table::setnames(sumstats_dt,"A1","A2")
+              data.table::setnames(sumstats_dt,"A2_INPUTTED_OLD_","A1")
+            }
+          }
+          else{ #couldn't infer
+            message(switch_req)
+            message("Can't infer allele columns from sumstats")
+          }
+        }
+        else{
+          message("Can't infer allele columns from sumstats")
+        }
+      }
+    }
+    #### Return format ####
+    if(return_list){
+      return(list("sumstats_dt" = sumstats_dt))
+    }else {
+      return(sumstats_dt)
+    }
+  }    
diff --git a/R/standardise_sumstats_column_headers_crossplatform.R b/R/standardise_sumstats_column_headers_crossplatform.R
index ef9592a..34a1749 100644
--- a/R/standardise_sumstats_column_headers_crossplatform.R
+++ b/R/standardise_sumstats_column_headers_crossplatform.R
@@ -14,7 +14,7 @@
 #' @return list containing sumstats_dt, the modified summary statistics data
 #' table object
 #' @export
-#' @importFrom data.table setnames
+#' @importFrom data.table setnames rbindlist
 #' @examples
 #' sumstats_dt <- data.table::fread(system.file("extdata", "eduAttainOkbay.txt",
 #'                                              package = "MungeSumstats"))
@@ -41,6 +41,10 @@ standardise_header <- standardise_sumstats_column_headers_crossplatform <-
         # Go through each and get correct spelling
         # allow for differing cases column names
         colnames(mapping_file) <- toupper(colnames(mapping_file))
+        #get names for allele mared eff/frq columns
+        eff_frq_allele_matches <- get_eff_frq_allele_combns()
+        mapping_file <- rbind(mapping_file,
+                              as.data.frame(eff_frq_allele_matches))
         for (headerI in seq_len(nrow(mapping_file))) {
             un <- mapping_file[headerI, "UNCORRECTED"]
             cr <- mapping_file[headerI, "CORRECTED"]
diff --git a/R/sysdata.rda b/R/sysdata.rda
index 06028d0..3955c63 100644
Binary files a/R/sysdata.rda and b/R/sysdata.rda differ
diff --git a/data/sumstatsColHeaders.rda b/data/sumstatsColHeaders.rda
index 2960502..83f04f9 100644
Binary files a/data/sumstatsColHeaders.rda and b/data/sumstatsColHeaders.rda differ
diff --git a/man/get_eff_frq_allele_combns.Rd b/man/get_eff_frq_allele_combns.Rd
new file mode 100644
index 0000000..2d6f228
--- /dev/null
+++ b/man/get_eff_frq_allele_combns.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_eff_frq_allele_combns.R
+\name{get_eff_frq_allele_combns}
+\alias{get_eff_frq_allele_combns}
+\title{Get combinations of uncorrected allele and effect (and frq) columns}
+\usage{
+get_eff_frq_allele_combns(
+  mapping_file = sumstatsColHeaders,
+  eff_frq_cols = c("BETA", "OR", "LOG_ODDS", "SIGNED_SUMSTAT", "Z", "FRQ")
+)
+}
+\arguments{
+\item{mapping_file}{MungeSumstats has a pre-defined column-name mapping file
+which should cover the most common column headers and their interpretations.
+However, if a column header that is in youf file is missing of the mapping we
+give is incorrect you can supply your own mapping file. Must be a 2 column
+dataframe with column names "Uncorrected" and "Corrected". See
+data(sumstatsColHeaders) for default mapping and necessary format.}
+
+\item{eff_frq_cols}{Corrected effect or frequency column names found in a
+sumstats. Default of BETA, OR, LOG_ODDS, SIGNED_SUMSTAT, Z and FRQ.}
+}
+\value{
+datatable containing uncorrected and corrected combinations
+}
+\description{
+Get combinations of uncorrected allele and effect (and frq) columns
+}
diff --git a/man/get_genome_build.Rd b/man/get_genome_build.Rd
index 6ee12d1..9ffce59 100644
--- a/man/get_genome_build.Rd
+++ b/man/get_genome_build.Rd
@@ -12,7 +12,9 @@ get_genome_build(
   standardise_headers = TRUE,
   mapping_file = sumstatsColHeaders,
   dbSNP = 155,
-  header_only = FALSE
+  header_only = FALSE,
+  allele_match_ref = FALSE,
+  ref_genome = NULL
 )
 }
 \arguments{
@@ -41,6 +43,13 @@ dataframe with column names "Uncorrected" and "Corrected". See
 only read in the first N rows where N=\code{sampled_snps}.
 This should help speed up cases where you have to read in \code{sumstats}
 from disk each time.}
+
+\item{allele_match_ref}{Instead of returning the genome_build this will
+return the propotion of matches to each genome build for each allele (A1,A2).}
+
+\item{ref_genome}{name of the reference genome used for the GWAS ("GRCh37" or
+"GRCh38"). Argument is case-insensitive. Default is NULL which infers the
+reference genome from the data.}
 }
 \value{
 ref_genome the genome build of the data
diff --git a/man/infer_effect_column.Rd b/man/infer_effect_column.Rd
new file mode 100644
index 0000000..cb6a08d
--- /dev/null
+++ b/man/infer_effect_column.Rd
@@ -0,0 +1,73 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/infer_effect_column.R
+\name{infer_effect_column}
+\alias{infer_effect_column}
+\title{Infer if effect relates to a1 or A2 if ambiguously named}
+\usage{
+infer_effect_column(
+  sumstats_dt,
+  dbSNP = 155,
+  sampled_snps = 10000,
+  mapping_file = sumstatsColHeaders,
+  nThread = nThread,
+  ref_genome = NULL,
+  on_ref_genome = TRUE,
+  return_list = TRUE
+)
+}
+\arguments{
+\item{sumstats_dt}{data table obj of the summary statistics file for the
+GWAS.}
+
+\item{dbSNP}{version of dbSNP to be used for imputation (144 or 155).}
+
+\item{sampled_snps}{Downsample the number of SNPs used when inferring genome
+build to save time.}
+
+\item{mapping_file}{MungeSumstats has a pre-defined column-name mapping file
+which should cover the most common column headers and their interpretations.
+However, if a column header that is in youf file is missing of the mapping we
+give is incorrect you can supply your own mapping file. Must be a 2 column
+dataframe with column names "Uncorrected" and "Corrected". See
+data(sumstatsColHeaders) for default mapping and necessary format.}
+
+\item{nThread}{Number of threads to use for parallel processes.}
+
+\item{ref_genome}{name of the reference genome used for the GWAS ("GRCh37" or
+"GRCh38"). Argument is case-insensitive. Default is NULL which infers the
+reference genome from the data.}
+
+\item{on_ref_genome}{Binary Should a check take place that all SNPs are on
+the reference genome by SNP ID. Default is TRUE.}
+
+\item{return_list}{Return the \code{sumstats_dt} within a named list
+(default: \code{TRUE}).}
+}
+\value{
+list containing sumstats_dt, the modified summary statistics data
+table object
+}
+\description{
+Three checks are made to infer which allele the effect/frequency information
+relates to if they are ambiguous (named A1 and A2 or equivalent):
+\enumerate{
+\item Check if ambiguous naming conventions are used (i.e. allele 1 and 2 or
+equivalent). If not exit, otherwise continue to next checks. This can be
+checked by using the mapping file and splitting A1/A2 mappings by those that
+contain 1 or 2 (ambiguous) or doesn't contain 1 or 2 e.g. effect,
+tested (unambiguous so fine for MSS to handle as is).
+\item Look for effect column/frequency column where the A1/A2 explicitly
+mentioned, if found then  we know the direction and should update A1/A2
+naming so A2 is the effect column. We can look for such columns by getting
+every combination of A1/A2 naming and effect/frq naming.
+\item If note found in 2, a final check should be against the reference genome,
+whichever of A1 and A2 has more of a match with the reference genome should
+be taken as \strong{not} the effect allele. There is an assumption in this but is
+still better than guessing the ambiguous allele naming.
+}
+}
+\examples{
+sumstats <- MungeSumstats::formatted_example()
+#for speed, don't run on_ref_genome part of check (on_ref_genome = FALSE)
+sumstats_dt2<-infer_effect_column(sumstats_dt=sumstats,on_ref_genome = FALSE)
+}
diff --git a/tests/testthat/test-infer_effect_column.R b/tests/testthat/test-infer_effect_column.R
new file mode 100644
index 0000000..2656f15
--- /dev/null
+++ b/tests/testthat/test-infer_effect_column.R
@@ -0,0 +1,95 @@
+test_that("Test infer effect column function works", {
+  ## Call uses reference genome as default with more than 2GB of memory,
+  ## which is more than what 32-bit Windows can handle so remove tests
+  is_32bit_windows <-
+    .Platform$OS.type == "windows" && .Platform$r_arch == "i386"
+  if (!is_32bit_windows) {
+    a <- MungeSumstats::formatted_example()
+    b <- copy(a)
+    
+    #check if inferring effect direction is working
+    
+    #make A1 the effect col not A2
+    #change FRQ and BETA to reflect this change
+    data.table::setnames(b,"A2","A3")
+    data.table::setnames(b,"A1","A2")
+    data.table::setnames(b,"A3","A1")
+    c <- copy(b)
+    data.table::setnames(b,"BETA","BETA1")
+    data.table::setnames(c,"FRQ","A1FRQ")
+    
+    # if the allele columns are named unambiguously, formatting will be performed 
+    # correctly - use this to test against as eff will be flipped compared to org
+    b_renamed <- copy(b)
+    data.table::setnames(b_renamed, "A1", "effect_allele")
+    data.table::setnames(b_renamed, "A2", "non_effect_allele")
+    b_renamed_for <- MungeSumstats::format_sumstats(b_renamed, return_data = TRUE,
+                                                    #all just make MSS run faster
+                                                    ref_genome = 'GRCh37',
+                                                    on_ref_genome = FALSE,
+                                                    strand_ambig_filter = FALSE,
+                                                    bi_allelic_filter = FALSE,
+                                                    allele_flip_check = FALSE,
+                                                    dbSNP=144)
+    
+    # what if BETA gives the direction
+    b_for <- MungeSumstats::format_sumstats(b, return_data = TRUE, 
+                                            #all just make MSS run faster
+                                            ref_genome = 'GRCh37',
+                                            on_ref_genome = FALSE,
+                                            strand_ambig_filter = FALSE,
+                                            bi_allelic_filter = FALSE,
+                                            allele_flip_check = FALSE,
+                                            dbSNP=144)
+    
+    # what if FRQ gives the direction
+    c_for <- MungeSumstats::format_sumstats(b, return_data = TRUE, 
+                                            #all just make MSS run faster
+                                            ref_genome = 'GRCh37',
+                                            on_ref_genome = FALSE,
+                                            strand_ambig_filter = FALSE,
+                                            bi_allelic_filter = FALSE,
+                                            allele_flip_check = FALSE,
+                                            dbSNP=144)
+    
+    testthat::expect_equal(
+      all.equal(b_renamed_for, b_for,ignore.col.order = TRUE),
+      TRUE
+    )
+    
+    testthat::expect_equal(
+      all.equal(b_renamed_for, c_for,ignore.col.order = TRUE),
+      TRUE
+    )
+    
+    
+    #finally check if the ref genome can be used to infer rather than being told in 
+    #eff/frq cols - just subset for speed
+    snps <- c("rs11210860","rs34305371","rs1008078","rs11588857","rs1777827",
+              "rs76076331","rs2457660","rs10496091","rs4851251","rs12987662",
+              "rs10930008","rs301800","rs2568955","rs61787263","rs2992632",
+              "rs11689269","rs11690172")
+    d <- copy(b)
+    data.table::setnames(d,"BETA1","BETA")
+    d_for <- MungeSumstats::format_sumstats(d[SNP %in% snps,], return_data = TRUE, 
+                                            on_ref_genome = TRUE,
+                                            #all just make MSS run faster
+                                            ref_genome = 'GRCh37',
+                                            strand_ambig_filter = FALSE,
+                                            bi_allelic_filter = FALSE,
+                                            allele_flip_check = FALSE,
+                                            dbSNP=144)
+    data.table::setkey(b_renamed_for,"SNP")
+    data.table::setkey(d_for,"SNP")
+    testthat::expect_equal(
+      all.equal(b_renamed_for[SNP %in% snps], d_for,
+                ignore.col.order = TRUE),
+      TRUE
+    )
+  }    
+  else{
+    testthat::expect_equal(is_32bit_windows, TRUE)
+    testthat::expect_equal(is_32bit_windows, TRUE)
+    testthat::expect_equal(is_32bit_windows, TRUE)
+  }
+})