From d16c5fb815ad6f80bfcaf01d2f1294d203808679 Mon Sep 17 00:00:00 2001 From: Al-Murphy Date: Fri, 13 Oct 2023 17:27:05 +0100 Subject: [PATCH] Bug Fix: Infer effect allele --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 41 ++++ R/format_sumstats.R | 11 ++ R/get_eff_frq_allele_combns.R | 77 ++++++++ R/get_genome_build.R | 187 ++++++++++++++---- R/infer_effect_column.R | 182 +++++++++++++++++ ...se_sumstats_column_headers_crossplatform.R | 6 +- R/sysdata.rda | Bin 2106 -> 2014 bytes data/sumstatsColHeaders.rda | Bin 2150 -> 2053 bytes man/get_eff_frq_allele_combns.Rd | 28 +++ man/get_genome_build.Rd | 11 +- man/infer_effect_column.Rd | 73 +++++++ tests/testthat/test-infer_effect_column.R | 95 +++++++++ 14 files changed, 670 insertions(+), 44 deletions(-) create mode 100644 R/get_eff_frq_allele_combns.R create mode 100644 R/infer_effect_column.R create mode 100644 man/get_eff_frq_allele_combns.Rd create mode 100644 man/infer_effect_column.Rd create mode 100644 tests/testthat/test-infer_effect_column.R diff --git a/DESCRIPTION b/DESCRIPTION index 2cba6af5..b9a36fba 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: MungeSumstats Type: Package Title: Standardise summary statistics from GWAS -Version: 1.9.17 +Version: 1.9.18 Authors@R: c(person(given = "Alan", family = "Murphy", diff --git a/NAMESPACE b/NAMESPACE index 4761e75b..03c8c9af 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ export(formatted_example) export(get_genome_builds) export(import_sumstats) export(index_tabular) +export(infer_effect_column) export(liftover) export(list_sumstats) export(load_snp_loc_data) diff --git a/NEWS.md b/NEWS.md index 5b55a28a..c3c10cfe 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,44 @@ +## CHANGES IN VERSION 1.9.18 + +### Bug fix +* Fixed column header mappings + * Made all uncorrected header names uppercase and removed duplicates + * "TOTALSAMPLESIZE" now maps to "N" instead of "NSTUDY" + * "MAJORALLELE", "MAJOR_ALLELE", "MAJOR-ALLELE", and "MAJOR ALLELE" now map to + "A1" instead of "A2" + * Removed the mappings for "OR-A1", "OR.A1", "OR_A1", and "BETA1" because MSS + assumes that A2 is the effect allele + * Removed mappings for "A1FREQ", "A1FRQ", "AF1", "FREQ.A1.1000G.EUR", + "FREQ.A1.ESP.EUR", "FREQ.ALLELE1.HAPMAPCEU", "FREQ1", "FREQ1.HAPMAP", and + "FRQ_A1" because MSS defines "FRQ" to be the allele frequency of A2 + * Removed mappings for "CHR36", "BASE_GRCH36", "POSITION36", "POSGRCH36", + "BASEGRCH36", "POS36", "POS GRCH36", "POS.GRCH36", "POS-GRCH36", and + "POS_GRCH36" + because MSS does not support the GRCh36 genome build + * Removed the ambiguous mapping "NMISS" -> "N" because "NMISS" can refer to + the number of samples with missing data + * Removed the ambiguous mapping "WEIGHT" -> "N" because "WEIGHT" can refer to + coefficient weights +* Fixed inference of allele where ambiguous (A1, A2) naming used (see + infer_effect_column.R for code) but in short: + * Three checks now made to infer which allele the effect/frequency information + relates to. See infer_effect_column.R for further details. + * See get_eff_frq_allele_combns.R for how effect/frequency columns that infer + the allele are captured in the mapping file + +### New features +* New column header mappings: + * "VARIANT_ID" and "RSIDS" --> "SNP" + * "P_BOLT_LMM" --> "P" + * "NCASES" --> "N_CAS" + * "N_EFFECTIVE", "N_INFORMATIVE", and "TOTAL_N" --> "N" + * "HET_P" --> "HETPVAL" + * "HET_ISQ" --> "HETISQT" + * "ALL_AF" --> "FRQ" + * "DIRECT" --> "DIRECTION" + * "ALT_EFFSIZE" --> "BETA" + * "INFORMATIVE_ALT_AC" --> "AC" + ## CHANGES IN VERSION 1.9.17 ### Bug fix diff --git a/R/format_sumstats.R b/R/format_sumstats.R index ce517e53..e60aee45 100644 --- a/R/format_sumstats.R +++ b/R/format_sumstats.R @@ -445,6 +445,17 @@ format_sumstats <- function(path, mapping_file <- rbind(mapping_file,es_cols) } + #### Check 2:Check for effect direction #### + sumstats_return <- + infer_effect_column( + sumstats_dt = sumstats_return$sumstats_dt, + mapping_file = mapping_file, + dbSNP = dbSNP, + nThread = nThread, + ref_genome = ref_genome, + on_ref_genome = on_ref_genome + ) + #### Check 3:Standardise headers for all OS #### sumstats_return <- standardise_sumstats_column_headers_crossplatform( diff --git a/R/get_eff_frq_allele_combns.R b/R/get_eff_frq_allele_combns.R new file mode 100644 index 00000000..e6e93751 --- /dev/null +++ b/R/get_eff_frq_allele_combns.R @@ -0,0 +1,77 @@ +#' Get combinations of uncorrected allele and effect (and frq) columns +#' +#' @inheritParams format_sumstats +#' @inheritParams compute_nsize +#' @param eff_frq_cols Corrected effect or frequency column names found in a +#' sumstats. Default of BETA, OR, LOG_ODDS, SIGNED_SUMSTAT, Z and FRQ. +#' @return datatable containing uncorrected and corrected combinations +#' @importFrom data.table setnames as.data.table := setkey rbindlist data.table +get_eff_frq_allele_combns <- + function(mapping_file = sumstatsColHeaders, + eff_frq_cols = c("BETA", "OR", "LOG_ODDS", "SIGNED_SUMSTAT","Z", + "FRQ")) { + ### Add this to avoid confusing BiocCheck + CORRECTED <- UNCORRECTED <- Var1 <- Var2 <- NULL + colnames(mapping_file) <- toupper(colnames(mapping_file)) + #get allele associated effect/FRQ columns + #get all combinations with allele columns + eff_frq_cols_uncorrc <- + mapping_file[mapping_file$CORRECTED %in% eff_frq_cols,]$UNCORRECTED + #join with all allele cols + allele_uncorrc <- + mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]$UNCORRECTED + #get combinations + eff_frq_allele_dt <- + data.table::as.data.table(expand.grid(eff_frq_cols_uncorrc, + allele_uncorrc)) + mapping_file_dt <- data.table::as.data.table(mapping_file) + data.table::setkey(mapping_file_dt,"UNCORRECTED") + data.table::setkey(eff_frq_allele_dt,"Var1") + #add corrected + eff_frq_allele_dt[mapping_file,CORRECTED:=CORRECTED,] + #now loop through every joining character and join with eff both before + #and after + joining_char <- c("","_",".","-"," ") + all_combns <- vector(mode="list",length = length(joining_char)*2) + counter <- 1 + for(join_i in joining_char){ + eff_frq_allele_dt_i <- copy(eff_frq_allele_dt) + eff_frq_allele_dt_i[,UNCORRECTED:=paste0(Var1,join_i,Var2)] + all_combns[[counter]] <- + eff_frq_allele_dt_i[,c("UNCORRECTED","CORRECTED")] + counter <- counter+1 + #same for Var 2 in front + eff_frq_allele_dt_i <- copy(eff_frq_allele_dt) + eff_frq_allele_dt_i[,UNCORRECTED:=paste0(Var2,join_i,Var1)] + all_combns[[counter]] <- + eff_frq_allele_dt_i[,c("UNCORRECTED","CORRECTED")] + counter <- counter+1 + } + #join all together + eff_frq_allele_matches <- data.table::rbindlist(all_combns) + #finally add some custom ones + custom_adds <- data.table::data.table("UNCORRECTED" = + c("BETA1", "BETA2","AF1","AF2", + "FREQ.A1.1000G.EUR", + "FREQ.A2.1000G.EUR", + "FREQ.A1.ESP.EUR", + "FREQ.A2.ESP.EUR", + "FREQ.ALLELE1.HAPMAPCEU", + "FREQ.ALLELE2.HAPMAPCEU", + "FREQ1","FREQ2", + "FREQ1.HAPMAP","FREQ2.HAPMAP"), + "CORRECTED" = + c("BETA", "BETA","FRQ","FRQ", + "FRQ", + "FRQ", + "FRQ", + "FRQ", + "FRQ", + "FRQ", + "FRQ","FRQ", + "FRQ","FRQ")) + eff_frq_allele_matches <- data.table::rbindlist(list( + eff_frq_allele_matches,custom_adds)) + + return(eff_frq_allele_matches) + } diff --git a/R/get_genome_build.R b/R/get_genome_build.R index dbd5b12d..0209ced0 100644 --- a/R/get_genome_build.R +++ b/R/get_genome_build.R @@ -21,9 +21,12 @@ #' only read in the first N rows where N=\code{sampled_snps}. #' This should help speed up cases where you have to read in \code{sumstats} #' from disk each time. -#' +#' @param allele_match_ref Instead of returning the genome_build this will +#' return the propotion of matches to each genome build for each allele (A1,A2). +#' @inheritParams format_sumstats +#' #' @return ref_genome the genome build of the data -#' @importFrom data.table setDT +#' @importFrom data.table setDT := #' @keywords internal get_genome_build <- function(sumstats, nThread = 1, @@ -31,11 +34,14 @@ get_genome_build <- function(sumstats, standardise_headers = TRUE, mapping_file = sumstatsColHeaders, dbSNP=155, - header_only = FALSE) { + header_only = FALSE, + allele_match_ref = FALSE, + ref_genome = NULL) { ### Add this to avoid confusing BiocCheck - seqnames <- CHR <- SNP <- BP <- NULL - - message("Inferring genome build.") + seqnames <- CHR <- SNP <- BP <- alt_alleles <- NULL + if(isFALSE(allele_match_ref)){ + message("Inferring genome build.") + } # if not a data.table, must be a path if (!is.data.frame(sumstats)) { # Checking if the file exists should happen first @@ -82,7 +88,13 @@ get_genome_build <- function(sumstats, ) # Infer genome build using SNP & CHR & BP if (!all(c("SNP", "CHR", "BP") %in% colnames(sumstats))) { + #want it returned rather than throwing an error + if(isTRUE(allele_match_ref)){ + return(err_msg) + } + else{ stop(err_msg) + } } #### Do some filtering first to avoid errors #### @@ -113,7 +125,13 @@ get_genome_build <- function(sumstats, size_okay <- TRUE } if (!size_okay) { + #want it returned rather than throwing an error + if(isTRUE(allele_match_ref)){ + return(err_msg2) + } + else{ stop(err_msg2) + } } #### Downsample SNPs to save time #### if ((nrow(sumstats) > sampled_snps) && !(is.null(sampled_snps))) { @@ -124,58 +142,145 @@ get_genome_build <- function(sumstats, sumstats <- sumstats[SNP %in% snps, ] - # otherwise SNP, CHR, BP were all found and can infer - snp_loc_data_37 <- load_ref_genome_data( + #now split into functions two roles + if(isTRUE(allele_match_ref)){ + #1. checking for matches for A1/A2 to ref genomes + if (is.null(ref_genome)){ + #have to check multiple + snp_loc_data_37 <- load_ref_genome_data( + snps = snps, + ref_genome = "GRCH37", + dbSNP = dbSNP + ) + snp_loc_data_38 <- load_ref_genome_data( + snps = snps, + ref_genome = "GRCH38", + dbSNP = dbSNP + ) + # convert CHR filed in ref genomes to character not factor + snp_loc_data_37[, seqnames := as.character(seqnames)] + snp_loc_data_38[, seqnames := as.character(seqnames)] + # convert CHR filed in data to character if not already + sumstats[, CHR := as.character(CHR)] + # Now check which genome build has more matches to data + num_37 <- + nrow(snp_loc_data_37[sumstats, , + on = c("SNP"="SNP","pos"="BP","seqnames"="CHR"), + nomatch = FALSE + ]) + num_38 <- + nrow(snp_loc_data_38[sumstats, , + on = c("SNP"="SNP","pos"="BP","seqnames"="CHR"), + nomatch = FALSE + ]) + if (num_37 > num_38) { + ref_gen_num <- num_37 + ref_genome <- "GRCH37" + snp_loc_data <- snp_loc_data_37 + + } else { + ref_gen_num <- num_38 + ref_genome <- "GRCH38" + snp_loc_data <- snp_loc_data_38 + } + } + else{ + #only check one chosen + snp_loc_data <- load_ref_genome_data( + snps = snps, + ref_genome = ref_genome, + dbSNP = dbSNP + ) + # convert CHR filed in ref genomes to character not factor + snp_loc_data[, seqnames := as.character(seqnames)] + # convert CHR filed in data to character if not already + sumstats[, CHR := as.character(CHR)] + } + # Now check which allele has more matches to data + # want to match on ref and alt alleles too + # need to take first alt from list to do this + snp_loc_data[,alt_alleles := unlist(lapply(alt_alleles, + function(x) x[[1]]))] + num_a1 <- + nrow(snp_loc_data[sumstats, , + on = c("SNP"="SNP","pos"="BP","seqnames"="CHR", + "ref_allele"="A1","alt_alleles"="A2"), + nomatch = FALSE + ]) + num_a2 <- + nrow(snp_loc_data[sumstats, , + on = c("SNP"="SNP","pos"="BP","seqnames"="CHR", + "ref_allele"="A2","alt_alleles"="A1"), + nomatch = FALSE + ]) + if(num_a2>=num_a1){ + message("Effect/frq column(s) relate to A2 in the inputted sumstats") + #this is what MSS expects so no action required + switch_req <- FALSE + }else{#num_a2 num_38) { + #if no matches throw error + if(num_37==0 && num_38==0){ + msg_err <- + paste0("No matches found in either reference genome for your ", + "SNPs.\nPlease check their formatting (SNP, CHR and BP", + " columns) or supply the genome build.") + stop(msg_err) + } + if (num_37 > num_38) { ref_gen_num <- num_37 ref_genome <- "GRCH37" - } else { + } else { ref_gen_num <- num_38 ref_genome <- "GRCH38" - } - - message("Inferred genome build: ", ref_genome) - # add a warning if low proportion of matches found - msg <- paste0( + } + + message("Inferred genome build: ", ref_genome) + # add a warning if low proportion of matches found + msg <- paste0( "WARNING: Less than 10% of your sampled SNPs matched that of ", "either reference genome, this may question the quality of ", "your summary statistics file." - ) - if (ref_gen_num / length(snps) < 0.1) { + ) + if (ref_gen_num / length(snps) < 0.1) { message(msg) + } + + return(ref_genome) + } - - return(ref_genome) } diff --git a/R/infer_effect_column.R b/R/infer_effect_column.R new file mode 100644 index 00000000..f70fbc66 --- /dev/null +++ b/R/infer_effect_column.R @@ -0,0 +1,182 @@ +#' Infer if effect relates to a1 or A2 if ambiguously named +#' +#' Three checks are made to infer which allele the effect/frequency information +#' relates to if they are ambiguous (named A1 and A2 or equivalent): +#' 1. Check if ambiguous naming conventions are used (i.e. allele 1 and 2 or +#' equivalent). If not exit, otherwise continue to next checks. This can be +#' checked by using the mapping file and splitting A1/A2 mappings by those that +#' contain 1 or 2 (ambiguous) or doesn't contain 1 or 2 e.g. effect, +#' tested (unambiguous so fine for MSS to handle as is). +#' 2. Look for effect column/frequency column where the A1/A2 explicitly +#' mentioned, if found then we know the direction and should update A1/A2 +#' naming so A2 is the effect column. We can look for such columns by getting +#' every combination of A1/A2 naming and effect/frq naming. +#' 3. If note found in 2, a final check should be against the reference genome, +#' whichever of A1 and A2 has more of a match with the reference genome should +#' be taken as **not** the effect allele. There is an assumption in this but is +#' still better than guessing the ambiguous allele naming. +#' +#' @inheritParams format_sumstats +#' @inheritParams compute_nsize +#' @inheritParams get_genome_build +#' @return list containing sumstats_dt, the modified summary statistics data +#' table object +#' @export +#' @importFrom data.table setnames as.data.table := setkey rbindlist data.table +#' @examples +#' sumstats <- MungeSumstats::formatted_example() +#' #for speed, don't run on_ref_genome part of check (on_ref_genome = FALSE) +#' sumstats_dt2<-infer_effect_column(sumstats_dt=sumstats,on_ref_genome = FALSE) +infer_effect_column <- + function(sumstats_dt, + dbSNP=155, + sampled_snps = 10000, + mapping_file = sumstatsColHeaders, + nThread = nThread, + ref_genome = NULL, + on_ref_genome = TRUE, + return_list=TRUE) { + + message("Infer Effect Column") + message("First line of summary statistics file: ") + msg <- paste0(names(sumstats_dt), split = "\t") + message(msg) + #### first make all column headers upper case #### + column_headers <- names(sumstats_dt) + # load synonym mapping - internal data no loading + # Identify allele mappings which are ambiguous and problematic + # vs those that are interpretable + colnames(mapping_file) <- toupper(colnames(mapping_file)) + allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),] + ambig_allele_map <- allele_mapping[grepl('1',allele_mapping$UNCORRECTED)| + grepl('2',allele_mapping$UNCORRECTED),] + unambig_allele_map <- + allele_mapping[!(grepl('1',allele_mapping$UNCORRECTED)| + grepl('2',allele_mapping$UNCORRECTED)),] + #as long as the sumstats contains 1 unambiguous allele column MSS will work + #as expected + unambig_cols <- intersect(unambig_allele_map$UNCORRECTED, + toupper(column_headers)) + ambig_cols <- intersect(ambig_allele_map$UNCORRECTED, + toupper(column_headers)) + #if both ambiguous and unambiguous columns found, rename ambiguous ones so + #they aren't used later by MSS + #example: 'A1','A2','EFFECT_ALLELE' all present + if (length(unambig_cols)>0 && length(ambig_cols)>0){ + #find if unambig and ambig relate to the same allele + #get corrected name for unambig + unambig_corrcted <- + unique(allele_mapping[allele_mapping$UNCORRECTED %in% unambig_cols, + ]$CORRECTED) + #check if any ambig are to the same allele + ambig_corrcted <- + unique(allele_mapping[allele_mapping$UNCORRECTED %in% ambig_cols, + ]$CORRECTED) + #overlap? + ambig_corrcted_rnme <- + ambig_corrcted[ambig_corrcted %in% unambig_corrcted] + if (length(ambig_corrcted_rnme)>0){ + message("There are multiple columns relating to the same allele info") + message("Renaming ambiguous allele columns so they won't be used") + #get the related ambig naming and change there name so won't be used + ambig_uncorrcted_rnme <- + ambig_allele_map[ambig_allele_map$CORRECTED %in% ambig_corrcted_rnme, + ]$UNCORRECTED + #now rename any matches in sumstats + chng_nmes <- column_headers[toupper(column_headers) %in% + ambig_uncorrcted_rnme] + for(chng_i in chng_nmes){ + data.table::setnames(sumstats_dt, chng_i, paste0(chng_i,"_INPUTTED")) + } + } + } else if (length(unambig_cols)==0 && length(ambig_cols)>=2){ + #only continue if no unambiguous columns found but 2 ambig ones are found- + #less than 2 in total means allele info is missing which MSS can try fill + #in later + message("Allele columns are ambiguous, attempting to infer direction") + #get names for allele mared eff/frq columns + eff_frq_allele_matches <- get_eff_frq_allele_combns() + #now look for matches in sumstats + fnd_allele_indicator <- + column_headers[toupper(column_headers) %in% + eff_frq_allele_matches$UNCORRECTED] + if(length(fnd_allele_indicator)>0){ + message("Found direction from effect/frq column naming") + #fnd_allele_indicator could be >1 so majority vote + a1_mtch <- sum(grepl("A1",fnd_allele_indicator)) + a2_mtch <- sum(grepl("A2",fnd_allele_indicator)) + if(a2_mtch>=a1_mtch){ + message("Effect/frq column(s) relate to A2 in the inputted sumstats") + #this is what MSS expects so no action required + }else{#a2_mtch4Xe3A=nE+~G8l&|#nus(200Te({gaClumIV-!IGdj zkO;IwiYL?v5NS~)N7n6uMv@ue1R3BFZq;&13J&eQu}I_X7Tq3+6oI*q3WcfRMCDfk zUG&&c4pGFVJV+yj2qaLEjySV4DZy9+s#YWDi2*kXbi#Mj8vqcM<;_rmp?00a3rn52 zykK)F1pL~=5Yu9b6h|%E1CpCyCIv3;4HTfN0!7^S*xNJhs0-LYaJW_M$+rO&etfUO zIcG@5Th4YkkQs(G_%Kn1R3knG0o51URL>?xp;+fL6NdHbtX$1jt+Eh^6DU}-t%Eee zeRCL;YZx$B)k7U@(%%kDTOIQ(H4JAI0Z13Vb4WsDl7zPSVLVx9z?dm83f0@8;*REN zceRF}DPKr6Uj>Z@m-gs@nO2KfaLiGU?sjgdZ@P8W>KVJo;dX z2`gi%X5&QoJ%fcSL=@!dXmpqnx~hgw5Lo4lWs-MSH9AUloxakr0ExQhYfRln?~v4r0!5`D^+or&mv}-bUi(JAo}^^N^Mp;4V)<=g zZ{K^iMkJkr!_;8X9|WIRbdYNk5)qqjQu)s_D=M@t&v!pf*Vx=Jt1)XbATSMxTt6>G%?3=cxS;4c*=32ynmCqG_k5>`pexpyf#4m%S7B~-v`L{b!`2?>%Y6J{A% zW`!GJbrD%^K&m35pj8l{s2VlA`qquC*k6cpV9=NiB{z3=+imY%5xq$@05<84rK+!W z*Eu}8d3w4|thj8=3Nks#^5>uC(^sO!Wuv5nuE=XNOTtCOHkOPC;TcVlhn;4Y9_AnF$vq z=k`JwV(@|Bw*jMs#ryZW!@3Ae)Xoc;7!BOU%~H-T{yfgMAktJ>SpJ?UTq1K7_uoyB zgkTg*JcgZC1s%W$eX=WfYZ_=mMDr8(JelEr?9Unm5n(su4hEwX(rp?g9~9A>Ea}8= z-~?5A9`hRxwFAmo2v_$efsx)$5rR!5z?wCqgMd)@p#imtv^J$h3iKdMniE2H)JPN9 z_p7)-ud8V!*cUTAk7u7n(c_eio+s9oc4HtZGJ*;ey%Fzy==&|8B8UnD!mvAh z84bn;R7bewP(~=#o~GVOBUk6c9)0?1k~!vCJ^C>e{tTq5XGJu6M0(Is1y0L1y_!@P zFaoYH#9e8g4j^C>+jG8L=w}gs9ufnUY0k?u-OpAsVqAwEZev4H*E8M2bP=ChGiG;$ zk|`@=nT>ts;es9nNujhBO*H}&inw*8HbafCwcMg2A-k4DTuh=x7~GGJ#kcY1>37v-SDOJA zhta51px?IBg3{Wn762@XY_gJITu}x^QzF0wP)AyCJRWxY@W3tB$0+8^C`YhS1ZbL$ zS(4!rG8r^nEohO8j43j?o)sbhiOgNEHJ$=%_L+`z-T9vYBY(YDam&9S+oS+$#O4V(U@!>0LI*8a)n|vhEuNi zq6z2+ibgQThBPu$hD&`<6pAISTWB#Dfo|1*mEpz&aF-V;N*0d01X|Is30|T~04Z&J zfhQHZ;0H!*r1Ep?Rs~KCNyT3)?B-ziDr+Gk%`WWWkPQylbVq)+U=v7?gtbIXq-_VL zcaUs?iwjd!xTl4E2&|MLfGtukc#LROP1P6Z&lRw-%$&=GC=13Ja`E!=4emMa#R0eV wLbB$u-sc&)0JItRfnUIKO_DwR z;3w03#ousr&erbJi7#e)@^s^$2_!_?5tN^&qNk=5HYv2l27np?wLMJ$JrHT6N1+p9 zOp1D*Q`824000|MX`lcMjR^!()beV2jEAJq%}js+kN{`^0U=1Iri_uPnIQEuQ}qdu z01X>R0001rRXBf38eHir<7@JA|SsNuq@-_`b#fB`fpqFL_hP`MnPsNA2; zB9I%|a41=$0uLyx$Ju@WlFH1T9H&GqpZ7pk6^D*E#6>-ETwTaXa-9Q|fQX!ep9oJ6 zj(R`{O8V@WK=8FZg0#DX@~iI@(!wdxKQvZQHETrV-Pe??4oR zh4Y$9B1TH8vsH4dM+1?4q{%^M3svK+69XecB2zl_odJT3u~5YVEzc#zx)owN@Fvwh z$ET{m+~%?+nuNqgD+yOjS8d_d!s_JnEYrmA4Q&V#^dLZlz(@>ifau7S9w;a!93KLLd$6VxwXRf)#;HH0fHvtqIb{J>2Gfc_KEFg1vG1n4J^%FZOUrw0cuvYv^CCE>{F3oHzvA!;8__p|Hn8Xwyi^9KOn zVPZ%Vb;ty)GuDd2=PIoO4|v)l#1LdpDj6?om#m`TuzCxN;8ClZ6A%= zO}Jp>kwzq(l$(3jhLHF~ev!=wXqX7YXH=z09p1Vm3YDa$QI<=kp=;^8+`9QU3@Y<8 z1$$BqLsBjlNfP60!Z5_%GusxmO_Fr%?OWApt2;9XvbrJ~9pFHC5bY^QwW^a9 z8X<6OU@=LKF-V|}rKWqmSse?DDP+k)>F&eBptJ)-#y8|M~E%&M3CDE_Rxn`Kg zoe)DUu%yw=+K{!I1ivSz(VlYIv!kAm49;7nF%8+~RiRjPHMWF#q7b_7HMK6I>GmxG zY<(MfK`@*p(aGuLxLbCM^oO)1p(Z66Wqp~MtGwh_ zB21nGvc$Pv#q!OW#L(TZe&c?Ocq^ud9N%7ebv2l6R^(rYf+*NG_Uj)FbR62XInVJSB-T2~!CymGI|U5b zNTuVRw+}mRkU9fE+tMDhE#gJOx|{g#>Jp+RUD6FQ^D^w+61b|yOg1cK-T<178{dEk;GwKK!8mt zi&8i^NQa#y2DT>9*ri1a0SHW*6GC`YLKdjpuHgd5u-K3@xxa$Ic@}yZ-n6gPwTv#g zqR2;LR#|36W%IL$wIQM5G6?CI^7(-ktJoGIQeNdsrUf3B-Eu}4^KEr=+oGx@Z{%7Y z?ZT&zMHOY}uP)&nprTh;c&PU9l?B)URmMFjPr+}_(!@2~@26!9kYB%afZosi6pp2ix)Mtc7e?c(g`PO1p|j1NWw$w(;{=6Z`^7S=O(CzssJoS@x7WGxSMJL zf^XiF-y4}VVig*??c#!lNp@o`{8e42b?z`pG7y4ov$U-ZmuAn{kwG zl8tKOAwp0P({T=ynwhgk-lu%$Vzli@Tm$uZEK=pyyzyG!Pe8l`mEzs9V6(^?G))yq zNTLb8drp>DZFuM++88p{hDi^?%^(NOGFJ@~vII{>oELIZq$V{GLhf7+JbNwCg`XK_ z!7PP-gxU`Fxc2Ackd=|#up`hRz*$Zj;`td=kf}7~;(hu^WJ?o9wclzo)X`utT~;gz zCgfp~Wri-bkzp25hBp?M`0gR+s;o$MI4er217O3GQzn?fj;IcRzGT=z*dXqzfMnW) z6iAJ$%epkF+$+Us+R{O{8je(oBtpIyP!!4z;7;M8jSg|Q0>XXZs+U5x;9@fYQvGT? zFAea6bho)mP`GLW#u{}2&j=nY2DI?<7FWg?CHGEDY|XW}=sD@?Y~1m)KYIFkfYQPQ z(qx?nU>+L{vBfBZOh{o5I-+bR5lrzU*f40YwLCnl?l=DsxHdvE1KNw zq~$0yuosFi^!4wKPrn7g^gUoe!FkhcxqSevk2YBvXY634C4%aH1LH3;+NC diff --git a/data/sumstatsColHeaders.rda b/data/sumstatsColHeaders.rda index 29605026cedb4bcd9cd37e5940fb861f9d03af2e..83f04f9e6abfebb1e066fbd0ae9180c2a5be7251 100644 GIT binary patch literal 2053 zcmV+g2>SOQiwFP!000001MON{Z`?K(9?#8l$;EYn-R+xzye%NYV>`RfBS+$lwX2a7 zNfRV~A&kU*Y8${#(bs+Ne_kv}=^P%4a?!Lw(%p4|H0S%yjXZ}UsZq-K_1WU}VlWtt z2IGOIAqpgmV z;m@t*!Bs0vuc|07MT-uvLJH^GNILqVK)+qhw-V5(%8FK;C^3m!FfJ>UCuJoOEOxy7 zrzkVN6By7ikf^M{Qn_XO-C7_sM+Qc%Qj$nR%raF9MhCzgm87b`QCYNX#>Gg2mXbad z5NckOB2J<@hhZ*~SjKA={QQNTi95B?Gt7JVikAt`YH)6$Gv*`)$0t>?%|tA#{jREc z?XqDn8}zcwWyeIG){>xMCD60xSX@rzB#(BTO0jM6)1tQoi$bpRw3g{^2c5%-PVzG3rxr*G7=lF} z$CWfuDZ@M_q|971o=HHN88}faJD6EK&8m7Izk$+alERr$CIYvKMs10&nrBVsn~q|V z$;j5rfV#}n3In$USb$z;HkE5KQZ6`V8P#Pv{*E3A&mP zG@+|mip@qub@EDJEykJRYGTlCVh4ys6v=YLckX4y%B1YuPfCR+yRWcB;{x+ z=2M|h1%z67l~w6+MzAYoGRZc1x#NU9Yz7H)o~aE<>g7)`>)^111A-@XLmTUKQ2At6 z>}#__@F-GyMFbDfa;MHK-i!?fbt!7H|4bVQg>RL(c`X#-?tE&iQyc)JCXtisWlbV0 zh$r|dF_TuIL&UHYW(q`?M7b$n+S5YtjXgX5)U<%7sYu196fd14U}_j)(aT)|eK^!{ zEc}`VQndC2ODz*eS8Jm3*G^P~ZdCr-i)tNR?MOuhQH6xYXp6nbu(P@{a*sY%+@a5y zL_OBBR9iu<(=VE_t!zh2!Ha47H7@)Zp{T ztU$S1ovogq+c=s#Tb(_>aLvxn&tIIZT(k4D^XGqq4YbJf)MTn@wKkfth&d>vD6N^( zmykA)Q4rGj%$`F!8%1eKIu$3S-th5GF;WDazfBLi7T>fB33<7GjNF#d<=NkT>H>q=o6@mS7d$CtCzjDLtzmtdKx=}& z{LIfE`c&Xk%o!zdyzm`?I$Rf0w+WyckX&Qfxq?m}B*1^6L*B!H(*kDM!+_UBF{n1c z9U;Q$x*+2(4P2m@~xZtXO~JR9cOgq}_4*+|dEU2{vQn8Y_F4NZ%$r@;5>wr4f%gj~GON@ubfL6xOx~OiV52{5_8bh~ zKzLN`RbBh`zFM|dbxDE)MJ~Y2TP(3TRF^O6qIYrSMqGv*wy3{2$7FM`MEM@ya;QFR z^x=K_pwS0Ze`Jo3UyjQfJ?r}Q{i2r|EwMCDW=C`^^v(8F(=OB=1vgyk#5Ejmz^8|;A4Lg^k~?< zn($st;Et=Q4eKMrKGJnxnCo7bVIC8Dj2mZ321pvjyp{zX%o)AgiiwJkZqUz_?Pq5Wp;d()Rp3w0O=bD;9zN(P2kYU&9xm|87W6&2TEY4r zeys@mHh?(}k5ry{y$C#}GfQ3Qo<1(k=pE`QW^`$c;EqqF@&?@9Z%=T^gVe6k6Ss-7 zgM$tZJD8o=rD8>8bEZwP%3~vUk2`zJ0Af)F(e-WJ_`Q&PXWIYxWD`VE`&eV|5ikQ z*MATn#E%iZU!Ok(L4I)lBZ&5gZrUn#8~q(7AO*H13F z|5SJK!}a$!cLVc_$Y9u#%l!{;-{0Qey!~+VuBGM?5^jL`?fah?1bM83{A-~9JNUVq j>FsydcaRCl#k=bd*X-L{<*(4c^`HL$q8LtRXegFHYo1d=UZy|-YF{sj;hUA6ulVQ`%YQljT0%Lk5n}jm8p%s0pptyn5 zQ3IiDtRnVDvelBt+ZWbm_>wkK}aI!__*?kkojEG=vJ zs7U51pNe9$DoW+spS?aP~B=<{!P`iKuz1>2LlBH9YPbtrH=oqk~ktL06gxOtP zZIG~E@Ch%OMPfRSYjh}4vE+O`-Q=uHvb5$8Okf|?+vn@$U%Q(=Y%48hD$F@Z^hnSpV_Sa_IP+)s;gGks&mGZh#Pv@+qa zO;l=2JT9}6Eep0X!#=_`DYFpr%bfgP( zMZ{&kM5&m~D`hAsCEJjFZOH8iXtkODG`5MDwc)Qg>?1o6>2ExlFARMsTc*aP$-&^P638qkg{5ry=8{j* zq&wkc_w}@!=rvUPs2PZCQpmZ0Jwm$+R3J4)LyCk{gs4$aYXiCW2vyVwQx6jlM+o=u z-qlUgnJK5EWVPOudNW{gEVqT&EQ`IBJcU>)kn!$_r~uul>~#>;I=b49jA}#~(hR<`*zgo)PmPS& zfR7bd;L}A>kCZIrA}-}sp=y-bqR2Qc-Cf$ zu!uT{V6-f$Qy&Bu62*Yv_|zV9D%*W%I8AbOKElnUBMCE~RRQ=GKh^~eyqLUku1=kJ z0Us$+%vM>E!FNngGiI}SYE6~m)Qs9-F(1E-Z7uenVz*9VYgmfvD+#9GCNxx6QAU>L zaF*AuCWZEd`^dypPj|A;;DgGF11TnGRQ4wF>LliQyf}RkQqM!`9OBy>*pXJRS&|Dc zyw*^qp-q}dX`-ek(XdG*q@s|bPAie)dFW`U-N{<0nKQF*8U+2{m5i7`qM?LNW+2Xe z!bBLUtd5CgUIfhZpd~K?#>DAK2Gkljn&(a*LD!$GQI9pk!$AWGwh|IoOR%;YsdLm^ zcakg<+Td=}z>5Z6H1HyU7YV#b;3Wbt5qNnE!c2pJ1&0>FGK*RBBJeal`MI9Mo+(cR zyHJPK`Cy5@Dk%=vGbvK5RFJq(SlC6sBkY(nYq#1DA$$eRK4QQBloJ&M?_{G@f!A!`l>ZUV@33I)u#gkj4uipJ!;cgPO*hq%zItJ%&0zv6-uMsdCY?YMIMVm^ z1(7;&4$AHA-S!>1y1w3CZ(UM&WW%kyCW%9K7-ZjHvA?K0DOOh;Y2YGAH{f#oA|Rz>t7(*7OjX zhen+TbgrIJdW%f@j^D*kBzh@sqVIR9r$L`Dv4@Vv* z9;U~T1Bue=o~OS5NI)-o4`LvG^tJ83#G?1NuOz|a*jEwkYu2|Uxld4E200H+Uml=` zr9TY7v(o>f(DW+vIT%08+$Vc)FrN$23(Uvir@zB|B5)sFJ`2HH$?rw@QRP8=5Wg`o z`04(45a_4+Z%nj5z&}V2(jOt=A9KH!+(th=el6+bLHa+I#t+grApQA^OYWa~_tYOs z-?sEmJ*{?4?rna0`{Cwh`}S`8uBL`D5N?3s&4+(c6XZYz`MIV3+xf-M^!EDd)?_l| c=-t)b75)2-`fKf0>sR&fH?%avmZ&ZO00P)P6951J diff --git a/man/get_eff_frq_allele_combns.Rd b/man/get_eff_frq_allele_combns.Rd new file mode 100644 index 00000000..2d6f228c --- /dev/null +++ b/man/get_eff_frq_allele_combns.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_eff_frq_allele_combns.R +\name{get_eff_frq_allele_combns} +\alias{get_eff_frq_allele_combns} +\title{Get combinations of uncorrected allele and effect (and frq) columns} +\usage{ +get_eff_frq_allele_combns( + mapping_file = sumstatsColHeaders, + eff_frq_cols = c("BETA", "OR", "LOG_ODDS", "SIGNED_SUMSTAT", "Z", "FRQ") +) +} +\arguments{ +\item{mapping_file}{MungeSumstats has a pre-defined column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in youf file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.} + +\item{eff_frq_cols}{Corrected effect or frequency column names found in a +sumstats. Default of BETA, OR, LOG_ODDS, SIGNED_SUMSTAT, Z and FRQ.} +} +\value{ +datatable containing uncorrected and corrected combinations +} +\description{ +Get combinations of uncorrected allele and effect (and frq) columns +} diff --git a/man/get_genome_build.Rd b/man/get_genome_build.Rd index 6ee12d1b..9ffce59c 100644 --- a/man/get_genome_build.Rd +++ b/man/get_genome_build.Rd @@ -12,7 +12,9 @@ get_genome_build( standardise_headers = TRUE, mapping_file = sumstatsColHeaders, dbSNP = 155, - header_only = FALSE + header_only = FALSE, + allele_match_ref = FALSE, + ref_genome = NULL ) } \arguments{ @@ -41,6 +43,13 @@ dataframe with column names "Uncorrected" and "Corrected". See only read in the first N rows where N=\code{sampled_snps}. This should help speed up cases where you have to read in \code{sumstats} from disk each time.} + +\item{allele_match_ref}{Instead of returning the genome_build this will +return the propotion of matches to each genome build for each allele (A1,A2).} + +\item{ref_genome}{name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.} } \value{ ref_genome the genome build of the data diff --git a/man/infer_effect_column.Rd b/man/infer_effect_column.Rd new file mode 100644 index 00000000..cb6a08d7 --- /dev/null +++ b/man/infer_effect_column.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/infer_effect_column.R +\name{infer_effect_column} +\alias{infer_effect_column} +\title{Infer if effect relates to a1 or A2 if ambiguously named} +\usage{ +infer_effect_column( + sumstats_dt, + dbSNP = 155, + sampled_snps = 10000, + mapping_file = sumstatsColHeaders, + nThread = nThread, + ref_genome = NULL, + on_ref_genome = TRUE, + return_list = TRUE +) +} +\arguments{ +\item{sumstats_dt}{data table obj of the summary statistics file for the +GWAS.} + +\item{dbSNP}{version of dbSNP to be used for imputation (144 or 155).} + +\item{sampled_snps}{Downsample the number of SNPs used when inferring genome +build to save time.} + +\item{mapping_file}{MungeSumstats has a pre-defined column-name mapping file +which should cover the most common column headers and their interpretations. +However, if a column header that is in youf file is missing of the mapping we +give is incorrect you can supply your own mapping file. Must be a 2 column +dataframe with column names "Uncorrected" and "Corrected". See +data(sumstatsColHeaders) for default mapping and necessary format.} + +\item{nThread}{Number of threads to use for parallel processes.} + +\item{ref_genome}{name of the reference genome used for the GWAS ("GRCh37" or +"GRCh38"). Argument is case-insensitive. Default is NULL which infers the +reference genome from the data.} + +\item{on_ref_genome}{Binary Should a check take place that all SNPs are on +the reference genome by SNP ID. Default is TRUE.} + +\item{return_list}{Return the \code{sumstats_dt} within a named list +(default: \code{TRUE}).} +} +\value{ +list containing sumstats_dt, the modified summary statistics data +table object +} +\description{ +Three checks are made to infer which allele the effect/frequency information +relates to if they are ambiguous (named A1 and A2 or equivalent): +\enumerate{ +\item Check if ambiguous naming conventions are used (i.e. allele 1 and 2 or +equivalent). If not exit, otherwise continue to next checks. This can be +checked by using the mapping file and splitting A1/A2 mappings by those that +contain 1 or 2 (ambiguous) or doesn't contain 1 or 2 e.g. effect, +tested (unambiguous so fine for MSS to handle as is). +\item Look for effect column/frequency column where the A1/A2 explicitly +mentioned, if found then we know the direction and should update A1/A2 +naming so A2 is the effect column. We can look for such columns by getting +every combination of A1/A2 naming and effect/frq naming. +\item If note found in 2, a final check should be against the reference genome, +whichever of A1 and A2 has more of a match with the reference genome should +be taken as \strong{not} the effect allele. There is an assumption in this but is +still better than guessing the ambiguous allele naming. +} +} +\examples{ +sumstats <- MungeSumstats::formatted_example() +#for speed, don't run on_ref_genome part of check (on_ref_genome = FALSE) +sumstats_dt2<-infer_effect_column(sumstats_dt=sumstats,on_ref_genome = FALSE) +} diff --git a/tests/testthat/test-infer_effect_column.R b/tests/testthat/test-infer_effect_column.R new file mode 100644 index 00000000..2656f15c --- /dev/null +++ b/tests/testthat/test-infer_effect_column.R @@ -0,0 +1,95 @@ +test_that("Test infer effect column function works", { + ## Call uses reference genome as default with more than 2GB of memory, + ## which is more than what 32-bit Windows can handle so remove tests + is_32bit_windows <- + .Platform$OS.type == "windows" && .Platform$r_arch == "i386" + if (!is_32bit_windows) { + a <- MungeSumstats::formatted_example() + b <- copy(a) + + #check if inferring effect direction is working + + #make A1 the effect col not A2 + #change FRQ and BETA to reflect this change + data.table::setnames(b,"A2","A3") + data.table::setnames(b,"A1","A2") + data.table::setnames(b,"A3","A1") + c <- copy(b) + data.table::setnames(b,"BETA","BETA1") + data.table::setnames(c,"FRQ","A1FRQ") + + # if the allele columns are named unambiguously, formatting will be performed + # correctly - use this to test against as eff will be flipped compared to org + b_renamed <- copy(b) + data.table::setnames(b_renamed, "A1", "effect_allele") + data.table::setnames(b_renamed, "A2", "non_effect_allele") + b_renamed_for <- MungeSumstats::format_sumstats(b_renamed, return_data = TRUE, + #all just make MSS run faster + ref_genome = 'GRCh37', + on_ref_genome = FALSE, + strand_ambig_filter = FALSE, + bi_allelic_filter = FALSE, + allele_flip_check = FALSE, + dbSNP=144) + + # what if BETA gives the direction + b_for <- MungeSumstats::format_sumstats(b, return_data = TRUE, + #all just make MSS run faster + ref_genome = 'GRCh37', + on_ref_genome = FALSE, + strand_ambig_filter = FALSE, + bi_allelic_filter = FALSE, + allele_flip_check = FALSE, + dbSNP=144) + + # what if FRQ gives the direction + c_for <- MungeSumstats::format_sumstats(b, return_data = TRUE, + #all just make MSS run faster + ref_genome = 'GRCh37', + on_ref_genome = FALSE, + strand_ambig_filter = FALSE, + bi_allelic_filter = FALSE, + allele_flip_check = FALSE, + dbSNP=144) + + testthat::expect_equal( + all.equal(b_renamed_for, b_for,ignore.col.order = TRUE), + TRUE + ) + + testthat::expect_equal( + all.equal(b_renamed_for, c_for,ignore.col.order = TRUE), + TRUE + ) + + + #finally check if the ref genome can be used to infer rather than being told in + #eff/frq cols - just subset for speed + snps <- c("rs11210860","rs34305371","rs1008078","rs11588857","rs1777827", + "rs76076331","rs2457660","rs10496091","rs4851251","rs12987662", + "rs10930008","rs301800","rs2568955","rs61787263","rs2992632", + "rs11689269","rs11690172") + d <- copy(b) + data.table::setnames(d,"BETA1","BETA") + d_for <- MungeSumstats::format_sumstats(d[SNP %in% snps,], return_data = TRUE, + on_ref_genome = TRUE, + #all just make MSS run faster + ref_genome = 'GRCh37', + strand_ambig_filter = FALSE, + bi_allelic_filter = FALSE, + allele_flip_check = FALSE, + dbSNP=144) + data.table::setkey(b_renamed_for,"SNP") + data.table::setkey(d_for,"SNP") + testthat::expect_equal( + all.equal(b_renamed_for[SNP %in% snps], d_for, + ignore.col.order = TRUE), + TRUE + ) + } + else{ + testthat::expect_equal(is_32bit_windows, TRUE) + testthat::expect_equal(is_32bit_windows, TRUE) + testthat::expect_equal(is_32bit_windows, TRUE) + } +})