diff --git a/DESCRIPTION b/DESCRIPTION index 2cba6af..b9a36fb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: MungeSumstats Type: Package Title: Standardise summary statistics from GWAS -Version: 1.9.17 +Version: 1.9.18 Authors@R: c(person(given = "Alan", family = "Murphy", diff --git a/NAMESPACE b/NAMESPACE index 4761e75..03c8c9a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ export(formatted_example) export(get_genome_builds) export(import_sumstats) export(index_tabular) +export(infer_effect_column) export(liftover) export(list_sumstats) export(load_snp_loc_data) diff --git a/NEWS.md b/NEWS.md index 5b55a28..c3c10cf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,44 @@ +## CHANGES IN VERSION 1.9.18 + +### Bug fix +* Fixed column header mappings + * Made all uncorrected header names uppercase and removed duplicates + * "TOTALSAMPLESIZE" now maps to "N" instead of "NSTUDY" + * "MAJORALLELE", "MAJOR_ALLELE", "MAJOR-ALLELE", and "MAJOR ALLELE" now map to + "A1" instead of "A2" + * Removed the mappings for "OR-A1", "OR.A1", "OR_A1", and "BETA1" because MSS + assumes that A2 is the effect allele + * Removed mappings for "A1FREQ", "A1FRQ", "AF1", "FREQ.A1.1000G.EUR", + "FREQ.A1.ESP.EUR", "FREQ.ALLELE1.HAPMAPCEU", "FREQ1", "FREQ1.HAPMAP", and + "FRQ_A1" because MSS defines "FRQ" to be the allele frequency of A2 + * Removed mappings for "CHR36", "BASE_GRCH36", "POSITION36", "POSGRCH36", + "BASEGRCH36", "POS36", "POS GRCH36", "POS.GRCH36", "POS-GRCH36", and + "POS_GRCH36" + because MSS does not support the GRCh36 genome build + * Removed the ambiguous mapping "NMISS" -> "N" because "NMISS" can refer to + the number of samples with missing data + * Removed the ambiguous mapping "WEIGHT" -> "N" because "WEIGHT" can refer to + coefficient weights +* Fixed inference of allele where ambiguous (A1, A2) naming used (see + infer_effect_column.R for code) but in short: + * Three checks now made to infer which allele the effect/frequency information + relates to. See infer_effect_column.R for further details. + * See get_eff_frq_allele_combns.R for how effect/frequency columns that infer + the allele are captured in the mapping file + +### New features +* New column header mappings: + * "VARIANT_ID" and "RSIDS" --> "SNP" + * "P_BOLT_LMM" --> "P" + * "NCASES" --> "N_CAS" + * "N_EFFECTIVE", "N_INFORMATIVE", and "TOTAL_N" --> "N" + * "HET_P" --> "HETPVAL" + * "HET_ISQ" --> "HETISQT" + * "ALL_AF" --> "FRQ" + * "DIRECT" --> "DIRECTION" + * "ALT_EFFSIZE" --> "BETA" + * "INFORMATIVE_ALT_AC" --> "AC" + ## CHANGES IN VERSION 1.9.17 ### Bug fix diff --git a/R/format_sumstats.R b/R/format_sumstats.R index ce517e5..e60aee4 100644 --- a/R/format_sumstats.R +++ b/R/format_sumstats.R @@ -445,6 +445,17 @@ format_sumstats <- function(path, mapping_file <- rbind(mapping_file,es_cols) } + #### Check 2:Check for effect direction #### + sumstats_return <- + infer_effect_column( + sumstats_dt = sumstats_return$sumstats_dt, + mapping_file = mapping_file, + dbSNP = dbSNP, + nThread = nThread, + ref_genome = ref_genome, + on_ref_genome = on_ref_genome + ) + #### Check 3:Standardise headers for all OS #### sumstats_return <- standardise_sumstats_column_headers_crossplatform( diff --git a/R/get_eff_frq_allele_combns.R b/R/get_eff_frq_allele_combns.R new file mode 100644 index 0000000..e6e9375 --- /dev/null +++ b/R/get_eff_frq_allele_combns.R @@ -0,0 +1,77 @@ +#' Get combinations of uncorrected allele and effect (and frq) columns +#' +#' @inheritParams format_sumstats +#' @inheritParams compute_nsize +#' @param eff_frq_cols Corrected effect or frequency column names found in a +#' sumstats. Default of BETA, OR, LOG_ODDS, SIGNED_SUMSTAT, Z and FRQ. +#' @return datatable containing uncorrected and corrected combinations +#' @importFrom data.table setnames as.data.table := setkey rbindlist data.table +get_eff_frq_allele_combns <- + function(mapping_file = sumstatsColHeaders, + eff_frq_cols = c("BETA", "OR", "LOG_ODDS", "SIGNED_SUMSTAT","Z", + "FRQ")) { + ### Add this to avoid confusing BiocCheck + CORRECTED <- UNCORRECTED <- Var1 <- Var2 <- NULL + colnames(mapping_file) <- toupper(colnames(mapping_file)) + #get allele associated effect/FRQ columns + #get all combinations with allele columns + eff_frq_cols_uncorrc <- + mapping_file[mapping_file$CORRECTED %in% eff_frq_cols,]$UNCORRECTED + #join with all allele cols + allele_uncorrc <- + mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]$UNCORRECTED + #get combinations + eff_frq_allele_dt <- + data.table::as.data.table(expand.grid(eff_frq_cols_uncorrc, + allele_uncorrc)) + mapping_file_dt <- data.table::as.data.table(mapping_file) + data.table::setkey(mapping_file_dt,"UNCORRECTED") + data.table::setkey(eff_frq_allele_dt,"Var1") + #add corrected + eff_frq_allele_dt[mapping_file,CORRECTED:=CORRECTED,] + #now loop through every joining character and join with eff both before + #and after + joining_char <- c("","_",".","-"," ") + all_combns <- vector(mode="list",length = length(joining_char)*2) + counter <- 1 + for(join_i in joining_char){ + eff_frq_allele_dt_i <- copy(eff_frq_allele_dt) + eff_frq_allele_dt_i[,UNCORRECTED:=paste0(Var1,join_i,Var2)] + all_combns[[counter]] <- + eff_frq_allele_dt_i[,c("UNCORRECTED","CORRECTED")] + counter <- counter+1 + #same for Var 2 in front + eff_frq_allele_dt_i <- copy(eff_frq_allele_dt) + eff_frq_allele_dt_i[,UNCORRECTED:=paste0(Var2,join_i,Var1)] + all_combns[[counter]] <- + eff_frq_allele_dt_i[,c("UNCORRECTED","CORRECTED")] + counter <- counter+1 + } + #join all together + eff_frq_allele_matches <- data.table::rbindlist(all_combns) + #finally add some custom ones + custom_adds <- data.table::data.table("UNCORRECTED" = + c("BETA1", "BETA2","AF1","AF2", + "FREQ.A1.1000G.EUR", + "FREQ.A2.1000G.EUR", + "FREQ.A1.ESP.EUR", + "FREQ.A2.ESP.EUR", + "FREQ.ALLELE1.HAPMAPCEU", + "FREQ.ALLELE2.HAPMAPCEU", + "FREQ1","FREQ2", + "FREQ1.HAPMAP","FREQ2.HAPMAP"), + "CORRECTED" = + c("BETA", "BETA","FRQ","FRQ", + "FRQ", + "FRQ", + "FRQ", + "FRQ", + "FRQ", + "FRQ", + "FRQ","FRQ", + "FRQ","FRQ")) + eff_frq_allele_matches <- data.table::rbindlist(list( + eff_frq_allele_matches,custom_adds)) + + return(eff_frq_allele_matches) + } diff --git a/R/get_genome_build.R b/R/get_genome_build.R index dbd5b12..0209ced 100644 --- a/R/get_genome_build.R +++ b/R/get_genome_build.R @@ -21,9 +21,12 @@ #' only read in the first N rows where N=\code{sampled_snps}. #' This should help speed up cases where you have to read in \code{sumstats} #' from disk each time. -#' +#' @param allele_match_ref Instead of returning the genome_build this will +#' return the propotion of matches to each genome build for each allele (A1,A2). +#' @inheritParams format_sumstats +#' #' @return ref_genome the genome build of the data -#' @importFrom data.table setDT +#' @importFrom data.table setDT := #' @keywords internal get_genome_build <- function(sumstats, nThread = 1, @@ -31,11 +34,14 @@ get_genome_build <- function(sumstats, standardise_headers = TRUE, mapping_file = sumstatsColHeaders, dbSNP=155, - header_only = FALSE) { + header_only = FALSE, + allele_match_ref = FALSE, + ref_genome = NULL) { ### Add this to avoid confusing BiocCheck - seqnames <- CHR <- SNP <- BP <- NULL - - message("Inferring genome build.") + seqnames <- CHR <- SNP <- BP <- alt_alleles <- NULL + if(isFALSE(allele_match_ref)){ + message("Inferring genome build.") + } # if not a data.table, must be a path if (!is.data.frame(sumstats)) { # Checking if the file exists should happen first @@ -82,7 +88,13 @@ get_genome_build <- function(sumstats, ) # Infer genome build using SNP & CHR & BP if (!all(c("SNP", "CHR", "BP") %in% colnames(sumstats))) { + #want it returned rather than throwing an error + if(isTRUE(allele_match_ref)){ + return(err_msg) + } + else{ stop(err_msg) + } } #### Do some filtering first to avoid errors #### @@ -113,7 +125,13 @@ get_genome_build <- function(sumstats, size_okay <- TRUE } if (!size_okay) { + #want it returned rather than throwing an error + if(isTRUE(allele_match_ref)){ + return(err_msg2) + } + else{ stop(err_msg2) + } } #### Downsample SNPs to save time #### if ((nrow(sumstats) > sampled_snps) && !(is.null(sampled_snps))) { @@ -124,58 +142,145 @@ get_genome_build <- function(sumstats, sumstats <- sumstats[SNP %in% snps, ] - # otherwise SNP, CHR, BP were all found and can infer - snp_loc_data_37 <- load_ref_genome_data( + #now split into functions two roles + if(isTRUE(allele_match_ref)){ + #1. checking for matches for A1/A2 to ref genomes + if (is.null(ref_genome)){ + #have to check multiple + snp_loc_data_37 <- load_ref_genome_data( + snps = snps, + ref_genome = "GRCH37", + dbSNP = dbSNP + ) + snp_loc_data_38 <- load_ref_genome_data( + snps = snps, + ref_genome = "GRCH38", + dbSNP = dbSNP + ) + # convert CHR filed in ref genomes to character not factor + snp_loc_data_37[, seqnames := as.character(seqnames)] + snp_loc_data_38[, seqnames := as.character(seqnames)] + # convert CHR filed in data to character if not already + sumstats[, CHR := as.character(CHR)] + # Now check which genome build has more matches to data + num_37 <- + nrow(snp_loc_data_37[sumstats, , + on = c("SNP"="SNP","pos"="BP","seqnames"="CHR"), + nomatch = FALSE + ]) + num_38 <- + nrow(snp_loc_data_38[sumstats, , + on = c("SNP"="SNP","pos"="BP","seqnames"="CHR"), + nomatch = FALSE + ]) + if (num_37 > num_38) { + ref_gen_num <- num_37 + ref_genome <- "GRCH37" + snp_loc_data <- snp_loc_data_37 + + } else { + ref_gen_num <- num_38 + ref_genome <- "GRCH38" + snp_loc_data <- snp_loc_data_38 + } + } + else{ + #only check one chosen + snp_loc_data <- load_ref_genome_data( + snps = snps, + ref_genome = ref_genome, + dbSNP = dbSNP + ) + # convert CHR filed in ref genomes to character not factor + snp_loc_data[, seqnames := as.character(seqnames)] + # convert CHR filed in data to character if not already + sumstats[, CHR := as.character(CHR)] + } + # Now check which allele has more matches to data + # want to match on ref and alt alleles too + # need to take first alt from list to do this + snp_loc_data[,alt_alleles := unlist(lapply(alt_alleles, + function(x) x[[1]]))] + num_a1 <- + nrow(snp_loc_data[sumstats, , + on = c("SNP"="SNP","pos"="BP","seqnames"="CHR", + "ref_allele"="A1","alt_alleles"="A2"), + nomatch = FALSE + ]) + num_a2 <- + nrow(snp_loc_data[sumstats, , + on = c("SNP"="SNP","pos"="BP","seqnames"="CHR", + "ref_allele"="A2","alt_alleles"="A1"), + nomatch = FALSE + ]) + if(num_a2>=num_a1){ + message("Effect/frq column(s) relate to A2 in the inputted sumstats") + #this is what MSS expects so no action required + switch_req <- FALSE + }else{#num_a2 num_38) { + #if no matches throw error + if(num_37==0 && num_38==0){ + msg_err <- + paste0("No matches found in either reference genome for your ", + "SNPs.\nPlease check their formatting (SNP, CHR and BP", + " columns) or supply the genome build.") + stop(msg_err) + } + if (num_37 > num_38) { ref_gen_num <- num_37 ref_genome <- "GRCH37" - } else { + } else { ref_gen_num <- num_38 ref_genome <- "GRCH38" - } - - message("Inferred genome build: ", ref_genome) - # add a warning if low proportion of matches found - msg <- paste0( + } + + message("Inferred genome build: ", ref_genome) + # add a warning if low proportion of matches found + msg <- paste0( "WARNING: Less than 10% of your sampled SNPs matched that of ", "either reference genome, this may question the quality of ", "your summary statistics file." - ) - if (ref_gen_num / length(snps) < 0.1) { + ) + if (ref_gen_num / length(snps) < 0.1) { message(msg) + } + + return(ref_genome) + } - - return(ref_genome) } diff --git a/R/infer_effect_column.R b/R/infer_effect_column.R new file mode 100644 index 0000000..f70fbc6 --- /dev/null +++ b/R/infer_effect_column.R @@ -0,0 +1,182 @@ +#' Infer if effect relates to a1 or A2 if ambiguously named +#' +#' Three checks are made to infer which allele the effect/frequency information +#' relates to if they are ambiguous (named A1 and A2 or equivalent): +#' 1. Check if ambiguous naming conventions are used (i.e. allele 1 and 2 or +#' equivalent). If not exit, otherwise continue to next checks. This can be +#' checked by using the mapping file and splitting A1/A2 mappings by those that +#' contain 1 or 2 (ambiguous) or doesn't contain 1 or 2 e.g. effect, +#' tested (unambiguous so fine for MSS to handle as is). +#' 2. Look for effect column/frequency column where the A1/A2 explicitly +#' mentioned, if found then we know the direction and should update A1/A2 +#' naming so A2 is the effect column. We can look for such columns by getting +#' every combination of A1/A2 naming and effect/frq naming. +#' 3. If note found in 2, a final check should be against the reference genome, +#' whichever of A1 and A2 has more of a match with the reference genome should +#' be taken as **not** the effect allele. There is an assumption in this but is +#' still better than guessing the ambiguous allele naming. +#' +#' @inheritParams format_sumstats +#' @inheritParams compute_nsize +#' @inheritParams get_genome_build +#' @return list containing sumstats_dt, the modified summary statistics data +#' table object +#' @export +#' @importFrom data.table setnames as.data.table := setkey rbindlist data.table +#' @examples +#' sumstats <- MungeSumstats::formatted_example() +#' #for speed, don't run on_ref_genome part of check (on_ref_genome = FALSE) +#' sumstats_dt2<-infer_effect_column(sumstats_dt=sumstats,on_ref_genome = FALSE) +infer_effect_column <- + function(sumstats_dt, + dbSNP=155, + sampled_snps = 10000, + mapping_file = sumstatsColHeaders, + nThread = nThread, + ref_genome = NULL, + on_ref_genome = TRUE, + return_list=TRUE) { + + message("Infer Effect Column") + message("First line of summary statistics file: ") + msg <- paste0(names(sumstats_dt), split = "\t") + message(msg) + #### first make all column headers upper case #### + column_headers <- names(sumstats_dt) + # load synonym mapping - internal data no loading + # Identify allele mappings which are ambiguous and problematic + # vs those that are interpretable + colnames(mapping_file) <- toupper(colnames(mapping_file)) + allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),] + ambig_allele_map <- allele_mapping[grepl('1',allele_mapping$UNCORRECTED)| + grepl('2',allele_mapping$UNCORRECTED),] + unambig_allele_map <- + allele_mapping[!(grepl('1',allele_mapping$UNCORRECTED)| + grepl('2',allele_mapping$UNCORRECTED)),] + #as long as the sumstats contains 1 unambiguous allele column MSS will work + #as expected + unambig_cols <- intersect(unambig_allele_map$UNCORRECTED, + toupper(column_headers)) + ambig_cols <- intersect(ambig_allele_map$UNCORRECTED, + toupper(column_headers)) + #if both ambiguous and unambiguous columns found, rename ambiguous ones so + #they aren't used later by MSS + #example: 'A1','A2','EFFECT_ALLELE' all present + if (length(unambig_cols)>0 && length(ambig_cols)>0){ + #find if unambig and ambig relate to the same allele + #get corrected name for unambig + unambig_corrcted <- + unique(allele_mapping[allele_mapping$UNCORRECTED %in% unambig_cols, + ]$CORRECTED) + #check if any ambig are to the same allele + ambig_corrcted <- + unique(allele_mapping[allele_mapping$UNCORRECTED %in% ambig_cols, + ]$CORRECTED) + #overlap? + ambig_corrcted_rnme <- + ambig_corrcted[ambig_corrcted %in% unambig_corrcted] + if (length(ambig_corrcted_rnme)>0){ + message("There are multiple columns relating to the same allele info") + message("Renaming ambiguous allele columns so they won't be used") + #get the related ambig naming and change there name so won't be used + ambig_uncorrcted_rnme <- + ambig_allele_map[ambig_allele_map$CORRECTED %in% ambig_corrcted_rnme, + ]$UNCORRECTED + #now rename any matches in sumstats + chng_nmes <- column_headers[toupper(column_headers) %in% + ambig_uncorrcted_rnme] + for(chng_i in chng_nmes){ + data.table::setnames(sumstats_dt, chng_i, paste0(chng_i,"_INPUTTED")) + } + } + } else if (length(unambig_cols)==0 && length(ambig_cols)>=2){ + #only continue if no unambiguous columns found but 2 ambig ones are found- + #less than 2 in total means allele info is missing which MSS can try fill + #in later + message("Allele columns are ambiguous, attempting to infer direction") + #get names for allele mared eff/frq columns + eff_frq_allele_matches <- get_eff_frq_allele_combns() + #now look for matches in sumstats + fnd_allele_indicator <- + column_headers[toupper(column_headers) %in% + eff_frq_allele_matches$UNCORRECTED] + if(length(fnd_allele_indicator)>0){ + message("Found direction from effect/frq column naming") + #fnd_allele_indicator could be >1 so majority vote + a1_mtch <- sum(grepl("A1",fnd_allele_indicator)) + a2_mtch <- sum(grepl("A2",fnd_allele_indicator)) + if(a2_mtch>=a1_mtch){ + message("Effect/frq column(s) relate to A2 in the inputted sumstats") + #this is what MSS expects so no action required + }else{#a2_mtch