Skip to content

Commit

Permalink
Bug Fix: Infer effect allele
Browse files Browse the repository at this point in the history
  • Loading branch information
Al-Murphy committed Oct 13, 2023
1 parent ee806c8 commit d16c5fb
Show file tree
Hide file tree
Showing 14 changed files with 670 additions and 44 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: MungeSumstats
Type: Package
Title: Standardise summary statistics from GWAS
Version: 1.9.17
Version: 1.9.18
Authors@R:
c(person(given = "Alan",
family = "Murphy",
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export(formatted_example)
export(get_genome_builds)
export(import_sumstats)
export(index_tabular)
export(infer_effect_column)
export(liftover)
export(list_sumstats)
export(load_snp_loc_data)
Expand Down
41 changes: 41 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,44 @@
## CHANGES IN VERSION 1.9.18

### Bug fix
* Fixed column header mappings
* Made all uncorrected header names uppercase and removed duplicates
* "TOTALSAMPLESIZE" now maps to "N" instead of "NSTUDY"
* "MAJORALLELE", "MAJOR_ALLELE", "MAJOR-ALLELE", and "MAJOR ALLELE" now map to
"A1" instead of "A2"
* Removed the mappings for "OR-A1", "OR.A1", "OR_A1", and "BETA1" because MSS
assumes that A2 is the effect allele
* Removed mappings for "A1FREQ", "A1FRQ", "AF1", "FREQ.A1.1000G.EUR",
"FREQ.A1.ESP.EUR", "FREQ.ALLELE1.HAPMAPCEU", "FREQ1", "FREQ1.HAPMAP", and
"FRQ_A1" because MSS defines "FRQ" to be the allele frequency of A2
* Removed mappings for "CHR36", "BASE_GRCH36", "POSITION36", "POSGRCH36",
"BASEGRCH36", "POS36", "POS GRCH36", "POS.GRCH36", "POS-GRCH36", and
"POS_GRCH36"
because MSS does not support the GRCh36 genome build
* Removed the ambiguous mapping "NMISS" -> "N" because "NMISS" can refer to
the number of samples with missing data
* Removed the ambiguous mapping "WEIGHT" -> "N" because "WEIGHT" can refer to
coefficient weights
* Fixed inference of allele where ambiguous (A1, A2) naming used (see
infer_effect_column.R for code) but in short:
* Three checks now made to infer which allele the effect/frequency information
relates to. See infer_effect_column.R for further details.
* See get_eff_frq_allele_combns.R for how effect/frequency columns that infer
the allele are captured in the mapping file

### New features
* New column header mappings:
* "VARIANT_ID" and "RSIDS" --> "SNP"
* "P_BOLT_LMM" --> "P"
* "NCASES" --> "N_CAS"
* "N_EFFECTIVE", "N_INFORMATIVE", and "TOTAL_N" --> "N"
* "HET_P" --> "HETPVAL"
* "HET_ISQ" --> "HETISQT"
* "ALL_AF" --> "FRQ"
* "DIRECT" --> "DIRECTION"
* "ALT_EFFSIZE" --> "BETA"
* "INFORMATIVE_ALT_AC" --> "AC"

## CHANGES IN VERSION 1.9.17

### Bug fix
Expand Down
11 changes: 11 additions & 0 deletions R/format_sumstats.R
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,17 @@ format_sumstats <- function(path,
mapping_file <- rbind(mapping_file,es_cols)
}

#### Check 2:Check for effect direction ####
sumstats_return <-
infer_effect_column(
sumstats_dt = sumstats_return$sumstats_dt,
mapping_file = mapping_file,
dbSNP = dbSNP,
nThread = nThread,
ref_genome = ref_genome,
on_ref_genome = on_ref_genome
)

#### Check 3:Standardise headers for all OS ####
sumstats_return <-
standardise_sumstats_column_headers_crossplatform(
Expand Down
77 changes: 77 additions & 0 deletions R/get_eff_frq_allele_combns.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#' Get combinations of uncorrected allele and effect (and frq) columns
#'
#' @inheritParams format_sumstats
#' @inheritParams compute_nsize
#' @param eff_frq_cols Corrected effect or frequency column names found in a
#' sumstats. Default of BETA, OR, LOG_ODDS, SIGNED_SUMSTAT, Z and FRQ.
#' @return datatable containing uncorrected and corrected combinations
#' @importFrom data.table setnames as.data.table := setkey rbindlist data.table
get_eff_frq_allele_combns <-
function(mapping_file = sumstatsColHeaders,
eff_frq_cols = c("BETA", "OR", "LOG_ODDS", "SIGNED_SUMSTAT","Z",
"FRQ")) {
### Add this to avoid confusing BiocCheck
CORRECTED <- UNCORRECTED <- Var1 <- Var2 <- NULL
colnames(mapping_file) <- toupper(colnames(mapping_file))
#get allele associated effect/FRQ columns
#get all combinations with allele columns
eff_frq_cols_uncorrc <-
mapping_file[mapping_file$CORRECTED %in% eff_frq_cols,]$UNCORRECTED
#join with all allele cols
allele_uncorrc <-
mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]$UNCORRECTED
#get combinations
eff_frq_allele_dt <-
data.table::as.data.table(expand.grid(eff_frq_cols_uncorrc,
allele_uncorrc))
mapping_file_dt <- data.table::as.data.table(mapping_file)
data.table::setkey(mapping_file_dt,"UNCORRECTED")
data.table::setkey(eff_frq_allele_dt,"Var1")
#add corrected
eff_frq_allele_dt[mapping_file,CORRECTED:=CORRECTED,]
#now loop through every joining character and join with eff both before
#and after
joining_char <- c("","_",".","-"," ")
all_combns <- vector(mode="list",length = length(joining_char)*2)
counter <- 1
for(join_i in joining_char){
eff_frq_allele_dt_i <- copy(eff_frq_allele_dt)
eff_frq_allele_dt_i[,UNCORRECTED:=paste0(Var1,join_i,Var2)]
all_combns[[counter]] <-
eff_frq_allele_dt_i[,c("UNCORRECTED","CORRECTED")]
counter <- counter+1
#same for Var 2 in front
eff_frq_allele_dt_i <- copy(eff_frq_allele_dt)
eff_frq_allele_dt_i[,UNCORRECTED:=paste0(Var2,join_i,Var1)]
all_combns[[counter]] <-
eff_frq_allele_dt_i[,c("UNCORRECTED","CORRECTED")]
counter <- counter+1
}
#join all together
eff_frq_allele_matches <- data.table::rbindlist(all_combns)
#finally add some custom ones
custom_adds <- data.table::data.table("UNCORRECTED" =
c("BETA1", "BETA2","AF1","AF2",
"FREQ.A1.1000G.EUR",
"FREQ.A2.1000G.EUR",
"FREQ.A1.ESP.EUR",
"FREQ.A2.ESP.EUR",
"FREQ.ALLELE1.HAPMAPCEU",
"FREQ.ALLELE2.HAPMAPCEU",
"FREQ1","FREQ2",
"FREQ1.HAPMAP","FREQ2.HAPMAP"),
"CORRECTED" =
c("BETA", "BETA","FRQ","FRQ",
"FRQ",
"FRQ",
"FRQ",
"FRQ",
"FRQ",
"FRQ",
"FRQ","FRQ",
"FRQ","FRQ"))
eff_frq_allele_matches <- data.table::rbindlist(list(
eff_frq_allele_matches,custom_adds))

return(eff_frq_allele_matches)
}
Loading

0 comments on commit d16c5fb

Please sign in to comment.