Skip to content

Commit

Permalink
Infer effect dirt param added; bug fix unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Al-Murphy committed Oct 19, 2023
1 parent d16c5fb commit 0158026
Show file tree
Hide file tree
Showing 14 changed files with 275 additions and 218 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: MungeSumstats
Type: Package
Title: Standardise summary statistics from GWAS
Version: 1.9.18
Version: 1.9.19
Authors@R:
c(person(given = "Alan",
family = "Murphy",
Expand Down
9 changes: 9 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
## CHANGES IN VERSION 1.9.19

### New features
* `infer_eff_direction` parameter added so user can decide whether to run the check

### Bug fix
* Typo in unit test for infer effect direction.
* IEU GWAS unit tests updated to account for server outages.

## CHANGES IN VERSION 1.9.18

### Bug fix
Expand Down
7 changes: 6 additions & 1 deletion R/format_sumstats.R
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@
#' which removes all non-autosomal SNPs.
#' @param on_ref_genome Binary Should a check take place that all SNPs are on
#' the reference genome by SNP ID. Default is TRUE.
#' @param infer_eff_direction Binary Should a check take place to ensure the
#' alleles match the effect direction? Default is TRUE.
#' @param strand_ambig_filter Binary Should SNPs with strand-ambiguous alleles
#' be removed. Default is FALSE.
#' @param allele_flip_check Binary Should the allele columns be checked against
Expand Down Expand Up @@ -247,6 +249,7 @@ format_sumstats <- function(path,
chr_style = "Ensembl",
rmv_chr = c("X", "Y", "MT"),
on_ref_genome = TRUE,
infer_eff_direction = TRUE,
strand_ambig_filter = FALSE,
allele_flip_check = TRUE,
allele_flip_drop = TRUE,
Expand Down Expand Up @@ -335,6 +338,7 @@ format_sumstats <- function(path,
chr_style = chr_style,
rmv_chr = rmv_chr,
on_ref_genome = on_ref_genome,
infer_eff_direction = infer_eff_direction,
strand_ambig_filter = strand_ambig_filter,
allele_flip_check = allele_flip_check,
allele_flip_drop = allele_flip_drop,
Expand Down Expand Up @@ -453,7 +457,8 @@ format_sumstats <- function(path,
dbSNP = dbSNP,
nThread = nThread,
ref_genome = ref_genome,
on_ref_genome = on_ref_genome
on_ref_genome = on_ref_genome,
infer_eff_direction = infer_eff_direction
)

#### Check 3:Standardise headers for all OS ####
Expand Down
253 changes: 129 additions & 124 deletions R/infer_effect_column.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,144 +35,149 @@ infer_effect_column <-
nThread = nThread,
ref_genome = NULL,
on_ref_genome = TRUE,
infer_eff_direction = TRUE,
return_list=TRUE) {

message("Infer Effect Column")
message("First line of summary statistics file: ")
msg <- paste0(names(sumstats_dt), split = "\t")
message(msg)
#### first make all column headers upper case ####
column_headers <- names(sumstats_dt)
# load synonym mapping - internal data no loading
# Identify allele mappings which are ambiguous and problematic
# vs those that are interpretable
colnames(mapping_file) <- toupper(colnames(mapping_file))
allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]
ambig_allele_map <- allele_mapping[grepl('1',allele_mapping$UNCORRECTED)|
grepl('2',allele_mapping$UNCORRECTED),]
unambig_allele_map <-
allele_mapping[!(grepl('1',allele_mapping$UNCORRECTED)|
grepl('2',allele_mapping$UNCORRECTED)),]
#as long as the sumstats contains 1 unambiguous allele column MSS will work
#as expected
unambig_cols <- intersect(unambig_allele_map$UNCORRECTED,
if(isTRUE(infer_eff_direction)){
message("Infer Effect Column")
message("First line of summary statistics file: ")
msg <- paste0(names(sumstats_dt), split = "\t")
message(msg)
#### first make all column headers upper case ####
column_headers <- names(sumstats_dt)
# load synonym mapping - internal data no loading
# Identify allele mappings which are ambiguous and problematic
# vs those that are interpretable
colnames(mapping_file) <- toupper(colnames(mapping_file))
allele_mapping <- mapping_file[mapping_file$CORRECTED %in% c('A1','A2'),]
ambig_allele_map <- allele_mapping[grepl('1',allele_mapping$UNCORRECTED)|
grepl('2',allele_mapping$UNCORRECTED),]
unambig_allele_map <-
allele_mapping[!(grepl('1',allele_mapping$UNCORRECTED)|
grepl('2',allele_mapping$UNCORRECTED)),]
#as long as the sumstats contains 1 unambiguous allele column MSS will work
#as expected
unambig_cols <- intersect(unambig_allele_map$UNCORRECTED,
toupper(column_headers))
ambig_cols <- intersect(ambig_allele_map$UNCORRECTED,
toupper(column_headers))
ambig_cols <- intersect(ambig_allele_map$UNCORRECTED,
toupper(column_headers))
#if both ambiguous and unambiguous columns found, rename ambiguous ones so
#they aren't used later by MSS
#example: 'A1','A2','EFFECT_ALLELE' all present
if (length(unambig_cols)>0 && length(ambig_cols)>0){
#find if unambig and ambig relate to the same allele
#get corrected name for unambig
unambig_corrcted <-
unique(allele_mapping[allele_mapping$UNCORRECTED %in% unambig_cols,
]$CORRECTED)
#check if any ambig are to the same allele
ambig_corrcted <-
unique(allele_mapping[allele_mapping$UNCORRECTED %in% ambig_cols,
]$CORRECTED)
#overlap?
ambig_corrcted_rnme <-
ambig_corrcted[ambig_corrcted %in% unambig_corrcted]
if (length(ambig_corrcted_rnme)>0){
message("There are multiple columns relating to the same allele info")
message("Renaming ambiguous allele columns so they won't be used")
#get the related ambig naming and change there name so won't be used
ambig_uncorrcted_rnme <-
ambig_allele_map[ambig_allele_map$CORRECTED %in% ambig_corrcted_rnme,
]$UNCORRECTED
#now rename any matches in sumstats
chng_nmes <- column_headers[toupper(column_headers) %in%
ambig_uncorrcted_rnme]
for(chng_i in chng_nmes){
data.table::setnames(sumstats_dt, chng_i, paste0(chng_i,"_INPUTTED"))
#if both ambiguous and unambiguous columns found, rename ambiguous ones so
#they aren't used later by MSS
#example: 'A1','A2','EFFECT_ALLELE' all present
if (length(unambig_cols)>0 && length(ambig_cols)>0){
#find if unambig and ambig relate to the same allele
#get corrected name for unambig
unambig_corrcted <-
unique(allele_mapping[allele_mapping$UNCORRECTED %in% unambig_cols,
]$CORRECTED)
#check if any ambig are to the same allele
ambig_corrcted <-
unique(allele_mapping[allele_mapping$UNCORRECTED %in% ambig_cols,
]$CORRECTED)
#overlap?
ambig_corrcted_rnme <-
ambig_corrcted[ambig_corrcted %in% unambig_corrcted]
if (length(ambig_corrcted_rnme)>0){
message("There are multiple columns relating to the same allele info")
message("Renaming ambiguous allele columns so they won't be used")
#get the related ambig naming and change there name so won't be used
ambig_uncorrcted_rnme <-
ambig_allele_map[ambig_allele_map$CORRECTED %in% ambig_corrcted_rnme,
]$UNCORRECTED
#now rename any matches in sumstats
chng_nmes <- column_headers[toupper(column_headers) %in%
ambig_uncorrcted_rnme]
for(chng_i in chng_nmes){
data.table::setnames(sumstats_dt, chng_i, paste0(chng_i,"_INPUTTED"))
}
}
}
} else if (length(unambig_cols)==0 && length(ambig_cols)>=2){
#only continue if no unambiguous columns found but 2 ambig ones are found-
#less than 2 in total means allele info is missing which MSS can try fill
#in later
message("Allele columns are ambiguous, attempting to infer direction")
#get names for allele mared eff/frq columns
eff_frq_allele_matches <- get_eff_frq_allele_combns()
#now look for matches in sumstats
fnd_allele_indicator <-
column_headers[toupper(column_headers) %in%
eff_frq_allele_matches$UNCORRECTED]
if(length(fnd_allele_indicator)>0){
message("Found direction from effect/frq column naming")
#fnd_allele_indicator could be >1 so majority vote
a1_mtch <- sum(grepl("A1",fnd_allele_indicator))
a2_mtch <- sum(grepl("A2",fnd_allele_indicator))
if(a2_mtch>=a1_mtch){
message("Effect/frq column(s) relate to A2 in the inputted sumstats")
#this is what MSS expects so no action required
}else{#a2_mtch<a1_mtch
message("Effect/frq column(s) relate to A1 in the inputted sumstats")
#this is the opposite to what MSS expects so switch A1/A2 naming
#first get corrected names for allele columns then switch
for (headerI in seq_len(nrow(mapping_file))) {
un <- mapping_file[headerI, "UNCORRECTED"]
cr <- mapping_file[headerI, "CORRECTED"]
#note ambig_cols here not all col headers
if (un %in% ambig_cols & (!cr %in% column_headers)) {
data.table::setnames(sumstats_dt, un, cr)
} else if (length(unambig_cols)==0 && length(ambig_cols)>=2){
#only continue if no unambiguous columns found but 2 ambig ones are found-
#less than 2 in total means allele info is missing which MSS can try fill
#in later
message("Allele columns are ambiguous, attempting to infer direction")
#get names for allele mared eff/frq columns
eff_frq_allele_matches <- get_eff_frq_allele_combns()
#now look for matches in sumstats
fnd_allele_indicator <-
column_headers[toupper(column_headers) %in%
eff_frq_allele_matches$UNCORRECTED]
if(length(fnd_allele_indicator)>0){
message("Found direction from effect/frq column naming")
#fnd_allele_indicator could be >1 so majority vote
a1_mtch <- sum(grepl("A1",fnd_allele_indicator))
a2_mtch <- sum(grepl("A2",fnd_allele_indicator))
if(a2_mtch>=a1_mtch){
message("Effect/frq column(s) relate to A2 in the inputted sumstats")
#this is what MSS expects so no action required
}else{#a2_mtch<a1_mtch
message("Effect/frq column(s) relate to A1 in the inputted sumstats")
#this is the opposite to what MSS expects so switch A1/A2 naming
#first get corrected names for allele columns then switch
for (headerI in seq_len(nrow(mapping_file))) {
un <- mapping_file[headerI, "UNCORRECTED"]
cr <- mapping_file[headerI, "CORRECTED"]
#note ambig_cols here not all col headers
if (un %in% ambig_cols & (!cr %in% column_headers)) {
data.table::setnames(sumstats_dt, un, cr)
}
}
#now switch
data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_")
data.table::setnames(sumstats_dt,"A1","A2")
data.table::setnames(sumstats_dt,"A2_INPUTTED_OLD_","A1")
}
#now switch
data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_")
data.table::setnames(sumstats_dt,"A1","A2")
data.table::setnames(sumstats_dt,"A2_INPUTTED_OLD_","A1")

}

}
else{
#didn't find any allele specific
#last option is to check the reference genome and take the allele with
#more matches to it as the non-effect allele
#need to standardise the sumstats for this
#use get_genome_build function for this
if(isTRUE(on_ref_genome)){
switch_req <- get_genome_build(
sumstats = sumstats_dt,
standardise_headers = TRUE,
sampled_snps = sampled_snps,
mapping_file = mapping_file,
dbSNP=dbSNP,
nThread = nThread,
allele_match_ref = TRUE,
ref_genome = ref_genome
)
if(is.logical(switch_req)){
if(isTRUE(switch_req)){
#swap A1 and A2
#this is the opposite to what MSS expects so switch A1/A2 naming
#first get corrected names for allele columns then switch
for (headerI in seq_len(nrow(mapping_file))) {
un <- mapping_file[headerI, "UNCORRECTED"]
cr <- mapping_file[headerI, "CORRECTED"]
#note ambig_cols here not all col headers
if (un %in% ambig_cols & (!cr %in% column_headers)) {
data.table::setnames(sumstats_dt, un, cr)
else{
#didn't find any allele specific
#last option is to check the reference genome and take the allele with
#more matches to it as the non-effect allele
#need to standardise the sumstats for this
#use get_genome_build function for this
if(isTRUE(on_ref_genome)){
switch_req <- get_genome_build(
sumstats = sumstats_dt,
standardise_headers = TRUE,
sampled_snps = sampled_snps,
mapping_file = mapping_file,
dbSNP=dbSNP,
nThread = nThread,
allele_match_ref = TRUE,
ref_genome = ref_genome
)
if(is.logical(switch_req)){
message(paste0("Found direction from matchine reference genome - N",
"OTE this assumes non-effect allele will macth the ",
"reference genome"))
if(isTRUE(switch_req)){
#swap A1 and A2
#this is the opposite to what MSS expects so switch A1/A2 naming
#first get corrected names for allele columns then switch
for (headerI in seq_len(nrow(mapping_file))) {
un <- mapping_file[headerI, "UNCORRECTED"]
cr <- mapping_file[headerI, "CORRECTED"]
#note ambig_cols here not all col headers
if (un %in% ambig_cols & (!cr %in% column_headers)) {
data.table::setnames(sumstats_dt, un, cr)
}
}
#now switch
data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_")
data.table::setnames(sumstats_dt,"A1","A2")
data.table::setnames(sumstats_dt,"A2_INPUTTED_OLD_","A1")
}
#now switch
data.table::setnames(sumstats_dt,"A2","A2_INPUTTED_OLD_")
data.table::setnames(sumstats_dt,"A1","A2")
data.table::setnames(sumstats_dt,"A2_INPUTTED_OLD_","A1")
}
else{ #couldn't infer
message(switch_req)
message("Can't infer allele columns from sumstats")
}
}
else{ #couldn't infer
message(switch_req)
else{
message("Can't infer allele columns from sumstats")
}
}
else{
message("Can't infer allele columns from sumstats")
}
}
}
}
#### Return format ####
if(return_list){
return(list("sumstats_dt" = sumstats_dt))
Expand Down
4 changes: 4 additions & 0 deletions R/validate_parameters.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ validate_parameters <- function(path,
chr_style,
rmv_chr,
on_ref_genome,
infer_eff_direction,
strand_ambig_filter,
allele_flip_check,
allele_flip_drop,
Expand Down Expand Up @@ -225,6 +226,9 @@ validate_parameters <- function(path,
if (!is.logical(on_ref_genome)) {
stop("on_ref_genome must be either TRUE or FALSE")
}
if(!is.logical(infer_eff_direction)){
stop("infer_eff_direction must be either TRUE or FALSE")
}
if (!is.logical(strand_ambig_filter)) {
stop("strand_ambig_filter must be either TRUE or FALSE")
}
Expand Down
Binary file added inst/extdata/ALSvcf.vcf.bgz
Binary file not shown.
Binary file added inst/extdata/ALSvcf.vcf.bgz.tbi
Binary file not shown.
4 changes: 4 additions & 0 deletions man/format_sumstats.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions man/import_sumstats.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions man/infer_effect_column.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 0158026

Please sign in to comment.