diff --git a/DESCRIPTION b/DESCRIPTION index 05b8390..7b7e526 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: MungeSumstats Type: Package Title: Standardise summary statistics from GWAS -Version: 1.11.5 +Version: 1.11.6 Authors@R: c(person(given = "Alan", family = "Murphy", diff --git a/NAMESPACE b/NAMESPACE index 03c8c9a..e1145c7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -66,6 +66,7 @@ importFrom(data.table,fwrite) importFrom(data.table,key) importFrom(data.table,rbindlist) importFrom(data.table,set) +importFrom(data.table,setDF) importFrom(data.table,setDT) importFrom(data.table,setcolorder) importFrom(data.table,setkey) diff --git a/NEWS.md b/NEWS.md index 5833d67..399dc13 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +## CHANGES IN VERSION 1.11.6 + +### Bug fix +* Catch for overflow when NA's in SNP col for `check_no_rs_snp()` check with +`imputation_ind=TRUE`. + ## CHANGES IN VERSION 1.11.4 ### Bug fix diff --git a/R/check_no_rs_snp.R b/R/check_no_rs_snp.R index 71809f5..b632617 100644 --- a/R/check_no_rs_snp.R +++ b/R/check_no_rs_snp.R @@ -20,7 +20,7 @@ check_no_rs_snp <- function(sumstats_dt, path, ref_genome, snp_ids_are_rs_ids, check_save_out,tabix_index, nThread, log_files, dbSNP) { SNP <- CHR <- CHR1 <- BP1 <- i.RefSNP_id <- IMPUTATION_SNP <- - SNP_old_temp <- SNP_INFO <- A1 <- A2 <- NULL + SNP_old_temp <- SNP_INFO <- A1 <- A2 <- .I <- NULL # if snp ids aren't rs ids rename the column to ID's # so RSIDs can be inferred if ((!snp_ids_are_rs_ids) & sum("SNP" %in% names(sumstats_dt)) == 1) { @@ -56,6 +56,10 @@ check_no_rs_snp <- function(sumstats_dt, path, ref_genome, snp_ids_are_rs_ids, message("Checking SNP RSIDs.") # needed for later to join and match SNPs if (imputation_ind) { + # if any are NA, we need to give them a unique ID + # need to be able to identify to revert back so picked up + #by other, NA for RS ID check + sumstats_dt[is.na(SNP)|SNP=="", SNP := paste0("NA_",.I)] sumstats_dt[, SNP_old_temp := SNP] } miss_rs <- sumstats_dt[!grep("^rs", SNP), ] @@ -174,8 +178,8 @@ check_no_rs_snp <- function(sumstats_dt, path, ref_genome, snp_ids_are_rs_ids, format <- c("BP1", "CHR1") } miss_rs_chr_bp[, (format) := data.table::tstrsplit(SNP, - split = ":", fixed = TRUE - )]#[c(1,2)]] + split = ":", + fixed = TRUE)] # if BP col has other info after, drop it if (sum(grepl("[[:punct:]].*", miss_rs_chr_bp$BP1)) > 0) { miss_rs_chr_bp[, BP1 := gsub("([[:punct:]]).*", "", BP1)] @@ -278,7 +282,8 @@ check_no_rs_snp <- function(sumstats_dt, path, ref_genome, snp_ids_are_rs_ids, } } if (nrow(miss_rs) != nrow(sumstats_dt) && nrow(miss_rs) != 0) { - if (nrow(miss_rs_chr_bp) == 0) { + #Prev if (nrow(miss_rs_chr_bp) == 0) { + if (nrow(miss_rs_chr_bp) < nrow(miss_rs)) { # don't filter twice if hit prev condition # check if impute of correct SNP ID possible if (sum(c("CHR", "BP") %in% col_headers) == 2 && @@ -346,11 +351,18 @@ check_no_rs_snp <- function(sumstats_dt, path, ref_genome, snp_ids_are_rs_ids, # If user wants log, save it to there if (log_folder_ind && nrow(sumstats_dt[!grep("^rs", SNP), ]) > 0) { + #if imputation ind on, NA's will be replaced with NA_i + #change back here + tmp_rmv <- sumstats_dt[!grepl("^rs", SNP), ] + if(imputation_ind){ + tmp_rmv[, SNP_old_temp := NULL] + tmp_rmv[grepl("^NA_",SNP), SNP := NA] + } name <- "snp_missing_rs" name <- get_unique_name_log_file(name = name, log_files = log_files) write_sumstats( - sumstats_dt = sumstats_dt[!grepl("^rs", SNP), ], + sumstats_dt = tmp_rmv, save_path = paste0( check_save_out$log_folder, @@ -420,7 +432,7 @@ check_no_rs_snp <- function(sumstats_dt, path, ref_genome, snp_ids_are_rs_ids, setkey(sumstats_dt, SNP_old_temp) sumstats_dt[miss_rs, IMPUTATION_SNP := TRUE] } - # remove temp column either way + # remove temp columns either way sumstats_dt[, SNP_old_temp := NULL] } return(list("sumstats_dt" = sumstats_dt, "log_files" = log_files)) diff --git a/R/format_sumstats.R b/R/format_sumstats.R index cb05691..d4a4fbf 100644 --- a/R/format_sumstats.R +++ b/R/format_sumstats.R @@ -433,7 +433,19 @@ format_sumstats <- function(path, nThread = nThread ) } - + + #If user inputted mapping file, validate + if(!identical(mapping_file, sumstatsColHeaders)) { + message("Non-standard mapping file detected.", + "Making sure all entries in `Uncorrected`", + " are in upper case.") + data.table::setDF(mapping_file) + #check again + if(!identical(mapping_file, sumstatsColHeaders)) { + mapping_file$Uncorrected <- toupper(mapping_file$Uncorrected) + } + } + #If es_is_beta remove from mapping file if present if (!es_is_beta & nrow(mapping_file[mapping_file$Uncorrected=="ES" & mapping_file$Corrected=="BETA",])>=1) diff --git a/R/standardise_sumstats_column_headers_crossplatform.R b/R/standardise_sumstats_column_headers_crossplatform.R index 12d9432..4ecc989 100644 --- a/R/standardise_sumstats_column_headers_crossplatform.R +++ b/R/standardise_sumstats_column_headers_crossplatform.R @@ -24,17 +24,6 @@ standardise_header <- standardise_sumstats_column_headers_crossplatform <- mapping_file = sumstatsColHeaders, uppercase_unmapped=TRUE, return_list=TRUE) { - - data.table::setDF(mapping_file) - if(!all.equal( - mapping_file, - sumstatsColHeaders - )) { - message( - "Non-standard mapping file detected.", - "Making sure all entries in `Uncorrected` are in upper case.") - mapping_file$Uncorrected <- toupper(mapping_file$Uncorrected) - } message("Standardising column headers.") message("First line of summary statistics file: ") msg <- paste0(names(sumstats_dt), split = "\t")