diff --git a/DESCRIPTION b/DESCRIPTION index 23e352e..2fe19a0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: MungeSumstats Type: Package Title: Standardise summary statistics from GWAS -Version: 1.9.10 +Version: 1.9.11 Authors@R: c(person(given = "Alan", family = "Murphy", diff --git a/NEWS.md b/NEWS.md index c43a913..27031b4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,13 @@ +## CHANGES IN VERSION 1.9.11 + +### New features +* `check_chr()` now automatically removes all SNPs with nonstandard CHR entries +(anything other than 1-22, X, Y, and MT) +* `check_chr()` now ensures that the "chr" prefix is lowercase if kept + +### Bug fix +* The `rmv_chrPrefix` parameter is no longer ignored when `rmv_chr` is NULL + ## CHANGES IN VERSION 1.9.10 ### Bug fix diff --git a/R/check_chr.R b/R/check_chr.R index 0a1d1e0..d443c2d 100644 --- a/R/check_chr.R +++ b/R/check_chr.R @@ -1,84 +1,109 @@ -#' Ensure all SNPs on specified chromosomes are removed +#' Standardize the CHR column #' +#' Renames "23" to "X", makes X/Y/MT uppercase and the "chr" prefix lowercase, +#' and removes SNPs with nonstandard CHR entries. Optionally, also removes the +#' "chr" prefix and SNPs on user-specified chromosomes. +#' +#' @param sumstats_dt data.table with summary statistics +#' @param log_files list of locations for all log files +#' @param check_save_out list of parameters for saved files #' @inheritParams format_sumstats -#' @param log_files list of log file locations -#' @return list containing sumstats_dt, the modified summary statistics data -#' table object and the log file list +#' @return list containing the updated summary statistics data.table and the +#' updated log file locations list #' @keywords internal check_chr <- function(sumstats_dt, - path, - rmv_chr, - log_folder_ind, + log_files, check_save_out, - tabix_index, + rmv_chr, + rmv_chrPrefix, nThread, - log_files, - make_uppercase = TRUE, - rmv_chrPrefix = TRUE) { - CHR <- NULL - # If CHR present and user specified chromosome to have SNPs removed - col_headers <- names(sumstats_dt) - if ("CHR" %in% col_headers && !is.null(rmv_chr)) { + tabix_index, + log_folder_ind) { + CHR <- NULL + + # The CHR column needs to be a character vector for gsub substitution to work + sumstats_dt[, CHR := as.character(CHR)] + + ### Rename "23" to "X" + sumstats_dt[, CHR := gsub("23", "X", CHR)] - ### Sometimes X is labeled as 23 - sumstats_dt[, CHR := gsub("23", "X", CHR)] + ### Make X/Y/MT uppercase + sumstats_dt[, CHR := gsub("x", "X", CHR)] + sumstats_dt[, CHR := gsub("y", "Y", CHR)] + sumstats_dt[, CHR := gsub("mt", "MT", CHR)] - #### Remove chr prefix uppercase #### - if (rmv_chrPrefix) { - message("Removing 'chr' prefix from CHR.") - sumstats_dt[, CHR := gsub("chr", "", CHR, ignore.case = TRUE)] - rmv_chr <- gsub("chr", "", rmv_chr, ignore.case = TRUE) - } - #### Make all CHR uppercase #### - if (make_uppercase) { - message("Making X/Y/MT CHR uppercase.") - sumstats_dt[, CHR := gsub("x|23", "X", CHR)] - sumstats_dt[, CHR := gsub("y", "Y", CHR)] - sumstats_dt[, CHR := gsub("mt", "MT", CHR)] - } + ### If specified, remove the "chr" prefix + if (rmv_chrPrefix) { + message("Removing 'chr' prefix from CHR.") + sumstats_dt[, CHR := gsub("chr", "", CHR, ignore.case = TRUE)] + standard_chrs <- c(1:22, "X", "Y", "MT") + } else { + ### Otherwise, make the "chr" prefix lowercase + sumstats_dt[, CHR := gsub("CHR", "chr", CHR)] + standard_chrs <- c(paste0("chr", 1:22), "X", "Y", "MT") + } - # check for chromosomes to be removed - ### Standardise chromosomes specified - rmv_chr <- toupper(rmv_chr) - if (any(rmv_chr %in% unique(sumstats_dt$CHR))) { - num_bad_ids <- nrow(sumstats_dt[CHR %in% rmv_chr, ]) - msg <- paste0( - formatC(num_bad_ids, big.mark = ","), - " SNPs are on chromosomes ", - paste(rmv_chr, collapse = ", "), - " and will be removed" - ) - message(msg) - # If user wants log, save it to there - if (log_folder_ind) { - name <- "chr_excl" - name <- get_unique_name_log_file( - name = name, - log_files = log_files - ) - write_sumstats( - sumstats_dt = sumstats_dt[CHR %in% (rmv_chr), ], - save_path = - paste0( - check_save_out$log_folder, - "/", name, - check_save_out$extension - ), - sep = check_save_out$sep, - tabix_index = tabix_index, - nThread = nThread - ) - log_files[[name]] <- - paste0( - check_save_out$log_folder, "/", name, - check_save_out$extension - ) - } - # remove rows on these chromosomes - sumstats_dt <- sumstats_dt[!CHR %in% (rmv_chr), ] - } - return(list("sumstats_dt" = sumstats_dt, "log_files" = log_files)) + ### Remove rows with nonstandard CHR entries + nonstandard_rows <- which(!(sumstats_dt$CHR %in% standard_chrs)) + if (length(nonstandard_rows) > 0L) { + message( + "Removing ", + formatC(length(nonstandard_rows), big.mark = ","), + " SNPs with nonstandard CHR entries." + ) + } + + ### If specified, remove SNPs on specific chromosomes + rmv_chr_rows <- c() + if (!is.null(rmv_chr)) { + # Standardize user-specified chromosomes + rmv_chr <- gsub("23", "X", rmv_chr) + rmv_chr <- gsub("x", "X", rmv_chr) + rmv_chr <- gsub("y", "Y", rmv_chr) + rmv_chr <- gsub("mt", "MT", rmv_chr) + if (rmv_chrPrefix) { + rmv_chr <- gsub("chr", "", rmv_chr, ignore.case = TRUE) } else { - return(list("sumstats_dt" = sumstats_dt, "log_files" = log_files)) + rmv_chr <- gsub("CHR", "chr", rmv_chr) + } + + # Check for chromosomes to be removed + rmv_chr_rows <- which(sumstats_dt$CHR %in% rmv_chr) + if (length(rmv_chr_rows) > 0L) { + message( + formatC(length(rmv_chr_rows), big.mark = ","), + " SNPs are on chromosomes ", + paste(rmv_chr, collapse = ", "), + " and will be removed." + ) } + } + + # Vector of row numbers for all removed SNPs + all_removed_rows <- sort(unique(c(nonstandard_rows, rmv_chr_rows))) + + ### Save a log of removed SNPs if the user wants it + if (log_folder_ind && (length(all_removed_rows) > 0L)) { + name <- "chr_excl" + name <- get_unique_name_log_file(name = name, + log_files = log_files) + save_path <- paste0( + check_save_out$log_folder, + "/", + name, + check_save_out$extension + ) + + write_sumstats(sumstats_dt = sumstats_dt[all_removed_rows], + save_path = save_path, + sep = check_save_out$sep, + tabix_index = tabix_index, + nThread = nThread) + log_files[[name]] <- save_path + } + + # Remove the SNPs identified above, if any + sumstats_dt <- sumstats_dt[!all_removed_rows] + + return(list(sumstats_dt = sumstats_dt, log_files = log_files)) } diff --git a/R/format_sumstats.R b/R/format_sumstats.R index 8a9625b..f0d4d5e 100644 --- a/R/format_sumstats.R +++ b/R/format_sumstats.R @@ -901,17 +901,16 @@ format_sumstats <- function(path, # update values log_files <- sumstats_return$log_files - #### Check 24: check that no snps are on specific chromosomes #### + #### Check 24: standardize the CHR column #### sumstats_return <- check_chr( - sumstats_dt = sumstats_return$sumstats_dt, - path = path, - rmv_chr = rmv_chr, - rmv_chrPrefix = rmv_chrPrefix, - log_folder_ind = log_folder_ind, - check_save_out = check_save_out, - tabix_index = tabix_index, - nThread = nThread, - log_files = log_files + sumstats_dt = sumstats_return$sumstats_dt, + log_files = log_files, + check_save_out = check_save_out, + rmv_chr = rmv_chr, + rmv_chrPrefix = rmv_chrPrefix, + nThread = nThread, + tabix_index = tabix_index, + log_folder_ind = log_folder_ind ) # update values log_files <- sumstats_return$log_files diff --git a/man/check_chr.Rd b/man/check_chr.Rd index 949d9c6..343025e 100644 --- a/man/check_chr.Rd +++ b/man/check_chr.Rd @@ -2,51 +2,51 @@ % Please edit documentation in R/check_chr.R \name{check_chr} \alias{check_chr} -\title{Ensure all SNPs on specified chromosomes are removed} +\title{Standardize the CHR column} \usage{ check_chr( sumstats_dt, - path, - rmv_chr, - log_folder_ind, + log_files, check_save_out, - tabix_index, + rmv_chr, + rmv_chrPrefix, nThread, - log_files, - make_uppercase = TRUE, - rmv_chrPrefix = TRUE + tabix_index, + log_folder_ind ) } \arguments{ -\item{path}{Filepath for the summary statistics file to be formatted. A -dataframe or datatable of the summary statistics file can also be passed -directly to MungeSumstats using the path parameter.} +\item{sumstats_dt}{data.table with summary statistics} + +\item{log_files}{list of locations for all log files} + +\item{check_save_out}{list of parameters for saved files} \item{rmv_chr}{vector or character The chromosomes on which the SNPs should be removed. Use NULL if no filtering necessary. Default is X, Y and mitochondrial.} +\item{rmv_chrPrefix}{Remove "chr" or "CHR" from chromosome names. Default is +TRUE.} + +\item{nThread}{Number of threads to use for parallel processes.} + +\item{tabix_index}{Index the formatted summary statistics with +\href{http://www.htslib.org/doc/tabix.html}{tabix} for fast querying.} + \item{log_folder_ind}{Binary Should log files be stored containing all filtered out SNPs (separate file per filter). The data is outputted in the same format specified for the resulting sumstats file. The only exception to this rule is if output is vcf, then log file saved as .tsv.gz. Default is FALSE.} - -\item{tabix_index}{Index the formatted summary statistics with -\href{http://www.htslib.org/doc/tabix.html}{tabix} for fast querying.} - -\item{nThread}{Number of threads to use for parallel processes.} - -\item{log_files}{list of log file locations} - -\item{rmv_chrPrefix}{Remove "chr" or "CHR" from chromosome names. Default is -TRUE.} } \value{ -list containing sumstats_dt, the modified summary statistics data -table object and the log file list +list containing the updated summary statistics data.table and the +updated log file locations list } \description{ -Ensure all SNPs on specified chromosomes are removed +Renames "23" to "X", makes X/Y/MT uppercase and the "chr" prefix lowercase, +and removes SNPs with nonstandard CHR entries. Optionally, also removes the +"chr" prefix and SNPs on user-specified chromosomes. } \keyword{internal} diff --git a/man/to_GRanges.Rd b/man/to_granges.Rd similarity index 100% rename from man/to_GRanges.Rd rename to man/to_granges.Rd diff --git a/man/to_VRanges.Rd b/man/to_vranges.Rd similarity index 100% rename from man/to_VRanges.Rd rename to man/to_vranges.Rd