Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve chromosome standardization #157

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: MungeSumstats
Type: Package
Title: Standardise summary statistics from GWAS
Version: 1.9.10
Version: 1.9.11
Authors@R:
c(person(given = "Alan",
family = "Murphy",
Expand Down
10 changes: 10 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
## CHANGES IN VERSION 1.9.11

### New features
* `check_chr()` now automatically removes all SNPs with nonstandard CHR entries
(anything other than 1-22, X, Y, and MT)
* `check_chr()` now ensures that the "chr" prefix is lowercase if kept

### Bug fix
* The `rmv_chrPrefix` parameter is no longer ignored when `rmv_chr` is NULL

## CHANGES IN VERSION 1.9.10

### Bug fix
Expand Down
169 changes: 97 additions & 72 deletions R/check_chr.R
Original file line number Diff line number Diff line change
@@ -1,84 +1,109 @@
#' Ensure all SNPs on specified chromosomes are removed
#' Standardize the CHR column
#'
#' Renames "23" to "X", makes X/Y/MT uppercase and the "chr" prefix lowercase,
#' and removes SNPs with nonstandard CHR entries. Optionally, also removes the
#' "chr" prefix and SNPs on user-specified chromosomes.
#'
#' @param sumstats_dt data.table with summary statistics
#' @param log_files list of locations for all log files
#' @param check_save_out list of parameters for saved files
#' @inheritParams format_sumstats
#' @param log_files list of log file locations
#' @return list containing sumstats_dt, the modified summary statistics data
#' table object and the log file list
#' @return list containing the updated summary statistics data.table and the
#' updated log file locations list
#' @keywords internal
check_chr <- function(sumstats_dt,
path,
rmv_chr,
log_folder_ind,
log_files,
check_save_out,
tabix_index,
rmv_chr,
rmv_chrPrefix,
nThread,
log_files,
make_uppercase = TRUE,
rmv_chrPrefix = TRUE) {
CHR <- NULL
# If CHR present and user specified chromosome to have SNPs removed
col_headers <- names(sumstats_dt)
if ("CHR" %in% col_headers && !is.null(rmv_chr)) {
tabix_index,
log_folder_ind) {
CHR <- NULL

# The CHR column needs to be a character vector for gsub substitution to work
sumstats_dt[, CHR := as.character(CHR)]

### Rename "23" to "X"
sumstats_dt[, CHR := gsub("23", "X", CHR)]

### Sometimes X is labeled as 23
sumstats_dt[, CHR := gsub("23", "X", CHR)]
### Make X/Y/MT uppercase
sumstats_dt[, CHR := gsub("x", "X", CHR)]
sumstats_dt[, CHR := gsub("y", "Y", CHR)]
sumstats_dt[, CHR := gsub("mt", "MT", CHR)]

#### Remove chr prefix uppercase ####
if (rmv_chrPrefix) {
message("Removing 'chr' prefix from CHR.")
sumstats_dt[, CHR := gsub("chr", "", CHR, ignore.case = TRUE)]
rmv_chr <- gsub("chr", "", rmv_chr, ignore.case = TRUE)
}
#### Make all CHR uppercase ####
if (make_uppercase) {
message("Making X/Y/MT CHR uppercase.")
sumstats_dt[, CHR := gsub("x|23", "X", CHR)]
sumstats_dt[, CHR := gsub("y", "Y", CHR)]
sumstats_dt[, CHR := gsub("mt", "MT", CHR)]
}
### If specified, remove the "chr" prefix
if (rmv_chrPrefix) {
message("Removing 'chr' prefix from CHR.")
sumstats_dt[, CHR := gsub("chr", "", CHR, ignore.case = TRUE)]
standard_chrs <- c(1:22, "X", "Y", "MT")
} else {
### Otherwise, make the "chr" prefix lowercase
sumstats_dt[, CHR := gsub("CHR", "chr", CHR)]
standard_chrs <- c(paste0("chr", 1:22), "X", "Y", "MT")
}

# check for chromosomes to be removed
### Standardise chromosomes specified
rmv_chr <- toupper(rmv_chr)
if (any(rmv_chr %in% unique(sumstats_dt$CHR))) {
num_bad_ids <- nrow(sumstats_dt[CHR %in% rmv_chr, ])
msg <- paste0(
formatC(num_bad_ids, big.mark = ","),
" SNPs are on chromosomes ",
paste(rmv_chr, collapse = ", "),
" and will be removed"
)
message(msg)
# If user wants log, save it to there
if (log_folder_ind) {
name <- "chr_excl"
name <- get_unique_name_log_file(
name = name,
log_files = log_files
)
write_sumstats(
sumstats_dt = sumstats_dt[CHR %in% (rmv_chr), ],
save_path =
paste0(
check_save_out$log_folder,
"/", name,
check_save_out$extension
),
sep = check_save_out$sep,
tabix_index = tabix_index,
nThread = nThread
)
log_files[[name]] <-
paste0(
check_save_out$log_folder, "/", name,
check_save_out$extension
)
}
# remove rows on these chromosomes
sumstats_dt <- sumstats_dt[!CHR %in% (rmv_chr), ]
}
return(list("sumstats_dt" = sumstats_dt, "log_files" = log_files))
### Remove rows with nonstandard CHR entries
nonstandard_rows <- which(!(sumstats_dt$CHR %in% standard_chrs))
if (length(nonstandard_rows) > 0L) {
message(
"Removing ",
formatC(length(nonstandard_rows), big.mark = ","),
" SNPs with nonstandard CHR entries."
)
}

### If specified, remove SNPs on specific chromosomes
rmv_chr_rows <- c()
if (!is.null(rmv_chr)) {
# Standardize user-specified chromosomes
rmv_chr <- gsub("23", "X", rmv_chr)
rmv_chr <- gsub("x", "X", rmv_chr)
rmv_chr <- gsub("y", "Y", rmv_chr)
rmv_chr <- gsub("mt", "MT", rmv_chr)
if (rmv_chrPrefix) {
rmv_chr <- gsub("chr", "", rmv_chr, ignore.case = TRUE)
} else {
return(list("sumstats_dt" = sumstats_dt, "log_files" = log_files))
rmv_chr <- gsub("CHR", "chr", rmv_chr)
}

# Check for chromosomes to be removed
rmv_chr_rows <- which(sumstats_dt$CHR %in% rmv_chr)
if (length(rmv_chr_rows) > 0L) {
message(
formatC(length(rmv_chr_rows), big.mark = ","),
" SNPs are on chromosomes ",
paste(rmv_chr, collapse = ", "),
" and will be removed."
)
}
}

# Vector of row numbers for all removed SNPs
all_removed_rows <- sort(unique(c(nonstandard_rows, rmv_chr_rows)))

### Save a log of removed SNPs if the user wants it
if (log_folder_ind && (length(all_removed_rows) > 0L)) {
name <- "chr_excl"
name <- get_unique_name_log_file(name = name,
log_files = log_files)
save_path <- paste0(
check_save_out$log_folder,
"/",
name,
check_save_out$extension
)

write_sumstats(sumstats_dt = sumstats_dt[all_removed_rows],
save_path = save_path,
sep = check_save_out$sep,
tabix_index = tabix_index,
nThread = nThread)
log_files[[name]] <- save_path
}

# Remove the SNPs identified above, if any
sumstats_dt <- sumstats_dt[!all_removed_rows]

return(list(sumstats_dt = sumstats_dt, log_files = log_files))
}
19 changes: 9 additions & 10 deletions R/format_sumstats.R
Original file line number Diff line number Diff line change
Expand Up @@ -901,17 +901,16 @@ format_sumstats <- function(path,
# update values
log_files <- sumstats_return$log_files

#### Check 24: check that no snps are on specific chromosomes ####
#### Check 24: standardize the CHR column ####
sumstats_return <- check_chr(
sumstats_dt = sumstats_return$sumstats_dt,
path = path,
rmv_chr = rmv_chr,
rmv_chrPrefix = rmv_chrPrefix,
log_folder_ind = log_folder_ind,
check_save_out = check_save_out,
tabix_index = tabix_index,
nThread = nThread,
log_files = log_files
sumstats_dt = sumstats_return$sumstats_dt,
log_files = log_files,
check_save_out = check_save_out,
rmv_chr = rmv_chr,
rmv_chrPrefix = rmv_chrPrefix,
nThread = nThread,
tabix_index = tabix_index,
log_folder_ind = log_folder_ind
)
# update values
log_files <- sumstats_return$log_files
Expand Down
48 changes: 24 additions & 24 deletions man/check_chr.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

File renamed without changes.
File renamed without changes.
Loading