-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for alternative chromosome styles
- Loading branch information
MykMal
committed
Jul 13, 2023
1 parent
7bc7b59
commit b8a022f
Showing
15 changed files
with
309 additions
and
265 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,84 +1,90 @@ | ||
#' Ensure all SNPs on specified chromosomes are removed | ||
#' Standardize the CHR column | ||
#' | ||
#' Maps chromosome names to the default Ensembl/NCBI naming style and removes | ||
#' SNPs with nonstandard CHR entries. Optionally, also removes SNPs on | ||
#' user-specified chromosomes. | ||
#' | ||
#' @param sumstats_dt data.table with summary statistics | ||
#' @param log_files list of locations for all log files | ||
#' @param check_save_out list of parameters for saved files | ||
#' @inheritParams format_sumstats | ||
#' @param log_files list of log file locations | ||
#' @return list containing sumstats_dt, the modified summary statistics data | ||
#' table object and the log file list | ||
#' @return list containing the updated summary statistics data.table and the | ||
#' updated log file locations list | ||
#' @keywords internal | ||
check_chr <- function(sumstats_dt, | ||
path, | ||
rmv_chr, | ||
log_folder_ind, | ||
log_files, | ||
check_save_out, | ||
tabix_index, | ||
rmv_chr, | ||
nThread, | ||
log_files, | ||
make_uppercase = TRUE, | ||
rmv_chrPrefix = TRUE) { | ||
CHR <- NULL | ||
# If CHR present and user specified chromosome to have SNPs removed | ||
col_headers <- names(sumstats_dt) | ||
if ("CHR" %in% col_headers && !is.null(rmv_chr)) { | ||
tabix_index, | ||
log_folder_ind) { | ||
CHR <- NULL | ||
|
||
### Sometimes X is labeled as 23 | ||
sumstats_dt[, CHR := gsub("23", "X", CHR)] | ||
# The CHR column needs to be a character vector for gsub substitution to work | ||
sumstats_dt[, CHR := as.character(CHR)] | ||
|
||
#### Remove chr prefix uppercase #### | ||
if (rmv_chrPrefix) { | ||
message("Removing 'chr' prefix from CHR.") | ||
sumstats_dt[, CHR := gsub("chr", "", CHR, ignore.case = TRUE)] | ||
rmv_chr <- gsub("chr", "", rmv_chr, ignore.case = TRUE) | ||
} | ||
#### Make all CHR uppercase #### | ||
if (make_uppercase) { | ||
message("Making X/Y/MT CHR uppercase.") | ||
sumstats_dt[, CHR := gsub("x|23", "X", CHR)] | ||
sumstats_dt[, CHR := gsub("y", "Y", CHR)] | ||
sumstats_dt[, CHR := gsub("mt", "MT", CHR)] | ||
} | ||
### Reformat chromosome names according to the default style (Ensembl/NCBI) | ||
# Remove the "chr" prefix | ||
sumstats_dt[, CHR := gsub("chr", "", CHR, ignore.case = TRUE)] | ||
# Remove the "ch" prefix | ||
sumstats_dt[, CHR := gsub("ch", "", CHR, ignore.case = TRUE)] | ||
# Rename "23" to "X" | ||
sumstats_dt[, CHR := gsub("23", "X", CHR)] | ||
# Rename "M" to "MT" | ||
sumstats_dt[, CHR := gsub("M", "MT", CHR, ignore.case = TRUE)] | ||
# Make all chromosome names uppercase | ||
sumstats_dt[, CHR := toupper(CHR)] | ||
|
||
# check for chromosomes to be removed | ||
### Standardise chromosomes specified | ||
rmv_chr <- toupper(rmv_chr) | ||
if (any(rmv_chr %in% unique(sumstats_dt$CHR))) { | ||
num_bad_ids <- nrow(sumstats_dt[CHR %in% rmv_chr, ]) | ||
msg <- paste0( | ||
formatC(num_bad_ids, big.mark = ","), | ||
" SNPs are on chromosomes ", | ||
paste(rmv_chr, collapse = ", "), | ||
" and will be removed" | ||
) | ||
message(msg) | ||
# If user wants log, save it to there | ||
if (log_folder_ind) { | ||
name <- "chr_excl" | ||
name <- get_unique_name_log_file( | ||
name = name, | ||
log_files = log_files | ||
) | ||
write_sumstats( | ||
sumstats_dt = sumstats_dt[CHR %in% (rmv_chr), ], | ||
save_path = | ||
paste0( | ||
check_save_out$log_folder, | ||
"/", name, | ||
check_save_out$extension | ||
), | ||
sep = check_save_out$sep, | ||
tabix_index = tabix_index, | ||
nThread = nThread | ||
) | ||
log_files[[name]] <- | ||
paste0( | ||
check_save_out$log_folder, "/", name, | ||
check_save_out$extension | ||
) | ||
} | ||
# remove rows on these chromosomes | ||
sumstats_dt <- sumstats_dt[!CHR %in% (rmv_chr), ] | ||
} | ||
return(list("sumstats_dt" = sumstats_dt, "log_files" = log_files)) | ||
} else { | ||
return(list("sumstats_dt" = sumstats_dt, "log_files" = log_files)) | ||
### Remove rows with nonstandard CHR entries | ||
standard_chrs <- c(1:22, "X", "Y", "MT") | ||
nonstandard_rows <- which(!(sumstats_dt$CHR %in% standard_chrs)) | ||
if (length(nonstandard_rows) > 0L) { | ||
message("Removing ", | ||
formatC(length(nonstandard_rows), big.mark = ","), | ||
" SNPs with nonstandard CHR entries.") | ||
} | ||
|
||
### Remove SNPs on user-specified chromosomes, if requested | ||
rmv_chr_rows <- c() | ||
if (!is.null(rmv_chr)) { | ||
# Check for chromosomes to be removed | ||
rmv_chr_rows <- which(sumstats_dt$CHR %in% rmv_chr) | ||
|
||
if (length(rmv_chr_rows) > 0L) { | ||
message( | ||
formatC(length(rmv_chr_rows), big.mark = ","), | ||
" SNPs are on chromosomes ", | ||
paste(rmv_chr, collapse = ", "), | ||
" and will be removed." | ||
) | ||
} | ||
} | ||
|
||
# Vector of row numbers for all removed SNPs | ||
all_removed_rows <- sort(unique(c(nonstandard_rows, rmv_chr_rows))) | ||
|
||
### Save a log of removed SNPs if the user wants it | ||
if (log_folder_ind && (length(all_removed_rows) > 0L)) { | ||
name <- "chr_excl" | ||
name <- get_unique_name_log_file(name = name, | ||
log_files = log_files) | ||
save_path <- paste0(check_save_out$log_folder, | ||
"/", | ||
name, | ||
check_save_out$extension) | ||
|
||
write_sumstats( | ||
sumstats_dt = sumstats_dt[all_removed_rows], | ||
save_path = save_path, | ||
sep = check_save_out$sep, | ||
tabix_index = tabix_index, | ||
nThread = nThread | ||
) | ||
log_files[[name]] <- save_path | ||
} | ||
|
||
# Remove the SNPs identified above, if any | ||
sumstats_dt <- sumstats_dt[!all_removed_rows] | ||
|
||
return(list(sumstats_dt = sumstats_dt, log_files = log_files)) | ||
} |
Oops, something went wrong.
b8a022f
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
frq= eaf or maf?
b8a022f
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In a formatted summary statistics file, the FRQ column denotes the effect allele frequency (EAF). Conventionally, this is the same as the minor allele frequency (MAF).
However, some GWAS summary statistics files report the major allele frequency instead. If this is the case for your data, you can set
frq_is_maf = FALSE
when runningformat_sumstats()
. This will rename the FRQ column to MAJOR_ALLELE_FRQ if the frequency values appear to relate to the major allele, i.e. if they are greater than 0.5. See the documentation for more information: https://neurogenomics.github.io/MungeSumstats/reference/format_sumstats.htmlDoes this answer your question?