Skip to content

Commit

Permalink
Add support for alternative chromosome styles
Browse files Browse the repository at this point in the history
  • Loading branch information
MykMal committed Jul 13, 2023
1 parent 7bc7b59 commit b8a022f
Show file tree
Hide file tree
Showing 15 changed files with 309 additions and 265 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: MungeSumstats
Type: Package
Title: Standardise summary statistics from GWAS
Version: 1.9.10
Version: 1.9.11
Authors@R:
c(person(given = "Alan",
family = "Murphy",
Expand Down
10 changes: 10 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
## CHANGES IN VERSION 1.9.11

### New features
* The `rmv_chrPrefix` parameter in `format_sumstats()` has been replaced with
the new `chr_style` parameter, which allows users to specify their desired
chromosome name style. The supported chromosome styles are "NCBI", "UCSC", "dbSNP",
and "Ensembl" with "Ensembl" being the default.
* `check_chr()` now automatically removes all SNPs with nonstandard CHR entries
(anything other than 1-22, X, Y, and MT in the Ensembl naming style).

## CHANGES IN VERSION 1.9.10

### Bug fix
Expand Down
152 changes: 79 additions & 73 deletions R/check_chr.R
Original file line number Diff line number Diff line change
@@ -1,84 +1,90 @@
#' Ensure all SNPs on specified chromosomes are removed
#' Standardize the CHR column
#'
#' Maps chromosome names to the default Ensembl/NCBI naming style and removes
#' SNPs with nonstandard CHR entries. Optionally, also removes SNPs on
#' user-specified chromosomes.
#'
#' @param sumstats_dt data.table with summary statistics
#' @param log_files list of locations for all log files
#' @param check_save_out list of parameters for saved files
#' @inheritParams format_sumstats
#' @param log_files list of log file locations
#' @return list containing sumstats_dt, the modified summary statistics data
#' table object and the log file list
#' @return list containing the updated summary statistics data.table and the
#' updated log file locations list
#' @keywords internal
check_chr <- function(sumstats_dt,
path,
rmv_chr,
log_folder_ind,
log_files,
check_save_out,
tabix_index,
rmv_chr,
nThread,
log_files,
make_uppercase = TRUE,
rmv_chrPrefix = TRUE) {
CHR <- NULL
# If CHR present and user specified chromosome to have SNPs removed
col_headers <- names(sumstats_dt)
if ("CHR" %in% col_headers && !is.null(rmv_chr)) {
tabix_index,
log_folder_ind) {
CHR <- NULL

### Sometimes X is labeled as 23
sumstats_dt[, CHR := gsub("23", "X", CHR)]
# The CHR column needs to be a character vector for gsub substitution to work
sumstats_dt[, CHR := as.character(CHR)]

#### Remove chr prefix uppercase ####
if (rmv_chrPrefix) {
message("Removing 'chr' prefix from CHR.")
sumstats_dt[, CHR := gsub("chr", "", CHR, ignore.case = TRUE)]
rmv_chr <- gsub("chr", "", rmv_chr, ignore.case = TRUE)
}
#### Make all CHR uppercase ####
if (make_uppercase) {
message("Making X/Y/MT CHR uppercase.")
sumstats_dt[, CHR := gsub("x|23", "X", CHR)]
sumstats_dt[, CHR := gsub("y", "Y", CHR)]
sumstats_dt[, CHR := gsub("mt", "MT", CHR)]
}
### Reformat chromosome names according to the default style (Ensembl/NCBI)
# Remove the "chr" prefix
sumstats_dt[, CHR := gsub("chr", "", CHR, ignore.case = TRUE)]
# Remove the "ch" prefix
sumstats_dt[, CHR := gsub("ch", "", CHR, ignore.case = TRUE)]
# Rename "23" to "X"
sumstats_dt[, CHR := gsub("23", "X", CHR)]
# Rename "M" to "MT"
sumstats_dt[, CHR := gsub("M", "MT", CHR, ignore.case = TRUE)]
# Make all chromosome names uppercase
sumstats_dt[, CHR := toupper(CHR)]

# check for chromosomes to be removed
### Standardise chromosomes specified
rmv_chr <- toupper(rmv_chr)
if (any(rmv_chr %in% unique(sumstats_dt$CHR))) {
num_bad_ids <- nrow(sumstats_dt[CHR %in% rmv_chr, ])
msg <- paste0(
formatC(num_bad_ids, big.mark = ","),
" SNPs are on chromosomes ",
paste(rmv_chr, collapse = ", "),
" and will be removed"
)
message(msg)
# If user wants log, save it to there
if (log_folder_ind) {
name <- "chr_excl"
name <- get_unique_name_log_file(
name = name,
log_files = log_files
)
write_sumstats(
sumstats_dt = sumstats_dt[CHR %in% (rmv_chr), ],
save_path =
paste0(
check_save_out$log_folder,
"/", name,
check_save_out$extension
),
sep = check_save_out$sep,
tabix_index = tabix_index,
nThread = nThread
)
log_files[[name]] <-
paste0(
check_save_out$log_folder, "/", name,
check_save_out$extension
)
}
# remove rows on these chromosomes
sumstats_dt <- sumstats_dt[!CHR %in% (rmv_chr), ]
}
return(list("sumstats_dt" = sumstats_dt, "log_files" = log_files))
} else {
return(list("sumstats_dt" = sumstats_dt, "log_files" = log_files))
### Remove rows with nonstandard CHR entries
standard_chrs <- c(1:22, "X", "Y", "MT")
nonstandard_rows <- which(!(sumstats_dt$CHR %in% standard_chrs))
if (length(nonstandard_rows) > 0L) {
message("Removing ",
formatC(length(nonstandard_rows), big.mark = ","),
" SNPs with nonstandard CHR entries.")
}

### Remove SNPs on user-specified chromosomes, if requested
rmv_chr_rows <- c()
if (!is.null(rmv_chr)) {
# Check for chromosomes to be removed
rmv_chr_rows <- which(sumstats_dt$CHR %in% rmv_chr)

if (length(rmv_chr_rows) > 0L) {
message(
formatC(length(rmv_chr_rows), big.mark = ","),
" SNPs are on chromosomes ",
paste(rmv_chr, collapse = ", "),
" and will be removed."
)
}
}

# Vector of row numbers for all removed SNPs
all_removed_rows <- sort(unique(c(nonstandard_rows, rmv_chr_rows)))

### Save a log of removed SNPs if the user wants it
if (log_folder_ind && (length(all_removed_rows) > 0L)) {
name <- "chr_excl"
name <- get_unique_name_log_file(name = name,
log_files = log_files)
save_path <- paste0(check_save_out$log_folder,
"/",
name,
check_save_out$extension)

write_sumstats(
sumstats_dt = sumstats_dt[all_removed_rows],
save_path = save_path,
sep = check_save_out$sep,
tabix_index = tabix_index,
nThread = nThread
)
log_files[[name]] <- save_path
}

# Remove the SNPs identified above, if any
sumstats_dt <- sumstats_dt[!all_removed_rows]

return(list(sumstats_dt = sumstats_dt, log_files = log_files))
}
Loading

2 comments on commit b8a022f

@u119-1
Copy link

@u119-1 u119-1 commented on b8a022f Oct 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

frq= eaf or maf?

@mykmal
Copy link
Contributor

@mykmal mykmal commented on b8a022f Oct 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

frq= eaf or maf?

In a formatted summary statistics file, the FRQ column denotes the effect allele frequency (EAF). Conventionally, this is the same as the minor allele frequency (MAF).

However, some GWAS summary statistics files report the major allele frequency instead. If this is the case for your data, you can set frq_is_maf = FALSE when running format_sumstats(). This will rename the FRQ column to MAJOR_ALLELE_FRQ if the frequency values appear to relate to the major allele, i.e. if they are greater than 0.5. See the documentation for more information: https://neurogenomics.github.io/MungeSumstats/reference/format_sumstats.html

Does this answer your question?

Please sign in to comment.