Skip to content

Commit

Permalink
v1.2.5
Browse files Browse the repository at this point in the history
Updates
- Minor bug fixes
- 30-40% faster in phosphopeptide searches
- 10% faster on global data searches
- more hits in modified phosphopeptides
- removed dependency to package proteoCpp
- temporarily disable MGF mass calibration (to be bring back in a next version)
  • Loading branch information
qzhang503 committed May 14, 2023
1 parent 8c8248d commit c3c54a4
Show file tree
Hide file tree
Showing 55 changed files with 3,957 additions and 3,276 deletions.
10 changes: 2 additions & 8 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: mzion
Type: Package
Title: Database Searches of Proteomic Mass-spectrometrirc Data
Version: 1.2.4.1
Version: 1.2.5
Authors@R:
person(given = "Qiang",
family = "Zhang",
Expand Down Expand Up @@ -37,14 +37,8 @@ Imports:
fs,
magrittr,
parallel,
proteoCpp,
proxyC,
qs,
xml2,
Matrix,
Rcpp
Matrix
RoxygenNote: 7.2.3
Remotes:
qzhang503/proteoCpp
LinkingTo:
Rcpp
32 changes: 20 additions & 12 deletions R/mapMS2ions.R
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,8 @@ find_psm_rows1 <- function (file_t1, file_t2, file_t3, scan, raw_file,
psms_3 <- NULL

cols_excl <- c("pep_ms2_moverzs", "pep_ms2_ints", "pep_ms2_theos",
"pep_ms2_theos2", "pep_ms2_deltas", "pep_ms2_ideltas",
"pep_ms2_deltas2", "pep_ms2_ideltas2")
"pep_ms2_theos2", "pep_ms2_deltas", "pep_ms2_ideltas",
"pep_ms2_deltas2", "pep_ms2_ideltas2")

if (!is.null(psms_1)) psms_1 <- psms_1 %>% dplyr::select(-which(names(.) %in% cols_excl))
if (!is.null(psms_2)) psms_2 <- psms_2 %>% dplyr::select(-which(names(.) %in% cols_excl))
Expand Down Expand Up @@ -377,6 +377,7 @@ find_psm_rows2 <- function (file_t0, scan, raw_file, rank = 1L,
#' @inheritParams mapMS2ions
find_theoexpt_pair <- function (psm, out_path, scan, raw_id, is_decoy = FALSE)
{
tempdir <- file.path(out_path, "temp")
col_nms <- names(psm)

lapply(c("pep_seq", "pep_ivmod", "pep_mod_group"), function (x) {
Expand All @@ -400,22 +401,28 @@ find_theoexpt_pair <- function (psm, out_path, scan, raw_id, is_decoy = FALSE)
.list_table <- get(nm2, envir = .GlobalEnv)
}
else {
file <- file.path(out_path, "temp", paste0("ion_matches_", mod, ".rds"))
file2 <- file.path(out_path, "temp", paste0("list_table_", mod, ".rds"))
file <- file.path(tempdir, paste0("ion_matches_", mod, ".rds"))
file2 <- file.path(tempdir, paste0("list_table_", mod, ".rds"))

if (!file.exists(file))
stop("Ion matches not found: ", file, call. = FALSE)

if (!file.exists(file2))
stop("Secondary ion matches not found: ", file, call. = FALSE)

if (!file.exists(file2)) {
fs_mod <- order_fracs("list_table", tempdir)[[mod]]

if (!length(fs_mod))
stop("Secondary ion matches not found: ", file)
else
combine_fracs(fs_mod, tempdir, tempdir)
}

.ion_matches <-
qs::qread(file) %>%
dplyr::mutate(scan_num = as.character(scan_num))
dplyr::mutate(pep_scan_num = as.character(pep_scan_num))

.list_table <-
qs::qread(file2) %>%
dplyr::mutate(scan_num = as.character(scan_num))
dplyr::mutate(pep_scan_num = as.character(pep_scan_num))

if (! "pep_mod_group" %in% names(.list_table))
.list_table$pep_mod_group <- mod
Expand All @@ -425,12 +432,13 @@ find_theoexpt_pair <- function (psm, out_path, scan, raw_id, is_decoy = FALSE)
}

ion_match <- .ion_matches %>%
dplyr::filter(scan_num == scan,
dplyr::filter(pep_scan_num == scan,
raw_file == raw_id,
pep_isdecoy == is_decoy)
# pep_isdecoy == is_decoy,
)

list_table <- .list_table %>%
dplyr::filter(scan_num == scan,
dplyr::filter(pep_scan_num == scan,
raw_file == raw_id, )

nrow <- nrow(ion_match)
Expand Down
211 changes: 113 additions & 98 deletions R/mgfs.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,15 @@ load_mgfs <- function (out_path, mgf_path, min_mass = 200L, max_mass = 4500L,

args_except <- c("out_path")
args <- names(formals(fun))
args_must <- args[! args %in% args_except]
args_must <- args[!args %in% args_except]

cache_pars <- find_callarg_vals(
time = NULL,
path = file.path(out_path, "Calls"),
fun = paste0(fun, ".rda"),
args = args_must,
new_args = unlist(formals(load_mgfs)[c("enzyme")])
)

new_args = unlist(formals(load_mgfs)[c("enzyme")]))

cache_pars <- cache_pars[sort(names(cache_pars))]
call_pars <- mget(args_must, envir = fun_env, inherits = FALSE)
call_pars <- call_pars[sort(names(call_pars))]
Expand All @@ -63,100 +62,111 @@ load_mgfs <- function (out_path, mgf_path, min_mass = 200L, max_mass = 4500L,
if ((!ok_pars) && enzyme == "noenzyme")
ok_pars <- TRUE

rds <- file.path(mgf_path, "mgf_queries.rds")
# checks processed mgfs
raws_indexes <- file.path(mgf_path, "raw_indexes.rds")

if (ok_pars && file.exists(rds)) {
message("Found cached MGFs: `", rds, "`.")
.savecall <- FALSE
}
if (file.exists(raws_indexes)) {
raws <- qs::qread(raws_indexes)
ques <- list.files(mgf_path, pattern = "^mgf_queries_\\d+\\.rds$")
ok_mgfs <- if (length(raws) == length(ques)) TRUE else FALSE
rm(list = c("raws", "ques", "raws_indexes"))
}
else {
message("Processing raw MGFs.")

ppm_ms1_new <- if (is_ms1_three_frame)
as.integer(ceiling(ppm_ms1 * .5))
else
ppm_ms1
ok_mgfs <- FALSE
rm(list = c("raws_indexes"))
}

ppm_ms2_new <- if (is_ms2_three_frame)
as.integer(ceiling(ppm_ms2 * .5))
else
ppm_ms2

delete_files(
out_path,
ignores = c("\\.[Rr]$", "\\.(mgf|MGF)$", "\\.xlsx$",
"\\.xls$", "\\.csv$", "\\.txt$", "\\.tsv$",
"^mgf$", "^mgfs$", "Calls",

# in case calling from proteoQ with MSGF workflows
"fraction_scheme.rda", "label_scheme.rda",
"label_scheme_full.rda"))

fi_mgf <- list.files(path = file.path(mgf_path), pattern = "^.*\\.mgf$")
fi_mzml <- list.files(path = file.path(mgf_path), pattern = "^.*\\.mzML$")
len_mgf <- length(fi_mgf)
len_mzml <- length(fi_mzml)

if (len_mgf && len_mzml)
stop("Peak lists need to be in either MGF or mzML, but not both.")

filelist <- if (len_mgf) fi_mgf else fi_mzml
if (ok_pars && ok_mgfs) {
message("Found cached MGFs,")
.savecall <- FALSE

if (len_mgf) {
readMGF(filepath = mgf_path,
filelist = filelist,
min_mass = min_mass,
max_mass = max_mass,
min_ms2mass = min_ms2mass,
max_ms2mass = max_ms2mass,
topn_ms2ions = topn_ms2ions,
quant = quant,
ms1_charge_range = c(min_ms1_charge, max_ms1_charge),
ms1_scan_range = c(min_scan_num, max_scan_num),
ret_range = c(min_ret_time, max_ret_time),
ppm_ms1 = ppm_ms1_new,
ppm_ms2 = ppm_ms2_new,
tmt_reporter_lower = tmt_reporter_lower,
tmt_reporter_upper = tmt_reporter_upper,
exclude_reporter_region = exclude_reporter_region,
index_mgf_ms2 = index_mgf_ms2,
mgf_cutmzs = mgf_cutmzs,
mgf_cutpercs = mgf_cutpercs,
out_path = rds,
digits = digits)
}
else if (len_mzml) {
warning("Please uncheck \"Use zlib compression\" with mzML from MSConvert.",
call. = FALSE)

readmzML(filepath = mgf_path,
filelist = filelist,
min_mass = min_mass,
max_mass = max_mass,
min_ms2mass = min_ms2mass,
max_ms2mass = max_ms2mass,
topn_ms2ions = topn_ms2ions,
quant = quant,
ms1_charge_range = c(min_ms1_charge, max_ms1_charge),
ms1_scan_range = c(min_scan_num, max_scan_num),
ret_range = c(min_ret_time, max_ret_time),
ppm_ms1 = ppm_ms1_new,
ppm_ms2 = ppm_ms2_new,
tmt_reporter_lower = tmt_reporter_lower,
tmt_reporter_upper = tmt_reporter_upper,
exclude_reporter_region = exclude_reporter_region,
index_mgf_ms2 = index_mgf_ms2,
mgf_cutmzs = mgf_cutmzs,
mgf_cutpercs = mgf_cutpercs,
out_path = rds,
digits = digits)
}
else {
stop("No files of peak lists found.")
}
return(NULL)
}

message("Processing raw MGFs.")

ppm_ms1_new <- if (is_ms1_three_frame)
as.integer(ceiling(ppm_ms1 * .5))
else
ppm_ms1

ppm_ms2_new <- if (is_ms2_three_frame)
as.integer(ceiling(ppm_ms2 * .5))
else
ppm_ms2

delete_files(
out_path,
ignores = c("\\.[Rr]$", "\\.(mgf|MGF)$", "\\.xlsx$",
"\\.xls$", "\\.csv$", "\\.txt$", "\\.tsv$",
"^mgf$", "^mgfs$", "Calls",
# in case of reprocessing after proteoQ
"fraction_scheme.rda", "label_scheme.rda",
"label_scheme_full.rda"))

fi_mgf <- list.files(path = file.path(mgf_path), pattern = "^.*\\.mgf$")
fi_mzml <- list.files(path = file.path(mgf_path), pattern = "^.*\\.mzML$")
len_mgf <- length(fi_mgf)
len_mzml <- length(fi_mzml)

if (len_mgf && len_mzml)
stop("Peak lists need to be in either MGF or mzML, but not both.")

filelist <- if (len_mgf) fi_mgf else fi_mzml

if (len_mgf) {
readMGF(filepath = mgf_path,
filelist = filelist,
min_mass = min_mass,
max_mass = max_mass,
min_ms2mass = min_ms2mass,
max_ms2mass = max_ms2mass,
topn_ms2ions = topn_ms2ions,
quant = quant,
ms1_charge_range = c(min_ms1_charge, max_ms1_charge),
ms1_scan_range = c(min_scan_num, max_scan_num),
ret_range = c(min_ret_time, max_ret_time),
ppm_ms1 = ppm_ms1_new,
ppm_ms2 = ppm_ms2_new,
tmt_reporter_lower = tmt_reporter_lower,
tmt_reporter_upper = tmt_reporter_upper,
exclude_reporter_region = exclude_reporter_region,
index_mgf_ms2 = index_mgf_ms2,
mgf_cutmzs = mgf_cutmzs,
mgf_cutpercs = mgf_cutpercs,
out_path = out_path,
digits = digits)
}
else if (len_mzml) {
warning("Please uncheck \"Use zlib compression\" with mzML from MSConvert.")

.savecall <- TRUE
readmzML(filepath = mgf_path,
filelist = filelist,
min_mass = min_mass,
max_mass = max_mass,
min_ms2mass = min_ms2mass,
max_ms2mass = max_ms2mass,
topn_ms2ions = topn_ms2ions,
quant = quant,
ms1_charge_range = c(min_ms1_charge, max_ms1_charge),
ms1_scan_range = c(min_scan_num, max_scan_num),
ret_range = c(min_ret_time, max_ret_time),
ppm_ms1 = ppm_ms1_new,
ppm_ms2 = ppm_ms2_new,
tmt_reporter_lower = tmt_reporter_lower,
tmt_reporter_upper = tmt_reporter_upper,
exclude_reporter_region = exclude_reporter_region,
index_mgf_ms2 = index_mgf_ms2,
mgf_cutmzs = mgf_cutmzs,
mgf_cutpercs = mgf_cutpercs,
out_path = out_path,
digits = digits)
}
else {
stop("No files of peak lists found.")
}

.savecall <- TRUE

invisible(NULL)
}
Expand Down Expand Up @@ -316,7 +326,7 @@ readMGF <- function (filepath = NULL, filelist = NULL,
out <- dplyr::bind_rows(out)

post_readmgf(out, min_mass = min_mass, max_mass = max_mass, ppm_ms1 = ppm_ms1,
filepath = filepath, out_path = out_path)
filepath = filepath)
}


Expand All @@ -327,7 +337,7 @@ readMGF <- function (filepath = NULL, filelist = NULL,
#' @param df A data frame of processed peak lists.
#' @inheritParams readMGF
post_readmgf <- function (df, min_mass = 200L, max_mass = 4500L, ppm_ms1 = 10L,
filepath, out_path)
filepath)
{
df <- dplyr::arrange(df, ms1_mass)
# df <- dplyr::filter(df, ms1_mass >= min_mass, ms1_mass <= max_mass)
Expand All @@ -346,8 +356,15 @@ post_readmgf <- function (df, min_mass = 200L, max_mass = 4500L, ppm_ms1 = 10L,
qs::qsave(inds2, file.path(filepath, "scan_indexes.rds"), preset = "fast")
df$scan_title <- unname(inds2[scans])

qs::qsave(df, out_path, preset = "fast")
df <- split(df, df$raw_file)
nms <- names(df)

for (i in seq_along(df)) {
qs::qsave(df[[i]], file.path(filepath, paste0("mgf_queries_", nms[i], ".rds")),
preset = "fast")
}


invisible(NULL)
}

Expand All @@ -367,8 +384,6 @@ readlineMGFs <- function (i, file, filepath, raw_file)
writeLines(x, nm)
}

message("Loading '", file, "'.")

temp_i <- paste0("temp_", i)

# refresh at every `i`
Expand Down Expand Up @@ -1376,7 +1391,7 @@ readmzML <- function (filepath = NULL, filelist = NULL,

## frame number
post_readmgf(out, min_mass = min_mass, max_mass = max_mass, ppm_ms1 = ppm_ms1,
filepath = filepath, out_path = out_path)
filepath = filepath)
}


Expand Down
Loading

0 comments on commit c3c54a4

Please sign in to comment.