v1.2.5

Updates - Minor bug fixes - 30-40% faster in phosphopeptide searches - 10% faster on global data searches - more hits in modified phosphopeptides - removed dependency to package proteoCpp - temporarily disable MGF mass calibration (to be bring back in a next version)
qzhang503 · May 14, 2023 · c3c54a4 · c3c54a4
1 parent 8c8248d
commit c3c54a4
Show file tree

Hide file tree

Showing 55 changed files with 3,957 additions and 3,276 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: mzion
 Type: Package
 Title: Database Searches of Proteomic Mass-spectrometrirc Data
-Version: 1.2.4.1
+Version: 1.2.5
 Authors@R: 
     person(given = "Qiang",
            family = "Zhang",
@@ -37,14 +37,8 @@ Imports:
     fs,
     magrittr,
     parallel,
-    proteoCpp, 
     proxyC,
     qs,
     xml2, 
-    Matrix,
-    Rcpp
+    Matrix
 RoxygenNote: 7.2.3
-Remotes:
-    qzhang503/proteoCpp
-LinkingTo: 
-    Rcpp
diff --git a/R/mapMS2ions.R b/R/mapMS2ions.R
@@ -313,8 +313,8 @@ find_psm_rows1 <- function (file_t1, file_t2, file_t3, scan, raw_file,
       psms_3 <- NULL
 
     cols_excl <- c("pep_ms2_moverzs", "pep_ms2_ints", "pep_ms2_theos", 
-                  "pep_ms2_theos2", "pep_ms2_deltas", "pep_ms2_ideltas", 
-                  "pep_ms2_deltas2", "pep_ms2_ideltas2")
+                   "pep_ms2_theos2", "pep_ms2_deltas", "pep_ms2_ideltas", 
+                   "pep_ms2_deltas2", "pep_ms2_ideltas2")
 
     if (!is.null(psms_1)) psms_1 <- psms_1 %>% dplyr::select(-which(names(.) %in% cols_excl))
     if (!is.null(psms_2)) psms_2 <- psms_2 %>% dplyr::select(-which(names(.) %in% cols_excl))
@@ -377,6 +377,7 @@ find_psm_rows2 <- function (file_t0, scan, raw_file, rank = 1L,
 #' @inheritParams mapMS2ions
 find_theoexpt_pair <- function (psm, out_path, scan, raw_id, is_decoy = FALSE) 
 {
+  tempdir <- file.path(out_path, "temp")
   col_nms <- names(psm)
 
   lapply(c("pep_seq", "pep_ivmod", "pep_mod_group"), function (x) {
@@ -400,22 +401,28 @@ find_theoexpt_pair <- function (psm, out_path, scan, raw_id, is_decoy = FALSE)
     .list_table <- get(nm2, envir = .GlobalEnv)
   }
   else {
-    file <- file.path(out_path, "temp", paste0("ion_matches_", mod, ".rds"))
-    file2 <- file.path(out_path, "temp", paste0("list_table_", mod, ".rds"))
+    file  <- file.path(tempdir, paste0("ion_matches_", mod, ".rds"))
+    file2 <- file.path(tempdir, paste0("list_table_",  mod, ".rds"))
 
     if (!file.exists(file))
       stop("Ion matches not found: ", file, call. = FALSE)
 
-    if (!file.exists(file2))
-      stop("Secondary ion matches not found: ", file, call. = FALSE)
-
+    if (!file.exists(file2)) {
+      fs_mod <- order_fracs("list_table", tempdir)[[mod]]
+
+      if (!length(fs_mod))
+        stop("Secondary ion matches not found: ", file)
+      else
+        combine_fracs(fs_mod, tempdir, tempdir)
+    }
+
     .ion_matches <- 
       qs::qread(file) %>% 
-      dplyr::mutate(scan_num = as.character(scan_num))
+      dplyr::mutate(pep_scan_num = as.character(pep_scan_num))
 
     .list_table <- 
       qs::qread(file2) %>%
-      dplyr::mutate(scan_num = as.character(scan_num))
+      dplyr::mutate(pep_scan_num = as.character(pep_scan_num))
 
     if (! "pep_mod_group" %in% names(.list_table))
       .list_table$pep_mod_group <- mod
@@ -425,12 +432,13 @@ find_theoexpt_pair <- function (psm, out_path, scan, raw_id, is_decoy = FALSE)
   }
 
   ion_match <- .ion_matches %>% 
-    dplyr::filter(scan_num == scan, 
+    dplyr::filter(pep_scan_num == scan, 
                   raw_file == raw_id, 
-                  pep_isdecoy == is_decoy)
+                  # pep_isdecoy == is_decoy, 
+                  )
 
   list_table <- .list_table %>% 
-    dplyr::filter(scan_num == scan, 
+    dplyr::filter(pep_scan_num == scan, 
                   raw_file == raw_id, )
 
   nrow <- nrow(ion_match)

diff --git a/R/mgfs.R b/R/mgfs.R
@@ -44,16 +44,15 @@ load_mgfs <- function (out_path, mgf_path, min_mass = 200L, max_mass = 4500L,
 
   args_except <- c("out_path")
   args <- names(formals(fun))
-  args_must <- args[! args %in% args_except]
+  args_must <- args[!args %in% args_except]
 
   cache_pars <- find_callarg_vals(
     time = NULL, 
     path = file.path(out_path, "Calls"), 
     fun = paste0(fun, ".rda"), 
     args = args_must, 
-    new_args = unlist(formals(load_mgfs)[c("enzyme")])
-  ) 
-
+    new_args = unlist(formals(load_mgfs)[c("enzyme")]))
+
   cache_pars <- cache_pars[sort(names(cache_pars))]
   call_pars <- mget(args_must, envir = fun_env, inherits = FALSE)
   call_pars <- call_pars[sort(names(call_pars))]
@@ -63,100 +62,111 @@ load_mgfs <- function (out_path, mgf_path, min_mass = 200L, max_mass = 4500L,
   if ((!ok_pars) && enzyme == "noenzyme") 
     ok_pars <- TRUE
 
-  rds <- file.path(mgf_path, "mgf_queries.rds")
+  # checks processed mgfs
+  raws_indexes <- file.path(mgf_path, "raw_indexes.rds")
 
-  if (ok_pars && file.exists(rds)) {
-    message("Found cached MGFs: `", rds, "`.")
-    .savecall <- FALSE
-  } 
+  if (file.exists(raws_indexes)) {
+    raws <- qs::qread(raws_indexes)
+    ques <- list.files(mgf_path, pattern = "^mgf_queries_\\d+\\.rds$")
+    ok_mgfs <- if (length(raws) == length(ques)) TRUE else FALSE
+    rm(list = c("raws", "ques", "raws_indexes"))
+  }
   else {
-    message("Processing raw MGFs.")
-
-    ppm_ms1_new <- if (is_ms1_three_frame) 
-      as.integer(ceiling(ppm_ms1 * .5))
-    else 
-      ppm_ms1
+    ok_mgfs <- FALSE
+    rm(list = c("raws_indexes"))
+  }
 
-    ppm_ms2_new <- if (is_ms2_three_frame) 
-      as.integer(ceiling(ppm_ms2 * .5))
-    else 
-      ppm_ms2
-
-    delete_files(
-      out_path, 
-      ignores = c("\\.[Rr]$", "\\.(mgf|MGF)$", "\\.xlsx$", 
-                  "\\.xls$", "\\.csv$", "\\.txt$", "\\.tsv$", 
-                  "^mgf$", "^mgfs$", "Calls", 
-
-                  # in case calling from proteoQ with MSGF workflows
-                  "fraction_scheme.rda", "label_scheme.rda", 
-                  "label_scheme_full.rda"))
-
-    fi_mgf <- list.files(path = file.path(mgf_path), pattern = "^.*\\.mgf$")
-    fi_mzml <- list.files(path = file.path(mgf_path), pattern = "^.*\\.mzML$")
-    len_mgf <- length(fi_mgf)
-    len_mzml <- length(fi_mzml)
-
-    if (len_mgf && len_mzml)
-      stop("Peak lists need to be in either MGF or mzML, but not both.")
-
-    filelist <- if (len_mgf) fi_mgf else fi_mzml
+  if (ok_pars && ok_mgfs) {
+    message("Found cached MGFs,")
+    .savecall <- FALSE
 
-    if (len_mgf) {
-      readMGF(filepath = mgf_path,
-              filelist = filelist, 
-              min_mass = min_mass,
-              max_mass = max_mass, 
-              min_ms2mass = min_ms2mass,
-              max_ms2mass = max_ms2mass, 
-              topn_ms2ions = topn_ms2ions,
-              quant = quant, 
-              ms1_charge_range = c(min_ms1_charge, max_ms1_charge), 
-              ms1_scan_range = c(min_scan_num, max_scan_num), 
-              ret_range = c(min_ret_time, max_ret_time),
-              ppm_ms1 = ppm_ms1_new,
-              ppm_ms2 = ppm_ms2_new,
-              tmt_reporter_lower = tmt_reporter_lower, 
-              tmt_reporter_upper = tmt_reporter_upper, 
-              exclude_reporter_region = exclude_reporter_region, 
-              index_mgf_ms2 = index_mgf_ms2, 
-              mgf_cutmzs = mgf_cutmzs, 
-              mgf_cutpercs = mgf_cutpercs, 
-              out_path = rds, 
-              digits = digits)
-    }
-    else if (len_mzml) {
-      warning("Please uncheck \"Use zlib compression\" with mzML from MSConvert.", 
-              call. = FALSE)
-
-      readmzML(filepath = mgf_path,
-               filelist = filelist, 
-               min_mass = min_mass,
-               max_mass = max_mass, 
-               min_ms2mass = min_ms2mass,
-               max_ms2mass = max_ms2mass, 
-               topn_ms2ions = topn_ms2ions,
-               quant = quant, 
-               ms1_charge_range = c(min_ms1_charge, max_ms1_charge), 
-               ms1_scan_range = c(min_scan_num, max_scan_num), 
-               ret_range = c(min_ret_time, max_ret_time),
-               ppm_ms1 = ppm_ms1_new,
-               ppm_ms2 = ppm_ms2_new,
-               tmt_reporter_lower = tmt_reporter_lower, 
-               tmt_reporter_upper = tmt_reporter_upper, 
-               exclude_reporter_region = exclude_reporter_region, 
-               index_mgf_ms2 = index_mgf_ms2, 
-               mgf_cutmzs = mgf_cutmzs, 
-               mgf_cutpercs = mgf_cutpercs, 
-               out_path = rds, 
-               digits = digits)
-    }
-    else {
-      stop("No files of peak lists found.")
-    }
+    return(NULL)
+  }
+
+  message("Processing raw MGFs.")
+
+  ppm_ms1_new <- if (is_ms1_three_frame) 
+    as.integer(ceiling(ppm_ms1 * .5))
+  else 
+    ppm_ms1
+
+  ppm_ms2_new <- if (is_ms2_three_frame) 
+    as.integer(ceiling(ppm_ms2 * .5))
+  else 
+    ppm_ms2
+
+  delete_files(
+    out_path, 
+    ignores = c("\\.[Rr]$", "\\.(mgf|MGF)$", "\\.xlsx$", 
+                "\\.xls$", "\\.csv$", "\\.txt$", "\\.tsv$", 
+                "^mgf$", "^mgfs$", "Calls", 
+                # in case of reprocessing after proteoQ
+                "fraction_scheme.rda", "label_scheme.rda", 
+                "label_scheme_full.rda"))
+
+  fi_mgf   <- list.files(path = file.path(mgf_path), pattern = "^.*\\.mgf$")
+  fi_mzml  <- list.files(path = file.path(mgf_path), pattern = "^.*\\.mzML$")
+  len_mgf  <- length(fi_mgf)
+  len_mzml <- length(fi_mzml)
+
+  if (len_mgf && len_mzml)
+    stop("Peak lists need to be in either MGF or mzML, but not both.")
+
+  filelist <- if (len_mgf) fi_mgf else fi_mzml
+
+  if (len_mgf) {
+    readMGF(filepath = mgf_path,
+            filelist = filelist, 
+            min_mass = min_mass,
+            max_mass = max_mass, 
+            min_ms2mass = min_ms2mass,
+            max_ms2mass = max_ms2mass, 
+            topn_ms2ions = topn_ms2ions,
+            quant = quant, 
+            ms1_charge_range = c(min_ms1_charge, max_ms1_charge), 
+            ms1_scan_range = c(min_scan_num, max_scan_num), 
+            ret_range = c(min_ret_time, max_ret_time),
+            ppm_ms1 = ppm_ms1_new,
+            ppm_ms2 = ppm_ms2_new,
+            tmt_reporter_lower = tmt_reporter_lower, 
+            tmt_reporter_upper = tmt_reporter_upper, 
+            exclude_reporter_region = exclude_reporter_region, 
+            index_mgf_ms2 = index_mgf_ms2, 
+            mgf_cutmzs = mgf_cutmzs, 
+            mgf_cutpercs = mgf_cutpercs, 
+            out_path = out_path, 
+            digits = digits)
+  }
+  else if (len_mzml) {
+    warning("Please uncheck \"Use zlib compression\" with mzML from MSConvert.")
 
-    .savecall <- TRUE
+    readmzML(filepath = mgf_path,
+             filelist = filelist, 
+             min_mass = min_mass,
+             max_mass = max_mass, 
+             min_ms2mass = min_ms2mass,
+             max_ms2mass = max_ms2mass, 
+             topn_ms2ions = topn_ms2ions,
+             quant = quant, 
+             ms1_charge_range = c(min_ms1_charge, max_ms1_charge), 
+             ms1_scan_range = c(min_scan_num, max_scan_num), 
+             ret_range = c(min_ret_time, max_ret_time),
+             ppm_ms1 = ppm_ms1_new,
+             ppm_ms2 = ppm_ms2_new,
+             tmt_reporter_lower = tmt_reporter_lower, 
+             tmt_reporter_upper = tmt_reporter_upper, 
+             exclude_reporter_region = exclude_reporter_region, 
+             index_mgf_ms2 = index_mgf_ms2, 
+             mgf_cutmzs = mgf_cutmzs, 
+             mgf_cutpercs = mgf_cutpercs, 
+             out_path = out_path, 
+             digits = digits)
   }
+  else {
+    stop("No files of peak lists found.")
+  }
+
+  .savecall <- TRUE
 
   invisible(NULL)
 }
@@ -316,7 +326,7 @@ readMGF <- function (filepath = NULL, filelist = NULL,
   out <- dplyr::bind_rows(out)
 
   post_readmgf(out, min_mass = min_mass, max_mass = max_mass, ppm_ms1 = ppm_ms1, 
-               filepath = filepath, out_path = out_path)
+               filepath = filepath)
 }
 
 
@@ -327,7 +337,7 @@ readMGF <- function (filepath = NULL, filelist = NULL,
 #' @param df A data frame of processed peak lists.
 #' @inheritParams readMGF
 post_readmgf <- function (df, min_mass = 200L, max_mass = 4500L, ppm_ms1 = 10L, 
-                          filepath, out_path) 
+                          filepath) 
 {
   df <- dplyr::arrange(df, ms1_mass)
   # df <- dplyr::filter(df, ms1_mass >= min_mass, ms1_mass <= max_mass)
@@ -346,8 +356,15 @@ post_readmgf <- function (df, min_mass = 200L, max_mass = 4500L, ppm_ms1 = 10L,
   qs::qsave(inds2, file.path(filepath, "scan_indexes.rds"), preset = "fast")
   df$scan_title <- unname(inds2[scans])
 
-  qs::qsave(df, out_path, preset = "fast")
+  df  <- split(df, df$raw_file)
+  nms <- names(df)
 
+  for (i in seq_along(df)) {
+    qs::qsave(df[[i]], file.path(filepath, paste0("mgf_queries_", nms[i], ".rds")), 
+              preset = "fast")
+  }
+
+
   invisible(NULL)
 }
 
@@ -367,8 +384,6 @@ readlineMGFs <- function (i, file, filepath, raw_file)
     writeLines(x, nm)
   }
 
-  message("Loading '", file, "'.")
-
   temp_i <- paste0("temp_", i)
 
   # refresh at every `i`
@@ -1376,7 +1391,7 @@ readmzML <- function (filepath = NULL, filelist = NULL,
 
   ## frame number
   post_readmgf(out, min_mass = min_mass, max_mass = max_mass, ppm_ms1 = ppm_ms1, 
-               filepath = filepath, out_path = out_path)
+               filepath = filepath)
 }