From 597bde1f1806ff9bc4542249754202afc0879c8f Mon Sep 17 00:00:00 2001 From: qzhang503 Date: Tue, 14 Nov 2023 16:48:44 -0600 Subject: [PATCH] v1.3.4 --- DESCRIPTION | 2 +- R/bin_masses.R | 2 + R/mapMS2ions.R | 115 +++++++------- R/mgfs.R | 316 ++++++++++++++++++-------------------- R/ms2frames.R | 50 +++--- R/msmsmatches.R | 92 +++++++---- R/msmsmatches2.R | 48 +++--- R/mztab.R | 59 ++++--- R/scores.R | 58 +++---- R/zzz.R | 11 +- README.md | 4 +- man/add_primatches.Rd | 17 +- man/calc_pepprobs_i.Rd | 15 -- man/calc_pepscores.Rd | 15 -- man/calc_probi.Rd | 15 -- man/calc_probi_bypep.Rd | 15 -- man/calc_probi_byvmods.Rd | 15 -- man/calcpepsc.Rd | 15 -- man/calib_mgf.Rd | 21 +-- man/find_ms2_bypep.Rd | 17 +- man/frames_adv.Rd | 15 -- man/hadd_primatches.Rd | 17 +- man/hms2match.Rd | 15 -- man/hms2match_one.Rd | 15 -- man/load_mgfs.Rd | 18 +-- man/make_mztab.Rd | 13 +- man/map_raw_n_scan_old.Rd | 24 +++ man/matchMS.Rd | 15 -- man/match_ex2th2.Rd | 18 +-- man/mframes_adv.Rd | 17 +- man/ms2match.Rd | 15 -- man/ms2match_all.Rd | 15 -- man/ms2match_one.Rd | 15 -- man/post_readmgf.Rd | 13 +- man/proc_dda.Rd | 17 +- man/proc_mgf_chunks.Rd | 16 -- man/proc_mgfs.Rd | 16 -- man/proc_mzml.Rd | 22 +-- man/readMGF.Rd | 22 +-- man/read_mgf_chunks.Rd | 24 +-- man/read_mzml.Rd | 16 -- man/readmzML.Rd | 16 -- man/scalc_pepprobs.Rd | 15 -- man/search_mgf.Rd | 15 -- vignettes/README.Rmd | 2 - 45 files changed, 456 insertions(+), 852 deletions(-) create mode 100644 man/map_raw_n_scan_old.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 86410f8..a3c94ab 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: mzion Type: Package Title: Proteomics Database Searches of Mass-spectrometrirc Data. -Version: 1.3.3.3 +Version: 1.3.4 Authors@R: person(given = "Qiang", family = "Zhang", diff --git a/R/bin_masses.R b/R/bin_masses.R index 952aab5..5f96c58 100644 --- a/R/bin_masses.R +++ b/R/bin_masses.R @@ -61,6 +61,8 @@ bin_ms1masses <- function (res = NULL, min_mass = 200L, max_mass = 4500L, if (len_bts && use_ms1_cache) { .path_bin <- file.path(.path_ms1masses, .time_stamp, fun, .time_bin) + # in the next version... + # .path_bin <- fs::fs_path(.path_bin) bins <- list.files(path = .path_bin, pattern = "binned_theopeps_\\d+\\.rds$") if (length(bins) == len_m) { diff --git a/R/mapMS2ions.R b/R/mapMS2ions.R index a9bc0f9..788e42e 100644 --- a/R/mapMS2ions.R +++ b/R/mapMS2ions.R @@ -76,29 +76,33 @@ mapMS2ions <- function (out_path = NULL, in_name = "psmQ.txt", type_ms2ions = "by", width = 12.5, height = 6) { if (is.null(out_path) || is.na(out_path) || out_path == "") { - warning("\"out_path\" cannot be empty.", call. = FALSE) + warning("\"out_path\" cannot be empty.") return(NULL) } if (is.null(raw_file) || is.na(raw_file) || raw_file == "" ) { - warning("\"raw_file\" cannot be empty.", call. = FALSE) + warning("\"raw_file\" cannot be empty.") return(NULL) } if (is.null(in_name) || is.na(in_name) || in_name == "" ) { - warning("\"in_name\" is empty; assume `psmQ.txt`.", call. = FALSE) + warning("\"in_name\" is empty; assume `psmQ.txt`.") in_name <- "psmQ.txt" } if (is.null(out_name) || out_name == "") { - warning("\"out_name\" is empty; use `bar.png`.", call. = FALSE) + warning("\"out_name\" is empty; use `bar.png`.") out_name <- "bar.png" } out_name <- check_ggname(out_name) # MGF - fi_psm <- file.path(out_path, in_name) + if (!file.exists(fi_psm <- file.path(out_path, in_name))) { + warning("PSM file not found: ", fi_psm) + return(NULL) + } + mgf_path <- match_mgf_path(out_path) raw_id <- match_raw_id(raw_file, mgf_path) scan <- as.character(scan) @@ -110,8 +114,16 @@ mapMS2ions <- function (out_path = NULL, in_name = "psmQ.txt", return(NULL) } - mgf <- data.frame(ms2_moverz = mgf_ok$ms2_moverz[[1]], - ms2_int = mgf_ok$ms2_int[[1]]) + req_cols <- c("ms2_moverzs", "ms2_ints") + + if (!all(oks <- req_cols %in% names(mgf_ok))) { + warning("Developer: missing PSM columns ", + paste(req_cols[!oks], collapse = ", ")) + return(NULL) + } + + mgf <- data.frame(ms2_moverz = mgf_ok$ms2_moverzs[[1]], + ms2_int = mgf_ok$ms2_ints[[1]]) mgf$iex <- seq_len(nrow(mgf)) ## PSMs @@ -119,41 +131,42 @@ mapMS2ions <- function (out_path = NULL, in_name = "psmQ.txt", "pep_ms2_theos2", "pep_ms2_exptints", "pep_ms2_exptints2", "pep_n_matches", "pep_n_matches2") - if (file.exists(fi_psm)) { - gl_vals <- ls(all.names = TRUE, envir = .GlobalEnv) - ok_psms <- any(gl_vals == ".psms") - - ok_file <- if (any(gl_vals == ".psm_file")) - identical(get(".psm_file", envir = .GlobalEnv), fi_psm) - else - FALSE - - if (ok_psms && ok_file) { - .psms <- get(".psms", envir = .GlobalEnv) - } - else { - # some columns in psmQ.txt not in psmC.txt - .psms <- suppressWarnings( - readr::read_tsv(fi_psm, show_col_types = FALSE, - col_types = get_mzion_coltypes())) - .psms <- .psms[, -which(names(.psms) %in% cols_excl), drop = FALSE] - assign(".psms", .psms, envir = .GlobalEnv) - assign(".psm_file", file.path(out_path, in_name), envir = .GlobalEnv) - } - - psm <- .psms |> - dplyr::filter(pep_scan_num == scan, - .data$raw_file == .env$raw_file, - pep_rank == rank, - pep_isdecoy == is_decoy) - psm <- psm[, -grep("^prot_", names(psm)), drop = FALSE] - # can be duplicated by prot_accs - psm <- unique(psm) + gl_vals <- ls(all.names = TRUE, envir = .GlobalEnv) + ok_psms <- any(gl_vals == ".psms") + + ok_file <- if (any(gl_vals == ".psm_file")) + identical(get(".psm_file", envir = .GlobalEnv), fi_psm) + else + FALSE + + if (ok_psms && ok_file) { + .psms <- get(".psms", envir = .GlobalEnv) } else { - warning("PSM file not found: ", fi_psm) + # some columns in psmQ.txt not in psmC.txt + .psms <- suppressWarnings( + readr::read_tsv(fi_psm, show_col_types = FALSE, + col_types = get_mzion_coltypes())) + .psms <- .psms[, -which(names(.psms) %in% cols_excl), drop = FALSE] + assign(".psms", .psms, envir = .GlobalEnv) + assign(".psm_file", file.path(out_path, in_name), envir = .GlobalEnv) + } + + req_psmcols <- c("pep_scan_num", "raw_file", "pep_rank", "pep_isdecoy") + + if (!all(oks <- req_psmcols %in% names(.psms))) { + warning("Developer: missing PSM columns ", + paste(req_psmcols[!oks], collapse = ", ")) return(NULL) } + + psm <- .psms |> + dplyr::filter(pep_scan_num == scan, + .data$raw_file == .env$raw_file, + pep_rank == rank, + pep_isdecoy == is_decoy) + psm <- psm[, -grep("^prot_", names(psm)), drop = FALSE] + psm <- unique(psm) # can be duplicated by prot_accs if (!(nrow <- nrow(psm))) { warning("PSM entry not found. Check the correctness of scan number etc.") @@ -174,14 +187,13 @@ mapMS2ions <- function (out_path = NULL, in_name = "psmQ.txt", ion_types <- unlist(strsplit(type_ms2ions, "")) if (length(ion_types) != 2L) - stop("Not a two-character `type_ms2ions = ", type_ms2ions, "`.", - call. = FALSE) + stop("Not a two-character `type_ms2ions = ", type_ms2ions, "`.") cols_pri <- c("pep_ms2_deltas", "pep_ms2_ideltas", "pep_ms2_iexs") if (!all(oks <- cols_pri %in% names(psm))) { - warning("PSM columns not found: ", paste(cols_pri[!oks], collapse = ", "), - "\nPlease use the latest version of mzion.") + warning("Developer: missing PSM columns ", + paste(cols_pri[!oks], collapse = ", ")) return(NULL) } @@ -211,8 +223,8 @@ mapMS2ions <- function (out_path = NULL, in_name = "psmQ.txt", cols_sec <- c("pep_ms2_deltas2", "pep_ms2_ideltas2", "pep_ms2_iexs2") if (!all(oks <- cols_sec %in% names(psm))) { - warning("PSM columns not found: ", paste(cols_sec[!oks], collapse = ", "), - "\nPlease use the latest version of mzion.") + warning("Developer: missing PSM columns ", + paste(cols_sec[!oks], collapse = ", ")) return(NULL) } @@ -327,7 +339,7 @@ match_mgf_path <- function (out_path) rda <- file.path(out_path, "Calls", "matchMS.rda") if (!file.exists(rda)) - stop("Parameter file not found: ", rda, call. = FALSE) + stop("Parameter file not found: ", rda) load(rda) @@ -346,14 +358,13 @@ match_raw_id <- function (raw_file, mgf_path) if (!file.exists(file)) stop("File not found ", file) - raw_lookup <- qs::qread(file) - raw_id <- unname(raw_lookup[raw_file]) + raw_map <- qs::qread(file) + raw_id <- unname(raw_map[raw_file]) if (is.na(raw_id)) { stop(raw_file, " not found in ", file, ".\n", "Aside from the possibility of incorrect `raw_file`, ", - "have the folder name been changed?", - call. = FALSE) + "have the folder name been changed?") } raw_id @@ -416,10 +427,10 @@ find_mgf_query <- function (mgf_path, raw_id, scan, to_global = TRUE) } else { files <- list.files(path = file.path(mgf_path), - pattern = "mgf_queries[_]*[0-9]*\\.rds$") + pattern = "mgf_queries_.*\\.rds$") if (!length(files)) { - warning("No parsed `mgf_queries.rds` under ", mgf_path, call. = FALSE) + warning("No parsed `mgf_queries.rds` under ", mgf_path) return(NULL) } @@ -521,7 +532,7 @@ make_speclib <- function (out_path = NULL, in_name = "psmQ.txt", score_co = 15, if (all(is.na(df$pep_ms2_moverzs)) || all(is.na(df$pep_ms2_ints))) { mgf_path <- match_mgf_path(out_path) - mgf_files <- list.files(file.path(mgf_path), "mgf_queries[_]*[0-9]*\\.rds$") + mgf_files <- list.files(file.path(mgf_path), "mgf_queries_.*\\.rds$") if (!length(mgf_files)) { warning("Processed `mgf_queries.rds` not found under ", mgf_path) diff --git a/R/mgfs.R b/R/mgfs.R index 9f3171e..565833f 100644 --- a/R/mgfs.R +++ b/R/mgfs.R @@ -19,14 +19,14 @@ load_mgfs <- function (out_path, mgf_path, min_mass = 200L, max_mass = 4500L, min_ret_time = 0, max_ret_time = Inf, ppm_ms1 = 20L, ppm_ms2 = 20L, tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, - exclude_reporter_region = FALSE, index_mgf_ms2 = FALSE, + exclude_reporter_region = FALSE, is_ms1_three_frame = TRUE, is_ms2_three_frame = TRUE, mgf_cutmzs = numeric(), mgf_cutpercs = numeric(), enzyme = "trypsin_p", - is_mdda = FALSE, deisotope_ms2 = TRUE, + deisotope_ms2 = TRUE, grad_isotope = 2.5, fct_iso2 = 3.0, max_ms2_charge = 3L, use_defpeaks = FALSE, - maxn_dia_precurs = 300L, maxn_mdda_precurs = 5L, + maxn_dia_precurs = 300L, maxn_mdda_precurs = 1L, n_mdda_flanks = 6L, ppm_ms1_deisotope = 10L, ppm_ms2_deisotope = 10L, quant = "none", digits = 4L) { @@ -67,20 +67,29 @@ load_mgfs <- function (out_path, mgf_path, min_mass = 200L, max_mass = 4500L, if ((!ok_pars) && isTRUE(enzyme == "noenzyme")) ok_pars <- TRUE - # checks processed mgfs - raws_indexes <- file.path(mgf_path, "raw_indexes.rds") + scns <- list.files(mgf_path, pattern = "^scan_map_.*\\.rds$") + ques <- list.files(mgf_path, pattern = "^mgf_queries_.*\\.rds$") + n_scns <- length(scns) + n_ques <- length(ques) - if (file.exists(raws_indexes)) { - raws <- qs::qread(raws_indexes) - ques <- list.files(mgf_path, pattern = "^mgf_queries_\\d+\\.rds$") - ok_mgfs <- if (length(raws) == length(ques)) TRUE else FALSE - rm(list = c("raws", "ques", "raws_indexes")) + if (n_scns) { + ok_mgfs <- if (n_scns == n_ques) TRUE else FALSE } else { - ok_mgfs <- FALSE - rm(list = c("raws_indexes")) + # backward compatible + if (file.exists(raws_indexes <- file.path(mgf_path, "raw_indexes.rds"))) { + raws <- qs::qread(raws_indexes) + ques <- list.files(mgf_path, pattern = "^mgf_queries_\\d+\\.rds$") + ok_mgfs <- if (length(raws) == length(ques)) TRUE else FALSE + rm(list = c("raws", "raws_indexes")) + } + else { + ok_mgfs <- FALSE + } } + rm(list = c("scns", "ques", "n_scns", "n_ques")) + if (ok_pars && ok_mgfs) { message("Found cached MGFs.") .savecall <- FALSE @@ -108,8 +117,8 @@ load_mgfs <- function (out_path, mgf_path, min_mass = 200L, max_mass = 4500L, "fraction_scheme.rda", "label_scheme.rda", "label_scheme_full.rda")) - fi_mgf <- list.files(path = file.path(mgf_path), pattern = "^.*\\.mgf$") - fi_mzml <- list.files(path = file.path(mgf_path), pattern = "^.*\\.mzML$") + fi_mgf <- list.files(path = mgf_path, pattern = "^.*\\.(mgf|MGF)$") + fi_mzml <- list.files(path = mgf_path, pattern = "^.*\\.(mzML|mzml)$") len_mgf <- length(fi_mgf) len_mzml <- length(fi_mzml) @@ -120,8 +129,8 @@ load_mgfs <- function (out_path, mgf_path, min_mass = 200L, max_mass = 4500L, if (len_mgf) { readMGF(filepath = mgf_path, - out_path = out_path, filelist = filelist, + out_path = out_path, min_mass = min_mass, max_mass = max_mass, min_ms2mass = min_ms2mass, @@ -135,10 +144,8 @@ load_mgfs <- function (out_path, mgf_path, min_mass = 200L, max_mass = 4500L, tmt_reporter_lower = tmt_reporter_lower, tmt_reporter_upper = tmt_reporter_upper, exclude_reporter_region = exclude_reporter_region, - index_mgf_ms2 = index_mgf_ms2, mgf_cutmzs = mgf_cutmzs, mgf_cutpercs = mgf_cutpercs, - is_mdda = FALSE, use_defpeaks = use_defpeaks, deisotope_ms2 = deisotope_ms2, max_ms2_charge = max_ms2_charge, @@ -152,8 +159,8 @@ load_mgfs <- function (out_path, mgf_path, min_mass = 200L, max_mass = 4500L, } else if (len_mzml) { readmzML(filepath = mgf_path, - out_path = out_path, filelist = filelist, + out_path = out_path, min_mass = min_mass, max_mass = max_mass, min_ms2mass = min_ms2mass, @@ -167,15 +174,11 @@ load_mgfs <- function (out_path, mgf_path, min_mass = 200L, max_mass = 4500L, tmt_reporter_lower = tmt_reporter_lower, tmt_reporter_upper = tmt_reporter_upper, exclude_reporter_region = exclude_reporter_region, - index_mgf_ms2 = index_mgf_ms2, mgf_cutmzs = mgf_cutmzs, mgf_cutpercs = mgf_cutpercs, - deisotope_ms2 = deisotope_ms2, max_ms2_charge = max_ms2_charge, maxn_dia_precurs = maxn_dia_precurs, - - is_mdda = is_mdda, use_defpeaks = use_defpeaks, maxn_mdda_precurs = maxn_mdda_precurs, n_mdda_flanks = n_mdda_flanks, @@ -214,25 +217,24 @@ load_mgfs <- function (out_path, mgf_path, min_mass = 200L, max_mass = 4500L, #' @import stringi #' @import readr #' @import fs -readMGF <- function (filepath = NULL, filelist = NULL, +readMGF <- function (filepath = NULL, filelist = NULL, out_path = NULL, min_mass = 200L, max_mass = 4500L, min_ms2mass = 115L, max_ms2mass = 4500L, topn_ms2ions = 100L, ms1_charge_range = c(2L, 4L), ms1_scan_range = c(1L, .Machine$integer.max), ret_range = c(0, Inf), ppm_ms1 = 10L, ppm_ms2 = 10L, tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, - exclude_reporter_region = FALSE, index_mgf_ms2 = FALSE, + exclude_reporter_region = FALSE, mgf_cutmzs = numeric(), mgf_cutpercs = numeric(), - out_path = file.path(filepath, "mgf_queries_1.rds"), - is_mdda = FALSE, deisotope_ms2 = TRUE, max_ms2_charge = 3L, + deisotope_ms2 = TRUE, max_ms2_charge = 3L, use_defpeaks = FALSE, maxn_dia_precurs = 300L, maxn_mdda_precurs = 1L, n_mdda_flanks = 6L, ppm_ms1_deisotope = 8L, ppm_ms2_deisotope = 8L, quant = "none", digits = 4L) { - if (is_mdda) { + if (maxn_mdda_precurs >= 1L) { warning("No multi-precursor DDA with MGF. Use mzML to enable the feature.") - is_mdda <- FALSE + maxn_mdda_precurs <- 0L } ## Parsing rules @@ -296,6 +298,9 @@ readMGF <- function (filepath = NULL, filelist = NULL, } ## Reads from chunks + warning("An mzML or MGF with multiple RAWs not supported since v1.3.4. ", + "Each peaklist file need contain exactly one RAW file.") + out <- vector("list", len) for (i in seq_along(filelist)) { @@ -304,7 +309,9 @@ readMGF <- function (filepath = NULL, filelist = NULL, message("Loading '", file, "'.") - out[[i]] <- read_mgf_chunks(filepath = temp_dir, + out[[i]] <- read_mgf_chunks(filepath = filepath, + temp_dir = temp_dir, + raw_id = i, topn_ms2ions = topn_ms2ions, ms1_charge_range = ms1_charge_range, ms1_scan_range = ms1_scan_range, @@ -334,8 +341,6 @@ readMGF <- function (filepath = NULL, filelist = NULL, tmt_reporter_lower = tmt_reporter_lower, tmt_reporter_upper = tmt_reporter_upper, exclude_reporter_region = exclude_reporter_region, - index_mgf_ms2 = index_mgf_ms2, - is_mdda = is_mdda, use_defpeaks = use_defpeaks, deisotope_ms2 = deisotope_ms2, max_ms2_charge = max_ms2_charge, @@ -359,11 +364,10 @@ readMGF <- function (filepath = NULL, filelist = NULL, }) } - ## Clean up - out <- dplyr::bind_rows(out) - - post_readmgf(out, min_mass = min_mass, max_mass = max_mass, ppm_ms1 = ppm_ms1, - filepath = filepath) + raws <- unlist(out, recursive = FALSE, use.names = TRUE) + qs::qsave(raws, file.path(filepath, "raw_indexes.rds"), preset = "fast") + + invisible(NULL) } @@ -372,37 +376,31 @@ readMGF <- function (filepath = NULL, filelist = NULL, #' Calculates mass \code{frame}s etc. #' #' @param df A data frame of processed peak lists. +#' @param raw_id An ID to replace the original RAW file name. #' @inheritParams readMGF -post_readmgf <- function (df, min_mass = 200L, max_mass = 4500L, ppm_ms1 = 10L, - filepath) +post_readmgf <- function (df, raw_id, mgf_path, min_mass = 200L, + max_mass = 4500L, ppm_ms1 = 10L) { - if (is.atomic(df[1, "ms1_charge", drop = TRUE])) { - df <- dplyr::arrange(df, ms1_mass) - # df <- dplyr::filter(df, ms1_mass >= min_mass, ms1_mass <= max_mass) - } - - raws_files <- df$raw_file - raws <- raws_files[!duplicated.default(raws_files)] - inds <- seq_along(raws) - names(inds) <- raws - qs::qsave(inds, file.path(filepath, "raw_indexes.rds"), preset = "fast") - df$raw_file <- unname(inds[raws_files]) + raw <- unique(df$raw_file) - scans <- df$scan_title - inds2 <- seq_along(scans) - names(inds2) <- scans - qs::qsave(inds2, file.path(filepath, "scan_indexes.rds"), preset = "fast") - df$scan_title <- unname(inds2[scans]) + if (length(raw) > 1L) + stop("An mzML or MGF with multiple RAWs not supported since v1.3.4. ", + "Each peaklist file need contain exactly one RAW file.") - df <- split(df, df$raw_file) - nms <- names(df) + raw_map <- raw_id + names(raw_map) <- raw + df$raw_file <- raw_id - for (i in seq_along(df)) { - qs::qsave(df[[i]], file.path(filepath, paste0("mgf_queries_", nms[i], ".rds")), - preset = "fast") - } + scans <- df$scan_title + scans_map <- df$scan_title <- seq_along(scans) + names(scans_map) <- scans + + qs::qsave(df, file.path(mgf_path, paste0("mgf_queries_", raw, ".rds")), + preset = "fast") + qs::qsave(scans_map, file.path(mgf_path, paste0("scan_map_", raw, ".rds")), + preset = "fast") - invisible(NULL) + invisible(raw_map) } @@ -453,6 +451,8 @@ readlineMGFs <- function (i, file, filepath, raw_file) #' Reads mgfs in chunks. #' +#' @param temp_dir A temporary path of MGFs. +#' @param raw_id An ID to RAW file name. #' @param type_mgf The type of MGF format. #' @param n_bf_begin The number of lines before \code{BEGIN IONS}. Zero for PD #' and MSConvert. @@ -478,7 +478,7 @@ readlineMGFs <- function (i, file, filepath, raw_file) #' @param raw_file The raw file name. Is NULL for PD and MSConvert. #' @inheritParams readMGF #' @inheritParams matchMS -read_mgf_chunks <- function (filepath = "~/mzion/mgf/temp_1", +read_mgf_chunks <- function (filepath, temp_dir, raw_id = 1L, topn_ms2ions = 100L, ms1_charge_range = c(2L, 6L), ms1_scan_range = c(1L, .Machine$integer.max), ret_range = c(0, Inf), min_mass = 200L, @@ -492,22 +492,22 @@ read_mgf_chunks <- function (filepath = "~/mzion/mgf/temp_1", sep_pepmass = " ", nfields_pepmass = 2L, raw_file = NULL, tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, - exclude_reporter_region = FALSE, index_mgf_ms2 = FALSE, + exclude_reporter_region = FALSE, deisotope_ms2 = TRUE, max_ms2_charge = 3L, maxn_dia_precurs = 300L, - is_mdda = FALSE, use_defpeaks = FALSE, + use_defpeaks = FALSE, maxn_mdda_precurs = 5L, n_mdda_flanks = 6L, ppm_ms1_deisotope = 10L, ppm_ms2_deisotope = 10L, quant = "none", digits = 4L) { - filelist <- list.files(path = file.path(filepath), pattern = "^.*\\.mgf$") - + filelist <- list.files(path = temp_dir, pattern = "^.*\\.mgf$") + if (!(len <- length(filelist))) - stop("No mgf files under ", filepath) + stop("No mgf files under ", temp_dir) if (len == 1L) { out <- proc_mgf_chunks( - file.path(filepath, filelist), + file.path(temp_dir, filelist), topn_ms2ions = topn_ms2ions, ms1_charge_range = ms1_charge_range, ms1_scan_range = ms1_scan_range, @@ -537,8 +537,6 @@ read_mgf_chunks <- function (filepath = "~/mzion/mgf/temp_1", tmt_reporter_lower = tmt_reporter_lower, tmt_reporter_upper = tmt_reporter_upper, exclude_reporter_region = exclude_reporter_region, - index_mgf_ms2 = index_mgf_ms2, - is_mdda = is_mdda, deisotope_ms2 = deisotope_ms2, max_ms2_charge = max_ms2_charge, use_defpeaks = use_defpeaks, @@ -575,7 +573,7 @@ read_mgf_chunks <- function (filepath = "~/mzion/mgf/temp_1", envir = environment(mzion::matchMS) ) - out <- parallel::clusterApply(cl, file.path(filepath, filelist), + out <- parallel::clusterApply(cl, file.path(temp_dir, filelist), proc_mgf_chunks, topn_ms2ions = topn_ms2ions, ms1_charge_range = ms1_charge_range, @@ -606,8 +604,6 @@ read_mgf_chunks <- function (filepath = "~/mzion/mgf/temp_1", tmt_reporter_lower = tmt_reporter_lower, tmt_reporter_upper = tmt_reporter_upper, exclude_reporter_region = exclude_reporter_region, - index_mgf_ms2 = index_mgf_ms2, - is_mdda = is_mdda, deisotope_ms2 = deisotope_ms2, max_ms2_charge = max_ms2_charge, use_defpeaks = use_defpeaks, @@ -623,40 +619,40 @@ read_mgf_chunks <- function (filepath = "~/mzion/mgf/temp_1", out <- dplyr::bind_rows(out) } - + # adds back broken mgf entries afs <- local({ - afs <- list.files(path = file.path(filepath), pattern = "^.*\\_af.mgf$") + afs <- list.files(path = temp_dir, pattern = "^.*\\_af.mgf$") idxes <- sort(as.integer(gsub("^chunk_(\\d+)_af\\.mgf", "\\1", afs))) afs <- paste0("chunk_", idxes, "_af.mgf") afs <- afs[-length(afs)] }) - + bfs <- local({ - bfs <- list.files(path = file.path(filepath), pattern = "^.*\\_bf.mgf$") + bfs <- list.files(path = temp_dir, pattern = "^.*\\_bf.mgf$") idxes <- sort(as.integer(gsub("^chunk_(\\d+)_bf\\.mgf", "\\1", bfs))) bfs <- paste0("chunk_", idxes, "_bf.mgf") bfs <- bfs[-1] }) - + # stopifnot(length(afs) == length(bfs)) - + gaps <- purrr::map2(afs, bfs, function (x, y) { - af <- stringi::stri_read_lines(file.path(filepath, x)) - bf <- stringi::stri_read_lines(file.path(filepath, y)) + af <- stringi::stri_read_lines(file.path(temp_dir, x)) + bf <- stringi::stri_read_lines(file.path(temp_dir, y)) ab <- append(af, bf) # perfect case of no gaps: two lines of "" and "" if (length(ab) > 2L) ab else NULL }) gaps <- unlist(gaps, use.names = FALSE) - write(gaps, file.path(filepath, "gaps.mgf")) - + write(gaps, file.path(temp_dir, "gaps.mgf")) + local({ - nms <- list.files(path = file.path(filepath), pattern = "^.*\\_[ab]f.mgf$") - if (length(nms)) suppressMessages(file.remove(file.path(filepath, nms))) + nms <- list.files(path = file.path(temp_dir), pattern = "^.*\\_[ab]f.mgf$") + if (length(nms)) suppressMessages(file.remove(file.path(temp_dir, nms))) }) - + if (!is.null(gaps)) { out <- dplyr::bind_rows( out, @@ -689,8 +685,6 @@ read_mgf_chunks <- function (filepath = "~/mzion/mgf/temp_1", tmt_reporter_lower = tmt_reporter_lower, tmt_reporter_upper = tmt_reporter_upper, exclude_reporter_region = exclude_reporter_region, - index_mgf_ms2 = index_mgf_ms2, - is_mdda = is_mdda, deisotope_ms2 = deisotope_ms2, max_ms2_charge = max_ms2_charge, use_defpeaks = use_defpeaks, @@ -703,13 +697,14 @@ read_mgf_chunks <- function (filepath = "~/mzion/mgf/temp_1", digits = digits) ) } - + if (type_mgf == "default_pasef") { out <- dplyr::mutate(out, scan_id = as.character(scan_num), scan_num = as.character(row_number())) - } - - invisible(out) + } + + post_readmgf(out, raw_id = raw_id, mgf_path = filepath, min_mass = min_mass, + max_mass = max_mass, ppm_ms1 = ppm_ms1) } @@ -734,7 +729,6 @@ proc_mgf_chunks <- function (file, topn_ms2ions = 100L, tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, exclude_reporter_region = FALSE, - index_mgf_ms2 = FALSE, is_mdda = FALSE, deisotope_ms2 = TRUE, max_ms2_charge = 3L, use_defpeaks = FALSE, maxn_dia_precurs = 300L, maxn_mdda_precurs = 5L, n_mdda_flanks = 6L, @@ -802,8 +796,6 @@ proc_mgf_chunks <- function (file, topn_ms2ions = 100L, tmt_reporter_lower = tmt_reporter_lower, tmt_reporter_upper = tmt_reporter_upper, exclude_reporter_region = exclude_reporter_region, - index_mgf_ms2 = index_mgf_ms2, - is_mdda = is_mdda, deisotope_ms2 = deisotope_ms2, max_ms2_charge = max_ms2_charge, use_defpeaks = use_defpeaks, @@ -833,8 +825,8 @@ proc_mgfs <- function (lines, topn_ms2ions = 100L, ms1_charge_range = c(2L, 6L), n_to_charge = 4L, sep_ms2s = " ", nfields_ms2s = 2L, sep_pepmass = " ", nfields_pepmass = 2L, raw_file = NULL, tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, - exclude_reporter_region = FALSE, index_mgf_ms2 = FALSE, - is_mdda = FALSE, deisotope_ms2 = TRUE, max_ms2_charge = 3L, + exclude_reporter_region = FALSE, + deisotope_ms2 = TRUE, max_ms2_charge = 3L, use_defpeaks = FALSE, maxn_dia_precurs = 300L, maxn_mdda_precurs = 5L, n_mdda_flanks = 6L, ppm_ms1_deisotope = 10L, ppm_ms2_deisotope = 10L, @@ -984,20 +976,11 @@ proc_mgfs <- function (lines, topn_ms2ions = 100L, ms1_charge_range = c(2L, 6L), max_ms2mass = max_ms2mass) ms2_moverzs <- mz_n_int[["ms2_moverzs"]] + # ms2_moverzs <- lapply(ms2_moverzs, round, digits = digits) ms2_ints <- mz_n_int[["ms2_ints"]] ms2_charges <- mz_n_int[["ms2_charges"]] lens <- mz_n_int[["lens"]] rm(list = "mz_n_int") - - if (index_mgf_ms2) { - ms2_moverzs <- lapply(ms2_moverzs, index_mz, min_ms2mass, ppm_ms2/1E6) - # dups <- lapply(ms2_moverzs, duplicated.default) - # ms2_moverzs <- mapply(function (x, y) x[!y], ms2_moverzs, dups, SIMPLIFY = FALSE, USE.NAMES = FALSE) - # ms2_ints <- mapply(function (x, y) x[!y], ms2_ints, dups, SIMPLIFY = FALSE, USE.NAMES = FALSE) - # rm(list = "dups") - } - else - ms2_moverzs <- lapply(ms2_moverzs, round, digits = digits) df <- tibble::tibble( scan_title = scan_titles, @@ -1468,9 +1451,8 @@ readmzML <- function (filepath = NULL, filelist = NULL, out_path = NULL, ret_range = c(0, Inf), ppm_ms1 = 10L, ppm_ms2 = 10L, tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, exclude_reporter_region = FALSE, - index_mgf_ms2 = FALSE, mgf_cutmzs = numeric(), mgf_cutpercs = numeric(), - is_mdda = FALSE, deisotope_ms2 = TRUE, max_ms2_charge = 3L, + deisotope_ms2 = TRUE, max_ms2_charge = 3L, use_defpeaks = FALSE, maxn_dia_precurs = 300L, maxn_mdda_precurs = 5L, n_mdda_flanks = 6L, ppm_ms1_deisotope = 10L, ppm_ms2_deisotope = 10L, @@ -1493,8 +1475,9 @@ readmzML <- function (filepath = NULL, filelist = NULL, out_path = NULL, n_cores <- max(1L, n_cores) if (n_cores == 1L) { - for (i in 1:len) - out[[i]] <- proc_mzml(files[[i]], + for (i in 1:len) { + out[[i]] <- proc_mzml(file = files[[i]], raw_id = i, + filepath = filepath, topn_ms2ions = topn_ms2ions, ms1_charge_range = ms1_charge_range, ret_range = ret_range, @@ -1509,8 +1492,6 @@ readmzML <- function (filepath = NULL, filelist = NULL, out_path = NULL, tmt_reporter_lower = tmt_reporter_lower, tmt_reporter_upper = tmt_reporter_upper, exclude_reporter_region = exclude_reporter_region, - index_mgf_ms2 = index_mgf_ms2, - is_mdda = is_mdda, deisotope_ms2 = deisotope_ms2, max_ms2_charge = max_ms2_charge, use_defpeaks = use_defpeaks, @@ -1523,45 +1504,50 @@ readmzML <- function (filepath = NULL, filelist = NULL, out_path = NULL, fct_iso2 = fct_iso2, quant = quant, digits = digits) + } } else { cl <- parallel::makeCluster(getOption("cl.cores", n_cores)) - out <- parallel::clusterApply(cl, files, proc_mzml, - topn_ms2ions = topn_ms2ions, - ms1_charge_range = ms1_charge_range, - ret_range = ret_range, - min_mass = min_mass, - max_mass = max_mass, - ppm_ms1 = ppm_ms1, - ppm_ms2 = ppm_ms2, - min_ms2mass = min_ms2mass, - max_ms2mass = max_ms2mass, - mgf_cutmzs = mgf_cutmzs, - mgf_cutpercs = mgf_cutpercs, - tmt_reporter_lower = tmt_reporter_lower, - tmt_reporter_upper = tmt_reporter_upper, - exclude_reporter_region = exclude_reporter_region, - index_mgf_ms2 = index_mgf_ms2, - is_mdda = is_mdda, - deisotope_ms2 = deisotope_ms2, - max_ms2_charge = max_ms2_charge, - use_defpeaks = use_defpeaks, - maxn_dia_precurs = maxn_dia_precurs, - maxn_mdda_precurs = maxn_mdda_precurs, - n_mdda_flanks = n_mdda_flanks, - ppm_ms1_deisotope = ppm_ms1_deisotope, - ppm_ms2_deisotope = ppm_ms2_deisotope, - grad_isotope = grad_isotope, - fct_iso2 = fct_iso2, - quant = quant, - digits = digits) + out <- parallel::clusterMap( + cl, proc_mzml, + files, seq_along(files), + MoreArgs = list(filepath = filepath, + topn_ms2ions = topn_ms2ions, + ms1_charge_range = ms1_charge_range, + ret_range = ret_range, + min_mass = min_mass, + max_mass = max_mass, + ppm_ms1 = ppm_ms1, + ppm_ms2 = ppm_ms2, + min_ms2mass = min_ms2mass, + max_ms2mass = max_ms2mass, + mgf_cutmzs = mgf_cutmzs, + mgf_cutpercs = mgf_cutpercs, + tmt_reporter_lower = tmt_reporter_lower, + tmt_reporter_upper = tmt_reporter_upper, + exclude_reporter_region = exclude_reporter_region, + deisotope_ms2 = deisotope_ms2, + max_ms2_charge = max_ms2_charge, + use_defpeaks = use_defpeaks, + maxn_dia_precurs = maxn_dia_precurs, + maxn_mdda_precurs = maxn_mdda_precurs, + n_mdda_flanks = n_mdda_flanks, + ppm_ms1_deisotope = ppm_ms1_deisotope, + ppm_ms2_deisotope = ppm_ms2_deisotope, + grad_isotope = grad_isotope, + fct_iso2 = fct_iso2, + quant = quant, + digits = digits), + SIMPLIFY = FALSE, USE.NAMES = FALSE, + .scheduling = "dynamic" + ) parallel::stopCluster(cl) } - out <- dplyr::bind_rows(out) + raws <- unlist(out, recursive = FALSE, use.names = TRUE) + qs::qsave(raws, file.path(filepath, "raw_indexes.rds"), preset = "fast") - post_readmgf(out, min_mass = min_mass, max_mass = max_mass, ppm_ms1 = ppm_ms1, - filepath = filepath) + invisible(NULL) } @@ -1570,15 +1556,18 @@ readmzML <- function (filepath = NULL, filelist = NULL, out_path = NULL, #' No scan range subsetting with PASEF timsTOF. #' #' @param file A file name to mzML with a prepending path. +#' @param raw_id A RAW file ID. +#' @param filepath A file path of MGF. #' @inheritParams readmzML -proc_mzml <- function (file, topn_ms2ions = 100L, ms1_charge_range = c(2L, 4L), +proc_mzml <- function (file, raw_id, filepath, topn_ms2ions = 100L, + ms1_charge_range = c(2L, 4L), ret_range = c(0, Inf), min_mass = 200L, max_mass = 4500L, ppm_ms1 = 10L, ppm_ms2 = 10L, min_ms2mass = 115L, max_ms2mass = 4500L, mgf_cutmzs = numeric(), mgf_cutpercs = numeric(), tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, - exclude_reporter_region = FALSE, index_mgf_ms2 = FALSE, - is_mdda = FALSE, deisotope_ms2 = TRUE, max_ms2_charge = 3L, + exclude_reporter_region = FALSE, + deisotope_ms2 = TRUE, max_ms2_charge = 3L, use_defpeaks = FALSE, maxn_dia_precurs = 300L, maxn_mdda_precurs = 5L, n_mdda_flanks = 6L, ppm_ms1_deisotope = 10L, ppm_ms2_deisotope = 10L, @@ -1592,10 +1581,9 @@ proc_mzml <- function (file, topn_ms2ions = 100L, ms1_charge_range = c(2L, 4L), tmt_reporter_lower = tmt_reporter_lower, tmt_reporter_upper = tmt_reporter_upper, exclude_reporter_region = exclude_reporter_region, - index_mgf_ms2 = index_mgf_ms2, ppm_ms1 = ppm_ms1, - ppm_ms2 = ppm_ms2, min_ms2mass = min_ms2mass, - max_ms2mass = max_ms2mass, max_ms1_charge = max_ms1_charge, - is_mdda = is_mdda, + ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, + min_ms2mass = min_ms2mass, max_ms2mass = max_ms2mass, + max_ms1_charge = max_ms1_charge, deisotope_ms2 = deisotope_ms2, max_ms2_charge = max_ms2_charge, use_defpeaks = use_defpeaks, @@ -1608,7 +1596,7 @@ proc_mzml <- function (file, topn_ms2ions = 100L, ms1_charge_range = c(2L, 4L), fct_iso2 = fct_iso2, quant = quant, digits = digits) - if (is_mdda) { + if (maxn_mdda_precurs) { rows <- lapply(df$ms1_mass, is.null) rows <- unlist(rows, recursive = FALSE, use.names = FALSE) df <- df[!rows, ] @@ -1647,12 +1635,12 @@ proc_mzml <- function (file, topn_ms2ions = 100L, ms1_charge_range = c(2L, 4L), if (is.atomic(df[1, "ms1_charge", drop = TRUE])) { df <- df[with(df, !is.na(ms1_mass)), ] - df <- dplyr::filter(df, ms1_charge >= min_ms1_charge, ms1_charge <= max_ms1_charge, ret_time >= ret_range[1], ret_time <= ret_range[2], ms1_mass >= min_mass, ms1_mass <= max_mass, ) + df <- dplyr::arrange(df, ms1_mass) } # subsets by top-n and min_ms2mass @@ -1666,13 +1654,13 @@ proc_mzml <- function (file, topn_ms2ions = 100L, ms1_charge_range = c(2L, 4L), min_ms2mass = min_ms2mass, max_ms2mass = max_ms2mass) - # can be integers if "index_mgf_ms2 = TRUE" df[["ms2_moverzs"]] <- mz_n_int[["ms2_moverzs"]] df[["ms2_ints"]] <- mz_n_int[["ms2_ints"]] df[["ms2_charges"]] <- mz_n_int[["ms2_charges"]] df[["ms2_n"]] <- mz_n_int[["lens"]] - invisible(df) + post_readmgf(df, raw_id = raw_id, mgf_path = filepath, min_mass = min_mass, + max_mass = max_mass, ppm_ms1 = ppm_ms1) } @@ -1684,10 +1672,10 @@ proc_mzml <- function (file, topn_ms2ions = 100L, ms1_charge_range = c(2L, 4L), #' @inheritParams matchMS read_mzml <- function (xml_file, topn_ms2ions = 100L, tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, - exclude_reporter_region = FALSE, index_mgf_ms2 = FALSE, + exclude_reporter_region = FALSE, ppm_ms1 = 10L, ppm_ms2 = 10L, min_ms2mass = 115L, max_ms2mass = 4500L, max_ms1_charge = 4L, - is_mdda = FALSE, deisotope_ms2 = TRUE, max_ms2_charge = 3L, + deisotope_ms2 = TRUE, max_ms2_charge = 3L, use_defpeaks = FALSE, maxn_dia_precurs = 300L, maxn_mdda_precurs = 5L, n_mdda_flanks = 6L, ppm_ms1_deisotope = 10L, ppm_ms2_deisotope = 10L, @@ -1887,7 +1875,7 @@ read_mzml <- function (xml_file, topn_ms2ions = 100L, # ms1_: ms1 only # ms0_: by other peak-pickings, e.g., MSConvert - if (is_mdda) { + if (maxn_mdda_precurs) { df <- proc_mdda(spec, raw_file = raw_file, idx_sc = idx_sc, idx_osc = idx_osc, idx_mslev = idx_mslev, idx_title = idx_title, idx_scanList_2 = idx_scanList_2, idx_rt_2 = idx_rt_2, @@ -1958,8 +1946,7 @@ read_mzml <- function (xml_file, topn_ms2ions = 100L, topn_ms2ions = topn_ms2ions, quant = quant, tmt_reporter_lower = tmt_reporter_lower, tmt_reporter_upper = tmt_reporter_upper, - exclude_reporter_region = exclude_reporter_region, - index_mgf_ms2 = index_mgf_ms2) + exclude_reporter_region = exclude_reporter_region) } } @@ -2473,7 +2460,7 @@ proc_dda <- function (spec, raw_file, idx_sc = 3L, idx_osc = 3L, max_ms2_charge = 3L, ppm_ms2_deisotope = 10L, topn_ms2ions = 100L, quant = "none", tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, - exclude_reporter_region = FALSE, index_mgf_ms2 = FALSE) + exclude_reporter_region = FALSE) { len <- length(spec) ret_times <- orig_scans <- scan_nums <- scan_titles <- character(len) @@ -2582,10 +2569,7 @@ proc_dda <- function (spec, raw_file, idx_sc = 3L, idx_osc = 3L, msx_ints <- restmt[["yvals"]] rptr_moverzs <- restmt[["rptr_moverzs"]] rptr_ints <- restmt[["rptr_ints"]] - - if (index_mgf_ms2) - msx_moverzs <- lapply(msx_moverzs, index_mz, min_ms2mass, ppm_ms2/1E6) - + df <- tibble::tibble( scan_title = scan_titles, raw_file = raw_file, diff --git a/R/ms2frames.R b/R/ms2frames.R index b4a75f2..17d8315 100644 --- a/R/ms2frames.R +++ b/R/ms2frames.R @@ -26,7 +26,7 @@ pair_mgftheos <- function (mgf_path, n_modules, ms1_offsets = 0, quant = "none", if (length(tempfiles)) unlink(tempfiles) - mgf_files <- list.files(mgf_path, pattern = "^mgf_queries_\\d+\\.rds$", + mgf_files <- list.files(mgf_path, pattern = "^mgf_queries_.*\\.rds$", full.names = TRUE) mgfs <- lapply(mgf_files, qs::qread) @@ -301,8 +301,7 @@ hms2match <- function (aa_masses_all, funs_ms2, ms1vmods_all, ms2vmods_all, maxn_vnl_per_seq = 3L, maxn_vmods_sitescombi_per_pep = 64L, minn_ms2 = 6L, ppm_ms1 = 10L, ppm_ms2 = 10L, - min_ms2mass = 115L, index_mgf_ms2 = FALSE, - by_modules = FALSE, df0 = NULL) + min_ms2mass = 115L, by_modules = FALSE, df0 = NULL) { pth <- if (by_modules) "theo" else "mgftheo" pex <- if (by_modules) "expt" else "mgftheo" @@ -357,7 +356,7 @@ hms2match <- function (aa_masses_all, funs_ms2, ms1vmods_all, ms2vmods_all, maxn_vnl_per_seq = maxn_vnl_per_seq, maxn_vmods_sitescombi_per_pep = maxn_vmods_sitescombi_per_pep, minn_ms2 = minn_ms2, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, - min_ms2mass = min_ms2mass, index_mgf_ms2 = index_mgf_ms2, + min_ms2mass = min_ms2mass, df0 = df0) } } @@ -385,7 +384,6 @@ hms2match <- function (aa_masses_all, funs_ms2, ms1vmods_all, ms2vmods_all, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, min_ms2mass = min_ms2mass, - index_mgf_ms2 = index_mgf_ms2, df0 = df0) } @@ -411,7 +409,7 @@ ms2match_all <- function (mgth, aa_masses_all, funs_ms2, ms1vmods_all, maxn_vnl_per_seq = 3L, maxn_vmods_sitescombi_per_pep = 64L, minn_ms2 = 6L, ppm_ms1 = 10L, ppm_ms2 = 10L, - min_ms2mass = 115L, index_mgf_ms2 = FALSE, + min_ms2mass = 115L, df0 = NULL) { msg <- paste0("Matching expt-theo pair: ", mgth) @@ -445,8 +443,7 @@ ms2match_all <- function (mgth, aa_masses_all, funs_ms2, ms1vmods_all, minn_ms2 = minn_ms2, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, - min_ms2mass = min_ms2mass, - index_mgf_ms2 = index_mgf_ms2) + min_ms2mass = min_ms2mass) if (!dir.exists(tempdir <- file.path(out_path, "temp"))) create_dir(tempdir) @@ -505,7 +502,7 @@ mframes_adv <- function (mgf_frames = NULL, theopeps = NULL, maxn_vnl_per_seq = 3L, maxn_vmods_sitescombi_per_pep = 64L, minn_ms2 = 6L, ppm_ms1 = 10L, ppm_ms2 = 10L, - min_ms2mass = 115L, index_mgf_ms2 = FALSE) + min_ms2mass = 115L) { lenm <- length(mgf_frames) frames <- as.integer(names(mgf_frames)) @@ -690,7 +687,6 @@ mframes_adv <- function (mgf_frames = NULL, theopeps = NULL, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, min_ms2mass = min_ms2mass, - index_mgf_ms2 = index_mgf_ms2, by_modules = FALSE ), SIMPLIFY = FALSE, @@ -1076,8 +1072,7 @@ mframes_adv <- function (mgf_frames = NULL, theopeps = NULL, #' #' @return Lists of (1) theo, (2) expt, (3) ith, (4) iex and (5) m. find_ms2_bypep <- function (theos = NULL, expts = NULL, ex = NULL, d = NULL, - ppm_ms2 = 10L, min_ms2mass = 115L, minn_ms2 = 6L, - index_mgf_ms2 = FALSE) + ppm_ms2 = 10L, min_ms2mass = 115L, minn_ms2 = 6L) { # `theos` # the same pep_seq at different applicable ivmods and NLs @@ -1259,8 +1254,7 @@ search_mgf <- function (expt_mass_ms1 = NULL, expt_moverz_ms2 = NULL, exptcharges_ms2 = NULL, theomasses_ms1 = NULL, theomasses_ms2 = NULL, pep_mod_groups = NULL, minn_ms2 = 6L, ppm_ms1 = 10L, ppm_ms2 = 10L, - min_ms2mass = 115L, index_mgf_ms2 = FALSE, - by_modules = FALSE) + min_ms2mass = 115L, by_modules = FALSE) { # don't flip the order of cdn1 & cdn2: FALSE & NA -> FALSE; TRUE & NA <- NA if (!is.null(exptcharges_ms2)) @@ -1268,12 +1262,8 @@ search_mgf <- function (expt_mass_ms1 = NULL, expt_moverz_ms2 = NULL, # --- find MS2 matches --- d2 <- ppm_ms2/1E6 - - ex <- if (index_mgf_ms2) # already indexed - expt_moverz_ms2 - else - index_mz(expt_moverz_ms2, min_ms2mass, d2) - + ex <- index_mz(expt_moverz_ms2, min_ms2mass, d2) + # lapply by the same pep_seq at different ivmods and/or NLs ans <- if (length(theomasses_ms2)) lapply(theomasses_ms2, find_ms2_bypep, @@ -1282,8 +1272,7 @@ search_mgf <- function (expt_mass_ms1 = NULL, expt_moverz_ms2 = NULL, d = d2, ppm_ms2 = ppm_ms2, min_ms2mass = min_ms2mass, - minn_ms2 = minn_ms2, - index_mgf_ms2 = index_mgf_ms2) + minn_ms2 = minn_ms2) else theomasses_ms2 @@ -1391,8 +1380,8 @@ hms2match_one <- function (pep_mod_group, nms_theo, nms_expt, aa_masses, FUN, maxn_fnl_per_seq = 3L, maxn_vnl_per_seq = 3L, maxn_vmods_sitescombi_per_pep = 64L, minn_ms2 = 6L, ppm_ms1 = 10L, ppm_ms2 = 10L, - min_ms2mass = 115L, index_mgf_ms2 = FALSE, - df0 = NULL) + min_ms2mass = 115L, df0 = NULL) + { nm_fmods <- attr(aa_masses, "fmods", exact = TRUE) nm_vmods <- attr(aa_masses, "vmods", exact = TRUE) @@ -1448,7 +1437,7 @@ hms2match_one <- function (pep_mod_group, nms_theo, nms_expt, aa_masses, FUN, deisotope_ms2 = deisotope_ms2, maxn_vmods_sitescombi_per_pep = maxn_vmods_sitescombi_per_pep, minn_ms2 = minn_ms2, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, - min_ms2mass = min_ms2mass, index_mgf_ms2 = index_mgf_ms2, + min_ms2mass = min_ms2mass, df0 = df0) } else { @@ -1468,7 +1457,7 @@ hms2match_one <- function (pep_mod_group, nms_theo, nms_expt, aa_masses, FUN, deisotope_ms2 = deisotope_ms2, maxn_vmods_sitescombi_per_pep = maxn_vmods_sitescombi_per_pep, minn_ms2 = minn_ms2, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, - min_ms2mass = min_ms2mass, index_mgf_ms2 = index_mgf_ms2, + min_ms2mass = min_ms2mass, df0 = df0) # neutral losses: maxn_neulosses_vnl and maxn_neulosses_fnl @@ -1487,7 +1476,7 @@ hms2match_one <- function (pep_mod_group, nms_theo, nms_expt, aa_masses, FUN, deisotope_ms2 = deisotope_ms2, maxn_vmods_sitescombi_per_pep = maxn_vmods_sitescombi_per_pep, minn_ms2 = minn_ms2, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, - min_ms2mass = min_ms2mass, index_mgf_ms2 = index_mgf_ms2, + min_ms2mass = min_ms2mass, df0 = df0) } @@ -1551,7 +1540,7 @@ ms2match_one <- function (nms_theo, nms_expt, pep_mod_group, aa_masses, FUN, deisotope_ms2 = TRUE, maxn_vmods_sitescombi_per_pep = 64L, minn_ms2 = 6L, ppm_ms1 = 10L, ppm_ms2 = 10L, - min_ms2mass = 115L, index_mgf_ms2 = FALSE, + min_ms2mass = 115L, df0 = NULL) { out_name <- gsub("^theo", "ion_matches", nms_theo) @@ -1606,7 +1595,6 @@ ms2match_one <- function (nms_theo, nms_expt, pep_mod_group, aa_masses, FUN, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, min_ms2mass = min_ms2mass, - index_mgf_ms2 = index_mgf_ms2, FUN = FUN), .scheduling = "dynamic") @@ -1655,8 +1643,7 @@ frames_adv <- function (mgf_frames = NULL, theopeps = NULL, deisotope_ms2 = TRUE, maxn_vmods_sitescombi_per_pep = 64L, minn_ms2 = 6L, ppm_ms1 = 10L, ppm_ms2 = 10L, - min_ms2mass = 115L, index_mgf_ms2 = FALSE, - FUN) + min_ms2mass = 115L, FUN) { len <- length(mgf_frames) @@ -1791,7 +1778,6 @@ frames_adv <- function (mgf_frames = NULL, theopeps = NULL, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, min_ms2mass = min_ms2mass, - index_mgf_ms2 = index_mgf_ms2, by_modules = TRUE ), SIMPLIFY = FALSE, diff --git a/R/msmsmatches.R b/R/msmsmatches.R index af10745..7a52b5c 100644 --- a/R/msmsmatches.R +++ b/R/msmsmatches.R @@ -263,20 +263,6 @@ #' The default is \eqn{126.1}. #' @param tmt_reporter_upper The upper bound of the region of TMT reporter ions. #' The default is \eqn{135.2}. -#' @param index_mgf_ms2 Depreciated. A low-priority feature. Logical; if TRUE, -#' converts up-frontly MS2 m-over-z values from numeric to integers as opposed -#' to \emph{on-the-fly} conversion during ion matches. The default is FALSE. -#' The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by -#' reducing RAM footprints. -#' -#' At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between -#' theoretical and experimental MS2 m-over-z values is limited by the -#' \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For -#' instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = -#' 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, -#' the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, -#' pep_ms2_deltas_sd} are nullified in the outputs. -#' #' @param min_ms1_charge A positive integer; the minimum MS1 charge state for #' considerations. The default is 2. #' @param max_ms1_charge A positive integer; the maximum MS1 charge state for @@ -750,8 +736,7 @@ matchMS <- function (out_path = "~/mzion/outs", tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, exclude_reporter_region = FALSE, - index_mgf_ms2 = FALSE, - + ppm_reporters = 10L, quant = c("none", "tmt6", "tmt10", "tmt11", "tmt16", "tmt18"), @@ -926,7 +911,7 @@ matchMS <- function (out_path = "~/mzion/outs", # logical types stopifnot(vapply(c(soft_secions, combine_tier_three, calib_ms1mass, use_ms1_cache, add_ms2theos, add_ms2theos2, add_ms2moverzs, - add_ms2ints, exclude_reporter_region, index_mgf_ms2, + add_ms2ints, exclude_reporter_region, svm_reproc, svm_cv, rm_dup_term_anywhere, make_speclib, deisotope_ms2, use_defpeaks), is.logical, logical(1L))) @@ -1341,8 +1326,6 @@ matchMS <- function (out_path = "~/mzion/outs", exclude_reporter_region = exclude_reporter_region, tmt_reporter_lower = tmt_reporter_lower, tmt_reporter_upper = tmt_reporter_upper, - index_mgf_ms2 = index_mgf_ms2, - is_mdda = if (maxn_mdda_precurs) TRUE else FALSE, deisotope_ms2 = deisotope_ms2, max_ms2_charge = max_ms2_charge, use_defpeaks = use_defpeaks, @@ -1392,7 +1375,7 @@ matchMS <- function (out_path = "~/mzion/outs", reframe_mgfs = reframe_mgfs, ppm_ms2 = ppm_ms2, min_mass = min_mass, max_mass = max_mass, min_ms2mass = min_ms2mass, quant = quant, - ppm_reporters = ppm_reporters, index_mgf_ms2 = index_mgf_ms2, + ppm_reporters = ppm_reporters, by_modules = by_modules, fasta = fasta, acc_type = acc_type, acc_pattern = acc_pattern, topn_ms2ions = topn_ms2ions, fixedmods = fixedmods, varmods = NULL, # the first search @@ -1424,7 +1407,6 @@ matchMS <- function (out_path = "~/mzion/outs", min_ms2mass = min_ms2mass, quant = quant, ppm_reporters = ppm_reporters, - index_mgf_ms2 = index_mgf_ms2, by_modules = by_modules, ms1_offsets = ms1_offsets, ms1_neulosses = ms1_neulosses, @@ -1470,7 +1452,6 @@ matchMS <- function (out_path = "~/mzion/outs", soft_secions = soft_secions, out_path = out_path, min_ms2mass = min_ms2mass, - index_mgf_ms2 = index_mgf_ms2, tally_ms2ints = tally_ms2ints, # dummies @@ -1509,8 +1490,7 @@ matchMS <- function (out_path = "~/mzion/outs", add_ms2theos2 = add_ms2theos2, add_ms2moverzs = add_ms2moverzs, add_ms2ints = add_ms2ints, - by_modules = by_modules, - index_mgf_ms2 = index_mgf_ms2) + by_modules = by_modules) ## Peptide FDR if (is.null(bypass_pepfdr <- dots$bypass_pepfdr)) @@ -1632,9 +1612,15 @@ matchMS <- function (out_path = "~/mzion/outs", ## Clean-ups # (raw_file etc. already mapped if `from_group_search`) - if (!isTRUE(from_group_search <- dots$from_group_search)) - df <- map_raw_n_scan(df, mgf_path) - + if (!isTRUE(from_group_search <- dots$from_group_search)) { + if (file.exists(file.path(mgf_path, "scan_indexes.rds"))) { + df <- map_raw_n_scan_old(df, mgf_path) # backward-compatible + } + else { + df <- map_raw_n_scan(df, mgf_path) + } + } + df <- dplyr::mutate(df, pep_expect = 10^((pep_score_co - pep_score)/10) * target_fdr) df[["pep_score_co"]] <- NULL df$pep_delta <- df$pep_exp_mr - df$pep_calc_mr @@ -2205,6 +2191,58 @@ check_locmods <- function (locmods, fixedmods, varmods, ms1_neulosses = NULL) #' @param df A data frame. #' @inheritParams matchMS map_raw_n_scan <- function (df, mgf_path) +{ + file_raw <- file.path(mgf_path, "raw_indexes.rds") + + if (file.exists(file_raw)) { + raws <- qs::qread(file_raw) + pos <- match(as.character(df$raw_file), as.character(raws)) + df$raw_file <- names(raws)[pos] + } + else { + stop("File not found: ", file_raw) + } + + files_scan <- list.files(mgf_path, pattern = "^scan_map_.*\\.rds$") + + if (!(len_sc <- length(files_scan))) { + stop("No `scan_map` files found.") + } + + + if (len_sc != length(raws)) + stop("The number of `scan_map` files is different to the number of RAWs.") + + dfs <- split(df, df$raw_file) + raws_in_df <- names(dfs) + ids <- match(raws_in_df, gsub("^scan_map_(.*)\\.rds$", "\\1", files_scan)) + + if (any(bads <- is.na(ids))) { + stop("Files do not have matched `scan_map`", + paste(raws_in_df[bads], collapse = ", ")) + } + + files_scan <- files_scan[ids] + + for (i in ids) { + scans <- qs::qread(file.path(mgf_path, files_scan[[i]])) + pos <- match(dfs[[i]]$pep_scan_title, as.character(scans)) + dfs[[i]]$pep_scan_title <- names(scans)[pos] + } + + df <- dplyr::bind_rows(dfs) + + invisible(df) +} + + +#' Maps raw_file and scan_title from indexes to real values. +#' +#' For backward compatibility. +#' +#' @param df A data frame. +#' @inheritParams matchMS +map_raw_n_scan_old <- function (df, mgf_path) { file_raw <- file.path(mgf_path, "raw_indexes.rds") file_scan <- file.path(mgf_path, "scan_indexes.rds") diff --git a/R/msmsmatches2.R b/R/msmsmatches2.R index a89834c..cb19103 100644 --- a/R/msmsmatches2.R +++ b/R/msmsmatches2.R @@ -32,7 +32,7 @@ ms2match <- function (mgf_path, aa_masses_all, out_path, .path_bin, # dummies fasta, acc_type, acc_pattern, topn_ms2ions, fixedmods, varmods, enzyme, maxn_fasta_seqs, maxn_vmods_setscombi, - min_len, max_len, max_miss, index_mgf_ms2 = FALSE, + min_len, max_len, max_miss, first_search = FALSE, .savecall = TRUE) { @@ -65,6 +65,19 @@ ms2match <- function (mgf_path, aa_masses_all, out_path, .path_bin, call_pars <- mget(fml_incl, envir = fun_env, inherits = FALSE) call_pars <- call_pars[sort(names(call_pars))] + # temporary fix, update find_dir on the next ver and delete this + if (".path_bin" %in% names(cache_pars) && ".path_bin" %in% names(call_pars)) { + if (!(is.null(cache_pars$.path_bin) || + "fs_path" %in% class(cache_pars$.path_bin))) { + cache_pars$.path_bin <- fs::fs_path(cache_pars$.path_bin) + } + + if (!(is.null(call_pars$.path_bin) || + "fs_path" %in% class(call_pars$.path_bin))) { + call_pars$.path_bin <- fs::fs_path(call_pars$.path_bin) + } + } + if (identical(cache_pars, call_pars)) { fions <- list.files(path = file.path(out_path, "temp"), pattern = "ion_matches_[0-9]+\\.rds$") @@ -178,7 +191,6 @@ ms2match <- function (mgf_path, aa_masses_all, out_path, .path_bin, ppm_ms1 = ppm_ms1_bin, ppm_ms2 = ppm_ms2_bin, min_ms2mass = min_ms2mass, - index_mgf_ms2 = index_mgf_ms2, by_modules = by_modules, df0 = df0) @@ -244,15 +256,15 @@ reverse_seqs <- function (seqs) #' @param .path_bin The file path to binned precursor masses. #' @param reframe_mgfs Logical; if TRUE, recalculates the frame indexes of MGFs #' @inheritParams matchMS -calib_mgf <- function (mgf_path = NULL, aa_masses_all = NULL, out_path = NULL, - .path_bin, mod_indexes = NULL, type_ms2ions = "by", +calib_mgf <- function (mgf_path, aa_masses_all, out_path, .path_bin, + mod_indexes = NULL, type_ms2ions = "by", maxn_vmods_per_pep = 5L,maxn_sites_per_vmod = 3L, maxn_fnl_per_seq = 3L, maxn_vnl_per_seq = 3L, maxn_vmods_sitescombi_per_pep = 64L, minn_ms2 = 6L, ppm_ms1 = 20L, reframe_mgfs = TRUE, ppm_ms2 = 20L, min_mass = 200L, max_mass = 4500L, min_ms2mass = 115L, quant = "none", - ppm_reporters = 10L, index_mgf_ms2 = FALSE, + ppm_reporters = 10L, by_modules = TRUE, fasta = NULL, acc_type = NULL, acc_pattern = NULL, topn_ms2ions = 100L, fixedmods = NULL, varmods = NULL, enzyme = "trypsin_p", @@ -262,9 +274,8 @@ calib_mgf <- function (mgf_path = NULL, aa_masses_all = NULL, out_path = NULL, on.exit( if (exists(".savecall", envir = fun_env)) { if (.savecall) save_call2(path = file.path(out_path, "Calls"), fun = fun) - }, add = TRUE - ) - + }, add = TRUE) + fun <- as.character(match.call()[[1]]) fun_env <- environment() args <- names(formals(fun)) @@ -339,7 +350,6 @@ calib_mgf <- function (mgf_path = NULL, aa_masses_all = NULL, out_path = NULL, quant = "none", ppm_reporters = ppm_reporters, reframe_mgfs = reframe_mgfs, - index_mgf_ms2 = index_mgf_ms2, by_modules = by_modules, fasta = fasta, acc_type = acc_type, @@ -365,7 +375,6 @@ calib_mgf <- function (mgf_path = NULL, aa_masses_all = NULL, out_path = NULL, soft_secions = FALSE, out_path = out_path, min_ms2mass = min_ms2mass, - index_mgf_ms2 = index_mgf_ms2, tally_ms2ints = TRUE, # dummies @@ -396,7 +405,7 @@ calib_mgf <- function (mgf_path = NULL, aa_masses_all = NULL, out_path = NULL, file.rename(fi_mi2, fi_mi) ## mass calibration - fs_mgf <- list.files(mgf_path, "^mgf_queries.*\\.rds$") + fs_mgf <- list.files(mgf_path, "^mgf_queries_.*\\.rds$") fi_ion <- file.path(out_path, "temp", "prescores_1_1.rds") if (!length(fs_mgf)) @@ -411,11 +420,13 @@ calib_mgf <- function (mgf_path = NULL, aa_masses_all = NULL, out_path = NULL, stop("Column not found in search results: `raw_file`") dfs <- split(df, df[["raw_file"]]) - ord <- sort(as.integer(gsub("^mgf_queries_(\\d+)\\.rds", "\\1", fs_mgf))) - dfs <- dfs[ord] - fs_mgf <- fs_mgf[ord] - rm(list = c("df", "ord")) - + raws <- qs::qread(file.path(mgf_path, "raw_indexes.rds")) + ord <- match(names(dfs), as.character(raws)) + names(dfs) <- names(raws)[ord] + + ord_mgf <- match(gsub("^mgf_queries_(.*)\\.rds$", "\\1", fs_mgf), names(dfs)) + fs_mgf <- fs_mgf[ord_mgf] + len <- length(dfs) n_cores <- min(len, detect_cores(32L)) @@ -483,7 +494,7 @@ calib_ms1 <- function (filename, df = NULL, mgf_path = NULL, out_path = NULL, } diff_ms1 <- (df[["pep_exp_mr"]] - df[["theo_ms1"]])/df[["theo_ms1"]] * 1E6 - mdiff <- median(diff_ms1, na.rm = TRUE)/1E6 + mdiff <- median(diff_ms1, na.rm = TRUE)/1E6 if (n_row <= 100L || mdiff <= 1e-6) { mgfs[["ms1_mass"]] <- mgfs[["ms1_mass"]] - mdiff @@ -494,8 +505,7 @@ calib_ms1 <- function (filename, df = NULL, mgf_path = NULL, out_path = NULL, else { cvs <- lapply(range, cv_ms1err, k = 10, df = df) cvs <- unlist(cvs, recursive = FALSE, use.names = FALSE) - - stopifnot(length(cvs) == length(range)) + # stopifnot(length(cvs) == length(range)) if (all(is.na(cvs))) { mgfs[["ms1_mass"]] <- mgfs[["ms1_mass"]] - mdiff diff --git a/R/mztab.R b/R/mztab.R index d2004f0..e0b9f79 100644 --- a/R/mztab.R +++ b/R/mztab.R @@ -1,14 +1,16 @@ #' Makes an mzTab file. #' -#' With \code{mzion} searches and proteoQ preprocessing. +#' With \code{mzion} searches and \code{proteoQ} post-processing. #' -#' @param out_path A parent path where the outputs of \code{PSM}, \code{Peptide} -#' and \code{Protein} files and folders are. +#' @param mzion_path The parent parthwhere \code{mzion} search was performed. +#' @param proteoq_path A parent path where the \code{proteoQ} post-processing +#' was performed. #' @import dplyr -make_mztab <- function (out_path = stop("Provide the path.", call. = FALSE)) +make_mztab <- function (mzion_path = stop("Provide the path.", call. = FALSE), + proteoq_path = mzion_path) { ## MTD - load(file.path(out_path, "Calls", "matchMS.rda")) + load(file.path(mzion_path, "Calls", "matchMS.rda")) # Header hdrs <- local({ @@ -20,14 +22,26 @@ make_mztab <- function (out_path = stop("Provide the path.", call. = FALSE)) # Instrument and MGF format ans_mgfs <- local({ mgf_path <- call_pars$mgf_path - info_mgfs <- qs::qread(file.path(mgf_path, "info_format.rds")) + fi_fmt <- file.path(mgf_path, "info_format.rds") + + if (file.exists(fi_fmt)) { + info_mgfs <- qs::qread(fi_fmt) + } + else { + # the only non-MGF is MSConvert-mzML + info_mgfs <- list(data_format = "Thermo-RAW", mgf_format = "MSconvert") + } + data_format <- info_mgfs$data_format val_data_format <- paste0("[MS, , ", data_format, ", ]") mgf_format <- info_mgfs$mgf_format val_mgf_format <- paste0("[MS, , ", mgf_format, ", ]") - mgf_queries <- qs::qread(file.path(mgf_path, "mgf_queries.rds")) + query_files <- list.files(mgf_path, pattern = "^mgf_queries_.*\\.rds", + full.names = TRUE) + mgf_queries <- lapply(query_files, qs::qread) |> + dplyr::bind_rows() raw_files <- names(qs::qread(file.path(mgf_path, "raw_indexes.rds"))) ans_mgfs <- vector("list", length(raw_files)) @@ -36,9 +50,7 @@ make_mztab <- function (out_path = stop("Provide the path.", call. = FALSE)) nm_format <- paste0("ms_run[", i, "]-format") nm_location <- paste0("ms_run[", i, "]-location") nm_id_format <- paste0("ms_run[", i, "]-id_format") - val_location <- raw_files[i] - nms <- c(nm_format, nm_location, nm_id_format) vals <- c(val_data_format, val_location, val_mgf_format) @@ -49,18 +61,17 @@ make_mztab <- function (out_path = stop("Provide the path.", call. = FALSE)) }) # Software settings - load(file.path(out_path, "Calls", "mzion.rda")) - - proteom_info <- devtools::session_info$otherPkgs[[1]] - proteom_ver <- proteom_info$Version + load(file.path(mzion_path, "Calls", "mzion.rda")) + pkgs <- devtools::session_info() + mzion_ver <- pkgs$packages["mzion", "loadedversion"] ans_software_1 <- local({ ln_software_1 <- data.frame(nm = "software[1]", val = paste0("[MS, MS:0000000, mzion,", - proteom_ver, "]")) + mzion_ver, "]")) idxes <- which(unlist(lapply(call_pars, is.null))) - call_pars[[idxes]] <- "NULL" + call_pars[idxes] <- "NULL" rm(list = "idxes") fixedmods <- call_pars$fixedmods @@ -132,10 +143,10 @@ make_mztab <- function (out_path = stop("Provide the path.", call. = FALSE)) mtd <- cbind(field = "MTD", mtd) ## Proteins - df_prots <- readr::read_tsv(file.path(out_path, "Protein", "Protein.txt"), + df_prots <- readr::read_tsv(file.path(proteoq_path, "Protein", "Protein.txt"), show_col_types = FALSE) - df_peps <- readr::read_tsv(file.path(out_path, "Peptide", "Peptide.txt"), + df_peps <- readr::read_tsv(file.path(proteoq_path, "Peptide", "Peptide.txt"), show_col_types = FALSE) df_shared_prot_accs <- local({ @@ -220,7 +231,7 @@ make_mztab <- function (out_path = stop("Provide the path.", call. = FALSE)) prt <- local({ df <- df_prots[, grepl("^I[0-9]+", names(df_prots))] colnames(df) <- paste0("protein_abundance_study_variable[", 1:ncol(df), "]") - data.frame(cbind(prot_acc = df_prots$prot_acc, df), check.names = FALSE) + df <- data.frame(cbind(prot_acc = df_prots$prot_acc, df), check.names = FALSE) dplyr::left_join(prt, df, by = c("accession" = "prot_acc")) }) @@ -255,7 +266,7 @@ make_mztab <- function (out_path = stop("Provide the path.", call. = FALSE)) database_version = "null", search_engine = "mzion", "best_search_engine_score[1]" = df_peps$pep_score, - modifications = df_peps$pep_vmod, + modifications = df_peps$pep_seq_mod, opt_global_missed_cleavages = df_peps$pep_miss, reliability = 1L, ) @@ -267,12 +278,12 @@ make_mztab <- function (out_path = stop("Provide the path.", call. = FALSE)) }) ## PSMs - psm_files <- list.files(path = file.path(out_path, "PSM"), + psm_files <- list.files(path = file.path(proteoq_path, "PSM"), pattern = "TMTset[0-9]+_LCMSinj[0-9]+_PSM_N\\.txt$", all.files = TRUE) df_psms <- lapply(psm_files, - function (x) readr::read_tsv(file.path(out_path, "PSM", x), + function (x) readr::read_tsv(file.path(proteoq_path, "PSM", x), show_col_types = FALSE)) |> dplyr::bind_rows() @@ -348,13 +359,13 @@ make_mztab <- function (out_path = stop("Provide the path.", call. = FALSE)) lines_psm <- paste(lines_psm, collapse = "\n") lines_psm <- paste(lines_psm, "\n") - dir.create(file.path(out_path, "mzTab"), showWarnings = FALSE, recursive = TRUE) - out_file <- file.path(out_path, "mzTab", "mztab.mzTab") + dir.create(file.path(proteoq_path, "mzTab"), showWarnings = FALSE, recursive = TRUE) + out_file <- file.path(proteoq_path, "mzTab", "mztab.mzTab") out <- Reduce(append, list(lines_mtd, lines_prt, lines_pep, lines_psm)) writeLines(out, out_file) - out + invisible(NULL) } diff --git a/R/scores.R b/R/scores.R index 61ddea2..d026c72 100644 --- a/R/scores.R +++ b/R/scores.R @@ -254,7 +254,7 @@ calc_probi_byvmods <- function (df, nms, expt_moverzs, expt_ints, # expt_charges N = 500L, type_ms2ions = "by", topn_ms2ions = 100L, ppm_ms2 = 20L, soft_secions = FALSE, burn_ins = 1:2, - min_ms2mass = 115L, d2 = 1E-5, index_mgf_ms2 = FALSE, + min_ms2mass = 115L, d2 = 1E-5, tally_ms2ints = TRUE, digits = 4L) { df_theo <- df[["theo"]] @@ -295,11 +295,11 @@ calc_probi_byvmods <- function (df, nms, expt_moverzs, expt_ints, # expt_charges nna <- !is.na(expt_charges) expt_one[nna & (expt_charges > 1L)] <- NA_integer_ expt_mul[nna & (expt_charges == 1L)] <- NA_integer_ - df2_one <- match_ex2th2(expt_one, tt2_one, min_ms2mass, d2, index_mgf_ms2) - df2_mul <- match_ex2th2(expt_mul, tt2_mul, min_ms2mass, d2, index_mgf_ms2) + df2_one <- match_ex2th2(expt_one, tt2_one, min_ms2mass, d2) + df2_mul <- match_ex2th2(expt_mul, tt2_mul, min_ms2mass, d2) } - df2 <- match_ex2th2(expt_moverzs, tt2, min_ms2mass, d2, index_mgf_ms2) + df2 <- match_ex2th2(expt_moverzs, tt2, min_ms2mass, d2) ith2 <- df2[["ith"]] iex2 <- df2[["iex"]] @@ -416,7 +416,7 @@ calc_probi_bypep <- function (mts, nms, expt_moverzs, expt_ints, # expt_charges, N = 500L, type_ms2ions = "by", topn_ms2ions = 100L, ppm_ms2 = 20L, soft_secions = FALSE, min_ms2mass = 115L, d2 = 1E-5, - index_mgf_ms2 = FALSE, tally_ms2ints = TRUE, + tally_ms2ints = TRUE, digits = 4L) { ## for different positions: $TNLAMMR$`0000500`, $TNLAMMR$`0000050` @@ -438,7 +438,6 @@ calc_probi_bypep <- function (mts, nms, expt_moverzs, expt_ints, # expt_charges, burn_ins = c(1:2), min_ms2mass = min_ms2mass, d2 = d2, - index_mgf_ms2 = index_mgf_ms2, tally_ms2ints = tally_ms2ints, digits = digits ), @@ -481,7 +480,7 @@ calc_probi_bypep <- function (mts, nms, expt_moverzs, expt_ints, # expt_charges, calc_probi <- function (mts, expt_moverzs, expt_ints, # expt_charges, N = 500L, type_ms2ions = "by", topn_ms2ions = 100L, ppm_ms2 = 20L, soft_secions = FALSE, - min_ms2mass = 115L, d2 = 1E-5, index_mgf_ms2 = FALSE, + min_ms2mass = 115L, d2 = 1E-5, tally_ms2ints = TRUE, digits = 4L) { out <- mapply( @@ -498,7 +497,6 @@ calc_probi <- function (mts, expt_moverzs, expt_ints, # expt_charges, ppm_ms2 = ppm_ms2, min_ms2mass = min_ms2mass, d2 = d2, - index_mgf_ms2 = index_mgf_ms2, tally_ms2ints = tally_ms2ints, digits = digits ), @@ -520,7 +518,7 @@ calc_probi <- function (mts, expt_moverzs, expt_ints, # expt_charges, #' @import purrr scalc_pepprobs <- function (entry, topn_ms2ions = 100L, type_ms2ions = "by", ppm_ms2 = 20L, soft_secions = FALSE, - min_ms2mass = 115L, d2 = 1E-5, index_mgf_ms2 = FALSE, + min_ms2mass = 115L, d2 = 1E-5, tally_ms2ints = TRUE, digits = 4L) { # only one experimental set of values and thus `[[1]]` @@ -573,7 +571,6 @@ scalc_pepprobs <- function (entry, topn_ms2ions = 100L, type_ms2ions = "by", soft_secions = soft_secions, min_ms2mass = min_ms2mass, d2 = d2, - index_mgf_ms2 = index_mgf_ms2, tally_ms2ints = tally_ms2ints, digits = digits) @@ -598,7 +595,7 @@ calc_pepprobs_i <- function (df, topn_ms2ions = 100L, type_ms2ions = "by", ppm_ms2 = 20L, soft_secions = FALSE, out_path = "~/mzion/outs", min_ms2mass = 115L, d2 = 1E-5, - index_mgf_ms2 = FALSE, tally_ms2ints = TRUE, + tally_ms2ints = TRUE, digits = 4L) { n_rows <- nrow(df) @@ -622,7 +619,6 @@ calc_pepprobs_i <- function (df, topn_ms2ions = 100L, type_ms2ions = "by", soft_secions = soft_secions, min_ms2mass = min_ms2mass, d2 = d2, - index_mgf_ms2 = index_mgf_ms2, tally_ms2ints = tally_ms2ints, digits = digits) @@ -644,7 +640,7 @@ calc_pepscores <- function (topn_ms2ions = 100L, type_ms2ions = "by", min_len = 7L, max_len = 40L, ppm_ms2 = 20L, soft_secions = FALSE, out_path = "~/mzion/outs", - min_ms2mass = 115L, index_mgf_ms2 = FALSE, + min_ms2mass = 115L, tally_ms2ints = TRUE, mgf_path, maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, maxn_vmods_sitescombi_per_pep = 64L, minn_ms2 = 6L, @@ -790,7 +786,6 @@ calc_pepscores <- function (topn_ms2ions = 100L, type_ms2ions = "by", out_path = out_path, min_ms2mass = min_ms2mass, d2 = d2, - index_mgf_ms2 = index_mgf_ms2, tally_ms2ints = tally_ms2ints, add_ms2theos = add_ms2theos, add_ms2theos2 = add_ms2theos2, @@ -984,7 +979,7 @@ find_targets <- function (out_path, pattern = "^ion_matches_") calcpepsc <- function (file, im_path, pep_fmod_all, pep_vmod_all, topn_ms2ions = 100L, type_ms2ions = "by", ppm_ms2 = 20L, soft_secions = FALSE, out_path = NULL, - min_ms2mass = 115L, d2 = 1E-5, index_mgf_ms2 = FALSE, + min_ms2mass = 115L, d2 = 1E-5, tally_ms2ints = TRUE, add_ms2theos = FALSE, add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, add_ms2ints = FALSE, quant = "none", ppm_reporters = 10, @@ -1053,7 +1048,6 @@ calcpepsc <- function (file, im_path, pep_fmod_all, pep_vmod_all, out_path = out_path, min_ms2mass = min_ms2mass, d2 = d2, - index_mgf_ms2 = index_mgf_ms2, tally_ms2ints = tally_ms2ints, digits = digits) @@ -1130,7 +1124,7 @@ calcpepsc <- function (file, im_path, pep_fmod_all, pep_vmod_all, hadd_primatches <- function (out_path = NULL, is_notched = FALSE, add_ms2theos = FALSE, add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, add_ms2ints = FALSE, - by_modules = TRUE, index_mgf_ms2 = FALSE) + by_modules = TRUE) { # the same as those in calcpepsc cols_sc <- c("pep_seq", "pep_n_ms2", "pep_scan_title", "pep_exp_mz", "pep_exp_mr", @@ -1164,8 +1158,7 @@ hadd_primatches <- function (out_path = NULL, is_notched = FALSE, add_ms2theos = add_ms2theos, add_ms2theos2 = add_ms2theos2, add_ms2moverzs = add_ms2moverzs, - add_ms2ints = add_ms2ints, - index_mgf_ms2 = index_mgf_ms2) + add_ms2ints = add_ms2ints) parallel::stopCluster(cl) ms_files <- order_fracs(type = "ms2info", tempdir, by_modules) @@ -1204,7 +1197,7 @@ hadd_primatches <- function (out_path = NULL, is_notched = FALSE, #' @inheritParams matchMS add_primatches <- function (file = NULL, tempdir = NULL, add_ms2theos = FALSE, add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, - add_ms2ints = FALSE, index_mgf_ms2 = FALSE) + add_ms2ints = FALSE) { df <- qs::qread(file.path(tempdir, file)) @@ -1286,20 +1279,10 @@ add_primatches <- function (file = NULL, tempdir = NULL, add_ms2theos = FALSE, p1s.[[i]] <- ps1 } - if (index_mgf_ms2) { - # need to convert theoretical m/z to integers; - # even so the resolution is limited by bin with (e.g. 10 ppm) - pep_ms2_deltas <- NA_character_ - pep_ms2_deltas2 <- NA_character_ - pep_ms2_deltas_mean <- NA_real_ - pep_ms2_deltas_sd <- NA_real_ - } - else { - df[["pep_ms2_deltas"]] <- do.call(rbind, d1s) - df[["pep_ms2_deltas2"]] <- do.call(rbind, d2s) - df[["pep_ms2_deltas_mean"]] <- do.call(rbind, me1s) - df[["pep_ms2_deltas_sd"]] <- do.call(rbind, sd1s) - } + df[["pep_ms2_deltas"]] <- do.call(rbind, d1s) + df[["pep_ms2_deltas2"]] <- do.call(rbind, d2s) + df[["pep_ms2_deltas_mean"]] <- do.call(rbind, me1s) + df[["pep_ms2_deltas_sd"]] <- do.call(rbind, sd1s) df[["pep_ms2_ideltas"]] <- do.call(rbind, p1s) df[["pep_ms2_ideltas2"]] <- do.call(rbind, p2s) @@ -2757,17 +2740,14 @@ find_ppm_outer_bycombi <- function (X, Y, ppm_ms2 = 20L) #' Matches between secondary experimentals and theoreticals. #' -#' At \code{index_mgf_ms2 = FALSE}. -#' #' @param expt A vector of experimental m-over-z values. #' @param theo A vector of theoretical m-over-z values. #' @param d Bin size, e.g., \eqn{20 ppm / 2 * 1E-6}. #' @inheritParams matchMS -match_ex2th2 <- function (expt, theo, min_ms2mass = 115L, d = 1E-5, - index_mgf_ms2 = FALSE) +match_ex2th2 <- function (expt, theo, min_ms2mass = 115L, d = 1E-5) { th <- index_mz(theo, from = min_ms2mass, d = d) - ex <- if (index_mgf_ms2) expt else index_mz(expt, from = min_ms2mass, d = d) + ex <- index_mz(expt, from = min_ms2mass, d = d) t2e <- fastmatch::fmatch(c(th, th - 1L, th + 1L), ex, nomatch = 0L) l <- length(th) diff --git a/R/zzz.R b/R/zzz.R index 8d09e49..9981176 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -2,15 +2,10 @@ .onAttach <- function(libname, pkgname) { packageStartupMessage("Welcome to mzion.\n\n", "============================================================================================\n", - # "NEW features (v1.2.4):\n", - # "[x] Incompatible with cached results from previous versions.\n\n", - "[x] For examples, enter \"?matchMS\".\n", - # "[x] Please delete cached \"\temp\pep_score.rds\" for reprocessing wither older versions.\n", + "NEW features (v1.3.3.4):\n", + "[x] MS1, MS2 de-isotoping and chimeric peptide searches.\n\n", + "[x] For documents, enter \"?matchMS\".\n", - # "[x] Added Percolator utility.\n", - # "[x] See also package `proteoQ` for downstream data QA and informatics.\n", - # "\n", - # "Notes:\n", "[x] Suggested configuration for large datasets: 32GB RAM and 8-cores.\n", # "[x] May need to remove previously cached results (or use a new .path_cache and .path_fasta).\n", diff --git a/README.md b/README.md index 67bbc43..255c070 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ Mzion ================ true -2023-09-13 +2023-11-12 - [Installation](#installation) - [Peaklist formats](#peaklist-formats) @@ -47,8 +47,6 @@ devtools::install_github("qzhang503/mzionShiny") 1) peakPicking: vendor msLevel = 1- 2) (optional) zeroSamples: removeExtra 1- - - - [x] `Proteome Discoverer mgf` - Bruker’s MS - [x] `DataAnalysis mgf` diff --git a/man/add_primatches.Rd b/man/add_primatches.Rd index 71c158f..4ad4316 100644 --- a/man/add_primatches.Rd +++ b/man/add_primatches.Rd @@ -10,8 +10,7 @@ add_primatches( add_ms2theos = FALSE, add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, - add_ms2ints = FALSE, - index_mgf_ms2 = FALSE + add_ms2ints = FALSE ) } \arguments{ @@ -40,20 +39,6 @@ add_primatches( \item{add_ms2ints}{Logical; if TRUE, adds the sequence of experimental MS2 intensity values (\code{pep_ms2_ints}).} - -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} } \description{ Applied to both targets and decoys as feature "pep_ms2_deltas_mean" may be diff --git a/man/calc_pepprobs_i.Rd b/man/calc_pepprobs_i.Rd index d13b992..e4fe8a1 100644 --- a/man/calc_pepprobs_i.Rd +++ b/man/calc_pepprobs_i.Rd @@ -13,7 +13,6 @@ calc_pepprobs_i( out_path = "~/mzion/outs", min_ms2mass = 115L, d2 = 1e-05, - index_mgf_ms2 = FALSE, tally_ms2ints = TRUE, digits = 4L ) @@ -45,20 +44,6 @@ interrogation. The default is 110.} \item{d2}{Bin width in ppm divided by 1E6.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{tally_ms2ints}{Logical; tally MS2 intensities or not.} \item{digits}{A non-negative integer; the number of decimal places to be diff --git a/man/calc_pepscores.Rd b/man/calc_pepscores.Rd index ecc03a7..aa8692b 100644 --- a/man/calc_pepscores.Rd +++ b/man/calc_pepscores.Rd @@ -14,7 +14,6 @@ calc_pepscores( soft_secions = FALSE, out_path = "~/mzion/outs", min_ms2mass = 115L, - index_mgf_ms2 = FALSE, tally_ms2ints = TRUE, mgf_path, maxn_vmods_per_pep = 5L, @@ -73,20 +72,6 @@ TRUE} on search performance has not yet been assessed.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{tally_ms2ints}{Logical; tally MS2 intensities or not.} \item{mgf_path}{A file path to a list of MGF files. The experimenter needs to diff --git a/man/calc_probi.Rd b/man/calc_probi.Rd index ca4be23..671c2eb 100644 --- a/man/calc_probi.Rd +++ b/man/calc_probi.Rd @@ -15,7 +15,6 @@ calc_probi( soft_secions = FALSE, min_ms2mass = 115L, d2 = 1e-05, - index_mgf_ms2 = FALSE, tally_ms2ints = TRUE, digits = 4L ) @@ -51,20 +50,6 @@ interrogation. The default is 110.} \item{d2}{Bin width in ppm divided by 1E6.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{tally_ms2ints}{Logical; tally MS2 intensities or not.} \item{digits}{A non-negative integer; the number of decimal places to be diff --git a/man/calc_probi_bypep.Rd b/man/calc_probi_bypep.Rd index dfc37dc..4923215 100644 --- a/man/calc_probi_bypep.Rd +++ b/man/calc_probi_bypep.Rd @@ -16,7 +16,6 @@ calc_probi_bypep( soft_secions = FALSE, min_ms2mass = 115L, d2 = 1e-05, - index_mgf_ms2 = FALSE, tally_ms2ints = TRUE, digits = 4L ) @@ -54,20 +53,6 @@ interrogation. The default is 110.} \item{d2}{Bin width in ppm divided by 1E6.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{tally_ms2ints}{Logical; tally MS2 intensities or not.} \item{digits}{A non-negative integer; the number of decimal places to be diff --git a/man/calc_probi_byvmods.Rd b/man/calc_probi_byvmods.Rd index 458ef82..4668871 100644 --- a/man/calc_probi_byvmods.Rd +++ b/man/calc_probi_byvmods.Rd @@ -17,7 +17,6 @@ calc_probi_byvmods( burn_ins = 1:2, min_ms2mass = 115L, d2 = 1e-05, - index_mgf_ms2 = FALSE, tally_ms2ints = TRUE, digits = 4L ) @@ -59,20 +58,6 @@ interrogation. The default is 110.} \item{d2}{Bin width in ppm divided by 1E6.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{tally_ms2ints}{Logical; tally MS2 intensities or not.} \item{digits}{A non-negative integer; the number of decimal places to be diff --git a/man/calcpepsc.Rd b/man/calcpepsc.Rd index 487cb86..7c4c3e0 100644 --- a/man/calcpepsc.Rd +++ b/man/calcpepsc.Rd @@ -16,7 +16,6 @@ calcpepsc( out_path = NULL, min_ms2mass = 115L, d2 = 1e-05, - index_mgf_ms2 = FALSE, tally_ms2ints = TRUE, add_ms2theos = FALSE, add_ms2theos2 = FALSE, @@ -62,20 +61,6 @@ interrogation. The default is 110.} \item{d2}{Bin width in ppm divided by 1E6} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{tally_ms2ints}{Logical; tally MS2 intensities or not.} \item{add_ms2theos}{Logical. If true, adds the sequence of primary diff --git a/man/calib_mgf.Rd b/man/calib_mgf.Rd index 1c38a08..1ae2b6c 100644 --- a/man/calib_mgf.Rd +++ b/man/calib_mgf.Rd @@ -5,9 +5,9 @@ \title{MGF precursor mass calibration.} \usage{ calib_mgf( - mgf_path = NULL, - aa_masses_all = NULL, - out_path = NULL, + mgf_path, + aa_masses_all, + out_path, .path_bin, mod_indexes = NULL, type_ms2ions = "by", @@ -25,7 +25,6 @@ calib_mgf( min_ms2mass = 115L, quant = "none", ppm_reporters = 10L, - index_mgf_ms2 = FALSE, by_modules = TRUE, fasta = NULL, acc_type = NULL, @@ -129,20 +128,6 @@ plexes. For example, apply \code{tmt16} for \code{tmt12} provided a set of \item{ppm_reporters}{A positive integer; the mass tolerance of MS2 reporter ions. The default is 10.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{by_modules}{Not used. Logical. At the TRUE default, searches MS data by individual modules of combinatorial fixed and variable modifications. If FALSE, search all modules together. The later would probably need more than diff --git a/man/find_ms2_bypep.Rd b/man/find_ms2_bypep.Rd index 7ee104f..3967930 100644 --- a/man/find_ms2_bypep.Rd +++ b/man/find_ms2_bypep.Rd @@ -11,8 +11,7 @@ find_ms2_bypep( d = NULL, ppm_ms2 = 10L, min_ms2mass = 115L, - minn_ms2 = 6L, - index_mgf_ms2 = FALSE + minn_ms2 = 6L ) } \arguments{ @@ -33,20 +32,6 @@ interrogation. The default is 110.} \item{minn_ms2}{A positive integer; the minimum number of matched MS2 ions for consideration as a hit. Counts of secondary ions, e.g. b0, b* etc., are not part of the threshold.} - -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} } \value{ Lists of (1) theo, (2) expt, (3) ith, (4) iex and (5) m. diff --git a/man/frames_adv.Rd b/man/frames_adv.Rd index 8dc82a4..d4d58bf 100644 --- a/man/frames_adv.Rd +++ b/man/frames_adv.Rd @@ -30,7 +30,6 @@ frames_adv( ppm_ms1 = 10L, ppm_ms2 = 10L, min_ms2mass = 115L, - index_mgf_ms2 = FALSE, FUN ) } @@ -116,20 +115,6 @@ a hit.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{FUN}{A function pointer to, e.g., \link{gen_ms2ions_base}.} } \value{ diff --git a/man/hadd_primatches.Rd b/man/hadd_primatches.Rd index 9d45725..6a74ea8 100644 --- a/man/hadd_primatches.Rd +++ b/man/hadd_primatches.Rd @@ -11,8 +11,7 @@ hadd_primatches( add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, add_ms2ints = FALSE, - by_modules = TRUE, - index_mgf_ms2 = FALSE + by_modules = TRUE ) } \arguments{ @@ -46,20 +45,6 @@ intensity values (\code{pep_ms2_ints}).} individual modules of combinatorial fixed and variable modifications. If FALSE, search all modules together. The later would probably need more than 32G RAM if the number of modules is over 96.} - -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} } \description{ Helper of \link{add_primatches} diff --git a/man/hms2match.Rd b/man/hms2match.Rd index f594bff..057f3b8 100644 --- a/man/hms2match.Rd +++ b/man/hms2match.Rd @@ -26,7 +26,6 @@ hms2match( ppm_ms1 = 10L, ppm_ms2 = 10L, min_ms2mass = 115L, - index_mgf_ms2 = FALSE, by_modules = FALSE, df0 = NULL ) @@ -127,20 +126,6 @@ default is 20.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{by_modules}{Not used. Logical. At the TRUE default, searches MS data by individual modules of combinatorial fixed and variable modifications. If FALSE, search all modules together. The later would probably need more than diff --git a/man/hms2match_one.Rd b/man/hms2match_one.Rd index d369689..6db7343 100644 --- a/man/hms2match_one.Rd +++ b/man/hms2match_one.Rd @@ -30,7 +30,6 @@ hms2match_one( ppm_ms1 = 10L, ppm_ms2 = 10L, min_ms2mass = 115L, - index_mgf_ms2 = FALSE, df0 = NULL ) } @@ -138,20 +137,6 @@ default is 20.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{df0}{An output template.} } \description{ diff --git a/man/load_mgfs.Rd b/man/load_mgfs.Rd index a29b895..83ea42f 100644 --- a/man/load_mgfs.Rd +++ b/man/load_mgfs.Rd @@ -23,20 +23,18 @@ load_mgfs( tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, exclude_reporter_region = FALSE, - index_mgf_ms2 = FALSE, is_ms1_three_frame = TRUE, is_ms2_three_frame = TRUE, mgf_cutmzs = numeric(), mgf_cutpercs = numeric(), enzyme = "trypsin_p", - is_mdda = FALSE, deisotope_ms2 = TRUE, grad_isotope = 2.5, fct_iso2 = 3, max_ms2_charge = 3L, use_defpeaks = FALSE, maxn_dia_precurs = 300L, - maxn_mdda_precurs = 5L, + maxn_mdda_precurs = 1L, n_mdda_flanks = 6L, ppm_ms1_deisotope = 10L, ppm_ms2_deisotope = 10L, @@ -107,20 +105,6 @@ region of TMT reporter ions. The default is FALSE. The corresponding range of TMT reporter ions is informed by \code{tmt_reporter_lower} and \code{tmt_reporter_upper}. The argument affects only TMT data.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{is_ms1_three_frame}{Logical; is the searches by the three frames of preceding, current and following.} diff --git a/man/make_mztab.Rd b/man/make_mztab.Rd index 08adb2d..7ef9512 100644 --- a/man/make_mztab.Rd +++ b/man/make_mztab.Rd @@ -4,12 +4,17 @@ \alias{make_mztab} \title{Makes an mzTab file.} \usage{ -make_mztab(out_path = stop("Provide the path.", call. = FALSE)) +make_mztab( + mzion_path = stop("Provide the path.", call. = FALSE), + proteoq_path = mzion_path +) } \arguments{ -\item{out_path}{A parent path where the outputs of \code{PSM}, \code{Peptide} -and \code{Protein} files and folders are.} +\item{mzion_path}{The parent parthwhere \code{mzion} search was performed.} + +\item{proteoq_path}{A parent path where the \code{proteoQ} post-processing +was performed.} } \description{ -With \code{mzion} searches and proteoQ preprocessing. +With \code{mzion} searches and \code{proteoQ} post-processing. } diff --git a/man/map_raw_n_scan_old.Rd b/man/map_raw_n_scan_old.Rd new file mode 100644 index 0000000..aca70e2 --- /dev/null +++ b/man/map_raw_n_scan_old.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/msmsmatches.R +\name{map_raw_n_scan_old} +\alias{map_raw_n_scan_old} +\title{Maps raw_file and scan_title from indexes to real values.} +\usage{ +map_raw_n_scan_old(df, mgf_path) +} +\arguments{ +\item{df}{A data frame.} + +\item{mgf_path}{A file path to a list of MGF files. The experimenter needs to + supply the files. + + The supported MGFs are in the formats of (1) MSConvert against \code{.raw} + from Thermo's Orbitrap or \code{.d} from Bruker's timsTOF Pro, (2) Thermo's + Proteome Discoverer or (3) Bruker's DataAnalysis. + + With MSConvert, the default \code{titleMaker} is required for correct + parsing (don't think it can be altered by users, but just in case).} +} +\description{ +For backward compatibility. +} diff --git a/man/matchMS.Rd b/man/matchMS.Rd index 7ed8d3a..fee5af4 100644 --- a/man/matchMS.Rd +++ b/man/matchMS.Rd @@ -56,7 +56,6 @@ matchMS( tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, exclude_reporter_region = FALSE, - index_mgf_ms2 = FALSE, ppm_reporters = 10L, quant = c("none", "tmt6", "tmt10", "tmt11", "tmt16", "tmt18"), target_fdr = 0.01, @@ -355,20 +354,6 @@ region of TMT reporter ions. The default is FALSE. The corresponding range of TMT reporter ions is informed by \code{tmt_reporter_lower} and \code{tmt_reporter_upper}. The argument affects only TMT data.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{ppm_reporters}{A positive integer; the mass tolerance of MS2 reporter ions. The default is 10.} diff --git a/man/match_ex2th2.Rd b/man/match_ex2th2.Rd index 519afef..a340729 100644 --- a/man/match_ex2th2.Rd +++ b/man/match_ex2th2.Rd @@ -4,7 +4,7 @@ \alias{match_ex2th2} \title{Matches between secondary experimentals and theoreticals.} \usage{ -match_ex2th2(expt, theo, min_ms2mass = 115L, d = 1e-05, index_mgf_ms2 = FALSE) +match_ex2th2(expt, theo, min_ms2mass = 115L, d = 1e-05) } \arguments{ \item{expt}{A vector of experimental m-over-z values.} @@ -15,21 +15,7 @@ match_ex2th2(expt, theo, min_ms2mass = 115L, d = 1e-05, index_mgf_ms2 = FALSE) interrogation. The default is 110.} \item{d}{Bin size, e.g., \eqn{20 ppm / 2 * 1E-6}.} - -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} } \description{ -At \code{index_mgf_ms2 = FALSE}. +Matches between secondary experimentals and theoreticals. } diff --git a/man/mframes_adv.Rd b/man/mframes_adv.Rd index 41c7f48..a85a089 100644 --- a/man/mframes_adv.Rd +++ b/man/mframes_adv.Rd @@ -21,8 +21,7 @@ mframes_adv( minn_ms2 = 6L, ppm_ms1 = 10L, ppm_ms2 = 10L, - min_ms2mass = 115L, - index_mgf_ms2 = FALSE + min_ms2mass = 115L ) } \arguments{ @@ -92,20 +91,6 @@ a hit.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} - -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} } \value{ Matches to each MGF as a list elements. The length of the output is diff --git a/man/ms2match.Rd b/man/ms2match.Rd index 7cc7490..c7d609c 100644 --- a/man/ms2match.Rd +++ b/man/ms2match.Rd @@ -43,7 +43,6 @@ ms2match( min_len, max_len, max_miss, - index_mgf_ms2 = FALSE, first_search = FALSE, .savecall = TRUE ) @@ -215,20 +214,6 @@ for considerations. Longer peptides will be excluded. The default is 40.} \item{max_miss}{A non-negative integer; the maximum number of mis-cleavages per peptide sequence for considerations. The default is 2.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{first_search}{Logical; is the first search (for MGF mass calibration) or not.} diff --git a/man/ms2match_all.Rd b/man/ms2match_all.Rd index f00a11d..e4f32ed 100644 --- a/man/ms2match_all.Rd +++ b/man/ms2match_all.Rd @@ -23,7 +23,6 @@ ms2match_all( ppm_ms1 = 10L, ppm_ms2 = 10L, min_ms2mass = 115L, - index_mgf_ms2 = FALSE, df0 = NULL ) } @@ -103,20 +102,6 @@ default is 20.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{df0}{An output template} } \description{ diff --git a/man/ms2match_one.Rd b/man/ms2match_one.Rd index 77efc13..549aaed 100644 --- a/man/ms2match_one.Rd +++ b/man/ms2match_one.Rd @@ -27,7 +27,6 @@ ms2match_one( ppm_ms1 = 10L, ppm_ms2 = 10L, min_ms2mass = 115L, - index_mgf_ms2 = FALSE, df0 = NULL ) } @@ -115,20 +114,6 @@ default is 20.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{df0}{An output template} } \description{ diff --git a/man/post_readmgf.Rd b/man/post_readmgf.Rd index 9a5c077..a13a7df 100644 --- a/man/post_readmgf.Rd +++ b/man/post_readmgf.Rd @@ -4,11 +4,20 @@ \alias{post_readmgf} \title{Post-processing of MGF or mzML} \usage{ -post_readmgf(df, min_mass = 200L, max_mass = 4500L, ppm_ms1 = 10L, filepath) +post_readmgf( + df, + raw_id, + mgf_path, + min_mass = 200L, + max_mass = 4500L, + ppm_ms1 = 10L +) } \arguments{ \item{df}{A data frame of processed peak lists.} +\item{raw_id}{An ID to replace the original RAW file name.} + \item{min_mass}{Numeric; the minimum mass of MS1 species. The value needs to match the one in \link{binTheoSeqs}.} @@ -16,8 +25,6 @@ match the one in \link{binTheoSeqs}.} \item{ppm_ms1}{A positive integer; the mass tolerance of MS1 species. The default is 20.} - -\item{filepath}{The file path to a list of MGF or mzML files.} } \description{ Calculates mass \code{frame}s etc. diff --git a/man/proc_dda.Rd b/man/proc_dda.Rd index 26d3d5c..27efd3a 100644 --- a/man/proc_dda.Rd +++ b/man/proc_dda.Rd @@ -28,8 +28,7 @@ proc_dda( quant = "none", tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, - exclude_reporter_region = FALSE, - index_mgf_ms2 = FALSE + exclude_reporter_region = FALSE ) } \arguments{ @@ -93,20 +92,6 @@ The default is \eqn{135.2}.} region of TMT reporter ions. The default is FALSE. The corresponding range of TMT reporter ions is informed by \code{tmt_reporter_lower} and \code{tmt_reporter_upper}. The argument affects only TMT data.} - -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} } \description{ Proc mzML data for DDA workflows. diff --git a/man/proc_mgf_chunks.Rd b/man/proc_mgf_chunks.Rd index 501fefc..d8c2619 100644 --- a/man/proc_mgf_chunks.Rd +++ b/man/proc_mgf_chunks.Rd @@ -35,8 +35,6 @@ proc_mgf_chunks( tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, exclude_reporter_region = FALSE, - index_mgf_ms2 = FALSE, - is_mdda = FALSE, deisotope_ms2 = TRUE, max_ms2_charge = 3L, use_defpeaks = FALSE, @@ -130,20 +128,6 @@ region of TMT reporter ions. The default is FALSE. The corresponding range of TMT reporter ions is informed by \code{tmt_reporter_lower} and \code{tmt_reporter_upper}. The argument affects only TMT data.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{deisotope_ms2}{Logical; if TRUE, de-isotope MS2 features.} \item{max_ms2_charge}{Maximum charge states for consideration with MS2 diff --git a/man/proc_mgfs.Rd b/man/proc_mgfs.Rd index cb5be0c..28e3ed1 100644 --- a/man/proc_mgfs.Rd +++ b/man/proc_mgfs.Rd @@ -35,8 +35,6 @@ proc_mgfs( tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, exclude_reporter_region = FALSE, - index_mgf_ms2 = FALSE, - is_mdda = FALSE, deisotope_ms2 = TRUE, max_ms2_charge = 3L, use_defpeaks = FALSE, @@ -130,20 +128,6 @@ region of TMT reporter ions. The default is FALSE. The corresponding range of TMT reporter ions is informed by \code{tmt_reporter_lower} and \code{tmt_reporter_upper}. The argument affects only TMT data.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{deisotope_ms2}{Logical; if TRUE, de-isotope MS2 features.} \item{max_ms2_charge}{Maximum charge states for consideration with MS2 diff --git a/man/proc_mzml.Rd b/man/proc_mzml.Rd index 0f17b58..45d5949 100644 --- a/man/proc_mzml.Rd +++ b/man/proc_mzml.Rd @@ -6,6 +6,8 @@ \usage{ proc_mzml( file, + raw_id, + filepath, topn_ms2ions = 100L, ms1_charge_range = c(2L, 4L), ret_range = c(0, Inf), @@ -20,8 +22,6 @@ proc_mzml( tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, exclude_reporter_region = FALSE, - index_mgf_ms2 = FALSE, - is_mdda = FALSE, deisotope_ms2 = TRUE, max_ms2_charge = 3L, use_defpeaks = FALSE, @@ -39,6 +39,10 @@ proc_mzml( \arguments{ \item{file}{A file name to mzML with a prepending path.} +\item{raw_id}{A RAW file ID.} + +\item{filepath}{A file path of MGF.} + \item{topn_ms2ions}{A non-negative integer; the top-n species for uses in MS2 ion searches. The default is to use the top-100 ions in an MS2 event.} @@ -78,20 +82,6 @@ region of TMT reporter ions. The default is FALSE. The corresponding range of TMT reporter ions is informed by \code{tmt_reporter_lower} and \code{tmt_reporter_upper}. The argument affects only TMT data.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{deisotope_ms2}{Logical; if TRUE, de-isotope MS2 features.} \item{max_ms2_charge}{Maximum charge states for consideration with MS2 diff --git a/man/readMGF.Rd b/man/readMGF.Rd index baba9f1..9d67a8d 100644 --- a/man/readMGF.Rd +++ b/man/readMGF.Rd @@ -7,6 +7,7 @@ readMGF( filepath = NULL, filelist = NULL, + out_path = NULL, min_mass = 200L, max_mass = 4500L, min_ms2mass = 115L, @@ -20,11 +21,8 @@ readMGF( tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, exclude_reporter_region = FALSE, - index_mgf_ms2 = FALSE, mgf_cutmzs = numeric(), mgf_cutpercs = numeric(), - out_path = file.path(filepath, "mgf_queries_1.rds"), - is_mdda = FALSE, deisotope_ms2 = TRUE, max_ms2_charge = 3L, use_defpeaks = FALSE, @@ -42,6 +40,8 @@ readMGF( \item{filelist}{A list of MGF or mzML files.} +\item{out_path}{An output path.} + \item{min_mass}{Numeric; the minimum mass of MS1 species. The value needs to match the one in \link{binTheoSeqs}.} @@ -78,27 +78,11 @@ region of TMT reporter ions. The default is FALSE. The corresponding range of TMT reporter ions is informed by \code{tmt_reporter_lower} and \code{tmt_reporter_upper}. The argument affects only TMT data.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{mgf_cutmzs}{Cut points of MS1 m-over-z values in peak picking.} \item{mgf_cutpercs}{The counts of MS2 features in each region of \code{mgf_cutmzs}.} -\item{out_path}{An output path.} - \item{deisotope_ms2}{Logical; if TRUE, de-isotope MS2 features.} \item{max_ms2_charge}{Maximum charge states for consideration with MS2 diff --git a/man/read_mgf_chunks.Rd b/man/read_mgf_chunks.Rd index 63ed621..b2f5725 100644 --- a/man/read_mgf_chunks.Rd +++ b/man/read_mgf_chunks.Rd @@ -5,7 +5,9 @@ \title{Reads mgfs in chunks.} \usage{ read_mgf_chunks( - filepath = "~/mzion/mgf/temp_1", + filepath, + temp_dir, + raw_id = 1L, topn_ms2ions = 100L, ms1_charge_range = c(2L, 6L), ms1_scan_range = c(1L, .Machine$integer.max), @@ -35,11 +37,9 @@ read_mgf_chunks( tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, exclude_reporter_region = FALSE, - index_mgf_ms2 = FALSE, deisotope_ms2 = TRUE, max_ms2_charge = 3L, maxn_dia_precurs = 300L, - is_mdda = FALSE, use_defpeaks = FALSE, maxn_mdda_precurs = 5L, n_mdda_flanks = 6L, @@ -52,6 +52,10 @@ read_mgf_chunks( \arguments{ \item{filepath}{The file path to a list of MGF or mzML files.} +\item{temp_dir}{A temporary path of MGFs.} + +\item{raw_id}{An ID to RAW file name.} + \item{topn_ms2ions}{A non-negative integer; the top-n species for uses in MS2 ion searches. The default is to use the top-100 ions in an MS2 event.} @@ -130,20 +134,6 @@ region of TMT reporter ions. The default is FALSE. The corresponding range of TMT reporter ions is informed by \code{tmt_reporter_lower} and \code{tmt_reporter_upper}. The argument affects only TMT data.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{deisotope_ms2}{Logical; if TRUE, de-isotope MS2 features.} \item{max_ms2_charge}{Maximum charge states for consideration with MS2 diff --git a/man/read_mzml.Rd b/man/read_mzml.Rd index 3a1f2b4..a2c6fe8 100644 --- a/man/read_mzml.Rd +++ b/man/read_mzml.Rd @@ -10,13 +10,11 @@ read_mzml( tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, exclude_reporter_region = FALSE, - index_mgf_ms2 = FALSE, ppm_ms1 = 10L, ppm_ms2 = 10L, min_ms2mass = 115L, max_ms2mass = 4500L, max_ms1_charge = 4L, - is_mdda = FALSE, deisotope_ms2 = TRUE, max_ms2_charge = 3L, use_defpeaks = FALSE, @@ -48,20 +46,6 @@ region of TMT reporter ions. The default is FALSE. The corresponding range of TMT reporter ions is informed by \code{tmt_reporter_lower} and \code{tmt_reporter_upper}. The argument affects only TMT data.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{ppm_ms1}{A positive integer; the mass tolerance of MS1 species. The default is 20.} diff --git a/man/readmzML.Rd b/man/readmzML.Rd index b7b31d8..c49ffbf 100644 --- a/man/readmzML.Rd +++ b/man/readmzML.Rd @@ -21,10 +21,8 @@ readmzML( tmt_reporter_lower = 126.1, tmt_reporter_upper = 135.2, exclude_reporter_region = FALSE, - index_mgf_ms2 = FALSE, mgf_cutmzs = numeric(), mgf_cutpercs = numeric(), - is_mdda = FALSE, deisotope_ms2 = TRUE, max_ms2_charge = 3L, use_defpeaks = FALSE, @@ -82,20 +80,6 @@ region of TMT reporter ions. The default is FALSE. The corresponding range of TMT reporter ions is informed by \code{tmt_reporter_lower} and \code{tmt_reporter_upper}. The argument affects only TMT data.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{mgf_cutmzs}{Cut points of MS1 m-over-z values in peak picking.} \item{mgf_cutpercs}{The counts of MS2 features in each region of diff --git a/man/scalc_pepprobs.Rd b/man/scalc_pepprobs.Rd index bd97905..a76900a 100644 --- a/man/scalc_pepprobs.Rd +++ b/man/scalc_pepprobs.Rd @@ -12,7 +12,6 @@ scalc_pepprobs( soft_secions = FALSE, min_ms2mass = 115L, d2 = 1e-05, - index_mgf_ms2 = FALSE, tally_ms2ints = TRUE, digits = 4L ) @@ -42,20 +41,6 @@ interrogation. The default is 110.} \item{d2}{Bin width in ppm divided by 1E6.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{tally_ms2ints}{Logical; tally MS2 intensities or not.} \item{digits}{A non-negative integer; the number of decimal places to be diff --git a/man/search_mgf.Rd b/man/search_mgf.Rd index 4ed388b..b8e9008 100644 --- a/man/search_mgf.Rd +++ b/man/search_mgf.Rd @@ -15,7 +15,6 @@ search_mgf( ppm_ms1 = 10L, ppm_ms2 = 10L, min_ms2mass = 115L, - index_mgf_ms2 = FALSE, by_modules = FALSE ) } @@ -49,20 +48,6 @@ default is 20.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} -\item{index_mgf_ms2}{Depreciated. A low-priority feature. Logical; if TRUE, - converts up-frontly MS2 m-over-z values from numeric to integers as opposed - to \emph{on-the-fly} conversion during ion matches. The default is FALSE. - The \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by - reducing RAM footprints. - - At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between - theoretical and experimental MS2 m-over-z values is limited by the - \code{bin_width}, which is the ceiling half of the \code{ppm_ms2}. For - instance, the \code{bin_width} is 10 ppm at the default \code{ppm_ms2 = - 20}. Due to the low resolution in mass deltas at \code{index_mgf_ms2 = TRUE}, - the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, - pep_ms2_deltas_sd} are nullified in the outputs.} - \item{by_modules}{Not used. Logical. At the TRUE default, searches MS data by individual modules of combinatorial fixed and variable modifications. If FALSE, search all modules together. The later would probably need more than diff --git a/vignettes/README.Rmd b/vignettes/README.Rmd index c2b4894..deca831 100644 --- a/vignettes/README.Rmd +++ b/vignettes/README.Rmd @@ -73,8 +73,6 @@ devtools::install_github("qzhang503/mzionShiny") (1) peakPicking: vendor msLevel = 1- (2) (optional) zeroSamples: removeExtra 1- - - - [x] `Proteome Discoverer mgf` - Bruker's MS - [x] `DataAnalysis mgf`