diff --git a/DESCRIPTION b/DESCRIPTION index 4935ee0..245b279 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: mzion Type: Package -Title: Database Searches of Proteomic Mass-spectrometrirc Data -Version: 1.2.8.1 +Title: Proteomics Database Searches of Mass-spectrometrirc Data +Version: 1.2.8.2 Authors@R: person(given = "Qiang", family = "Zhang", diff --git a/R/mgfs.R b/R/mgfs.R index dd65fec..730a6b5 100644 --- a/R/mgfs.R +++ b/R/mgfs.R @@ -1598,13 +1598,49 @@ prepBrukerMGF <- function (file = NULL, begin_offset = 5L, charge_offset = 5L) message("Processing: ", file) lines <- readLines(file) + hd <- lines[1:100] + ### Some MGFs may have additional "SCANS=" lines + pat_sc <- "SCANS" + sc <- .Internal(which(stringi::stri_startswith_fixed(hd, pat_sc))) + + if (length(sc)) { + lscs <- .Internal(which(stringi::stri_startswith_fixed(lines, pat_sc))) + lines <- lines[-lscs] + rm(list = "lscs") + } + rm(list = c("pat_sc", "sc")) + + ###PrecursorID: + pat_pr <- "###PrecursorID" + pr <- .Internal(which(stringi::stri_startswith_fixed(hd, pat_pr))) + + if (length(pr)) { + lprs <- .Internal(which(stringi::stri_startswith_fixed(lines, pat_pr))) + lines <- lines[-lprs] + rm(list = "lprs") + } + rm(list = c("pat_pr", "pr")) + + ###CCS: + pat_ccs <- "###CCS" + ccs <- .Internal(which(stringi::stri_startswith_fixed(hd, pat_ccs))) + + if (length(ccs)) { + ccs <- .Internal(which(stringi::stri_startswith_fixed(lines, pat_ccs))) + lines <- lines[-ccs] + rm(list = "ccs") + } + rm(list = c("pat_ccs", "ccs")) + + ## Processing begins <- .Internal(which(stringi::stri_startswith_fixed(lines, "BEGIN IONS"))) ends <- .Internal(which(stringi::stri_endswith_fixed(lines, "END IONS"))) hdrs <- 1:(begins[1]-begin_offset-1L) zls <- lines[begins+5L] - oks <- grepl("^CHARGE", zls) & (zls != "CHARGE=1+") + # oks <- grepl("^CHARGE", zls) & (zls != "CHARGE=1+") + oks <- grepl("^CHARGE", zls) & (zls != "CHARGE=1+") & grepl("^###Mobility", lines[begins-1L]) rm(list = "zls") b_oks <- begins[oks] - begin_offset @@ -1623,7 +1659,7 @@ prepBrukerMGF <- function (file = NULL, begin_offset = 5L, charge_offset = 5L) #' @param filepath A file path to MGF. #' @param n_cores The number of CPU cores. #' @export -mprepBrukerMGF <- function (filepath, n_cores = 48L) +mprepBrukerMGF <- function (filepath, n_cores = 32L) { message("Preparing Bruker's MGFs.") diff --git a/R/ms2_gen.R b/R/ms2_gen.R index 70aa9b1..899a2d4 100644 --- a/R/ms2_gen.R +++ b/R/ms2_gen.R @@ -100,8 +100,6 @@ gen_ms2ions_base <- function (aa_seq = NULL, ms1_mass = NULL, mod_indexes = NULL, type_ms2ions = "by", maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, - - # dummy maxn_fnl_per_seq = 3L, maxn_vnl_per_seq = 3L, maxn_vmods_sitescombi_per_pep = 64L) { @@ -182,11 +180,13 @@ gen_ms2ions_a0_vnl0_fnl1 <- function (aa_seq, ms1_mass = NULL, maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, maxn_fnl_per_seq = 3L, - - # dummy maxn_vnl_per_seq = 3L, maxn_vmods_sitescombi_per_pep = 64L) { + # cannot distinguish a neutral loss is in-source or in-MS2: + # "+ms1_offset and +ms2_nl" versus "-ms1_offset and -ms2_nl" + # if (ms1_offset != 0) maxn_vnl_per_seq = 1L + if (maxn_fnl_per_seq < 2L) return( gen_ms2ions_base(aa_seq = aa_seq, ms1_mass = ms1_mass, @@ -478,10 +478,8 @@ gen_ms2ions_a1_vnl0_fnl0 <- function (aa_seq, ms1_mass = NULL, aa_masses = NULL, mod_indexes = NULL, type_ms2ions = "by", maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, - - # dummy - maxn_fnl_per_seq = 3L, maxn_vnl_per_seq = 3L, - + maxn_fnl_per_seq = 3L, + maxn_vnl_per_seq = 3L, maxn_vmods_sitescombi_per_pep = 64L) { aas <- .Internal(strsplit(aa_seq, "", fixed = TRUE, perl = FALSE, useBytes = FALSE)) @@ -770,8 +768,6 @@ gen_ms2ions_a1_vnl0_fnl1 <- function (aa_seq = NULL, ms1_mass = NULL, maxn_sites_per_vmod = 3L, maxn_vmods_sitescombi_per_pep = 64L, maxn_fnl_per_seq = 3L, - - # dummy maxn_vnl_per_seq = 3L) { if (maxn_fnl_per_seq < 2L) @@ -1146,8 +1142,6 @@ gen_ms2ions_a1_vnl1_fnl0 <- function (aa_seq = NULL, ms1_mass = NULL, maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, maxn_vmods_sitescombi_per_pep = 64L, - - # dummy maxn_fnl_per_seq = 3L, maxn_vnl_per_seq = 3L) { diff --git a/R/ms2frames.R b/R/ms2frames.R index 4a44468..8bcb8d7 100644 --- a/R/ms2frames.R +++ b/R/ms2frames.R @@ -19,7 +19,7 @@ pair_mgftheos <- function (mgf_path, n_modules, ms1_offsets = 0, message("Pairing experimental and theoretical data.") tempfiles <- if (by_modules) - list.files(mgf_path, pattern = "^expttheo_", full.names = TRUE) + list.files(mgf_path, pattern = "^expt|^theo", full.names = TRUE) else list.files(mgf_path, pattern = "^mgftheo_", full.names = TRUE) @@ -33,6 +33,9 @@ pair_mgftheos <- function (mgf_path, n_modules, ms1_offsets = 0, # data thinning for MGF calibrations if (first_search) { mgfs <- lapply(mgfs, function (x) { + if ((fct <- ceiling(nrow(x)/50000L)) == 1L) + return(x) + min_mgfmass <- min(x$ms1_mass, na.rm = TRUE) max_mgfmass <- max(x$ms1_mass, na.rm = TRUE) oks_min <- with(x, ms1_mass <= min_mgfmass + 10L) @@ -41,20 +44,24 @@ pair_mgftheos <- function (mgf_path, n_modules, ms1_offsets = 0, mgfa <- x[oks_max, ] mgfb <- x[oks_min, ] mgfc <- x[!(oks_max | oks_min), ] - rows <- (1:nrow(mgfc)) %% 10L == 1L + rows <- (1:nrow(mgfc)) %% fct == 1L dplyr::bind_rows(mgfa, mgfc[rows, ], mgfb) }) } - ## output expttheo_1_1.rds: expttheo_module[_notch].rds mgfs <- dplyr::bind_rows(mgfs) - mapply(hpair_mgths, ms1_offsets, seq_along(ms1_offsets), + notches <- seq_along(ms1_offsets) + + mapply(hpair_mgths, ms1_offsets, notches, MoreArgs = list( mgfs = mgfs, n_modules = n_modules, by_modules = by_modules, mgf_path = mgf_path, min_mass = min_mass, max_mass = max_mass, ppm_ms1_bin = ppm_ms1_bin, .path_bin = .path_bin), SIMPLIFY = FALSE, USE.NAMES = FALSE) + + qs::qsave(data.frame(ms1_offset = ms1_offsets, notch = notches), + file.path(mgf_path, "notches.rds")) invisible(NULL) } @@ -125,7 +132,7 @@ hpair_mgths <- function (ms1_offset = 0, notch = NULL, mgfs, n_modules, } # rm(list = c("theos", "mfrs", "thfrs", "mins", "maxs")) - # (2) removes mgfs[[i]] not be found in anstheo[[i]] + # (2) removes mgfs[[i]] not found in any anstheo[[i]] for (i in seq_along(mgfs)) { mi <- mgfs[[i]] fi <- names(mi) @@ -138,10 +145,9 @@ hpair_mgths <- function (ms1_offset = 0, notch = NULL, mgfs, n_modules, } # rm(list = c("mi", "ti", "fi", "oks")) - if (length(mgfs) == 1L && !length(mgfs[[1]])) { + if (length(mgfs) == 1L && !length(mgfs[[1]])) mgfs[1] <- list(NULL) - } - + # (3) removes unused frames of `anstheo` # (mgfs determines the length of each anstheo[[i]]; # more effective to first generate all bracketed mgfs indexes and apply @@ -158,9 +164,13 @@ hpair_mgths <- function (ms1_offset = 0, notch = NULL, mgfs, n_modules, # (5) outputs (in chunks) if (by_modules) { + qs::qsave(mgfs, + file.path(mgf_path, paste0("expt_", 0, "_", notch, ".rds")), + preset = "fast") + for (i in seq_len(n_modules)) - qs::qsave(list(mgf_frames = mgfs, theopeps = anstheo[[i]]), - file.path(mgf_path, paste0("expttheo_", i, "_", notch, ".rds")), + qs::qsave(anstheo[[i]], + file.path(mgf_path, paste0("theo_", i, "_", notch, ".rds")), preset = "fast") } else { @@ -173,24 +183,7 @@ hpair_mgths <- function (ms1_offset = 0, notch = NULL, mgfs, n_modules, # next version: note that mgfs are the same for different modules # also save by fractions -> avoid sendData in parallel by read from disk - if (FALSE) { - if (by_modules) ( - # expt_notch_fraction.rds - # theo_module_notch_fraction.rds - mapply(function (data, frc) { - fi <- file.path(mgf_path, paste0("expt_", 0, "_", notch, "_", frc, ".rds")) - qs::qsave(data, fi, preset = "fast") - }, mgfs, seq_along(mgfs)) - ) - else { - for (i in seq_along(mgfs)) - qs::qsave(list(mgf_frames = mgfs[[i]], - theopeps = lapply(anstheo, `[[`, i)), - file.path(mgf_path, paste0("mgftheo_", i, ".rds")), - preset = "fast") - } - } - + invisible(NULL) } @@ -212,6 +205,8 @@ hpair_mgths <- function (ms1_offset = 0, notch = NULL, mgfs, n_modules, #' @inheritParams matchMS #' @inheritParams pair_mgftheos hms2match <- function (aa_masses_all, funs_ms2, ms1vmods_all, ms2vmods_all, + ms1_neulosses = NULL, maxn_neulosses_fnl = 1L, + maxn_neulosses_vnl = 1L, mod_indexes, mgf_path, out_path, type_ms2ions = "by", maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, maxn_fnl_per_seq = 3L, @@ -221,8 +216,10 @@ hms2match <- function (aa_masses_all, funs_ms2, ms1vmods_all, ms2vmods_all, min_ms2mass = 115L, index_mgf_ms2 = FALSE, by_modules = FALSE, df0 = NULL) { - pat <- if (by_modules) "^expttheo" else "^mgftheo" - mgths <- order_fracs(type = pat, tempdir = mgf_path, by_modules = by_modules) + pth <- if (by_modules) "theo" else "mgftheo" + pex <- if (by_modules) "expt" else "mgftheo" + theos <- order_fracs(type = pth, tempdir = mgf_path, by_modules = by_modules) + expts <- order_fracs(type = pex, tempdir = mgf_path, by_modules = by_modules) message("\n=== MS2 ion searches started at ", Sys.time(), ". ===\n") @@ -248,14 +245,18 @@ hms2match <- function (aa_masses_all, funs_ms2, ms1vmods_all, ms2vmods_all, if (by_modules) { parallel::clusterExport(cl, c("frames_adv"), envir = environment(mzion::matchMS)) - for (i in seq_along(aa_masses_all)) + for (i in seq_along(aa_masses_all)) { hms2match_one( pep_mod_group = i, - mgths = mgths[[i]], + nms_theo = theos[[i]], + nms_expt = expts[[1]], aa_masses = aa_masses_all[[i]], FUN = funs_ms2[[i]], ms1vmods = ms1vmods_all[[i]], ms2vmods = ms2vmods_all[[i]], + ms1_neulosses = ms1_neulosses, + maxn_neulosses_fnl = maxn_neulosses_fnl, + maxn_neulosses_vnl = maxn_neulosses_vnl, cl = cl, mod_indexes = mod_indexes, mgf_path = mgf_path, @@ -269,6 +270,7 @@ hms2match <- function (aa_masses_all, funs_ms2, ms1vmods_all, ms2vmods_all, minn_ms2 = minn_ms2, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, min_ms2mass = min_ms2mass, index_mgf_ms2 = index_mgf_ms2, df0 = df0) + } } else { message("Check search progress at: ", logs) @@ -334,7 +336,7 @@ ms2match_all <- function (mgth, aa_masses_all, funs_ms2, ms1vmods_all, if (!length(mgf_frames)) { qs::qsave(df0, file.path(out_path, "temp", out_name)) - return(df0) + return(NULL) } df <- mframes_adv( @@ -1275,7 +1277,8 @@ search_mgf <- function (expt_mass_ms1 = NULL, expt_moverz_ms2 = NULL, #' For a single module #' #' @param pep_mod_group The index of peptide modification groups. -#' @param mgths Pairs of experimental and theoretical data. +#' @param nms_expt Names of experimental files. +#' @param nms_theo Names of theoretical files. #' @param aa_masses An amino-acid look-up. #' @param FUN A function, e.g., \link{gen_ms2ions_base}, with an i-th module of #' \code{aa_masses}. @@ -1286,8 +1289,9 @@ search_mgf <- function (expt_mass_ms1 = NULL, expt_moverz_ms2 = NULL, #' @param cl The value of clusters for parallel processes. #' @param df0 An output template. #' @inheritParams hms2match -hms2match_one <- function (pep_mod_group, mgths, aa_masses, FUN, - ms1vmods, ms2vmods, cl, +hms2match_one <- function (pep_mod_group, nms_theo, nms_expt, aa_masses, FUN, + ms1vmods, ms2vmods, cl, ms1_neulosses = NULL, + maxn_neulosses_fnl = 1L, maxn_neulosses_vnl = 1L, mod_indexes, mgf_path, out_path, type_ms2ions = "by", maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, maxn_fnl_per_seq = 3L, maxn_vnl_per_seq = 3L, @@ -1302,11 +1306,43 @@ hms2match_one <- function (pep_mod_group, mgths, aa_masses, FUN, message("Matching against: ", if (nchar(nm_vmods) == 0L) nm_fmods else paste0(nm_fmods, " | ", nm_vmods)) - df <- vector("list", length(mgths)) + # subset notches: c("Phospho (S)", "Phospho (T)", "Phospho (Y)") + if (length(ms1_neulosses)) { + if (length(vmods_nl <- attr(aa_masses, "vmods_nl", exact = TRUE))) { + nls <- extract_umods(ms1_neulosses) + oksnl <- lapply(nls, function (x) length(x[["nl"]]) > 1L) + nls <- nls[unlist(oksnl)] + + if (length(nls) && any(oks <- names(nls) %in% names(vmods_nl))) { + notches <- unique(unlist(lapply(nls[oks], `[[`, "nl"))) + notches <- -round(notches, digits = 4) + + df <- qs::qread(file.path(mgf_path, "notches.rds")) + allowed <- df[with(df, ms1_offset %in% notches), ]$notch # at least one + + univ <- as.integer(gsub("^theo_\\d+_(\\d+)\\.rds", "\\1", nms_theo)) + idxes <- univ %in% allowed + + nms_theo <- nms_theo[idxes] # at least length-1 of offset-0 + nms_expt <- nms_expt[idxes] + } + else { + # Oxidation (M) + nms_theo <- nms_theo[1] + nms_expt <- nms_expt[1] + } + } + else { + # Deamidated (N) + nms_theo <- nms_theo[1] + nms_expt <- nms_expt[1] + } + } - for (i in seq_along(mgths)) { - df[[i]] <- ms2match_one( - mgths[[i]], + if ((len <- length(nms_theo)) == 1L) { + df <- ms2match_one( + nms_theo = nms_theo[[1]], + nms_expt = nms_expt[[1]], pep_mod_group = pep_mod_group, aa_masses = aa_masses, FUN = FUN, ms1vmods = ms1vmods, ms2vmods = ms2vmods, cl = cl, mod_indexes = mod_indexes, mgf_path = mgf_path, out_path = out_path, @@ -1320,9 +1356,49 @@ hms2match_one <- function (pep_mod_group, mgths, aa_masses, FUN, min_ms2mass = min_ms2mass, index_mgf_ms2 = index_mgf_ms2, df0 = df0) } - - df <- dplyr::bind_rows(df) - + else { + df <- vector("list", len) + + df[[1]] <- ms2match_one( + nms_theo = nms_theo[[1]], + nms_expt = nms_expt[[1]], + pep_mod_group = pep_mod_group, aa_masses = aa_masses, FUN = FUN, + ms1vmods = ms1vmods, ms2vmods = ms2vmods, cl = cl, + mod_indexes = mod_indexes, mgf_path = mgf_path, out_path = out_path, + type_ms2ions = type_ms2ions, + maxn_vmods_per_pep = maxn_vmods_per_pep, + maxn_sites_per_vmod = maxn_sites_per_vmod, + maxn_fnl_per_seq = maxn_fnl_per_seq, + maxn_vnl_per_seq = maxn_vnl_per_seq, + maxn_vmods_sitescombi_per_pep = maxn_vmods_sitescombi_per_pep, + minn_ms2 = minn_ms2, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, + min_ms2mass = min_ms2mass, index_mgf_ms2 = index_mgf_ms2, + df0 = df0) + + # neutral losses: maxn_neulosses_vnl and maxn_neulosses_fnl + for (i in 2:len) { + df[[i]] <- ms2match_one( + nms_theo = nms_theo[[i]], + nms_expt = nms_expt[[i]], + pep_mod_group = pep_mod_group, aa_masses = aa_masses, FUN = FUN, + ms1vmods = ms1vmods, ms2vmods = ms2vmods, cl = cl, + mod_indexes = mod_indexes, mgf_path = mgf_path, out_path = out_path, + type_ms2ions = type_ms2ions, + maxn_vmods_per_pep = maxn_vmods_per_pep, + maxn_sites_per_vmod = maxn_sites_per_vmod, + maxn_fnl_per_seq = maxn_neulosses_fnl, + maxn_vnl_per_seq = maxn_neulosses_vnl, + maxn_vmods_sitescombi_per_pep = maxn_vmods_sitescombi_per_pep, + minn_ms2 = minn_ms2, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, + min_ms2mass = min_ms2mass, index_mgf_ms2 = index_mgf_ms2, + df0 = df0) + } + + empties <- unlist(lapply(df, is.null)) + df <- df[!empties] + df <- dplyr::bind_rows(df) + } + out_nm <- file.path(out_path, "temp", paste0("ion_matches_", pep_mod_group, ".rds")) if (is.null(df)) { @@ -1345,9 +1421,7 @@ hms2match_one <- function (pep_mod_group, mgths, aa_masses, FUN, pep_scan_num = scan_num, pep_exp_z = ms1_charge, pep_ms2_moverzs = ms2_moverz, - pep_ms2_ints = ms2_int, - # pep_frame = frame, - ) + pep_ms2_ints = ms2_int, ) # df[["pep_scan_num"]] <- as.character(df[["pep_scan_num"]]) df <- reloc_col_after(df, "raw_file", "scan_num") df <- reloc_col_after(df, "pep_mod_group", "raw_file") @@ -1355,11 +1429,12 @@ hms2match_one <- function (pep_mod_group, mgths, aa_masses, FUN, } -#' Matches experimentals and theoreticals +#' Matches experimentals and theoreticals. #' -#' For a single module +#' For a single module and single notch. #' -#' @param mgth MGF and theoretical pairs +#' @param nms_expt Names of experimental files. +#' @param nms_theo Names of theoretical files. #' @param pep_mod_group The index of peptide modification groups #' @param aa_masses An amino-acid look-up #' @param FUN A function, e.g., \link{gen_ms2ions_base}, with an i-th module of @@ -1371,7 +1446,7 @@ hms2match_one <- function (pep_mod_group, mgths, aa_masses, FUN, #' @param cl The value of clusters for parallel processes #' @param df0 An output template #' @inheritParams hms2match -ms2match_one <- function (mgth, pep_mod_group, aa_masses, FUN, +ms2match_one <- function (nms_theo, nms_expt, pep_mod_group, aa_masses, FUN, ms1vmods, ms2vmods, cl, mod_indexes, mgf_path, out_path, type_ms2ions = "by", maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, @@ -1381,11 +1456,9 @@ ms2match_one <- function (mgth, pep_mod_group, aa_masses, FUN, min_ms2mass = 115L, index_mgf_ms2 = FALSE, df0 = NULL) { - out_name <- gsub("^expttheo", "ion_matches", mgth) - mgftheo <- qs::qread(file.path(mgf_path, mgth)) - mgf_frames <- mgftheo[["mgf_frames"]] - theopeps <- mgftheo[["theopeps"]] - rm(list = "mgftheo") + out_name <- gsub("^theo", "ion_matches", nms_theo) + mgf_frames <- qs::qread(file.path(mgf_path, nms_expt)) + theopeps <- qs::qread(file.path(mgf_path, nms_theo)) len <- length(mgf_frames) @@ -1438,6 +1511,8 @@ ms2match_one <- function (mgth, pep_mod_group, aa_masses, FUN, FUN = FUN), .scheduling = "dynamic") + empties <- unlist(lapply(df, is.null)) + df <- df[!empties] df <- dplyr::bind_rows(df) } diff --git a/R/msmsmatches.R b/R/msmsmatches.R index 113d4ef..36c26bc 100644 --- a/R/msmsmatches.R +++ b/R/msmsmatches.R @@ -171,8 +171,25 @@ #' @param n_13c Number(s) of 13C off-sets in precursor masses, for example, over #' the range of \code{-1:2}. The default is 0. #' @param ms1_notches A numeric vector; notches (off-sets) in precursor masses, -#' e.g., \code{c(-79.966331, -97.976896)} to account fo the loss of a phospho -#' group and phosphoric acid in precursor masses. +#' e.g., \code{c(-97.976896)} to account fo the loss of a phosphoric acid in +#' precursor masses. +#' @param ms1_neulosses Character string(s) specifying the neutral losses of +#' precursors. The nomenclature is the same as those in argument +#' \code{varmods}, e.g., \code{c("Phospho (S)", "Phospho (T)", "Phospho +#' (Y)")}. +#' +#' The argument is a simplified (narrower) usage of argument +#' \code{ms1_notches} with additional specificity in modifications and sites. +#' @param maxn_neulosses_fnl A positive integer used in conjunction with +#' arguments \code{ms1_notches} and \code{ms1_neulosses}. The maximum number +#' of fixed MS2 neutral losses to be considered when searching against peak +#' lists with MS1 mass off-sets. The default is two (one MS2 neutral loss in +#' addition to 0). +#' @param maxn_neulosses_vnl A positive integer used in conjunction with +#' arguments \code{ms1_notches} and \code{ms1_neulosses}. The maximum number +#' of variable MS2 neutral losses to be considered when searching against peak +#' lists with MS1 mass off-sets. The default is two (one MS2 neutral loss in +#' addition to 0). #' @param par_groups A low-priority feature. Parameter(s) of \code{matchMS} #' multiplied by sets of values in groups. Multiple searches will be performed #' separately against the parameter groups. For instance with one set of @@ -478,9 +495,9 @@ #' matchMS( #' fixedmods = c("TMTpro (N-term)", "TMTpro (K)", "Carbamidomethyl (C)"), #' varmods = c("Acetyl (Protein N-term)", "Oxidation (M)", -#' "Deamidated (N)", "Gln->pyro-Glu (N-term = Q)"), +#' "Deamidated (N)", "Deamidated (Q)", +#' "Gln->pyro-Glu (N-term = Q)"), #' quant = "tmt18", -#' fdr_type = "psm", #' out_path = "~/mzion/examples", #' ) #' @@ -659,6 +676,11 @@ matchMS <- function (out_path = "~/mzion/outs", "Oxidation (M)", "Deamidated (N)", "Gln->pyro-Glu (N-term = Q)"), rm_dup_term_anywhere = TRUE, + + ms1_neulosses = NULL, + maxn_neulosses_fnl = 2L, + maxn_neulosses_vnl = 2L, + fixedlabs = NULL, varlabs = NULL, locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)"), @@ -732,7 +754,7 @@ matchMS <- function (out_path = "~/mzion/outs", min_ret_time = 0, max_ret_time = Inf, calib_ms1mass = FALSE, ppm_ms1calib = 20L, - + add_ms2theos = FALSE, add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, add_ms2ints = FALSE, @@ -798,7 +820,7 @@ matchMS <- function (out_path = "~/mzion/outs", # modifications fixedmods <- sort(fixedmods) varmods <- sort(varmods) - locmods <- check_locmods(fixedmods, varmods, locmods) + locmods <- check_locmods(locmods, fixedmods, varmods, ms1_neulosses) db_ord <- order(fasta) fasta <- fasta[db_ord] @@ -820,6 +842,7 @@ matchMS <- function (out_path = "~/mzion/outs", # numeric types stopifnot(vapply(c(maxn_fasta_seqs, maxn_vmods_setscombi, maxn_vmods_per_pep, maxn_sites_per_vmod, maxn_fnl_per_seq, maxn_vnl_per_seq, + ms1_notches, maxn_neulosses_fnl, maxn_neulosses_vnl, maxn_vmods_sitescombi_per_pep, min_len, max_len, max_miss, topn_ms2ions, minn_ms2, min_mass, max_mass, min_ms2mass, max_ms2mass, n_13c, @@ -854,6 +877,8 @@ matchMS <- function (out_path = "~/mzion/outs", maxn_vmods_sitescombi_per_pep <- as.integer(maxn_vmods_sitescombi_per_pep) maxn_fnl_per_seq <- as.integer(maxn_fnl_per_seq) maxn_vnl_per_seq <- as.integer(maxn_vnl_per_seq) + maxn_neulosses_fnl <- as.integer(maxn_neulosses_fnl) + maxn_neulosses_vnl <- as.integer(maxn_neulosses_vnl) min_len <- as.integer(min_len) max_len <- as.integer(max_len) max_miss <- as.integer(max_miss) @@ -880,8 +905,9 @@ matchMS <- function (out_path = "~/mzion/outs", stopifnot(min_len >= 1L, max_len >= min_len, max_miss <= 10L, minn_ms2 >= 2L, min_mass >= 1L, max_mass >= min_mass, min_ms2mass >= 1L, max_ms2mass > min_ms2mass, - maxn_vmods_sitescombi_per_pep >= 2L, - noenzyme_maxn >= 0L, + maxn_vmods_sitescombi_per_pep >= 2L, noenzyme_maxn >= 0L, + maxn_fnl_per_seq >= 0L, maxn_vnl_per_seq >= 0L, + maxn_neulosses_fnl >= 0L, maxn_neulosses_vnl >= 0L, maxn_vmods_per_pep >= maxn_sites_per_vmod, max_n_prots > 1000L, min_ms1_charge >= 1L, max_ms1_charge >= min_ms1_charge, min_scan_num >= 1L, max_scan_num >= min_scan_num, @@ -1034,9 +1060,19 @@ matchMS <- function (out_path = "~/mzion/outs", rm(list = c("oks")) # TMT - check_tmt_pars(fixedmods, varmods, quant) + check_tmt_pars(fixedmods, varmods, quant) + + # MS1 off-sets + if (!all(ms1_neulosses %in% varmods)) + stop("Not all `ms1_neulosses` found in `varmods`.") + + check_notches(ms1_notches = ms1_notches, ms1_neulosses = ms1_neulosses) + ms1_offsets <- find_ms1_offsets(n_13c = n_13c, ms1_notches = ms1_notches) + is_notched <- length(unique(c(ms1_offsets, ms1_neulosses))) > 1L # system paths + homedir <- find_dir("~") + if (is.null(.path_cache)) { .path_cache <- "~/mzion/.MSearches/Cache/Calls/" } @@ -1049,6 +1085,9 @@ matchMS <- function (out_path = "~/mzion/outs", .path_fasta <- create_dir(.path_fasta) .path_ms1masses <- create_dir(file.path(.path_fasta, "ms1masses")) + + fasta <- lapply(fasta, function (x) if (grepl("~", x)) gsub("~", homedir, x) else x) + fasta <- unlist(fasta, recursive = FALSE, use.names = FALSE) # Output path out_path <- create_dir(out_path) @@ -1218,19 +1257,20 @@ matchMS <- function (out_path = "~/mzion/outs", enzyme = enzyme, out_path = out_path) - if (reframe_mgfs) - .path_bin_calib <- - bin_ms1masses(res = res, - min_mass = min_mass, - max_mass = max_mass, - min_len = min_len, - max_len = max_len, - ppm_ms1 = ppm_ms1calib, - use_ms1_cache = use_ms1_cache, - .path_cache = .path_cache, - .path_ms1masses = .path_ms1masses, - enzyme = enzyme, - out_path = out_path) + .path_bin_calib <- if (reframe_mgfs) + bin_ms1masses(res = res, + min_mass = min_mass, + max_mass = max_mass, + min_len = min_len, + max_len = max_len, + ppm_ms1 = ppm_ms1calib, + use_ms1_cache = use_ms1_cache, + .path_cache = .path_cache, + .path_ms1masses = .path_ms1masses, + enzyme = enzyme, + out_path = out_path) + else + .path_bin if (exists("res")) rm(list = "res") @@ -1269,10 +1309,8 @@ matchMS <- function (out_path = "~/mzion/outs", ## MSMS matches if (is.null(bypass_ms2match <- dots$bypass_ms2match)) bypass_ms2match <- FALSE - - .time_stamp <- find_ms1_times(out_path) - - if (length(.time_stamp) == 1L) { + + if (length(.time_stamp <- find_ms1_times(out_path)) == 1L) { path_time <- file.path(.path_ms1masses, .time_stamp) file_aams <- file.path(path_time, "aa_masses_all.rds") file_mods <- file.path(path_time, "mod_indexes.txt") @@ -1311,10 +1349,8 @@ matchMS <- function (out_path = "~/mzion/outs", enzyme = enzyme, maxn_fasta_seqs = maxn_fasta_seqs, maxn_vmods_setscombi = maxn_vmods_setscombi, min_len = min_len, max_len = max_len, max_miss = max_miss, - knots = 50L) + knots = 5L) - ms1_offsets <- find_ms1_offsets(n_13c = n_13c, ms1_notches = ms1_notches) - if (!bypass_ms2match) { if (min_ms2mass < 5L) warning("Maybe out of RAM at \"min_ms2mass < 5L\".") @@ -1342,6 +1378,9 @@ matchMS <- function (out_path = "~/mzion/outs", index_mgf_ms2 = index_mgf_ms2, by_modules = by_modules, ms1_offsets = ms1_offsets, + ms1_neulosses = ms1_neulosses, + maxn_neulosses_fnl = maxn_neulosses_fnl, + maxn_neulosses_vnl = maxn_neulosses_vnl, # dummy for argument matching fasta = fasta, @@ -1383,8 +1422,7 @@ matchMS <- function (out_path = "~/mzion/outs", min_ms2mass = min_ms2mass, index_mgf_ms2 = index_mgf_ms2, tally_ms2ints = tally_ms2ints, - ms1_offsets = ms1_offsets, - + # dummies mgf_path = mgf_path, maxn_vmods_per_pep = maxn_vmods_per_pep, @@ -1416,7 +1454,7 @@ matchMS <- function (out_path = "~/mzion/outs", if (!bypass_primatches) hadd_primatches(out_path = out_path, - ms1_offsets = ms1_offsets, + is_notched = is_notched, add_ms2theos = add_ms2theos, add_ms2theos2 = add_ms2theos2, add_ms2moverzs = add_ms2moverzs, @@ -1433,7 +1471,7 @@ matchMS <- function (out_path = "~/mzion/outs", fdr_type = fdr_type, min_len = min_len, max_len = max_len, - ms1_offsets = ms1_offsets, + is_notched = is_notched, max_pepscores_co = max_pepscores_co, min_pepscores_co = min_pepscores_co, enzyme = enzyme, @@ -1473,7 +1511,7 @@ matchMS <- function (out_path = "~/mzion/outs", calc_peploc(out_path = out_path, mod_indexes = mod_indexes, locmods = locmods, - ms1_offsets = ms1_offsets, + is_notched = is_notched, topn_mods_per_seq = topn_mods_per_seq, topn_seqs_per_query = topn_seqs_per_query) } @@ -2057,26 +2095,42 @@ checkMGF <- function (mgf_path = NULL, grp_args = NULL, error = c("stop", "warn" #' Coerced \code{fixedmods} not considered. #' #' @inheritParams matchMS -check_locmods <- function (fixedmods, varmods, locmods) +check_locmods <- function (locmods, fixedmods, varmods, ms1_neulosses = NULL) { if (!length(locmods)) return(NULL) - oks <- locmods %in% c(fixedmods, varmods) - - if (!all(oks)) { - warning("Some \"locmods\" not in \"varmods\" or \"fixedmods\": ", + if (!is.null(ms1_neulosses)) { + if (sum(bads <- !ms1_neulosses %in% locmods)) { + warning("\nPLEASE READ: \n\n", + "\"ms1_neulosses\": ", paste(ms1_neulosses[bads], collapse = ", "), + " not found in \"locmods\": ", paste(locmods, collapse =, ""), + "\n!!! Consider matching some of the \"locmods\" setting to", + " \"ms1_neulosses\". !!!\n") + + if (FALSE) { + nls <- lapply(ms1_neulosses, find_unimod) + nlresids <- lapply(nls, `[[`, "position_site") + nlresids <- unlist(nlresids, use.names = FALSE, recursive = FALSE) + + vs <- lapply(varmods, find_unimod) + vresids <- lapply(vs, `[[`, "position_site") + vresids <- unlist(vresids, use.names = FALSE, recursive = FALSE) + } + } + } + + if (!all(oks <- locmods %in% c(fixedmods, varmods))) { + warning("Ignore \"locmods\" not in \"varmods\" or \"fixedmods\": ", paste(locmods[!oks], collapse = ", ")) locmods <- locmods[oks] } - + if (!length(locmods)) return(NULL) - vids <- which(varmods %in% locmods) - # locmods are only among fixedmods - if (!length(vids)) + if (!length(vids <- which(varmods %in% locmods))) stop("No \"varmods\" matched to \"locmods\": ", paste(locmods, collapse = ", ")) vmods <- find_modps(varmods) @@ -2149,3 +2203,21 @@ check_fdr_group <- function (fdr_group = c("base", "all", "top3"), } +#' Checks the compatibility between ms1_notches and ms1_neulosses. +#' +#' @inheritParams matchMS +check_notches <- function (ms1_notches, ms1_neulosses) +{ + n_notches <- length(ms1_notches) + n_neulosses <- length(ms1_neulosses) + + if (n_neulosses) { + cdn_1 <- n_notches == 1L && ms1_notches != 0 + cdn_2 <- n_notches > 1L + + if (cdn_1 || cdn_2) + stop("Not support simultaneous non-trival `ms1_notches` and `ms1_neulosses`.") + } +} + + diff --git a/R/msmsmatches2.R b/R/msmsmatches2.R index 084590e..759180e 100644 --- a/R/msmsmatches2.R +++ b/R/msmsmatches2.R @@ -20,12 +20,14 @@ #' @import parallel ms2match <- function (mgf_path, aa_masses_all, out_path, .path_bin, mod_indexes, type_ms2ions = "by", maxn_vmods_per_pep = 5L, - maxn_sites_per_vmod = 3L, maxn_fnl_per_seq = 64L, - maxn_vnl_per_seq = 64L, maxn_vmods_sitescombi_per_pep = 64L, + maxn_sites_per_vmod = 3L, maxn_fnl_per_seq = 3L, + maxn_vnl_per_seq = 3L, maxn_vmods_sitescombi_per_pep = 64L, minn_ms2 = 6L, ppm_ms1 = 20L, ppm_ms2 = 20L, min_mass = 200L, max_mass = 4500L, min_ms2mass = 115L, quant = "none", ppm_reporters = 10L, by_modules = TRUE, reframe_mgfs = FALSE, ms1_offsets = 0, + ms1_neulosses = NULL, maxn_neulosses_fnl = 1L, + maxn_neulosses_vnl = 1L, # dummies fasta, acc_type, acc_pattern, @@ -109,7 +111,9 @@ ms2match <- function (mgf_path, aa_masses_all, out_path, .path_bin, ppm_ms2_bin <- calc_threeframe_ppm(ppm_ms2) pair_mgftheos(mgf_path = mgf_path, n_modules = length(aa_masses_all), - ms1_offsets = ms1_offsets, by_modules = by_modules, + ms1_offsets = comb_ms1_offsets(ms1_offsets = ms1_offsets, + ms1_neulosses = ms1_neulosses), + by_modules = by_modules, min_mass = min_mass, max_mass = max_mass, ppm_ms1_bin = ppm_ms1_bin, .path_bin = .path_bin, reframe_mgfs = reframe_mgfs, first_search = first_search) @@ -140,21 +144,28 @@ ms2match <- function (mgf_path, aa_masses_all, out_path, .path_bin, maxn_sites_per_vmod = maxn_sites_per_vmod) ms2vmods_all <- lapply(ms1vmods_all, lapply, make_ms2vmods) - # Searches - df0 <- tibble::tibble(scan_title = integer(), ms1_moverz = numeric(), - ms1_mass = numeric(), ms1_int = numeric(), - ms1_charge = character(), ret_time = numeric(), - scan_num = character(), raw_file = integer(), - ms2_moverz = list(list()), - ms2_int = list(list()), - ms2_n = integer(), frame = numeric(), + df0 <- tibble::tibble(scan_title = integer(), raw_file = integer(), + pep_mod_group = integer(), pep_exp_mz = numeric(), + pep_exp_mr = numeric(), pep_tot_int = numeric(), + pep_exp_z = numeric(), pep_ret_range = numeric(), + pep_scan_num = character(), + pep_ms2_moverzs = list(list()), + pep_ms2_ints = list(list()), + pep_n_ms2 = integer(), + rptr_moverz = list(list()), + rptr_int = list(list()), + pep_ms1_offset = numeric(), matches = list(list()), - pep_isdecoy = logical()) + pep_fmod = character(), + pep_vmod = character()) hms2match(aa_masses_all = aa_masses_all, funs_ms2 = funs_ms2, ms1vmods_all = ms1vmods_all, ms2vmods_all = ms2vmods_all, + ms1_neulosses = ms1_neulosses, + maxn_neulosses_fnl = maxn_neulosses_fnl, + maxn_neulosses_vnl = maxn_neulosses_vnl, mod_indexes = mod_indexes, mgf_path = mgf_path, out_path = out_path, @@ -235,7 +246,7 @@ reverse_seqs <- function (seqs) #' @param reframe_mgfs Logical; if TRUE, recalculates the frame indexes of MGFs #' @param knots The number of knots for spline fits. #' @inheritParams matchMS -calib_mgf <- function (mgf_path = NULL, aa_masses_all = NULL,out_path = NULL, +calib_mgf <- function (mgf_path = NULL, aa_masses_all = NULL, out_path = NULL, .path_bin, mod_indexes = NULL, type_ms2ions = "by", maxn_vmods_per_pep = 5L,maxn_sites_per_vmod = 3L, maxn_fnl_per_seq = 3L, maxn_vnl_per_seq = 3L, @@ -248,7 +259,7 @@ calib_mgf <- function (mgf_path = NULL, aa_masses_all = NULL,out_path = NULL, acc_pattern = NULL, topn_ms2ions = 100L, fixedmods = NULL, varmods = NULL, enzyme = "trypsin_p", maxn_fasta_seqs = 200000L, maxn_vmods_setscombi = 512L, - min_len = 7L, max_len = 40L, max_miss = 2L, knots = 50L) + min_len = 7L, max_len = 40L, max_miss = 2L, knots = 5L) { on.exit( if (exists(".savecall", envir = fun_env)) { @@ -280,12 +291,9 @@ calib_mgf <- function (mgf_path = NULL, aa_masses_all = NULL,out_path = NULL, return(NULL) } - # may need to delete mgf_queries_[...].rds when changing, e.g., from - # ppm_ms1 = 20 to 10; or save a copy of the original mgf_queries - ## the first search tempdir <- file.path(out_path, "temp") - pat_th <- if (by_modules) "^expttheo_\\d+.*\\.rds$" else "^mgftheo_\\d+.*\\.rds$" + pat_th <- if (by_modules) "^(theo|expt)_\\d+.*\\.rds$" else "^mgftheo_\\d+.*\\.rds$" pat_im <- "^ion_matches_\\d+.*\\.rds$" fs_th <- list.files(mgf_path, pattern = pat_th, full.names = TRUE) fs_im <- list.files(tempdir, pattern = pat_im, full.names = TRUE) @@ -307,25 +315,29 @@ calib_mgf <- function (mgf_path = NULL, aa_masses_all = NULL,out_path = NULL, fi_mi2 <- file.path(out_path, "Calls", "mod_indexes.txt") file.rename(fi_aa, fi_aa2) file.rename(fi_mi, fi_mi2) - + ms2match(mgf_path = mgf_path, aa_masses_all = aa_masses_all, out_path = out_path, .path_bin = .path_bin, mod_indexes = mod_indexes, type_ms2ions = type_ms2ions, - maxn_vmods_per_pep = maxn_vmods_per_pep, - maxn_sites_per_vmod = maxn_sites_per_vmod, - maxn_fnl_per_seq = maxn_fnl_per_seq, - maxn_vnl_per_seq = maxn_vnl_per_seq, - maxn_vmods_sitescombi_per_pep = maxn_vmods_sitescombi_per_pep, + maxn_vmods_per_pep = 1L, + maxn_sites_per_vmod = 1L, + maxn_fnl_per_seq = 1L, + maxn_vnl_per_seq = 1L, + ms1_offsets = 0, + ms1_neulosses = NULL, + maxn_neulosses_fnl = 1L, + maxn_neulosses_vnl = 1L, + maxn_vmods_sitescombi_per_pep = 1L, minn_ms2 = minn_ms2, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, min_mass = min_mass, max_mass = max_mass, min_ms2mass = min_ms2mass, - quant = quant, + quant = "none", ppm_reporters = ppm_reporters, reframe_mgfs = reframe_mgfs, index_mgf_ms2 = index_mgf_ms2, @@ -345,19 +357,56 @@ calib_mgf <- function (mgf_path = NULL, aa_masses_all = NULL,out_path = NULL, first_search = TRUE, .savecall = FALSE) + calc_pepscores(topn_ms2ions = topn_ms2ions, + type_ms2ions = type_ms2ions, + target_fdr = .01, + min_len = min_len, + max_len = max_len, + ppm_ms2 = ppm_ms2, + soft_secions = FALSE, + out_path = out_path, + min_ms2mass = min_ms2mass, + index_mgf_ms2 = index_mgf_ms2, + tally_ms2ints = TRUE, + + # dummies + mgf_path = mgf_path, + maxn_vmods_per_pep = 1L, + maxn_sites_per_vmod = 1L, + maxn_vmods_sitescombi_per_pep = 1L, + minn_ms2 = minn_ms2, + ppm_ms1 = ppm_ms1, + quant = quant, + ppm_reporters = ppm_reporters, + fasta = fasta, + acc_type = acc_type, + acc_pattern = acc_pattern, + fixedmods = fixedmods, + varmods = varmods, + enzyme = enzyme, + maxn_fasta_seqs = maxn_fasta_seqs, + maxn_vmods_setscombi = maxn_vmods_setscombi, + add_ms2theos = FALSE, + add_ms2theos2 = FALSE, + add_ms2moverzs = FALSE, + add_ms2ints = FALSE, + by_modules = by_modules, + digits = 4L) + file.rename(fi_aa2, fi_aa) file.rename(fi_mi2, fi_mi) ## mass calibration fs_mgf <- list.files(mgf_path, "^mgf_queries.*\\.rds$") - fi_ion <- file.path(out_path, "temp", "ion_matches_1.rds") - + fi_ion <- file.path(out_path, "temp", "prescores_1_1.rds") + if (!length(fs_mgf)) stop("No `mgf_queries` files found for calibrations.") if (!file.exists(fi_ion)) stop("No `ion_matches` files found for calibrations.") - df <- qs::qread(fi_ion) + df <- qs::qread(fi_ion) + df <- df[with(df, pep_prob <= .01), ] if (!"raw_file" %in% names(df)) stop("Column not found in search results: `raw_file`") @@ -414,9 +463,11 @@ calib_mgf <- function (mgf_path = NULL, aa_masses_all = NULL,out_path = NULL, #' @inheritParams calib_mgf calib_ms1 <- function (filename, df = NULL, mgf_path = NULL, out_path = NULL, ppm_ms1 = 20L, min_mass = 200L, max_mass = 4500L, - knots = 50L) + knots = 5L) { - mgfs <- qs::qread(file.path(mgf_path, filename)) + n_row <- nrow(df) + knots <- max(min(floor(n_row/20L), knots), 4L) + mgfs <- qs::qread(file.path(mgf_path, filename)) # subsets by minn_ms2 and ms1_int if (FALSE) { @@ -432,41 +483,79 @@ calib_ms1 <- function (filename, df = NULL, mgf_path = NULL, out_path = NULL, df <- df[minn_ok, ] } - # x[[1]]: no nested structures, as the first search is always - # against the all-fixed modifications - theo_ms1 <- lapply(df[["matches"]], function (x) { - attr(x[[1]], "theo_ms1", exact = TRUE) - }) - theo_ms1 <- .Internal(unlist(theo_ms1, recursive = FALSE, use.names = FALSE)) + diff_ms1 <- (df[["pep_exp_mr"]] - df[["theo_ms1"]])/df[["theo_ms1"]] * 1E6 + mdiff <- median(diff_ms1, na.rm = TRUE)/1E6 + + if (n_row <= 100L || mdiff <= 2e-6) { + mgfs[["ms1_mass"]] <- mgfs[["ms1_mass"]] - mdiff + post_calib(mgfs, min_mass, max_mass, mgf_path, filename) + .savecall <- TRUE + return(NULL) + } - diff_ms1 <- (df[["pep_exp_mr"]] - theo_ms1)/theo_ms1 * 1E6 - ret_time <- df[["pep_ret_range"]] + ret_time <- df[["pep_ret_range"]] ppm_ms1_bin <- calc_threeframe_ppm(ppm_ms1) - fit_ns <- tryCatch( - lm(diff_ms1 ~ splines::ns(ret_time, knots)), - error = function(e) NA) + fit_ns <- find_optns(diff_ms1, ret_time, knots) + fit_bs <- find_optbs(diff_ms1, ret_time, knots) + + if (all(is.na(fit_ns))) { + if (all(is.na(fit_bs))) { + mgfs[["ms1_mass"]] <- mgfs[["ms1_mass"]] - mdiff + post_calib(mgfs, min_mass, max_mass, mgf_path, filename) + .savecall <- TRUE + return(NULL) + } + else + fit_ns <- fit_bs + } + else if (all(is.na(fit_bs))) + fit_bs <- fit_ns - fit_bs <- tryCatch( - lm(diff_ms1 ~ splines::bs(ret_time, knots)), - error = function(e) NA) + bad_ns <- anyNA(coef(fit_ns)) + bad_bs <- anyNA(coef(fit_bs)) - res_ns <- if (class(fit_ns) == "lm") sum(resid(fit_ns)^2, na.rm = TRUE) else Inf - res_bs <- if (class(fit_bs) == "lm") sum(resid(fit_bs)^2, na.rm = TRUE) else Inf - fit <- if (res_ns <= res_bs) fit_ns else fit_bs + if (bad_ns) { + if (bad_bs) { + mgfs[["ms1_mass"]] <- mgfs[["ms1_mass"]] - mdiff + post_calib(mgfs, min_mass, max_mass, mgf_path, filename) + .savecall <- TRUE + return(NULL) + } + else + fit_ns <- fit_bs + } + else if (bad_bs) + fit_bs <- fit_ns + res_ns <- if (class(fit_ns) == "lm") mean(resid(fit_ns)^2, na.rm = TRUE) else Inf + res_bs <- if (class(fit_bs) == "lm") mean(resid(fit_bs)^2, na.rm = TRUE) else Inf + fit <- if (res_ns <= res_bs) fit_ns else fit_bs + # (keeps the original df$ms1_mass -> can later infer mass deltas) # charges <- get_ms1charges(df[["ms1_charge"]]) # df[["ms1_moverz"]] <- (df[["ms1_mass"]] + 1.00727647 * charges)/charges ## Update mgf rt <- mgfs[["ret_time"]] - oks_le <- rt >= min(ret_time, na.rm = TRUE) - oks_gr <- rt <= max(ret_time, na.rm = TRUE) + min_rt <- min(ret_time, na.rm = TRUE) + max_rt <- max(ret_time, na.rm = TRUE) + + if ((min_rt2 <- min_rt * 1.05) < (max_rt2 <- max_rt / 1.05)) { + oks_le <- rt >= min_rt2 + oks_gr <- rt <= max_rt2 + } + else { + oks_le <- rt >= min_rt + oks_gr <- rt <= max_rt + } + oks <- oks_le & oks_gr ms1err <- predict.lm(fit, newdata = data.frame(ret_time = rt[oks])) / 1E6 - ms1oks <- mgfs[["ms1_mass"]][oks] - mgfs[["ms1_mass"]][oks] <- ms1oks - ms1oks * ms1err + mgfs[["ms1_mass"]][oks] <- mgfs[["ms1_mass"]][oks] * (1 - ms1err) + + uls <- rt[!oks] + mgfs[["ms1_mass"]][uls] <- mgfs[["ms1_mass"]][uls] - mdiff # beyond the boundary of RT if (FALSE) { @@ -481,27 +570,118 @@ calib_ms1 <- function (filename, df = NULL, mgf_path = NULL, out_path = NULL, } ## update MGF - mgfs <- mgfs %>% - dplyr::arrange(ms1_mass) %>% - dplyr::filter(ms1_mass >= min_mass, ms1_mass <= max_mass) - # charges <- get_ms1charges(mgfs[["ms1_charge"]]) # mgfs[["ms1_moverz"]] <- (mgfs[["ms1_mass"]] + 1.00727647 * charges)/charges - + post_calib(mgfs, min_mass, max_mass, mgf_path, filename) .savecall <- TRUE + invisible(NULL) +} + + +#' Post MS1 calibrations. +#' +#' @param mgfs MGF data. +#' @param filename An MGF file name +#' @inheritParams matchMS +post_calib <- function (mgfs, min_mass, max_mass, mgf_path, filename) +{ + mgfs <- mgfs |> + dplyr::arrange(ms1_mass) |> + dplyr::filter(ms1_mass >= min_mass, ms1_mass <= max_mass) + qs::qsave(mgfs, file.path(mgf_path, filename), preset = "fast") } +#' Finds the optimal natural spline. +#' +#' @param diff_ms1 The differences between experimental and theoretical +#' precursor masses. +#' @param ret_time Retention timm. +#' @param knots The number of knots. +find_optns <- function (diff_ms1, ret_time, knots = 50L) +{ + if (knots <= 3L) + return(NA) + + ans <- tryCatch( + lm(diff_ms1 ~ splines::ns(ret_time, knots)), + error = function(e) NA) + + if (all(is.na(ans))) + return(NA) + + if (anyNA(coef(ans))) + find_optns(diff_ms1, ret_time, floor(knots/1.5)) + else + ans +} -#' Finds offsets in precursor masses. + +#' Finds the optimal natural spline. #' +#' @inheritParams find_optns +find_optbs <- function (diff_ms1, ret_time, knots = 50L) +{ + if (knots <= 3L) + return(NA) + + ans <- tryCatch( + lm(diff_ms1 ~ splines::bs(ret_time, knots)), + error = function(e) NA) + + if (all(is.na(ans))) + return(NA) + + if (anyNA(coef(ans))) + find_optbs(diff_ms1, ret_time, floor(knots/1.5)) + else + ans +} + + +#' Finds off-sets in precursor masses. +#' #' @inheritParams matchMS +#' @return A vector of mass off-sets. find_ms1_offsets <- function (n_13c = 0L, ms1_notches = 0) { + if (dups <- anyDuplicated(n_13c)) + stop("At least one duplicated values in `n_13c`: ", + n_13c[dups]) + + if (dups <- anyDuplicated(ms1_notches)) + stop("At least one duplicated values in `ms1_offsets`: ", + ms1_notches[dups]) + offsets_13c <- if (length(n_13c)) n_13c * 1.00335483 else NULL ms1_offsets <- unique(c(0, offsets_13c, ms1_notches)) - round(ms1_offsets, digits = 4L) + ms1_offsets <- round(ms1_offsets, digits = 4L) +} + + +#' Combines off-sets in precursor masses (notches and neutral losses). +#' +#' The return contains no information of Unimod titles and positions since it is +#' only used for pairing with experimental MGF data. +#' +#' @param ms1_offsets Precursor mass off-sets (notches, not neutral losses). +#' @inheritParams matchMS +#' @return A vector of mass off-sets. +comb_ms1_offsets <- function (ms1_offsets = 0, ms1_neulosses = NULL) +{ + if (dups <- anyDuplicated(ms1_neulosses)) + stop("At least one duplicated values in `ms1_neulosses`: ", + ms1_neulosses[dups]) + + if ((!(nnl <- length(ms1_neulosses))) || (nnl == 1L && nnl == 0)) + return(ms1_offsets) + + nls <- extract_umods(ms1_neulosses) + nls <- unique(unlist(lapply(nls, `[[`, "nl"))) + nls <- -nls[nls != 0] + + round(unique(c(ms1_offsets, nls)), digits = 4L) } diff --git a/R/quant2.R b/R/quant2.R index 4874c25..0836d12 100644 --- a/R/quant2.R +++ b/R/quant2.R @@ -5,7 +5,7 @@ #' @inheritParams matchMS #' @inheritParams calc_pepscores hcalc_tmtint <- function (df, quant = "tmt10", ppm_reporters = 10L, idx = 1L, - ms1_offsets = 0, out_path = NULL) + out_path = NULL) { df <- df[, c("raw_file", "pep_mod_group", "pep_scan_num", "rptr_moverz", "rptr_int")] diff --git a/R/scores.R b/R/scores.R index 310c3c9..a0fb244 100644 --- a/R/scores.R +++ b/R/scores.R @@ -598,8 +598,6 @@ calc_pepprobs_i <- function (df, topn_ms2ions = 100L, type_ms2ions = "by", #' Calculates the scores of peptides. #' #' @param tally_ms2ints Logical; tally MS2 intensities or not. -#' @param ms1_offsets Off-sets in precursor masses in relative to the original -#' values in MGFs. #' @inheritParams matchMS #' @import parallel calc_pepscores <- function (topn_ms2ions = 100L, type_ms2ions = "by", @@ -608,7 +606,7 @@ calc_pepscores <- function (topn_ms2ions = 100L, type_ms2ions = "by", soft_secions = FALSE, out_path = "~/mzion/outs", min_ms2mass = 115L, index_mgf_ms2 = FALSE, - tally_ms2ints = TRUE, ms1_offsets = 0, + tally_ms2ints = TRUE, mgf_path, maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, maxn_vmods_sitescombi_per_pep = 64L, minn_ms2 = 6L, ppm_ms1 = 20L, quant = "none", ppm_reporters = 10, @@ -749,7 +747,6 @@ calc_pepscores <- function (topn_ms2ions = 100L, type_ms2ions = "by", d2 = d2, index_mgf_ms2 = index_mgf_ms2, tally_ms2ints = tally_ms2ints, - ms1_offsets = ms1_offsets, add_ms2theos = add_ms2theos, add_ms2theos2 = add_ms2theos2, add_ms2moverzs = add_ms2moverzs, @@ -830,20 +827,49 @@ order_fracs <- function (type = "list_table", tempdir, by_modules = TRUE) } +#' Order fractions +#' +#' Three layers: module_notch_frac +#' +#' @param type The type of files. +#' @param tempdir A temporary directory containing the files. +#' @param by_modules Logical; if TRUE, performs searches by modules. +order_fracs3 <- function (type = "list_table", tempdir, by_modules = TRUE) +{ + files <- if (by_modules) + list.files(tempdir, pattern = paste0("^", type, "_\\d+_\\d+_\\d+.*")) + else + list.files(tempdir, pattern = paste0("^", type, "_\\d+(_){0,1}\\d*.*")) + + # all NA's if by_modules = FALSE + l1 <- as.integer(gsub(paste0("^", type, "_(\\d+).*"), "\\1", files)) + l2 <- as.integer(gsub(paste0("^", type, "_\\d+_(\\d+).*"), "\\1", files)) + l3 <- as.integer(gsub(paste0("^", type, "_\\d+_\\d+_(\\d+).*"), "\\1", files)) + + df <- data.frame(l1 = l1, l2 = l2, l3 = l3) + df <- dplyr::arrange(df, l1, l2, l3) + df$fs <- with(df, paste(l1, l2, l3, sep = "_")) + df$fs <- paste0(type, "_", df$fs, ".rds") + + split(df$fs, df$l1) +} + + + #' Combines results from fractions #' -#' @param files A list of file names -#' @param tempdir A temporary directory containing the files +#' @param files A list of file names. +#' @param tempdir A temporary directory containing the files. +#' @param is_notched Logical; is a search with MS1 notches or not. #' @param sc_path An output path -#' @inheritParams calc_pepscores -combine_fracs <- function (files, tempdir, sc_path, ms1_offsets = 0) +combine_fracs <- function (files, tempdir, sc_path, is_notched = FALSE) { out_nm <- gsub("_\\d+\\.rds$", ".rds", files[[1]]) df <- lapply(file.path(tempdir, files), qs::qread) df <- dplyr::bind_rows(df) # uniq_id does not contain information of pep_ms1_offset - if (length(ms1_offsets) > 1L) + if (is_notched) df <- df[!duplicated.default(df[["uniq_id"]]), ] qs::qsave(df, file.path(sc_path, out_nm)) @@ -914,7 +940,7 @@ calcpepsc <- function (file, im_path, pep_fmod_all, pep_vmod_all, topn_ms2ions = 100L, type_ms2ions = "by", ppm_ms2 = 20L, soft_secions = FALSE, out_path = NULL, min_ms2mass = 115L, d2 = 1E-5, index_mgf_ms2 = FALSE, - tally_ms2ints = TRUE, ms1_offsets = 0, + tally_ms2ints = TRUE, add_ms2theos = FALSE, add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, add_ms2ints = FALSE, quant = "none", ppm_reporters = 10, @@ -1042,7 +1068,6 @@ calcpepsc <- function (file, im_path, pep_fmod_all, pep_vmod_all, quant = quant, ppm_reporters = ppm_reporters, idx = idx, - ms1_offsets = ms1_offsets, out_path = im_path) qs::qsave(df[, cols_lt, drop = FALSE], @@ -1059,9 +1084,10 @@ calcpepsc <- function (file, im_path, pep_fmod_all, pep_vmod_all, #' Helper of \link{add_primatches} #' +#' @param is_notched Logical; is a search with MS1 notches or not. #' @inheritParams matchMS #' @inheritParams calc_pepscores -hadd_primatches <- function (out_path = NULL, ms1_offsets = 0, +hadd_primatches <- function (out_path = NULL, is_notched = FALSE, add_ms2theos = FALSE, add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, add_ms2ints = FALSE, by_modules = TRUE, index_mgf_ms2 = FALSE) @@ -1119,7 +1145,7 @@ hadd_primatches <- function (out_path = NULL, ms1_offsets = 0, }, ms_files, names(ms_files)) lapply(order_fracs("reporters", tempdir, by_modules), - combine_fracs, tempdir, tempdir, ms1_offsets) + combine_fracs, tempdir, tempdir, is_notched) message("Completed theoretical MS2 m/z and intensity values: ", Sys.time()) @@ -1653,12 +1679,12 @@ find_probco_valley <- function (prob_cos, guess = 12L) #' Prepares target-decoy data. #' #' @param td A data frame of targets and decoys (for Percolator). -#' @param only_notch_zero Logical; if TRUE, use only data at \code{ms1_offsets = -#' 0}. +#' @param is_notched Logical; is a search with MS1 notches or not. +#' @param only_notch_zero Logical; if TRUE, use only data at the zero notch. #' @inheritParams matchMS prep_pepfdr_td <- function (td = NULL, out_path, enzyme = "trypsin_p", nes_fdr_group = "base", fdr_group = "base", - ms1_offsets = 0, only_notch_zero = TRUE) + is_notched = FALSE, only_notch_zero = TRUE) { files <- list.files(path = file.path(out_path, "temp"), pattern = "^pepscores_", full.names = TRUE) @@ -1672,8 +1698,8 @@ prep_pepfdr_td <- function (td = NULL, out_path, enzyme = "trypsin_p", td <- dplyr::bind_rows(td) } - if (length(ms1_offsets) > 1L && only_notch_zero) - td <- td[with(td, abs(pep_ms1_offset) <= 1e-4), ] + if (is_notched && only_notch_zero) + td <- td[with(td, pep_ms1_offset == 0), ] cts <- dplyr::count(dplyr::group_by(td, "pep_mod_group"), pep_mod_group) max_i <- cts$pep_mod_group[which.max(cts$n)[[1]]] @@ -1771,8 +1797,7 @@ keep_pepfdr_best <- function (td, cols = c("pep_scan_num", "raw_file")) #' @param target_fdr Numeric; the levels of false-discovery rate (FDR). #' @param fdr_type Character string; the type of FDR for controlling. #' @param fct_score A trivial factor converting p-values to scores. -#' @param ms1_offsets Off-sets in precursor masses in relative to the original -#' values in MGFs. +#' @param is_notched Logical; is a search with MS1 notches or not. #' @inheritParams matchMS #' @examples #' \donttest{ @@ -1787,7 +1812,7 @@ keep_pepfdr_best <- function (td, cols = c("pep_scan_num", "raw_file")) #' } #' } calc_pepfdr <- function (target_fdr = .01, fdr_type = "protein", - min_len = 7L, max_len = 40L, ms1_offsets = 0, + min_len = 7L, max_len = 40L, is_notched = FALSE, max_pepscores_co = 50, min_pepscores_co = 0, enzyme = "trypsin_p", fdr_group = "base", nes_fdr_group = "base", fct_score = 10, out_path) @@ -1798,7 +1823,7 @@ calc_pepfdr <- function (target_fdr = .01, fdr_type = "protein", enzyme = enzyme, nes_fdr_group = nes_fdr_group, fdr_group = fdr_group, - ms1_offsets = ms1_offsets, + is_notched = is_notched, only_notch_zero = TRUE) # back-compatibility to new column keys (e.g. scan_num -> pep_scan_num) @@ -1814,7 +1839,9 @@ calc_pepfdr <- function (target_fdr = .01, fdr_type = "protein", return(data.frame(pep_len = seqs, pep_prob_co = prob_cos)) } - td <- keep_pepfdr_best(td) + # not cols = c("pep_scan_num", "raw_file", "pep_ms1_offset") + # as only use td at pep_ms1_offset == 0 + td <- keep_pepfdr_best(td, cols = c("pep_scan_num", "raw_file")) qs::qsave(td, file.path(out_path, "temp", "td_pepfdr.rds"), preset = "fast") # --- @@ -2192,11 +2219,11 @@ post_pepfdr <- function (prob_cos = NULL, out_path = NULL) dplyr::mutate(pep_issig = ifelse(pep_prob <= pep_prob_co, TRUE, FALSE), pep_adjp = p.adjust(pep_prob, "BH")) - adjp_cos <- purrr::map_dbl(prob_cos$pep_prob_co, function (x) { + adjp_cos <- lapply(prob_cos$pep_prob_co, function (x) { row <- which.min(abs(log10(td$pep_prob/x))) td[row, ]$pep_adjp - }) - + }) + adjp_cos <- unlist(adjp_cos, recursive = FALSE, use.names = FALSE) prob_cos <- dplyr::bind_cols(prob_cos, pep_adjp_co = adjp_cos) td <- td |> @@ -2719,13 +2746,12 @@ match_ex2th2 <- function (expt, theo, min_ms2mass = 115L, d = 1E-5, #' @param out_path An output path. #' @param mod_indexes Integer; the indexes of fixed and/or variable #' modifications. -#' @param ms1_offsets Off-sets in precursor masses in relative to the original -#' values in MGFs. +#' @param is_notched Logical; is a search with MS1 notches or not. #' @inheritParams matchMS #' @rawNamespace import(data.table, except = c(last, first, between, transpose, #' melt, dcast)) calc_peploc <- function (x = NULL, out_path = NULL, mod_indexes = NULL, - ms1_offsets = 0, + is_notched = FALSE, locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)"), topn_mods_per_seq = 3L, topn_seqs_per_query = 3L) { @@ -2757,7 +2783,7 @@ calc_peploc <- function (x = NULL, out_path = NULL, mod_indexes = NULL, # finds the rank-1 pep_ms1_offset at each query_id # keeps only the rank-1 pep_ms1_offset group at each query_id - if (length(ms1_offsets) > 1L) { + if (is_notched) { x[, query_id := paste(pep_isdecoy, pep_scan_num, raw_file, sep = ".")] if (para) { @@ -3054,24 +3080,18 @@ calcpeprank_3 <- function (x0) #' @param x A data.table object find_bestnotch <- function (x) { - x[, pep_rank0 := data.table::frank(-pep_score, ties.method = "min"), + # favors pep_ms1_offset == 0 + x <- data.table::rbindlist(list(x[x$pep_ms1_offset == 0], + x[x$pep_ms1_offset != 0]), + use.names = FALSE) + x[, pep_rank0 := data.table::frank(-pep_score, ties.method = "first"), by = list(query_id)] # the best query at each query_id g <- x[, c("pep_rank0", "query_id", "pep_ms1_offset")] g <- g[g$pep_rank0 == 1L, ] g[["pep_rank0"]] <- NULL - g[, id := seq_len(.N), by = query_id] - g <- g[g$id == 1L, ] - g[["id"]] <- NULL - if (FALSE) { - if (length(us <- unique(g$pep_ms1_offset)) > 1L) { - i <- if (length(ok <- which(abs(us) < 1e-4)) == 1L) ok else 1L - g <- g[g$pep_ms1_offset == us[[i]], ] - } - } - g[["keep"]] <- TRUE g[, uid := paste(query_id, pep_ms1_offset, sep = ".")] g[["pep_ms1_offset"]] <- g[["query_id"]] <- NULL diff --git a/R/unimods.R b/R/unimods.R index 60efe02..47a0a8f 100644 --- a/R/unimods.R +++ b/R/unimods.R @@ -76,64 +76,58 @@ parse_unimod <- function (unimod = "Carbamyl (M)") # (assumed) no space in `title` # title <- gsub("(.*)\\s\\([^\\(]*\\)$", "\\1", unimod) - title <- gsub("^([^ ]+?) .*", "\\1", unimod) - - pos_site <- unimod %>% - gsub("^[^ ]+", "", .) %>% - gsub("^[^\\(]+[\\(]*([^\\)]*)[\\)]*$", "\\1", .) + title <- gsub("^([^ ]+?) .*", "\\1", unimod) + pos_site <- gsub("^[^ ]+", "", unimod) + pos_site <- gsub("^[^\\(]+[\\(]*([^\\)]*)[\\)]*$", "\\1", pos_site) if (grepl("=", pos_site)) { - pos <- pos_site %>% - gsub("^([^=]+?)[=].*", "\\1", .) %>% - gsub("^[ ]*", "\\1", .) %>% - gsub(" *$", "", .) - - site <- pos_site %>% - gsub("^[^=]+?[=](.*)", "\\1", .) %>% - gsub("^[ ]*", "\\1", .) %>% - gsub(" *$", "", .) + pos <- gsub("^([^=]+?)[=].*", "\\1", pos_site) + pos <- gsub("^[ ]*", "\\1", pos) + pos <- gsub(" *$", "", pos) + site <- gsub("^[^=]+?[=](.*)", "\\1", pos_site) + site <- gsub("^[ ]*", "\\1", site) + site <- gsub(" *$", "", site) } else { - pos <- "." + pos <- "." site <- pos_site } - if (site == "") site = "." + if (site == "") + site = "." if (site %in% c("Protein N-term", "Protein C-term", "Anywhere N-term", "Anywhere C-term", "N-term", "C-term")) { - pos <- site - site <- site %>% gsub("^(Protein|Anywhere) ", "", .) + pos <- site + site <- gsub("^(Protein|Anywhere) ", "", site) } # standardize `position` pos <- gsub("^([NC]){1}-term", "Any \\1-term", pos) - if (pos %in% c(".", "")) pos <- "Anywhere" + if (pos %in% c(".", "")) + pos <- "Anywhere" pos_allowed <- c("Anywhere", "Protein N-term", "Protein C-term", "Any N-term", "Any C-term") if (! pos %in% pos_allowed) stop("`pos` needs to be one of ", - paste0("\n '", pos_allowed, collapse = "'"), - "'") - + paste0("\n '", pos_allowed, collapse = "'"), "'") + # standardize terminal sites if (site == ".") { - if (pos %in% c("Protein N-term", "Any N-term")) { + if (pos %in% c("Protein N-term", "Any N-term")) site <- "N-term" - } - else if (pos %in% c("Protein C-term", "Any C-term")) { + else if (pos %in% c("Protein C-term", "Any C-term")) site <- "C-term" - } } if (pos == "Anywhere" && site == ".") stop("'position' or 'site' cannot be both 'Anywhere'.") - invisible(list(title = title, position = pos, site = site)) + list(title = title, position = pos, site = site) } @@ -182,36 +176,26 @@ find_unimod <- function (unimod = "Carbamidomethyl (C)", node_delta <- xml2::xml_find_all(this_mod, "umod:delta") monomass <- as.numeric(xml2::xml_attr(node_delta, "mono_mass")) - if (!(length(monomass) == 1L)) + if (length(monomass) != 1L) stop("The length of `mono_mass` is not one.") # sites and positions node_specs <- xml2::xml_find_all(this_mod, "umod:specificity") attrs_specs <- xml2::xml_attrs(node_specs) - sites <- attrs_specs %>% - lapply(`[`, c("site")) %>% - unlist() - - positions <- attrs_specs %>% - lapply(`[`, c("position")) %>% - unlist() - + sites <- unlist(lapply(attrs_specs, `[`, "site")) + positions <- unlist(lapply(attrs_specs, `[`, "position")) names(sites) <- positions - idx_ps <- which(sites == site & positions == position) + idx_ps <- which(sites == site & positions == position) + sites <- sites[idx_ps] + empties <- unlist(lapply(sites, is.null)) + sites <- sites[!empties] - sites <- sites[idx_ps] %>% - .[purrr::map_lgl(., function (x) !is.null(x))] - - local({ - len_sites <- length(sites) - - if (len_sites > 1L) - stop("Multiple matches in site and position for '", unimod, "'.") - else if (len_sites == 0L) - stop("No matches in site and position for '", unimod, "'.") - }) + if ((len_sites <- length(sites)) > 1L) + stop("Multiple matches in site and position for '", unimod, "'.") + else if (len_sites == 0L) + stop("No matches in site and position for '", unimod, "'.") # neutral loss idxes_nl <- grep("NeutralLoss", node_specs) @@ -219,18 +203,16 @@ find_unimod <- function (unimod = "Carbamidomethyl (C)", if (length(idxes_nl)) { nodes_nl <- xml2::xml_children(node_specs[idxes_nl]) - - neulosses <- xml2::xml_attr(nodes_nl, "mono_mass") %>% - as.numeric() %>% - sort() # ensures the first is 0 + # ensures the first is 0 + neulosses <- sort(as.numeric(xml2::xml_attr(nodes_nl, "mono_mass"))) } else neulosses <- 0 - invisible(list(title = title, - monomass = monomass, - position_site = sites, - nl = neulosses)) + list(title = title, + monomass = monomass, + position_site = sites, + nl = neulosses) } @@ -261,51 +243,52 @@ hfind_unimod <- function (xml_files = c("master.xml", "custom.xml"), unimod) stop("Modification not found: '", title, "'.\n", "For example, use 'Acetyl' (title) instead of 'Acetylation' (full_name).") - invisible(this_mod) + this_mod } #' Tabulates \href{https://www.unimod.org/}{Unimod} entries. #' -#' For convenience summary of the \code{title}, \code{site} and -#' \code{position}. +#' For convenience summary of the \code{title}, \code{site} and \code{position}. #' #' @param out_nm A name to outputs. -#' @seealso \link{find_unimod}, \link{parse_unimod}. +#' @seealso \link{find_unimod}, \link{parse_unimod}, \link{add_unimod}, +#' \link{remove_unimod}, \link{remove_unimod_title}, +#' \link{calc_unimod_compmass}. #' @examples #' \donttest{ #' library(mzion) -#' +#' #' ans <- table_unimods() -#' -#' ## TMT-6, -10 and -11 plexes +#' +#' ## TMT-6, -10 and -11 plexes #' # share the same Unimod entry at title "TMT6plex" #' # (the same chemistry at tag mass 229.162932 Da) #' ans[with(ans, title == "TMT6plex"), ] #' this_mod1 <- parse_unimod("TMT6plex (Anywhere = K)") -#' +#' #' # Convenience title, "TMT10plex", alias to "TMT6plex" #' this_mod2 <- parse_unimod("TMT10plex (Anywhere = K)") -#' +#' #' # Title "TMT11plex" alias to "TMT6plex" #' this_mod3 <- parse_unimod("TMT11plex (Anywhere = K)") -#' +#' #' ## TMT-16 #' ans[with(ans, title == "TMTpro"), ] #' this_mod1 <- parse_unimod("TMTpro (Anywhere = K)") -#' +#' #' # Both "TMTpro16" and "TMT16plex" alias to "TMTpro" #' this_mod2 <- parse_unimod("TMTpro16 (Anywhere = K)") #' this_mod3 <- parse_unimod("TMT16plex (Anywhere = K)") -#' +#' #' ## Summary of TMT entries and alias #' ans[with(ans, grepl("^TMT", title)), ] -#' -#' +#' +#' #' ## Special characters in the title (e.g., "->") #' ans[with(ans, grepl("^Gln->pyro", title)), ] #' this_mod <- parse_unimod("Gln->pryo-Glu (N-term = Q)") -#' +#' #' } #' @export table_unimods <- function (out_nm = "~/mzion/unimods.txt") @@ -643,7 +626,7 @@ add_unimod <- function (header = c(title = "Foo", full_name = "Foo bar"), } local({ - if (!(length(site) == 1L)) + if (length(site) != 1L) stop("The length of `site` is not one.") if (!(length(position) == 1L)) @@ -709,9 +692,10 @@ add_unimod <- function (header = c(title = "Foo", full_name = "Foo bar"), local({ this_delta <- xml2::xml_find_all(this_mod, "umod:delta") - - stopifnot(length(this_delta) == 1L) + if (length(this_delta) != 1L) + stop("Multiple or no matches.") + this_mono_mass <- xml2::xml_attr(this_delta, "mono_mass") this_avge_mass <- xml2::xml_attr(this_delta, "avge_mass") this_composition <- xml2::xml_attr(this_delta, "composition") @@ -759,9 +743,7 @@ add_unimod <- function (header = c(title = "Foo", full_name = "Foo bar"), positions <- unlist(lapply(attrs_children, `[`, c("position"))) ok_sitepos <- which((sites == site) & (positions == position)) - len_sitepos <- length(ok_sitepos) - - if (len_sitepos > 1L) + if ((len_sitepos <- length(ok_sitepos)) > 1L) stop("Multiple matches to `site = ", site, "` and ", "`position = ", position, "` at ", "`title = ", title, "`.\n", @@ -1129,15 +1111,11 @@ remove_unimod <- function (header = c(title = "Foo", full_name = "Foo bar"), # title idx_title <- which(xml2::xml_attr(modifications, "title") == title) - local({ - len_title <- length(idx_title) - - if (len_title > 1L) - stop("Multiple matches to `", title, "`.\n", - "Fix the redundancy from ", xml_file, ".") - else if (!len_title) - stop("Entry `", title, "` not found.") - }) + if ((len_title <- length(idx_title)) > 1L) + stop("Multiple matches to `", title, "`.\n", + "Fix the redundancy from ", xml_file, ".") + else if (!len_title) + stop("Entry `", title, "` not found.") # `site` and `position` this_mod <- modifications[[idx_title]] @@ -1159,19 +1137,15 @@ remove_unimod <- function (header = c(title = "Foo", full_name = "Foo bar"), positions <- unlist(lapply(attrs_mod_ch, `[`, c("position"))) ok_sitepos <- which((sites == site) & (positions == position)) - local({ - len_sitepos <- length(ok_sitepos) - - if (!len_sitepos) - stop("No matches to `site = ", site, "` and ", - "`position = ", position, "` at ", - "`title = ", title, "`.\n") - else if (len_sitepos > 1L) - stop("Multiple matches to `site = ", site, "` and ", - "`position = ", position, "` at ", - "`title = ", title, "`.\n", - "Fix the redundancy from ", xml_file, ".") - }) + if (!(len_sitepos <- length(ok_sitepos))) + stop("No matches to `site = ", site, "` and ", + "`position = ", position, "` at ", + "`title = ", title, "`.\n") + else if (len_sitepos > 1L) + stop("Multiple matches to `site = ", site, "` and ", + "`position = ", position, "` at ", + "`title = ", title, "`.\n", + "Fix the redundancy from ", xml_file, ".") this_spec <- nodes_mod_ch[ok_sitepos] @@ -1208,14 +1182,10 @@ remove_unimod <- function (header = c(title = "Foo", full_name = "Foo bar"), neuloss_mono_masses <- unlist(lapply(attrs_neuloss, `[`, c("mono_mass"))) idx_neuloss <- which((neuloss_mono_masses == neuloss_mono_mass)) - local({ - len_neuloss <- length(idx_neuloss) - - if (len_neuloss > 1L) - stop("Multiple matches to the `mono_mass` in `neuloss`.") - else if (!len_neuloss) - stop("No matches to the `mono_mass` in `neuloss`.") - }) + if ((len_neuloss <- length(idx_neuloss)) > 1L) + stop("Multiple matches to the `mono_mass` in `neuloss`.") + else if (!len_neuloss) + stop("No matches to the `mono_mass` in `neuloss`.") this_neuloss <- nodes_neuloss[idx_neuloss] xml2::xml_remove(this_neuloss) @@ -1304,9 +1274,7 @@ remove_unimod_title <- function (title = NULL) idx <- which(xml2::xml_attr(modifications, "title") == title) - len <- length(idx) - - if (len > 1L) + if ((len <- length(idx)) > 1L) stop("Multiple matches to `", title, "`.\n", "Fix the redundancy from ", xml_file, ".") else if (!len) @@ -1378,12 +1346,12 @@ parse_unimod_composition <- function (composition = "H(4) C O S") { options(digits = 9L) - df <- composition %>% - stringr::str_replace_all("([:alnum:]+)$", paste0("\\1", "(1)")) %>% - stringr::str_replace_all("([:alnum:]+) ", paste0("\\1", "(1) ")) %>% - gsub("\\s+", "", .) %>% - stringr::str_match_all("(.*?)\\((-*[0-9]+)\\)") %>% - `[[`(1) + df <- composition |> + stringr::str_replace_all("([:alnum:]+)$", paste0("\\1", "(1)")) |> + stringr::str_replace_all("([:alnum:]+) ", paste0("\\1", "(1) ")) + df <- gsub("\\s+", "", df) + df <- stringr::str_match_all(df, "(.*?)\\((-*[0-9]+)\\)") + df <- df[[1]] df <- df[, -1, drop = FALSE] colnames(df) <- c("symbol", "number") diff --git a/inst/extdata/custom.xml b/inst/extdata/custom.xml index 9946ed3..529bf7b 100644 --- a/inst/extdata/custom.xml +++ b/inst/extdata/custom.xml @@ -259,5 +259,38 @@ + + + + + + + + + diff --git a/man/calc_pepfdr.Rd b/man/calc_pepfdr.Rd index 036e648..072eae0 100644 --- a/man/calc_pepfdr.Rd +++ b/man/calc_pepfdr.Rd @@ -9,7 +9,7 @@ calc_pepfdr( fdr_type = "protein", min_len = 7L, max_len = 40L, - ms1_offsets = 0, + is_notched = FALSE, max_pepscores_co = 50, min_pepscores_co = 0, enzyme = "trypsin_p", @@ -30,8 +30,7 @@ for considerations. Shorter peptides will be excluded. The default is 7.} \item{max_len}{A positive integer; the maximum length of peptide sequences for considerations. Longer peptides will be excluded. The default is 40.} -\item{ms1_offsets}{Off-sets in precursor masses in relative to the original -values in MGFs.} +\item{is_notched}{Logical; is a search with MS1 notches or not.} \item{max_pepscores_co}{A positive numeric; the upper limit in the cut-offs of peptide scores for discriminating significant and insignificant diff --git a/man/calc_peploc.Rd b/man/calc_peploc.Rd index 0ef8fd1..1102908 100644 --- a/man/calc_peploc.Rd +++ b/man/calc_peploc.Rd @@ -8,7 +8,7 @@ calc_peploc( x = NULL, out_path = NULL, mod_indexes = NULL, - ms1_offsets = 0, + is_notched = FALSE, locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)"), topn_mods_per_seq = 3L, topn_seqs_per_query = 3L @@ -22,8 +22,7 @@ calc_peploc( \item{mod_indexes}{Integer; the indexes of fixed and/or variable modifications.} -\item{ms1_offsets}{Off-sets in precursor masses in relative to the original -values in MGFs.} +\item{is_notched}{Logical; is a search with MS1 notches or not.} \item{locmods}{Among \code{varmods} for the consideration of localization probabilities; for instance, \code{locmods = NULL} for nothing, diff --git a/man/calc_pepscores.Rd b/man/calc_pepscores.Rd index cb98b2b..2d0970c 100644 --- a/man/calc_pepscores.Rd +++ b/man/calc_pepscores.Rd @@ -16,7 +16,6 @@ calc_pepscores( min_ms2mass = 115L, index_mgf_ms2 = FALSE, tally_ms2ints = TRUE, - ms1_offsets = 0, mgf_path, maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, @@ -90,9 +89,6 @@ interrogation. The default is 110.} \item{tally_ms2ints}{Logical; tally MS2 intensities or not.} -\item{ms1_offsets}{Off-sets in precursor masses in relative to the original -values in MGFs.} - \item{mgf_path}{A file path to a list of MGF files. The experimenter needs to supply the files. diff --git a/man/calcpepsc.Rd b/man/calcpepsc.Rd index b9ddc52..ef832cd 100644 --- a/man/calcpepsc.Rd +++ b/man/calcpepsc.Rd @@ -18,7 +18,6 @@ calcpepsc( d2 = 1e-05, index_mgf_ms2 = FALSE, tally_ms2ints = TRUE, - ms1_offsets = 0, add_ms2theos = FALSE, add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, @@ -79,9 +78,6 @@ interrogation. The default is 110.} \item{tally_ms2ints}{Logical; tally MS2 intensities or not.} -\item{ms1_offsets}{Off-sets in precursor masses in relative to the original -values in MGFs.} - \item{add_ms2theos}{Logical. If true, adds the sequence of primary theoretical MS2 m/z values (\code{pep_ms2_theos}). The sequence order at a given \code{type_ms2ions} is: diff --git a/man/calib_mgf.Rd b/man/calib_mgf.Rd index c7b5e90..0d9fb52 100644 --- a/man/calib_mgf.Rd +++ b/man/calib_mgf.Rd @@ -39,7 +39,7 @@ calib_mgf( min_len = 7L, max_len = 40L, max_miss = 2L, - knots = 50L + knots = 5L ) } \arguments{ diff --git a/man/calib_ms1.Rd b/man/calib_ms1.Rd index c8ddea7..da9fbd6 100644 --- a/man/calib_ms1.Rd +++ b/man/calib_ms1.Rd @@ -12,7 +12,7 @@ calib_ms1( ppm_ms1 = 20L, min_mass = 200L, max_mass = 4500L, - knots = 50L + knots = 5L ) } \arguments{ diff --git a/man/check_locmods.Rd b/man/check_locmods.Rd index 75688a0..297e366 100644 --- a/man/check_locmods.Rd +++ b/man/check_locmods.Rd @@ -4,9 +4,20 @@ \alias{check_locmods} \title{Checks \code{locmods}} \usage{ -check_locmods(fixedmods, varmods, locmods) +check_locmods(locmods, fixedmods, varmods, ms1_neulosses = NULL) } \arguments{ +\item{locmods}{Among \code{varmods} for the consideration of localization + probabilities; for instance, \code{locmods = NULL} for nothing, + \code{locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)")} for + phosphopeptides, \code{locmods = "Acetyl (K)"} for lysine acetylation. + \code{fixedmods} that were coerced to \code{varmods} will be added + automatically to \code{locmods}. + + For convenience, the default is set to look for applicable peptide + phosphorylation (and may encounter warning messages if the data type is + different to the default).} + \item{fixedmods}{Character string(s) of fixed modifications.} \item{varmods}{Character string(s) of variable modifications. Multiple @@ -20,16 +31,13 @@ check_locmods(fixedmods, varmods, locmods) \link{parse_unimod} for grammars of modification \code{title}, \code{position} and \code{site}.} -\item{locmods}{Among \code{varmods} for the consideration of localization - probabilities; for instance, \code{locmods = NULL} for nothing, - \code{locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)")} for - phosphopeptides, \code{locmods = "Acetyl (K)"} for lysine acetylation. - \code{fixedmods} that were coerced to \code{varmods} will be added - automatically to \code{locmods}. +\item{ms1_neulosses}{Character string(s) specifying the neutral losses of + precursors. The nomenclature is the same as those in argument + \code{varmods}, e.g., \code{c("Phospho (S)", "Phospho (T)", "Phospho + (Y)")}. - For convenience, the default is set to look for applicable peptide - phosphorylation (and may encounter warning messages if the data type is - different to the default).} + The argument is a simplified (narrower) usage of argument + \code{ms1_notches} with additional specificity in modifications and sites.} } \description{ Coerced \code{fixedmods} not considered. diff --git a/man/check_notches.Rd b/man/check_notches.Rd new file mode 100644 index 0000000..fee4952 --- /dev/null +++ b/man/check_notches.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/msmsmatches.R +\name{check_notches} +\alias{check_notches} +\title{Checks the compatibility between ms1_notches and ms1_neulosses.} +\usage{ +check_notches(ms1_notches, ms1_neulosses) +} +\arguments{ +\item{ms1_notches}{A numeric vector; notches (off-sets) in precursor masses, +e.g., \code{c(-97.976896)} to account fo the loss of a phosphoric acid in +precursor masses.} + +\item{ms1_neulosses}{Character string(s) specifying the neutral losses of + precursors. The nomenclature is the same as those in argument + \code{varmods}, e.g., \code{c("Phospho (S)", "Phospho (T)", "Phospho + (Y)")}. + + The argument is a simplified (narrower) usage of argument + \code{ms1_notches} with additional specificity in modifications and sites.} +} +\description{ +Checks the compatibility between ms1_notches and ms1_neulosses. +} diff --git a/man/comb_ms1_offsets.Rd b/man/comb_ms1_offsets.Rd new file mode 100644 index 0000000..297b5d9 --- /dev/null +++ b/man/comb_ms1_offsets.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/msmsmatches2.R +\name{comb_ms1_offsets} +\alias{comb_ms1_offsets} +\title{Combines off-sets in precursor masses (notches and neutral losses).} +\usage{ +comb_ms1_offsets(ms1_offsets = 0, ms1_neulosses = NULL) +} +\arguments{ +\item{ms1_offsets}{Precursor mass off-sets (notches, not neutral losses).} + +\item{ms1_neulosses}{Character string(s) specifying the neutral losses of + precursors. The nomenclature is the same as those in argument + \code{varmods}, e.g., \code{c("Phospho (S)", "Phospho (T)", "Phospho + (Y)")}. + + The argument is a simplified (narrower) usage of argument + \code{ms1_notches} with additional specificity in modifications and sites.} +} +\value{ +A vector of mass off-sets. +} +\description{ +The return contains no information of Unimod titles and positions since it is +only used for pairing with experimental MGF data. +} diff --git a/man/combine_fracs.Rd b/man/combine_fracs.Rd index 8ac891a..f858fd8 100644 --- a/man/combine_fracs.Rd +++ b/man/combine_fracs.Rd @@ -4,17 +4,16 @@ \alias{combine_fracs} \title{Combines results from fractions} \usage{ -combine_fracs(files, tempdir, sc_path, ms1_offsets = 0) +combine_fracs(files, tempdir, sc_path, is_notched = FALSE) } \arguments{ -\item{files}{A list of file names} +\item{files}{A list of file names.} -\item{tempdir}{A temporary directory containing the files} +\item{tempdir}{A temporary directory containing the files.} \item{sc_path}{An output path} -\item{ms1_offsets}{Off-sets in precursor masses in relative to the original -values in MGFs.} +\item{is_notched}{Logical; is a search with MS1 notches or not.} } \description{ Combines results from fractions diff --git a/man/find_ms1_offsets.Rd b/man/find_ms1_offsets.Rd index 295967b..6222cd4 100644 --- a/man/find_ms1_offsets.Rd +++ b/man/find_ms1_offsets.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/msmsmatches2.R \name{find_ms1_offsets} \alias{find_ms1_offsets} -\title{Finds offsets in precursor masses.} +\title{Finds off-sets in precursor masses.} \usage{ find_ms1_offsets(n_13c = 0L, ms1_notches = 0) } @@ -11,9 +11,12 @@ find_ms1_offsets(n_13c = 0L, ms1_notches = 0) the range of \code{-1:2}. The default is 0.} \item{ms1_notches}{A numeric vector; notches (off-sets) in precursor masses, -e.g., \code{c(-79.966331, -97.976896)} to account fo the loss of a phospho -group and phosphoric acid in precursor masses.} +e.g., \code{c(-97.976896)} to account fo the loss of a phosphoric acid in +precursor masses.} +} +\value{ +A vector of mass off-sets. } \description{ -Finds offsets in precursor masses. +Finds off-sets in precursor masses. } diff --git a/man/find_optbs.Rd b/man/find_optbs.Rd new file mode 100644 index 0000000..e309624 --- /dev/null +++ b/man/find_optbs.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/msmsmatches2.R +\name{find_optbs} +\alias{find_optbs} +\title{Finds the optimal natural spline.} +\usage{ +find_optbs(diff_ms1, ret_time, knots = 50L) +} +\arguments{ +\item{diff_ms1}{The differences between experimental and theoretical +precursor masses.} + +\item{ret_time}{Retention timm.} + +\item{knots}{The number of knots.} +} +\description{ +Finds the optimal natural spline. +} diff --git a/man/find_optns.Rd b/man/find_optns.Rd new file mode 100644 index 0000000..b356e8f --- /dev/null +++ b/man/find_optns.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/msmsmatches2.R +\name{find_optns} +\alias{find_optns} +\title{Finds the optimal natural spline.} +\usage{ +find_optns(diff_ms1, ret_time, knots = 50L) +} +\arguments{ +\item{diff_ms1}{The differences between experimental and theoretical +precursor masses.} + +\item{ret_time}{Retention timm.} + +\item{knots}{The number of knots.} +} +\description{ +Finds the optimal natural spline. +} diff --git a/man/hadd_primatches.Rd b/man/hadd_primatches.Rd index 5ba3f4b..7df0f78 100644 --- a/man/hadd_primatches.Rd +++ b/man/hadd_primatches.Rd @@ -6,7 +6,7 @@ \usage{ hadd_primatches( out_path = NULL, - ms1_offsets = 0, + is_notched = FALSE, add_ms2theos = FALSE, add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, @@ -18,8 +18,7 @@ hadd_primatches( \arguments{ \item{out_path}{A file path of outputs.} -\item{ms1_offsets}{Off-sets in precursor masses in relative to the original -values in MGFs.} +\item{is_notched}{Logical; is a search with MS1 notches or not.} \item{add_ms2theos}{Logical. If true, adds the sequence of primary theoretical MS2 m/z values (\code{pep_ms2_theos}). The sequence order at a diff --git a/man/hcalc_tmtint.Rd b/man/hcalc_tmtint.Rd index 10914d0..2bda578 100644 --- a/man/hcalc_tmtint.Rd +++ b/man/hcalc_tmtint.Rd @@ -9,7 +9,6 @@ hcalc_tmtint( quant = "tmt10", ppm_reporters = 10L, idx = 1L, - ms1_offsets = 0, out_path = NULL ) } @@ -30,9 +29,6 @@ ions. The default is 10.} \item{idx}{The i-th chunk} -\item{ms1_offsets}{Off-sets in precursor masses in relative to the original -values in MGFs.} - \item{out_path}{A file path of outputs.} } \description{ diff --git a/man/hms2match.Rd b/man/hms2match.Rd index 7b5ca1f..9fa6721 100644 --- a/man/hms2match.Rd +++ b/man/hms2match.Rd @@ -9,6 +9,9 @@ hms2match( funs_ms2, ms1vmods_all, ms2vmods_all, + ms1_neulosses = NULL, + maxn_neulosses_fnl = 1L, + maxn_neulosses_vnl = 1L, mod_indexes, mgf_path, out_path, @@ -39,6 +42,26 @@ correspondence to \code{aa_masses_all}} \item{ms2vmods_all}{All possible labels of MS2 variable modifications in correspondence to \code{aa_masses_all}} +\item{ms1_neulosses}{Character string(s) specifying the neutral losses of + precursors. The nomenclature is the same as those in argument + \code{varmods}, e.g., \code{c("Phospho (S)", "Phospho (T)", "Phospho + (Y)")}. + + The argument is a simplified (narrower) usage of argument + \code{ms1_notches} with additional specificity in modifications and sites.} + +\item{maxn_neulosses_fnl}{A positive integer used in conjunction with +arguments \code{ms1_notches} and \code{ms1_neulosses}. The maximum number +of fixed MS2 neutral losses to be considered when searching against peak +lists with MS1 mass off-sets. The default is two (one MS2 neutral loss in +addition to 0).} + +\item{maxn_neulosses_vnl}{A positive integer used in conjunction with +arguments \code{ms1_notches} and \code{ms1_neulosses}. The maximum number +of variable MS2 neutral losses to be considered when searching against peak +lists with MS1 mass off-sets. The default is two (one MS2 neutral loss in +addition to 0).} + \item{mod_indexes}{Integer; the indexes of fixed and/or variable modifications} diff --git a/man/hms2match_one.Rd b/man/hms2match_one.Rd index 0a30a3a..e39e3de 100644 --- a/man/hms2match_one.Rd +++ b/man/hms2match_one.Rd @@ -6,12 +6,16 @@ \usage{ hms2match_one( pep_mod_group, - mgths, + nms_theo, + nms_expt, aa_masses, FUN, ms1vmods, ms2vmods, cl, + ms1_neulosses = NULL, + maxn_neulosses_fnl = 1L, + maxn_neulosses_vnl = 1L, mod_indexes, mgf_path, out_path, @@ -32,7 +36,9 @@ hms2match_one( \arguments{ \item{pep_mod_group}{The index of peptide modification groups.} -\item{mgths}{Pairs of experimental and theoretical data.} +\item{nms_theo}{Names of theoretical files.} + +\item{nms_expt}{Names of experimental files.} \item{aa_masses}{An amino-acid look-up.} @@ -47,6 +53,26 @@ an i-t \code{aa_masses}.} \item{cl}{The value of clusters for parallel processes.} +\item{ms1_neulosses}{Character string(s) specifying the neutral losses of + precursors. The nomenclature is the same as those in argument + \code{varmods}, e.g., \code{c("Phospho (S)", "Phospho (T)", "Phospho + (Y)")}. + + The argument is a simplified (narrower) usage of argument + \code{ms1_notches} with additional specificity in modifications and sites.} + +\item{maxn_neulosses_fnl}{A positive integer used in conjunction with +arguments \code{ms1_notches} and \code{ms1_neulosses}. The maximum number +of fixed MS2 neutral losses to be considered when searching against peak +lists with MS1 mass off-sets. The default is two (one MS2 neutral loss in +addition to 0).} + +\item{maxn_neulosses_vnl}{A positive integer used in conjunction with +arguments \code{ms1_notches} and \code{ms1_neulosses}. The maximum number +of variable MS2 neutral losses to be considered when searching against peak +lists with MS1 mass off-sets. The default is two (one MS2 neutral loss in +addition to 0).} + \item{mod_indexes}{Integer; the indexes of fixed and/or variable modifications} diff --git a/man/matchMS.Rd b/man/matchMS.Rd index 72de881..956ef59 100644 --- a/man/matchMS.Rd +++ b/man/matchMS.Rd @@ -15,6 +15,9 @@ matchMS( varmods = c("Acetyl (Protein N-term)", "Oxidation (M)", "Deamidated (N)", "Gln->pyro-Glu (N-term = Q)"), rm_dup_term_anywhere = TRUE, + ms1_neulosses = NULL, + maxn_neulosses_fnl = 2L, + maxn_neulosses_vnl = 2L, fixedlabs = NULL, varlabs = NULL, locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)"), @@ -144,6 +147,26 @@ c("uniprot_acc", "uniprot_id", "refseq_acc", "other"). See also variable modifications with site(s) in positions of both terminal and anywhere, e.g., "Gln->pyro-Glu (N-term = Q)" and "Deamidated (Q).} +\item{ms1_neulosses}{Character string(s) specifying the neutral losses of + precursors. The nomenclature is the same as those in argument + \code{varmods}, e.g., \code{c("Phospho (S)", "Phospho (T)", "Phospho + (Y)")}. + + The argument is a simplified (narrower) usage of argument + \code{ms1_notches} with additional specificity in modifications and sites.} + +\item{maxn_neulosses_fnl}{A positive integer used in conjunction with +arguments \code{ms1_notches} and \code{ms1_neulosses}. The maximum number +of fixed MS2 neutral losses to be considered when searching against peak +lists with MS1 mass off-sets. The default is two (one MS2 neutral loss in +addition to 0).} + +\item{maxn_neulosses_vnl}{A positive integer used in conjunction with +arguments \code{ms1_notches} and \code{ms1_neulosses}. The maximum number +of variable MS2 neutral losses to be considered when searching against peak +lists with MS1 mass off-sets. The default is two (one MS2 neutral loss in +addition to 0).} + \item{fixedlabs}{Character string(s) of fixed isotopic labels. See examples of SILAC for details. Can be but not typically used in standard alone searches of labeled residues.} @@ -271,8 +294,8 @@ default is 20.} the range of \code{-1:2}. The default is 0.} \item{ms1_notches}{A numeric vector; notches (off-sets) in precursor masses, -e.g., \code{c(-79.966331, -97.976896)} to account fo the loss of a phospho -group and phosphoric acid in precursor masses.} +e.g., \code{c(-97.976896)} to account fo the loss of a phosphoric acid in +precursor masses.} \item{par_groups}{A low-priority feature. Parameter(s) of \code{matchMS} multiplied by sets of values in groups. Multiple searches will be performed @@ -671,9 +694,9 @@ matchMS( matchMS( fixedmods = c("TMTpro (N-term)", "TMTpro (K)", "Carbamidomethyl (C)"), varmods = c("Acetyl (Protein N-term)", "Oxidation (M)", - "Deamidated (N)", "Gln->pyro-Glu (N-term = Q)"), + "Deamidated (N)", "Deamidated (Q)", + "Gln->pyro-Glu (N-term = Q)"), quant = "tmt18", - fdr_type = "psm", out_path = "~/mzion/examples", ) diff --git a/man/mprepBrukerMGF.Rd b/man/mprepBrukerMGF.Rd index 0c62269..56ee95c 100644 --- a/man/mprepBrukerMGF.Rd +++ b/man/mprepBrukerMGF.Rd @@ -4,7 +4,7 @@ \alias{mprepBrukerMGF} \title{Parallel \link{prepBrukerMGF}} \usage{ -mprepBrukerMGF(filepath, n_cores = 48L) +mprepBrukerMGF(filepath, n_cores = 32L) } \arguments{ \item{filepath}{A file path to MGF.} diff --git a/man/ms2match.Rd b/man/ms2match.Rd index edf591f..af5436e 100644 --- a/man/ms2match.Rd +++ b/man/ms2match.Rd @@ -13,8 +13,8 @@ ms2match( type_ms2ions = "by", maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, - maxn_fnl_per_seq = 64L, - maxn_vnl_per_seq = 64L, + maxn_fnl_per_seq = 3L, + maxn_vnl_per_seq = 3L, maxn_vmods_sitescombi_per_pep = 64L, minn_ms2 = 6L, ppm_ms1 = 20L, @@ -27,6 +27,9 @@ ms2match( by_modules = TRUE, reframe_mgfs = FALSE, ms1_offsets = 0, + ms1_neulosses = NULL, + maxn_neulosses_fnl = 1L, + maxn_neulosses_vnl = 1L, fasta, acc_type, acc_pattern, @@ -140,6 +143,26 @@ FALSE, search all modules together. The later would probably need more than \item{ms1_offsets}{The MS1 off-sets.} +\item{ms1_neulosses}{Character string(s) specifying the neutral losses of + precursors. The nomenclature is the same as those in argument + \code{varmods}, e.g., \code{c("Phospho (S)", "Phospho (T)", "Phospho + (Y)")}. + + The argument is a simplified (narrower) usage of argument + \code{ms1_notches} with additional specificity in modifications and sites.} + +\item{maxn_neulosses_fnl}{A positive integer used in conjunction with +arguments \code{ms1_notches} and \code{ms1_neulosses}. The maximum number +of fixed MS2 neutral losses to be considered when searching against peak +lists with MS1 mass off-sets. The default is two (one MS2 neutral loss in +addition to 0).} + +\item{maxn_neulosses_vnl}{A positive integer used in conjunction with +arguments \code{ms1_notches} and \code{ms1_neulosses}. The maximum number +of variable MS2 neutral losses to be considered when searching against peak +lists with MS1 mass off-sets. The default is two (one MS2 neutral loss in +addition to 0).} + \item{fasta}{Character string(s) to the name(s) of fasta file(s) with prepended directory path. The experimenter needs to supply the files.} diff --git a/man/ms2match_one.Rd b/man/ms2match_one.Rd index 2e49a4e..680508e 100644 --- a/man/ms2match_one.Rd +++ b/man/ms2match_one.Rd @@ -2,10 +2,11 @@ % Please edit documentation in R/ms2frames.R \name{ms2match_one} \alias{ms2match_one} -\title{Matches experimentals and theoreticals} +\title{Matches experimentals and theoreticals.} \usage{ ms2match_one( - mgth, + nms_theo, + nms_expt, pep_mod_group, aa_masses, FUN, @@ -30,7 +31,9 @@ ms2match_one( ) } \arguments{ -\item{mgth}{MGF and theoretical pairs} +\item{nms_theo}{Names of theoretical files.} + +\item{nms_expt}{Names of experimental files.} \item{pep_mod_group}{The index of peptide modification groups} @@ -126,5 +129,5 @@ interrogation. The default is 110.} \item{df0}{An output template} } \description{ -For a single module +For a single module and single notch. } diff --git a/man/order_fracs3.Rd b/man/order_fracs3.Rd new file mode 100644 index 0000000..23d228e --- /dev/null +++ b/man/order_fracs3.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/scores.R +\name{order_fracs3} +\alias{order_fracs3} +\title{Order fractions} +\usage{ +order_fracs3(type = "list_table", tempdir, by_modules = TRUE) +} +\arguments{ +\item{type}{The type of files.} + +\item{tempdir}{A temporary directory containing the files.} + +\item{by_modules}{Logical; if TRUE, performs searches by modules.} +} +\description{ +Three layers: module_notch_frac +} diff --git a/man/post_calib.Rd b/man/post_calib.Rd new file mode 100644 index 0000000..5bc9097 --- /dev/null +++ b/man/post_calib.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/msmsmatches2.R +\name{post_calib} +\alias{post_calib} +\title{Post MS1 calibrations.} +\usage{ +post_calib(mgfs, min_mass, max_mass, mgf_path, filename) +} +\arguments{ +\item{mgfs}{MGF data.} + +\item{min_mass}{A positive integer; the minimum precursor mass for +interrogation. The default is an arbitrarily low value (the primary guard +against low molecular-weight precursors is \code{min_len}).} + +\item{max_mass}{A positive integer; the maximum precursor mass for +interrogation.} + +\item{mgf_path}{A file path to a list of MGF files. The experimenter needs to + supply the files. + + The supported MGFs are in the formats of (1) MSConvert against \code{.raw} + from Thermo's Orbitrap or \code{.d} from Bruker's timsTOF Pro, (2) Thermo's + Proteome Discoverer or (3) Bruker's DataAnalysis. + + With MSConvert, the default \code{titleMaker} is required for correct + parsing (don't think it can be altered by users, but just in case).} + +\item{filename}{An MGF file name} +} +\description{ +Post MS1 calibrations. +} diff --git a/man/prep_pepfdr_td.Rd b/man/prep_pepfdr_td.Rd index 71faa51..ad37b02 100644 --- a/man/prep_pepfdr_td.Rd +++ b/man/prep_pepfdr_td.Rd @@ -10,7 +10,7 @@ prep_pepfdr_td( enzyme = "trypsin_p", nes_fdr_group = "base", fdr_group = "base", - ms1_offsets = 0, + is_notched = FALSE, only_notch_zero = TRUE ) } @@ -39,8 +39,9 @@ peptide FDR controls. The value is in one of \code{c("all", "base")}. The \code{base} corresponds to the modification group with the largest number of matches.} -\item{only_notch_zero}{Logical; if TRUE, use only data at \code{ms1_offsets = -0}.} +\item{is_notched}{Logical; is a search with MS1 notches or not.} + +\item{only_notch_zero}{Logical; if TRUE, use only data at the zero notch.} } \description{ Prepares target-decoy data. diff --git a/man/table_unimods.Rd b/man/table_unimods.Rd index 2a90ee3..bf5a492 100644 --- a/man/table_unimods.Rd +++ b/man/table_unimods.Rd @@ -10,8 +10,7 @@ table_unimods(out_nm = "~/mzion/unimods.txt") \item{out_nm}{A name to outputs.} } \description{ -For convenience summary of the \code{title}, \code{site} and -\code{position}. +For convenience summary of the \code{title}, \code{site} and \code{position}. } \examples{ \donttest{ @@ -19,7 +18,7 @@ library(mzion) ans <- table_unimods() -## TMT-6, -10 and -11 plexes +## TMT-6, -10 and -11 plexes # share the same Unimod entry at title "TMT6plex" # (the same chemistry at tag mass 229.162932 Da) ans[with(ans, title == "TMT6plex"), ] @@ -50,5 +49,7 @@ this_mod <- parse_unimod("Gln->pryo-Glu (N-term = Q)") } } \seealso{ -\link{find_unimod}, \link{parse_unimod}. +\link{find_unimod}, \link{parse_unimod}, \link{add_unimod}, + \link{remove_unimod}, \link{remove_unimod_title}, + \link{calc_unimod_compmass}. }