diff --git a/DESCRIPTION b/DESCRIPTION index 45eed38..10c8cec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: mzion Type: Package Title: Proteomics Database Searches of Mass-spectrometrirc Data. -Version: 1.3.5 +Version: 1.3.5.1 Authors@R: person(given = "Qiang", family = "Zhang", diff --git a/R/mgfs.R b/R/mgfs.R index f6684a4..fe43ca5 100644 --- a/R/mgfs.R +++ b/R/mgfs.R @@ -312,48 +312,49 @@ readMGF <- function (filepath = NULL, filelist = NULL, out_path = NULL, message("Loading '", file, "'.") - out[[i]] <- read_mgf_chunks(filepath = filepath, - temp_dir = temp_dir, - raw_id = i, - topn_ms2ions = topn_ms2ions, - ms1_charge_range = ms1_charge_range, - ms1_scan_range = ms1_scan_range, - ret_range = ret_range, - min_mass = min_mass, - max_mass = max_mass, - ppm_ms1 = ppm_ms1, - ppm_ms2 = ppm_ms2, - min_ms2mass = min_ms2mass, - max_ms2mass = max_ms2mass, - mgf_cutmzs = mgf_cutmzs, - mgf_cutpercs = mgf_cutpercs, - type_mgf = type_mgf, - n_bf_begin = n_bf_begin, - n_spacer = n_spacer, - n_hdr = n_hdr, - n_to_pepmass = n_to_pepmass, - n_to_title = n_to_title, - n_to_scan = n_to_scan, - n_to_rt = n_to_rt, - n_to_charge = n_to_charge, - sep_ms2s = sep_ms2s, - nfields_ms2s = nfields_ms2s, - sep_pepmass = sep_pepmass, - nfields_pepmass = nfields_pepmass, - raw_file = raw_files[[i]], - tmt_reporter_lower = tmt_reporter_lower, - tmt_reporter_upper = tmt_reporter_upper, - exclude_reporter_region = exclude_reporter_region, - use_defpeaks = use_defpeaks, - deisotope_ms2 = deisotope_ms2, - max_ms2_charge = max_ms2_charge, - maxn_dia_precurs = maxn_dia_precurs, - maxn_mdda_precurs = maxn_mdda_precurs, - n_mdda_flanks = n_mdda_flanks, - ppm_ms1_deisotope = ppm_ms1_deisotope, - ppm_ms2_deisotope = ppm_ms2_deisotope, - quant = quant, - digits = digits) + out[[i]] <- read_mgf_chunks( + filepath = filepath, + temp_dir = temp_dir, + raw_id = i, + topn_ms2ions = topn_ms2ions, + ms1_charge_range = ms1_charge_range, + ms1_scan_range = ms1_scan_range, + ret_range = ret_range, + min_mass = min_mass, + max_mass = max_mass, + ppm_ms1 = ppm_ms1, + ppm_ms2 = ppm_ms2, + min_ms2mass = min_ms2mass, + max_ms2mass = max_ms2mass, + mgf_cutmzs = mgf_cutmzs, + mgf_cutpercs = mgf_cutpercs, + type_mgf = type_mgf, + n_bf_begin = n_bf_begin, + n_spacer = n_spacer, + n_hdr = n_hdr, + n_to_pepmass = n_to_pepmass, + n_to_title = n_to_title, + n_to_scan = n_to_scan, + n_to_rt = n_to_rt, + n_to_charge = n_to_charge, + sep_ms2s = sep_ms2s, + nfields_ms2s = nfields_ms2s, + sep_pepmass = sep_pepmass, + nfields_pepmass = nfields_pepmass, + raw_file = raw_files[[i]], + tmt_reporter_lower = tmt_reporter_lower, + tmt_reporter_upper = tmt_reporter_upper, + exclude_reporter_region = exclude_reporter_region, + use_defpeaks = use_defpeaks, + deisotope_ms2 = deisotope_ms2, + max_ms2_charge = max_ms2_charge, + maxn_dia_precurs = maxn_dia_precurs, + maxn_mdda_precurs = maxn_mdda_precurs, + n_mdda_flanks = n_mdda_flanks, + ppm_ms1_deisotope = ppm_ms1_deisotope, + ppm_ms2_deisotope = ppm_ms2_deisotope, + quant = quant, + digits = digits) local({ dir2 <- file.path(filepath, gsub("\\.[^.]*$", "", file)) @@ -550,8 +551,7 @@ read_mgf_chunks <- function (filepath, temp_dir, raw_id = 1L, ppm_ms1_deisotope = ppm_ms1_deisotope, ppm_ms2_deisotope = ppm_ms2_deisotope, quant = quant, - digits = digits - ) + digits = digits) } else { n_cores <- min(detect_cores(32L), len) @@ -576,47 +576,48 @@ read_mgf_chunks <- function (filepath, temp_dir, raw_id = 1L, envir = environment(mzion::matchMS) ) - out <- parallel::clusterApply(cl, file.path(temp_dir, filelist), - proc_mgf_chunks, - topn_ms2ions = topn_ms2ions, - ms1_charge_range = ms1_charge_range, - ms1_scan_range = ms1_scan_range, - ret_range = ret_range, - min_mass = min_mass, - max_mass = max_mass, - ppm_ms1 = ppm_ms1, - ppm_ms2 = ppm_ms2, - min_ms2mass = min_ms2mass, - max_ms2mass = max_ms2mass, - mgf_cutmzs = mgf_cutmzs, - mgf_cutpercs = mgf_cutpercs, - type_mgf = type_mgf, - n_bf_begin = n_bf_begin, - n_spacer = n_spacer, - n_hdr = n_hdr, - n_to_pepmass = n_to_pepmass, - n_to_title = n_to_title, - n_to_scan = n_to_scan, - n_to_rt = n_to_rt, - n_to_charge = n_to_charge, - sep_ms2s = sep_ms2s, - nfields_ms2s = nfields_ms2s, - sep_pepmass = sep_pepmass, - nfields_pepmass = nfields_pepmass, - raw_file = raw_file, - tmt_reporter_lower = tmt_reporter_lower, - tmt_reporter_upper = tmt_reporter_upper, - exclude_reporter_region = exclude_reporter_region, - deisotope_ms2 = deisotope_ms2, - max_ms2_charge = max_ms2_charge, - use_defpeaks = use_defpeaks, - maxn_dia_precurs = maxn_dia_precurs, - maxn_mdda_precurs = maxn_mdda_precurs, - n_mdda_flanks = n_mdda_flanks, - ppm_ms1_deisotope = ppm_ms1_deisotope, - ppm_ms2_deisotope = ppm_ms2_deisotope, - quant = quant, - digits = digits) + out <- parallel::clusterApply( + cl, file.path(temp_dir, filelist), + proc_mgf_chunks, + topn_ms2ions = topn_ms2ions, + ms1_charge_range = ms1_charge_range, + ms1_scan_range = ms1_scan_range, + ret_range = ret_range, + min_mass = min_mass, + max_mass = max_mass, + ppm_ms1 = ppm_ms1, + ppm_ms2 = ppm_ms2, + min_ms2mass = min_ms2mass, + max_ms2mass = max_ms2mass, + mgf_cutmzs = mgf_cutmzs, + mgf_cutpercs = mgf_cutpercs, + type_mgf = type_mgf, + n_bf_begin = n_bf_begin, + n_spacer = n_spacer, + n_hdr = n_hdr, + n_to_pepmass = n_to_pepmass, + n_to_title = n_to_title, + n_to_scan = n_to_scan, + n_to_rt = n_to_rt, + n_to_charge = n_to_charge, + sep_ms2s = sep_ms2s, + nfields_ms2s = nfields_ms2s, + sep_pepmass = sep_pepmass, + nfields_pepmass = nfields_pepmass, + raw_file = raw_file, + tmt_reporter_lower = tmt_reporter_lower, + tmt_reporter_upper = tmt_reporter_upper, + exclude_reporter_region = exclude_reporter_region, + deisotope_ms2 = deisotope_ms2, + max_ms2_charge = max_ms2_charge, + use_defpeaks = use_defpeaks, + maxn_dia_precurs = maxn_dia_precurs, + maxn_mdda_precurs = maxn_mdda_precurs, + n_mdda_flanks = n_mdda_flanks, + ppm_ms1_deisotope = ppm_ms1_deisotope, + ppm_ms2_deisotope = ppm_ms2_deisotope, + quant = quant, + digits = digits) parallel::stopCluster(cl) diff --git a/R/msmsmatches.R b/R/msmsmatches.R index 7a52b5c..3e9249e 100644 --- a/R/msmsmatches.R +++ b/R/msmsmatches.R @@ -1509,7 +1509,8 @@ matchMS <- function (out_path = "~/mzion/outs", nes_fdr_group = nes_fdr_group, out_path = out_path) - ans <- post_pepfdr(prob_cos, out_path) + ans <- post_pepfdr(prob_cos = prob_cos, maxn_mdda_precurs = maxn_mdda_precurs, + n_13c = n_13c, out_path = out_path) if (svm_reproc) { message("SVM reprocessing of peptide probabilities.") @@ -1543,7 +1544,8 @@ matchMS <- function (out_path = "~/mzion/outs", locmods = locmods, is_notched = is_notched, topn_mods_per_seq = topn_mods_per_seq, - topn_seqs_per_query = topn_seqs_per_query) + topn_seqs_per_query = topn_seqs_per_query, + maxn_mdda_precurs = maxn_mdda_precurs) } ## Protein accessions @@ -1608,6 +1610,10 @@ matchMS <- function (out_path = "~/mzion/outs", qs::qsave(df, file_protfdr, preset = "fast") } + # second removals after combining pep_mod_group's + df <- rm_dup13c(df, maxn_mdda_precurs = maxn_mdda_precurs, n_13c = n_13c) + + # add optional reporters df <- add_rptrs(df, quant, out_path) ## Clean-ups diff --git a/R/scores.R b/R/scores.R index dbaffbf..da1ea45 100644 --- a/R/scores.R +++ b/R/scores.R @@ -2214,7 +2214,9 @@ fill_probs <- function (nas, prob_cos, target_fdr = .01) #' #' @param prob_cos Probability cut-offs (in data frame). #' @param out_path An output path. -post_pepfdr <- function (prob_cos = NULL, out_path = NULL) +#' @inheritParams matchMS +post_pepfdr <- function (prob_cos = NULL, maxn_mdda_precurs = 1L, n_13c = 0L, + out_path = NULL) { if (is.null(prob_cos)) { file_prob <- file.path(out_path, "temp", "pep_probco.rds") @@ -2239,22 +2241,31 @@ post_pepfdr <- function (prob_cos = NULL, out_path = NULL) df <- qs::qread(file.path(out_path, "temp", x)) u <- df[, c("raw_file", "pep_scan_num", "pep_seq", "pep_ivmod", "pep_ms1_offset")] - df[!duplicated.data.frame(u), ] + df <- df[!duplicated.data.frame(u), ] }) names(td) <- ok_targets$idxes - ok <- lapply(td, nrow) > 0L - td <- dplyr::bind_rows(td[ok]) - rm(list = c("ok_targets", "files", "ok")) - - if (!nrow(td)) + + if (!any(ok <- unlist(lapply(td, nrow) > 0L))) stop("No PSM matches for scoring. Consider different search parameters.") + td <- td[ok] + rm(list = c("ok_targets", "files", "ok")) + + td <- lapply(td, rm_dup13c, maxn_mdda_precurs = maxn_mdda_precurs, n_13c = n_13c) + + td <- lapply(td, function (df) { + df <- df |> + dplyr::left_join(prob_cos, by = "pep_len") |> + dplyr::mutate(pep_issig = ifelse(pep_prob <= pep_prob_co, TRUE, FALSE)) + }) + + # may delay the combine... + td <- dplyr::bind_rows(td) + # Adjusted p-values (just to moderate pep_score) td <- td |> - dplyr::left_join(prob_cos, by = "pep_len") |> - dplyr::mutate(pep_issig = ifelse(pep_prob <= pep_prob_co, TRUE, FALSE), - pep_adjp = p.adjust(pep_prob, "BH")) + dplyr::mutate(pep_adjp = p.adjust(pep_prob, "BH")) adjp_cos <- lapply(prob_cos$pep_prob_co, function (x) { row <- which.min(abs(log10(td$pep_prob/x))) @@ -2787,7 +2798,8 @@ match_ex2th2 <- function (expt, theo, min_ms2mass = 115L, d = 1E-5) calc_peploc <- function (x = NULL, out_path = NULL, mod_indexes = NULL, is_notched = FALSE, locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)"), - topn_mods_per_seq = 3L, topn_seqs_per_query = 3L) + topn_mods_per_seq = 3L, topn_seqs_per_query = 3L, + maxn_mdda_precurs = 1L) { message("Calculating peptide localization scores and deltas.") @@ -3112,17 +3124,27 @@ calcpeprank_3 <- function (x0) find_bestnotch <- function (x) { # favors pep_ms1_offset == 0 + cols <- names(x) x <- data.table::rbindlist(list(x[x$pep_ms1_offset == 0], x[x$pep_ms1_offset != 0]), use.names = FALSE) - x[, pep_rank0 := data.table::frank(-pep_score, ties.method = "first"), - by = list(query_id)] + + if ("pep_score" %in% cols) { + x[, pep_rank0 := data.table::frank(-pep_score, ties.method = "first"), + by = list(query_id)] + } + else if ("pep_prob" %in% cols) { + x[, pep_rank0 := data.table::frank(pep_prob, ties.method = "first"), + by = list(query_id)] + } + else { + return(x) + } # the best query at each query_id g <- x[, c("pep_rank0", "query_id", "pep_ms1_offset")] g <- g[g$pep_rank0 == 1L, ] g[["pep_rank0"]] <- NULL - g[["keep"]] <- TRUE g[, uid := paste(query_id, pep_ms1_offset, sep = ".")] g[["pep_ms1_offset"]] <- g[["query_id"]] <- NULL @@ -3521,3 +3543,93 @@ tsoutliers <- function (x, iterate = 2, lambda = NULL) } +#' Removes duplicated entries in 13C results. +#' +#' Removes "n_13c != 0" entries with better matches at "n_13c == 0". By each +#' module as the same combination of raw_file + pep_scan_num can also be +#' duplicated across modification modules. +#' +#' @param df A data frame. +#' @inheritParams matchMS +rm_dup13c <- function (df, maxn_mdda_precurs = 1L, n_13c = 0L) +{ + if (length(n_13c) == 1L && n_13c == 0L) # maxn_mdda_precurs < 1L || + return(df) + + esscols <- c("pep_ms1_offset", "pep_scan_num", "raw_file") + cols <- names(df) + + if (!all(esscols %in% cols)) + return(df) + + key <- if ("pep_score" %in% cols) + 1L + else if ("pep_prob" %in% cols) + 2L + else + 0L + + if (!key) + return(df) + + rows <- df$pep_ms1_offset == 0 + df0 <- df[rows, ] + df1 <- df[!rows, ] + rm(list = c("rows", "df")) + + # no need of `pep_isdecoy` + ids0 <- with(df0, paste(raw_file, gsub("\\.\\d+$", "", pep_scan_num), sep = ".")) + ids1 <- with(df1, paste(raw_file, gsub("\\.\\d+$", "", pep_scan_num), sep = ".")) + + oks <- ids1 %in% unique(ids0) + df1a <- df1[!oks, ] + df1b <- df1[oks, ] + rm(list = c("oks", "df1")) + + oks <- ids0 %in% unique(ids1) + df0a <- df0[!oks, ] + df0b <- df0[oks, ] + rm(list = c("ids0", "ids1", "oks", "df0")) + + dfb <- dplyr::bind_rows(df0b, df1b) + rm(list = c("df0b", "df1b")) + + if (nrow(dfb)) { + dfb$scan_nums. <- gsub("\\.\\d+$", "", dfb$pep_scan_num) + dfb$query_id <- with(dfb, paste(raw_file, scan_nums., sep = ".")) + dfb$scan_nums. <- NULL + + dfb <- split(dfb, dfb$query_id) + + # if best(x0) better than best(x1) and x0 is decoy -> still keep x0 + if (key == 1L) { + dfb <- lapply(dfb, function (x) { + rows <- x$pep_ms1_offset == 0 + x0 <- x[rows, ] + x1 <- x[!rows, ] + s0 <- max(x0$pep_score, na.rm = TRUE) + s1 <- max(x1$pep_score, na.rm = TRUE) + if (s1 - s0 > .23) x1 else x0 # see below; can be just 0 + }) + } + else { + dfb <- lapply(dfb, function (x) { + rows <- x$pep_ms1_offset == 0 + x0 <- x[rows, ] + x1 <- x[!rows, ] + s0 <- min(x0$pep_prob, na.rm = TRUE) + s1 <- min(x1$pep_prob, na.rm = TRUE) + if (s1 / s0 < .95) x1 else x0 # 10^-(.23/10); can be just 1 + }) + } + + dfb <- dplyr::bind_rows(dfb) + dfb$query_id <- NULL + df <- dplyr::bind_rows(df0a, df1a, dfb) + } + else { + df <- dplyr::bind_rows(df0a, df1a) + } +} + + diff --git a/man/calc_peploc.Rd b/man/calc_peploc.Rd index 1102908..3229156 100644 --- a/man/calc_peploc.Rd +++ b/man/calc_peploc.Rd @@ -11,7 +11,8 @@ calc_peploc( is_notched = FALSE, locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)"), topn_mods_per_seq = 3L, - topn_seqs_per_query = 3L + topn_seqs_per_query = 3L, + maxn_mdda_precurs = 1L ) } \arguments{ @@ -51,6 +52,12 @@ modifications.} The same \code{MS query} refers to the identity in \code{MS scan number} and \code{MS raw file name}. Target and decoys matches are treated separately.} + +\item{maxn_mdda_precurs}{Maximum number of precursors for consideration in a +multi-precursor DDA scan. Note that at \code{maxn_mdda_precurs = 1}, it is +equivalent to DDA with precursor re-deisotoping. At \code{maxn_mdda_precurs += 0}, it is equivalent to DDA using the original monoisotopic masses and +charge states provided by a peak-picking algorithm (e.g., MSConvert).} } \description{ A score delta between the best and the second best. diff --git a/man/post_pepfdr.Rd b/man/post_pepfdr.Rd index 26ca528..c342df5 100644 --- a/man/post_pepfdr.Rd +++ b/man/post_pepfdr.Rd @@ -4,11 +4,25 @@ \alias{post_pepfdr} \title{Post processing after peptide FDR.} \usage{ -post_pepfdr(prob_cos = NULL, out_path = NULL) +post_pepfdr( + prob_cos = NULL, + maxn_mdda_precurs = 1L, + n_13c = 0L, + out_path = NULL +) } \arguments{ \item{prob_cos}{Probability cut-offs (in data frame).} +\item{maxn_mdda_precurs}{Maximum number of precursors for consideration in a +multi-precursor DDA scan. Note that at \code{maxn_mdda_precurs = 1}, it is +equivalent to DDA with precursor re-deisotoping. At \code{maxn_mdda_precurs += 0}, it is equivalent to DDA using the original monoisotopic masses and +charge states provided by a peak-picking algorithm (e.g., MSConvert).} + +\item{n_13c}{Number(s) of 13C off-sets in precursor masses, for example, over +the range of \code{-1:2}. The default is 0.} + \item{out_path}{An output path.} } \description{ diff --git a/man/rm_dup13c.Rd b/man/rm_dup13c.Rd new file mode 100644 index 0000000..38778d6 --- /dev/null +++ b/man/rm_dup13c.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/scores.R +\name{rm_dup13c} +\alias{rm_dup13c} +\title{Removes duplicated entries in 13C results.} +\usage{ +rm_dup13c(df, maxn_mdda_precurs = 1L, n_13c = 0L) +} +\arguments{ +\item{df}{A data frame.} + +\item{maxn_mdda_precurs}{Maximum number of precursors for consideration in a +multi-precursor DDA scan. Note that at \code{maxn_mdda_precurs = 1}, it is +equivalent to DDA with precursor re-deisotoping. At \code{maxn_mdda_precurs += 0}, it is equivalent to DDA using the original monoisotopic masses and +charge states provided by a peak-picking algorithm (e.g., MSConvert).} + +\item{n_13c}{Number(s) of 13C off-sets in precursor masses, for example, over +the range of \code{-1:2}. The default is 0.} +} +\description{ +Removes "n_13c != 0" entries with better matches at "n_13c == 0". By each +module as the same combination of raw_file + pep_scan_num can also be +duplicated across modification modules. +}