From b487e01352f503eab73fa48444a007948abeb00f Mon Sep 17 00:00:00 2001 From: Qiang Zhang <45829450+qzhang503@users.noreply.github.com> Date: Sun, 18 Jun 2023 20:40:29 -0500 Subject: [PATCH] 2023-06-18 A quick fix in searches with precursor mass off-sets (including the number of 13C off-sets). --- DESCRIPTION | 2 +- R/funs.R | 45 ++++++++++++------------ R/scores.R | 80 +++++++++++++++++++++++++------------------ man/prep_pepfdr_td.Rd | 7 +++- 4 files changed, 77 insertions(+), 57 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 458d8a1..4935ee0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: mzion Type: Package Title: Database Searches of Proteomic Mass-spectrometrirc Data -Version: 1.2.8 +Version: 1.2.8.1 Authors@R: person(given = "Qiang", family = "Zhang", diff --git a/R/funs.R b/R/funs.R index 6025f40..df672b4 100644 --- a/R/funs.R +++ b/R/funs.R @@ -1,6 +1,6 @@ # $bin_masses.R # [1] "bin_ms1masses" "binTheoSeqs_i" "binTheoSeqs2" "bin_theoseqs" "binTheoSeqs" -# [6] "find_ms1_cutpoints" "s_readRDS" +# [6] "find_ms1_cutpoints" "s_readRDS" "set_bin_ncores" # # $dispatch.R # [1] "find_pos_site" "contain_pos_site" "contain_termpos_any" "subset_by_prps" "subset_protntsite" @@ -42,11 +42,11 @@ # [29] "mmake_noenzpeps" "make_noenzpeps" "hmake_noenzpeps" "ms1masses_bare_noenz" # [33] "keep_n_misses" "exclude_n_misses" "str_exclude_count" "rm_char_in_nfirst" # [37] "rm_char_in_nlast" "adj_base_masses" "adj_anywhere_masses" "add_term_mass" -# [41] "ms1masses_bare" "add_ms1_13c" "ms1masses_noterm" "calcms1mass_noterm" -# [45] "calcms1mass_noterm_byprot" "calcms1mass_noterm_bypep" "distri_peps" "ct_counts" -# [49] "distri_fpeps" "roll_sum" "hsemipeps_byprots" "semipeps_byprots" -# [53] "calc_semipepmasses" "delta_ms1_a0_fnl1" "hms1_a0_vnl0_fnl1" "ms1_a0_vnl0_fnl1" -# [57] "hms1_a1_vnl0_fnl0" "ms1_a1_vnl0_fnl0" +# [41] "ms1masses_bare" "add_ms1_13c" "add_ms1_notches" "ms1masses_noterm" +# [45] "calcms1mass_noterm" "calcms1mass_noterm_byprot" "calcms1mass_noterm_bypep" "distri_peps" +# [49] "ct_counts" "distri_fpeps" "roll_sum" "hsemipeps_byprots" +# [53] "semipeps_byprots" "calc_semipepmasses" "delta_ms1_a0_fnl1" "hms1_a0_vnl0_fnl1" +# [57] "ms1_a0_vnl0_fnl1" "hms1_a1_vnl0_fnl0" "ms1_a1_vnl0_fnl0" # # $ms2_gen.R # [1] "gen_ms2ions_base" "gen_ms2ions_a0_vnl0_fnl1" "gen_ms2ions_a1_vnl0_fnl0" "calc_ms2ions_a1_vnl0_fnl0" @@ -54,8 +54,8 @@ # [9] "calc_ms2ions_a1_vnl1_fnl0" # # $ms2frames.R -# [1] "pair_mgftheo" "hms2match" "ms2match_all" "mframes_adv" "fuzzy_match_one" "fuzzy_match_one2" -# [7] "find_ms2_bypep" "search_mgf" "ms2match_one" "frames_adv" +# [1] "pair_mgftheos" "hpair_mgths" "hms2match" "ms2match_all" "mframes_adv" "find_ms2_bypep" "search_mgf" +# [8] "hms2match_one" "ms2match_one" "frames_adv" # # $msmsmatches.R # [1] "matchMS" "try_psmC2Q" "reproc_psmC" "psmC2Q" "post_psmC2Q" "check_tmt_pars" @@ -63,6 +63,7 @@ # # $msmsmatches2.R # [1] "ms2match" "reverse_peps_in_frame" "reverse_seqs" "calib_mgf" "calib_ms1" +# [6] "find_ms1_offsets" # # $mzion.R # character(0) @@ -74,10 +75,11 @@ # [1] "creat_folds" "cv_svm" "perco_svm" # # $quant2.R -# [1] "hcalc_tmtint" "calc_tmtint" "add_rptrs" "find_reporter_ints" "find_reporters_ppm" -# [6] "msub_protpep" "sub_protpep" "add_protacc2" "add_protacc" "hannot_decoys" -# [11] "groupProts" "map_pepprot" "collapse_sortpeps" "pcollapse_sortpeps" "chunksplit_spmat" -# [16] "find_group_breaks" "cut_proteinGroups" "sparseD_fourquad" "as_dist" "greedysetcover3" +# [1] "hcalc_tmtint" "calc_tmtint" "add_rptrs" "find_int_cols" "find_reporter_ints" +# [6] "find_reporters_ppm" "msub_protpep" "sub_protpep" "add_protacc2" "add_protacc" +# [11] "hannot_decoys" "groupProts" "map_pepprot" "collapse_sortpeps" "pcollapse_sortpeps" +# [16] "chunksplit_spmat" "find_group_breaks" "cut_proteinGroups" "sparseD_fourquad" "as_dist" +# [21] "greedysetcover3" # # $roadmaps.R # character(0) @@ -93,8 +95,9 @@ # [29] "fill_probco_nas" "fill_probs" "post_pepfdr" "calc_protfdr" # [33] "aggr_prot_es" "calc_protfdr_i" "fit_protfdr" " f" # [37] "find_ppm_outer_bycombi" "match_ex2th2" "calc_peploc" "calcpeprank_1" -# [41] "calcpeprank_2" "calcpeprank_3" "find_chunkbreaks" "findLocFracsDF" -# [45] "concatFracs" "na.interp" "is.constant" "tsoutliers" +# [41] "calcpeprank_2" "calcpeprank_3" "find_bestnotch" "find_chunkbreaks" +# [45] "findLocFracsDF" "concatFracs" "na.interp" "is.constant" +# [49] "tsoutliers" # # $silac.R # [1] "matchMS_silac_mix" "matchMS_par_groups" "add_fixedlab_masses" "matchMS_noenzyme" "combine_ion_matches" @@ -118,13 +121,13 @@ # [36] "finds_uniq_vec" "my_dataframe" "flatten_list" "calc_rev_ms2" "bind_dfs" # # $utils_os.R -# [1] "`names_pos<-`" "find_int_cols" "ins_cols_after" "add_cols_at" -# [5] "replace_cols_at" "reloc_col_after" "reloc_col_after_last" "reloc_col_after_first" -# [9] "reloc_col_before" "reloc_col_before_last" "reloc_col_before_first" "find_preceding_colnm" -# [13] "recur_flatten" "chunksplit" "chunksplitLB" "find_dir" -# [17] "create_dir" "save_call2" "find_callarg_vals" "match_calltime" -# [21] "delete_files" "find_ms1_times" "get_globalvar" "load_cache_info" -# [25] "is_nulllist" "add_nulllist" +# [1] "`names_pos<-`" "ins_cols_after" "add_cols_at" "replace_cols_at" +# [5] "reloc_col_after" "reloc_col_after_last" "reloc_col_after_first" "reloc_col_before" +# [9] "reloc_col_before_last" "reloc_col_before_first" "find_preceding_colnm" "recur_flatten" +# [13] "chunksplit" "chunksplitLB" "find_dir" "create_dir" +# [17] "save_call2" "find_callarg_vals" "match_calltime" "delete_files" +# [21] "find_ms1_times" "get_globalvar" "load_cache_info" "is_nulllist" +# [25] "add_nulllist" # # $utils_ui.R # [1] "calc_monopeptide" "calc_monopep" "check_aaseq" "calc_ms2ionseries" "calc_ms2ions" "unique_mvmods" diff --git a/R/scores.R b/R/scores.R index dd5ddeb..310c3c9 100644 --- a/R/scores.R +++ b/R/scores.R @@ -1479,7 +1479,7 @@ probco_bypeplen <- function (len, td, fdr_type = "protein", target_fdr = 0.01, .01 newx <- seq(min_x, max_x, by = step) - newy <- predict(fit, data.frame(x = newx)) %>% `names<-`(newx) + newy <- predict(fit, data.frame(x = newx)) |> `names<-`(newx) # NA if not existed score_co2 <- as.numeric(names(which(newy <= target_fdr)[1])) @@ -1651,11 +1651,14 @@ find_probco_valley <- function (prob_cos, guess = 12L) #' Prepares target-decoy data. -#' +#' #' @param td A data frame of targets and decoys (for Percolator). +#' @param only_notch_zero Logical; if TRUE, use only data at \code{ms1_offsets = +#' 0}. #' @inheritParams matchMS prep_pepfdr_td <- function (td = NULL, out_path, enzyme = "trypsin_p", - nes_fdr_group = "base", fdr_group = "base") + nes_fdr_group = "base", fdr_group = "base", + ms1_offsets = 0, only_notch_zero = TRUE) { files <- list.files(path = file.path(out_path, "temp"), pattern = "^pepscores_", full.names = TRUE) @@ -1669,6 +1672,9 @@ prep_pepfdr_td <- function (td = NULL, out_path, enzyme = "trypsin_p", td <- dplyr::bind_rows(td) } + if (length(ms1_offsets) > 1L && only_notch_zero) + td <- td[with(td, abs(pep_ms1_offset) <= 1e-4), ] + cts <- dplyr::count(dplyr::group_by(td, "pep_mod_group"), pep_mod_group) max_i <- cts$pep_mod_group[which.max(cts$n)[[1]]] top3s <- cts$pep_mod_group[which_topx2(cts$n, 3)[1:3]] @@ -1788,12 +1794,12 @@ calc_pepfdr <- function (target_fdr = .01, fdr_type = "protein", { message("Calculating peptide FDR.") - # may exclude non-zero ms1_offsets... - td <- prep_pepfdr_td(out_path = out_path, enzyme = enzyme, nes_fdr_group = nes_fdr_group, - fdr_group = fdr_group) + fdr_group = fdr_group, + ms1_offsets = ms1_offsets, + only_notch_zero = TRUE) # back-compatibility to new column keys (e.g. scan_num -> pep_scan_num) if (!"pep_scan_num" %in% names(td)) @@ -2219,7 +2225,7 @@ calc_protfdr <- function (df = NULL, target_fdr = .01, max_protscores_co = Inf, message("Calculating peptide-protein FDR.") # score cut-offs as a function of prot_n_pep - max_n_pep <- max(df$prot_n_pep, na.rm = TRUE) + max_n_pep <- max(df$prot_n_pep, na.rm = TRUE) all_n_peps <- unique(df$prot_n_pep) # protein enrichment score cut-offs at each `prot_n_pep` @@ -2296,10 +2302,10 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L, if (td[["prot_n_pep"]][[1]] > max_protnpep_co) return(0L) - td <- td %>% - dplyr::group_by(prot_acc, pep_seq) %>% - dplyr::arrange(-pep_score) %>% - dplyr::filter(row_number() == 1L) %>% + td <- td |> + dplyr::group_by(prot_acc, pep_seq) |> + dplyr::arrange(-pep_score) |> + dplyr::filter(row_number() == 1L) |> dplyr::ungroup() ## no decoys @@ -2316,8 +2322,8 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L, return(1L) # (NOT to use `local` for hard `return (0L)`) - prot_scores <- td %>% - dplyr::mutate(prot_es = pep_score - pep_score_co) %>% + prot_scores <- td |> + dplyr::mutate(prot_es = pep_score - pep_score_co) |> dplyr::group_by(prot_acc) prot_scores <- switch(method_prot_es_co, @@ -2345,8 +2351,8 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L, # keep one unique `prot_es` (by roughly assuming `prot_es` is an indicator for # the redundancy). - prot_scores <- prot_scores %>% - dplyr::mutate(prot_es = round(prot_es, digits = 5L)) %>% + prot_scores <- prot_scores |> + dplyr::mutate(prot_es = round(prot_es, digits = 5L)) |> dplyr::filter(!duplicated(prot_es)) # Removes the last three in case only one or two decoy spikes near the end @@ -2358,8 +2364,8 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L, # -NP_001157012 54.4 # NP_077772 62.7 - prot_scores <- prot_scores %>% - dplyr::arrange(prot_es) %>% + prot_scores <- prot_scores |> + dplyr::arrange(prot_es) |> dplyr::filter(row_number() > n_burnin) # no decoys @@ -2367,11 +2373,11 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L, return (0L) # both targets and decoys - td <- td %>% - dplyr::right_join(prot_scores, by = "prot_acc") %>% - dplyr::arrange(-prot_es) %>% - dplyr::mutate(total = row_number()) %>% - dplyr::mutate(decoy = cumsum(pep_isdecoy)) %>% + td <- td |> + dplyr::right_join(prot_scores, by = "prot_acc") |> + dplyr::arrange(-prot_es) |> + dplyr::mutate(total = row_number()) |> + dplyr::mutate(decoy = cumsum(pep_isdecoy)) |> dplyr::mutate(fdr = decoy/total) rm(list = "prot_scores") @@ -2404,11 +2410,11 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L, min_score <- min(data$x, na.rm = TRUE) max_score <- max(data$x, na.rm = TRUE) newx <- min_score:max_score - newy <- predict(fit, data.frame(x = newx)) %>% `names<-`(newx) + newy <- predict(fit, data.frame(x = newx)) |> `names<-`(newx) # NA if not existed - score_co2 <- which(newy <= target_fdr)[1] %>% - names() %>% + score_co2 <- which(newy <= target_fdr)[1] |> + names() |> as.numeric() local({ @@ -2493,9 +2499,9 @@ fit_protfdr <- function (vec, max_n_pep = 1000L, out_path) NA } else { - fits %>% - .[!is.na(.)] %>% - .[[length(.)]] + # fits %>% .[!is.na(.)] %>% .[[length(.)]] + fits <- fits[!is.na(fits)] + fits <- fits[[length(fits)]] } } @@ -2524,7 +2530,7 @@ fit_protfdr <- function (vec, max_n_pep = 1000L, out_path) out <- data.frame( prot_n_pep = newx, - prot_score_co = newy) %>% + prot_score_co = newy) |> dplyr::mutate(prot_score_co = ifelse(prot_n_pep >= elbow, 0, prot_score_co)) } else { @@ -2538,7 +2544,7 @@ fit_protfdr <- function (vec, max_n_pep = 1000L, out_path) out <- data.frame( prot_n_pep = newx, - prot_score_co = newy) %>% + prot_score_co = newy) |> dplyr::mutate(prot_score_co = ifelse(prot_n_pep >= elbow, 0, prot_score_co)) } @@ -3051,15 +3057,21 @@ find_bestnotch <- function (x) x[, pep_rank0 := data.table::frank(-pep_score, ties.method = "min"), by = list(query_id)] + # the best query at each query_id g <- x[, c("pep_rank0", "query_id", "pep_ms1_offset")] g <- g[g$pep_rank0 == 1L, ] g[["pep_rank0"]] <- NULL + g[, id := seq_len(.N), by = query_id] + g <- g[g$id == 1L, ] + g[["id"]] <- NULL - # pep_ms1_offset == 0 goes first? - if (length(us <- unique(g$pep_ms1_offset)) > 1L) { - g <- g[g$pep_ms1_offset == us[[1]], ] + if (FALSE) { + if (length(us <- unique(g$pep_ms1_offset)) > 1L) { + i <- if (length(ok <- which(abs(us) < 1e-4)) == 1L) ok else 1L + g <- g[g$pep_ms1_offset == us[[i]], ] + } } - + g[["keep"]] <- TRUE g[, uid := paste(query_id, pep_ms1_offset, sep = ".")] g[["pep_ms1_offset"]] <- g[["query_id"]] <- NULL diff --git a/man/prep_pepfdr_td.Rd b/man/prep_pepfdr_td.Rd index 59ab9af..71faa51 100644 --- a/man/prep_pepfdr_td.Rd +++ b/man/prep_pepfdr_td.Rd @@ -9,7 +9,9 @@ prep_pepfdr_td( out_path, enzyme = "trypsin_p", nes_fdr_group = "base", - fdr_group = "base" + fdr_group = "base", + ms1_offsets = 0, + only_notch_zero = TRUE ) } \arguments{ @@ -36,6 +38,9 @@ parameter \code{fdr_group}.} peptide FDR controls. The value is in one of \code{c("all", "base")}. The \code{base} corresponds to the modification group with the largest number of matches.} + +\item{only_notch_zero}{Logical; if TRUE, use only data at \code{ms1_offsets = +0}.} } \description{ Prepares target-decoy data.