2023-06-18

A quick fix in searches with precursor mass off-sets (including the number of 13C off-sets).
qzhang503 · Jun 19, 2023 · b487e01 · b487e01
1 parent 2be455b
commit b487e01
Show file tree

Hide file tree

Showing 4 changed files with 77 additions and 57 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: mzion
 Type: Package
 Title: Database Searches of Proteomic Mass-spectrometrirc Data
-Version: 1.2.8
+Version: 1.2.8.1
 Authors@R: 
     person(given = "Qiang",
            family = "Zhang",

diff --git a/R/funs.R b/R/funs.R
@@ -1,6 +1,6 @@
 # $bin_masses.R
 # [1] "bin_ms1masses"      "binTheoSeqs_i"      "binTheoSeqs2"       "bin_theoseqs"       "binTheoSeqs"       
-# [6] "find_ms1_cutpoints" "s_readRDS"         
+# [6] "find_ms1_cutpoints" "s_readRDS"          "set_bin_ncores"    
 # 
 # $dispatch.R
 #  [1] "find_pos_site"       "contain_pos_site"    "contain_termpos_any" "subset_by_prps"      "subset_protntsite"  
@@ -42,27 +42,28 @@
 # [29] "mmake_noenzpeps"           "make_noenzpeps"            "hmake_noenzpeps"           "ms1masses_bare_noenz"     
 # [33] "keep_n_misses"             "exclude_n_misses"          "str_exclude_count"         "rm_char_in_nfirst"        
 # [37] "rm_char_in_nlast"          "adj_base_masses"           "adj_anywhere_masses"       "add_term_mass"            
-# [41] "ms1masses_bare"            "add_ms1_13c"               "ms1masses_noterm"          "calcms1mass_noterm"       
-# [45] "calcms1mass_noterm_byprot" "calcms1mass_noterm_bypep"  "distri_peps"               "ct_counts"                
-# [49] "distri_fpeps"              "roll_sum"                  "hsemipeps_byprots"         "semipeps_byprots"         
-# [53] "calc_semipepmasses"        "delta_ms1_a0_fnl1"         "hms1_a0_vnl0_fnl1"         "ms1_a0_vnl0_fnl1"         
-# [57] "hms1_a1_vnl0_fnl0"         "ms1_a1_vnl0_fnl0"         
+# [41] "ms1masses_bare"            "add_ms1_13c"               "add_ms1_notches"           "ms1masses_noterm"         
+# [45] "calcms1mass_noterm"        "calcms1mass_noterm_byprot" "calcms1mass_noterm_bypep"  "distri_peps"              
+# [49] "ct_counts"                 "distri_fpeps"              "roll_sum"                  "hsemipeps_byprots"        
+# [53] "semipeps_byprots"          "calc_semipepmasses"        "delta_ms1_a0_fnl1"         "hms1_a0_vnl0_fnl1"        
+# [57] "ms1_a0_vnl0_fnl1"          "hms1_a1_vnl0_fnl0"         "ms1_a1_vnl0_fnl0"         
 # 
 # $ms2_gen.R
 # [1] "gen_ms2ions_base"          "gen_ms2ions_a0_vnl0_fnl1"  "gen_ms2ions_a1_vnl0_fnl0"  "calc_ms2ions_a1_vnl0_fnl0"
 # [5] "check_ms1_mass_vmods"      "gen_ms2ions_a1_vnl0_fnl1"  "calc_ms2ions_a1_vnl0_fnl1" "gen_ms2ions_a1_vnl1_fnl0" 
 # [9] "calc_ms2ions_a1_vnl1_fnl0"
 # 
 # $ms2frames.R
-#  [1] "pair_mgftheo"     "hms2match"        "ms2match_all"     "mframes_adv"      "fuzzy_match_one"  "fuzzy_match_one2"
-#  [7] "find_ms2_bypep"   "search_mgf"       "ms2match_one"     "frames_adv"      
+#  [1] "pair_mgftheos"  "hpair_mgths"    "hms2match"      "ms2match_all"   "mframes_adv"    "find_ms2_bypep" "search_mgf"    
+#  [8] "hms2match_one"  "ms2match_one"   "frames_adv"    
 # 
 # $msmsmatches.R
 #  [1] "matchMS"         "try_psmC2Q"      "reproc_psmC"     "psmC2Q"          "post_psmC2Q"     "check_tmt_pars" 
 #  [7] "checkMGF"        "check_locmods"   "map_raw_n_scan"  "check_fdr_group"
 # 
 # $msmsmatches2.R
 # [1] "ms2match"              "reverse_peps_in_frame" "reverse_seqs"          "calib_mgf"             "calib_ms1"            
+# [6] "find_ms1_offsets"     
 # 
 # $mzion.R
 # character(0)
@@ -74,10 +75,11 @@
 # [1] "creat_folds" "cv_svm"      "perco_svm"  
 # 
 # $quant2.R
-#  [1] "hcalc_tmtint"       "calc_tmtint"        "add_rptrs"          "find_reporter_ints" "find_reporters_ppm"
-#  [6] "msub_protpep"       "sub_protpep"        "add_protacc2"       "add_protacc"        "hannot_decoys"     
-# [11] "groupProts"         "map_pepprot"        "collapse_sortpeps"  "pcollapse_sortpeps" "chunksplit_spmat"  
-# [16] "find_group_breaks"  "cut_proteinGroups"  "sparseD_fourquad"   "as_dist"            "greedysetcover3"   
+#  [1] "hcalc_tmtint"       "calc_tmtint"        "add_rptrs"          "find_int_cols"      "find_reporter_ints"
+#  [6] "find_reporters_ppm" "msub_protpep"       "sub_protpep"        "add_protacc2"       "add_protacc"       
+# [11] "hannot_decoys"      "groupProts"         "map_pepprot"        "collapse_sortpeps"  "pcollapse_sortpeps"
+# [16] "chunksplit_spmat"   "find_group_breaks"  "cut_proteinGroups"  "sparseD_fourquad"   "as_dist"           
+# [21] "greedysetcover3"   
 # 
 # $roadmaps.R
 # character(0)
@@ -93,8 +95,9 @@
 # [29] "fill_probco_nas"        "fill_probs"             "post_pepfdr"            "calc_protfdr"          
 # [33] "aggr_prot_es"           "calc_protfdr_i"         "fit_protfdr"            "  f"                   
 # [37] "find_ppm_outer_bycombi" "match_ex2th2"           "calc_peploc"            "calcpeprank_1"         
-# [41] "calcpeprank_2"          "calcpeprank_3"          "find_chunkbreaks"       "findLocFracsDF"        
-# [45] "concatFracs"            "na.interp"              "is.constant"            "tsoutliers"            
+# [41] "calcpeprank_2"          "calcpeprank_3"          "find_bestnotch"         "find_chunkbreaks"      
+# [45] "findLocFracsDF"         "concatFracs"            "na.interp"              "is.constant"           
+# [49] "tsoutliers"            
 # 
 # $silac.R
 # [1] "matchMS_silac_mix"   "matchMS_par_groups"  "add_fixedlab_masses" "matchMS_noenzyme"    "combine_ion_matches"
@@ -118,13 +121,13 @@
 # [36] "finds_uniq_vec"        "my_dataframe"          "flatten_list"          "calc_rev_ms2"          "bind_dfs"             
 # 
 # $utils_os.R
-#  [1] "`names_pos<-`"          "find_int_cols"          "ins_cols_after"         "add_cols_at"           
-#  [5] "replace_cols_at"        "reloc_col_after"        "reloc_col_after_last"   "reloc_col_after_first" 
-#  [9] "reloc_col_before"       "reloc_col_before_last"  "reloc_col_before_first" "find_preceding_colnm"  
-# [13] "recur_flatten"          "chunksplit"             "chunksplitLB"           "find_dir"              
-# [17] "create_dir"             "save_call2"             "find_callarg_vals"      "match_calltime"        
-# [21] "delete_files"           "find_ms1_times"         "get_globalvar"          "load_cache_info"       
-# [25] "is_nulllist"            "add_nulllist"          
+#  [1] "`names_pos<-`"          "ins_cols_after"         "add_cols_at"            "replace_cols_at"       
+#  [5] "reloc_col_after"        "reloc_col_after_last"   "reloc_col_after_first"  "reloc_col_before"      
+#  [9] "reloc_col_before_last"  "reloc_col_before_first" "find_preceding_colnm"   "recur_flatten"         
+# [13] "chunksplit"             "chunksplitLB"           "find_dir"               "create_dir"            
+# [17] "save_call2"             "find_callarg_vals"      "match_calltime"         "delete_files"          
+# [21] "find_ms1_times"         "get_globalvar"          "load_cache_info"        "is_nulllist"           
+# [25] "add_nulllist"          
 # 
 # $utils_ui.R
 # [1] "calc_monopeptide"  "calc_monopep"      "check_aaseq"       "calc_ms2ionseries" "calc_ms2ions"      "unique_mvmods"    

diff --git a/R/scores.R b/R/scores.R
@@ -1479,7 +1479,7 @@ probco_bypeplen <- function (len, td, fdr_type = "protein", target_fdr = 0.01,
         .01
 
       newx <- seq(min_x, max_x, by = step)
-      newy <- predict(fit, data.frame(x = newx)) %>% `names<-`(newx)
+      newy <- predict(fit, data.frame(x = newx)) |> `names<-`(newx)
 
       # NA if not existed
       score_co2 <- as.numeric(names(which(newy <= target_fdr)[1]))
@@ -1651,11 +1651,14 @@ find_probco_valley <- function (prob_cos, guess = 12L)
 
 
 #' Prepares target-decoy data.
-#' 
+#'
 #' @param td A data frame of targets and decoys (for Percolator).
+#' @param only_notch_zero Logical; if TRUE, use only data at \code{ms1_offsets =
+#'   0}.
 #' @inheritParams matchMS
 prep_pepfdr_td <- function (td = NULL, out_path, enzyme = "trypsin_p", 
-                            nes_fdr_group = "base", fdr_group = "base")
+                            nes_fdr_group = "base", fdr_group = "base", 
+                            ms1_offsets = 0, only_notch_zero = TRUE)
 {
   files <- list.files(path = file.path(out_path, "temp"), 
                       pattern = "^pepscores_", full.names = TRUE)
@@ -1669,6 +1672,9 @@ prep_pepfdr_td <- function (td = NULL, out_path, enzyme = "trypsin_p",
     td <- dplyr::bind_rows(td)
   }
 
+  if (length(ms1_offsets) > 1L && only_notch_zero)
+    td <- td[with(td, abs(pep_ms1_offset) <= 1e-4), ]
+
   cts   <- dplyr::count(dplyr::group_by(td, "pep_mod_group"), pep_mod_group)
   max_i <- cts$pep_mod_group[which.max(cts$n)[[1]]]
   top3s <- cts$pep_mod_group[which_topx2(cts$n, 3)[1:3]]
@@ -1788,12 +1794,12 @@ calc_pepfdr <- function (target_fdr = .01, fdr_type = "protein",
 {
   message("Calculating peptide FDR.")
 
-  # may exclude non-zero ms1_offsets...
-
   td <- prep_pepfdr_td(out_path = out_path, 
                        enzyme = enzyme, 
                        nes_fdr_group = nes_fdr_group, 
-                       fdr_group = fdr_group)
+                       fdr_group = fdr_group, 
+                       ms1_offsets = ms1_offsets, 
+                       only_notch_zero = TRUE)
 
   # back-compatibility to new column keys (e.g. scan_num -> pep_scan_num)
   if (!"pep_scan_num" %in% names(td)) 
@@ -2219,7 +2225,7 @@ calc_protfdr <- function (df = NULL, target_fdr = .01, max_protscores_co = Inf,
   message("Calculating peptide-protein FDR.")
 
   # score cut-offs as a function of prot_n_pep
-  max_n_pep <- max(df$prot_n_pep, na.rm = TRUE)
+  max_n_pep  <- max(df$prot_n_pep, na.rm = TRUE)
   all_n_peps <- unique(df$prot_n_pep)
 
   # protein enrichment score cut-offs at each `prot_n_pep`
@@ -2296,10 +2302,10 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L,
   if (td[["prot_n_pep"]][[1]] > max_protnpep_co)
     return(0L)
 
-  td <- td %>% 
-    dplyr::group_by(prot_acc, pep_seq) %>% 
-    dplyr::arrange(-pep_score) %>% 
-    dplyr::filter(row_number() == 1L) %>% 
+  td <- td |> 
+    dplyr::group_by(prot_acc, pep_seq) |> 
+    dplyr::arrange(-pep_score) |> 
+    dplyr::filter(row_number() == 1L) |> 
     dplyr::ungroup()
 
   ## no decoys
@@ -2316,8 +2322,8 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L,
     return(1L)
 
   # (NOT to use `local` for hard `return (0L)`)
-  prot_scores <- td %>% 
-    dplyr::mutate(prot_es = pep_score - pep_score_co) %>% 
+  prot_scores <- td |> 
+    dplyr::mutate(prot_es = pep_score - pep_score_co) |> 
     dplyr::group_by(prot_acc) 
 
   prot_scores <- switch(method_prot_es_co, 
@@ -2345,8 +2351,8 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L,
   # keep one unique `prot_es` (by roughly assuming `prot_es` is an indicator for 
   # the redundancy). 
 
-  prot_scores <- prot_scores %>% 
-    dplyr::mutate(prot_es = round(prot_es, digits = 5L)) %>% 
+  prot_scores <- prot_scores |> 
+    dplyr::mutate(prot_es = round(prot_es, digits = 5L)) |> 
     dplyr::filter(!duplicated(prot_es))
 
   # Removes the last three in case only one or two decoy spikes near the end 
@@ -2358,20 +2364,20 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L,
   # -NP_001157012    54.4
   # NP_077772        62.7
 
-  prot_scores <- prot_scores %>% 
-    dplyr::arrange(prot_es) %>% 
+  prot_scores <- prot_scores |> 
+    dplyr::arrange(prot_es) |> 
     dplyr::filter(row_number() > n_burnin)
 
   # no decoys
   if (!any(grepl("^-", prot_scores$prot_acc))) 
     return (0L)
 
   # both targets and decoys
-  td <- td %>% 
-    dplyr::right_join(prot_scores, by = "prot_acc") %>% 
-    dplyr::arrange(-prot_es) %>% 
-    dplyr::mutate(total = row_number()) %>% 
-    dplyr::mutate(decoy = cumsum(pep_isdecoy)) %>% 
+  td <- td |> 
+    dplyr::right_join(prot_scores, by = "prot_acc") |> 
+    dplyr::arrange(-prot_es) |> 
+    dplyr::mutate(total = row_number()) |> 
+    dplyr::mutate(decoy = cumsum(pep_isdecoy)) |> 
     dplyr::mutate(fdr = decoy/total)
 
   rm(list = "prot_scores")
@@ -2404,11 +2410,11 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L,
         min_score <- min(data$x, na.rm = TRUE)
         max_score <- max(data$x, na.rm = TRUE)
         newx <- min_score:max_score
-        newy <- predict(fit, data.frame(x = newx)) %>% `names<-`(newx)
+        newy <- predict(fit, data.frame(x = newx)) |> `names<-`(newx)
 
         # NA if not existed
-        score_co2 <- which(newy <= target_fdr)[1] %>% 
-          names() %>% 
+        score_co2 <- which(newy <= target_fdr)[1] |> 
+          names() |> 
           as.numeric()
 
         local({
@@ -2493,9 +2499,9 @@ fit_protfdr <- function (vec, max_n_pep = 1000L, out_path)
       NA
     } 
     else {
-      fits %>% 
-        .[!is.na(.)] %>% 
-        .[[length(.)]]
+      # fits %>% .[!is.na(.)] %>% .[[length(.)]]
+      fits <- fits[!is.na(fits)]
+      fits <- fits[[length(fits)]]
     }
   }
 
@@ -2524,7 +2530,7 @@ fit_protfdr <- function (vec, max_n_pep = 1000L, out_path)
 
     out <- data.frame(
       prot_n_pep = newx, 
-      prot_score_co = newy) %>% 
+      prot_score_co = newy) |> 
       dplyr::mutate(prot_score_co = ifelse(prot_n_pep >= elbow, 0, prot_score_co))
   } 
   else {
@@ -2538,7 +2544,7 @@ fit_protfdr <- function (vec, max_n_pep = 1000L, out_path)
 
     out <- data.frame(
       prot_n_pep = newx, 
-      prot_score_co = newy) %>% 
+      prot_score_co = newy) |> 
       dplyr::mutate(prot_score_co = ifelse(prot_n_pep >= elbow, 0, prot_score_co))
 
   }
@@ -3051,15 +3057,21 @@ find_bestnotch <- function (x)
   x[, pep_rank0 := data.table::frank(-pep_score, ties.method = "min"), 
     by = list(query_id)]
 
+  # the best query at each query_id
   g <- x[, c("pep_rank0", "query_id", "pep_ms1_offset")]
   g <- g[g$pep_rank0 == 1L, ]
   g[["pep_rank0"]] <- NULL
+  g[, id := seq_len(.N), by = query_id]
+  g <- g[g$id == 1L, ]
+  g[["id"]] <- NULL
 
-  # pep_ms1_offset == 0 goes first?
-  if (length(us <- unique(g$pep_ms1_offset)) > 1L) {
-    g <- g[g$pep_ms1_offset == us[[1]], ]
+  if (FALSE) {
+    if (length(us <- unique(g$pep_ms1_offset)) > 1L) {
+      i <- if (length(ok <- which(abs(us) < 1e-4)) == 1L) ok else 1L
+      g <- g[g$pep_ms1_offset == us[[i]], ]
+    }
   }
-  
+
   g[["keep"]] <- TRUE
   g[, uid := paste(query_id, pep_ms1_offset, sep = ".")]
   g[["pep_ms1_offset"]] <- g[["query_id"]] <- NULL

diff --git a/man/prep_pepfdr_td.Rd b/man/prep_pepfdr_td.Rd