Skip to content

Commit

Permalink
2023-06-18
Browse files Browse the repository at this point in the history
A quick fix in searches with precursor mass off-sets (including the number of 13C off-sets).
  • Loading branch information
qzhang503 committed Jun 19, 2023
1 parent 2be455b commit b487e01
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 57 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: mzion
Type: Package
Title: Database Searches of Proteomic Mass-spectrometrirc Data
Version: 1.2.8
Version: 1.2.8.1
Authors@R:
person(given = "Qiang",
family = "Zhang",
Expand Down
45 changes: 24 additions & 21 deletions R/funs.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# $bin_masses.R
# [1] "bin_ms1masses" "binTheoSeqs_i" "binTheoSeqs2" "bin_theoseqs" "binTheoSeqs"
# [6] "find_ms1_cutpoints" "s_readRDS"
# [6] "find_ms1_cutpoints" "s_readRDS" "set_bin_ncores"
#
# $dispatch.R
# [1] "find_pos_site" "contain_pos_site" "contain_termpos_any" "subset_by_prps" "subset_protntsite"
Expand Down Expand Up @@ -42,27 +42,28 @@
# [29] "mmake_noenzpeps" "make_noenzpeps" "hmake_noenzpeps" "ms1masses_bare_noenz"
# [33] "keep_n_misses" "exclude_n_misses" "str_exclude_count" "rm_char_in_nfirst"
# [37] "rm_char_in_nlast" "adj_base_masses" "adj_anywhere_masses" "add_term_mass"
# [41] "ms1masses_bare" "add_ms1_13c" "ms1masses_noterm" "calcms1mass_noterm"
# [45] "calcms1mass_noterm_byprot" "calcms1mass_noterm_bypep" "distri_peps" "ct_counts"
# [49] "distri_fpeps" "roll_sum" "hsemipeps_byprots" "semipeps_byprots"
# [53] "calc_semipepmasses" "delta_ms1_a0_fnl1" "hms1_a0_vnl0_fnl1" "ms1_a0_vnl0_fnl1"
# [57] "hms1_a1_vnl0_fnl0" "ms1_a1_vnl0_fnl0"
# [41] "ms1masses_bare" "add_ms1_13c" "add_ms1_notches" "ms1masses_noterm"
# [45] "calcms1mass_noterm" "calcms1mass_noterm_byprot" "calcms1mass_noterm_bypep" "distri_peps"
# [49] "ct_counts" "distri_fpeps" "roll_sum" "hsemipeps_byprots"
# [53] "semipeps_byprots" "calc_semipepmasses" "delta_ms1_a0_fnl1" "hms1_a0_vnl0_fnl1"
# [57] "ms1_a0_vnl0_fnl1" "hms1_a1_vnl0_fnl0" "ms1_a1_vnl0_fnl0"
#
# $ms2_gen.R
# [1] "gen_ms2ions_base" "gen_ms2ions_a0_vnl0_fnl1" "gen_ms2ions_a1_vnl0_fnl0" "calc_ms2ions_a1_vnl0_fnl0"
# [5] "check_ms1_mass_vmods" "gen_ms2ions_a1_vnl0_fnl1" "calc_ms2ions_a1_vnl0_fnl1" "gen_ms2ions_a1_vnl1_fnl0"
# [9] "calc_ms2ions_a1_vnl1_fnl0"
#
# $ms2frames.R
# [1] "pair_mgftheo" "hms2match" "ms2match_all" "mframes_adv" "fuzzy_match_one" "fuzzy_match_one2"
# [7] "find_ms2_bypep" "search_mgf" "ms2match_one" "frames_adv"
# [1] "pair_mgftheos" "hpair_mgths" "hms2match" "ms2match_all" "mframes_adv" "find_ms2_bypep" "search_mgf"
# [8] "hms2match_one" "ms2match_one" "frames_adv"
#
# $msmsmatches.R
# [1] "matchMS" "try_psmC2Q" "reproc_psmC" "psmC2Q" "post_psmC2Q" "check_tmt_pars"
# [7] "checkMGF" "check_locmods" "map_raw_n_scan" "check_fdr_group"
#
# $msmsmatches2.R
# [1] "ms2match" "reverse_peps_in_frame" "reverse_seqs" "calib_mgf" "calib_ms1"
# [6] "find_ms1_offsets"
#
# $mzion.R
# character(0)
Expand All @@ -74,10 +75,11 @@
# [1] "creat_folds" "cv_svm" "perco_svm"
#
# $quant2.R
# [1] "hcalc_tmtint" "calc_tmtint" "add_rptrs" "find_reporter_ints" "find_reporters_ppm"
# [6] "msub_protpep" "sub_protpep" "add_protacc2" "add_protacc" "hannot_decoys"
# [11] "groupProts" "map_pepprot" "collapse_sortpeps" "pcollapse_sortpeps" "chunksplit_spmat"
# [16] "find_group_breaks" "cut_proteinGroups" "sparseD_fourquad" "as_dist" "greedysetcover3"
# [1] "hcalc_tmtint" "calc_tmtint" "add_rptrs" "find_int_cols" "find_reporter_ints"
# [6] "find_reporters_ppm" "msub_protpep" "sub_protpep" "add_protacc2" "add_protacc"
# [11] "hannot_decoys" "groupProts" "map_pepprot" "collapse_sortpeps" "pcollapse_sortpeps"
# [16] "chunksplit_spmat" "find_group_breaks" "cut_proteinGroups" "sparseD_fourquad" "as_dist"
# [21] "greedysetcover3"
#
# $roadmaps.R
# character(0)
Expand All @@ -93,8 +95,9 @@
# [29] "fill_probco_nas" "fill_probs" "post_pepfdr" "calc_protfdr"
# [33] "aggr_prot_es" "calc_protfdr_i" "fit_protfdr" " f"
# [37] "find_ppm_outer_bycombi" "match_ex2th2" "calc_peploc" "calcpeprank_1"
# [41] "calcpeprank_2" "calcpeprank_3" "find_chunkbreaks" "findLocFracsDF"
# [45] "concatFracs" "na.interp" "is.constant" "tsoutliers"
# [41] "calcpeprank_2" "calcpeprank_3" "find_bestnotch" "find_chunkbreaks"
# [45] "findLocFracsDF" "concatFracs" "na.interp" "is.constant"
# [49] "tsoutliers"
#
# $silac.R
# [1] "matchMS_silac_mix" "matchMS_par_groups" "add_fixedlab_masses" "matchMS_noenzyme" "combine_ion_matches"
Expand All @@ -118,13 +121,13 @@
# [36] "finds_uniq_vec" "my_dataframe" "flatten_list" "calc_rev_ms2" "bind_dfs"
#
# $utils_os.R
# [1] "`names_pos<-`" "find_int_cols" "ins_cols_after" "add_cols_at"
# [5] "replace_cols_at" "reloc_col_after" "reloc_col_after_last" "reloc_col_after_first"
# [9] "reloc_col_before" "reloc_col_before_last" "reloc_col_before_first" "find_preceding_colnm"
# [13] "recur_flatten" "chunksplit" "chunksplitLB" "find_dir"
# [17] "create_dir" "save_call2" "find_callarg_vals" "match_calltime"
# [21] "delete_files" "find_ms1_times" "get_globalvar" "load_cache_info"
# [25] "is_nulllist" "add_nulllist"
# [1] "`names_pos<-`" "ins_cols_after" "add_cols_at" "replace_cols_at"
# [5] "reloc_col_after" "reloc_col_after_last" "reloc_col_after_first" "reloc_col_before"
# [9] "reloc_col_before_last" "reloc_col_before_first" "find_preceding_colnm" "recur_flatten"
# [13] "chunksplit" "chunksplitLB" "find_dir" "create_dir"
# [17] "save_call2" "find_callarg_vals" "match_calltime" "delete_files"
# [21] "find_ms1_times" "get_globalvar" "load_cache_info" "is_nulllist"
# [25] "add_nulllist"
#
# $utils_ui.R
# [1] "calc_monopeptide" "calc_monopep" "check_aaseq" "calc_ms2ionseries" "calc_ms2ions" "unique_mvmods"
Expand Down
80 changes: 46 additions & 34 deletions R/scores.R
Original file line number Diff line number Diff line change
Expand Up @@ -1479,7 +1479,7 @@ probco_bypeplen <- function (len, td, fdr_type = "protein", target_fdr = 0.01,
.01

newx <- seq(min_x, max_x, by = step)
newy <- predict(fit, data.frame(x = newx)) %>% `names<-`(newx)
newy <- predict(fit, data.frame(x = newx)) |> `names<-`(newx)

# NA if not existed
score_co2 <- as.numeric(names(which(newy <= target_fdr)[1]))
Expand Down Expand Up @@ -1651,11 +1651,14 @@ find_probco_valley <- function (prob_cos, guess = 12L)


#' Prepares target-decoy data.
#'
#'
#' @param td A data frame of targets and decoys (for Percolator).
#' @param only_notch_zero Logical; if TRUE, use only data at \code{ms1_offsets =
#' 0}.
#' @inheritParams matchMS
prep_pepfdr_td <- function (td = NULL, out_path, enzyme = "trypsin_p",
nes_fdr_group = "base", fdr_group = "base")
nes_fdr_group = "base", fdr_group = "base",
ms1_offsets = 0, only_notch_zero = TRUE)
{
files <- list.files(path = file.path(out_path, "temp"),
pattern = "^pepscores_", full.names = TRUE)
Expand All @@ -1669,6 +1672,9 @@ prep_pepfdr_td <- function (td = NULL, out_path, enzyme = "trypsin_p",
td <- dplyr::bind_rows(td)
}

if (length(ms1_offsets) > 1L && only_notch_zero)
td <- td[with(td, abs(pep_ms1_offset) <= 1e-4), ]

cts <- dplyr::count(dplyr::group_by(td, "pep_mod_group"), pep_mod_group)
max_i <- cts$pep_mod_group[which.max(cts$n)[[1]]]
top3s <- cts$pep_mod_group[which_topx2(cts$n, 3)[1:3]]
Expand Down Expand Up @@ -1788,12 +1794,12 @@ calc_pepfdr <- function (target_fdr = .01, fdr_type = "protein",
{
message("Calculating peptide FDR.")

# may exclude non-zero ms1_offsets...

td <- prep_pepfdr_td(out_path = out_path,
enzyme = enzyme,
nes_fdr_group = nes_fdr_group,
fdr_group = fdr_group)
fdr_group = fdr_group,
ms1_offsets = ms1_offsets,
only_notch_zero = TRUE)

# back-compatibility to new column keys (e.g. scan_num -> pep_scan_num)
if (!"pep_scan_num" %in% names(td))
Expand Down Expand Up @@ -2219,7 +2225,7 @@ calc_protfdr <- function (df = NULL, target_fdr = .01, max_protscores_co = Inf,
message("Calculating peptide-protein FDR.")

# score cut-offs as a function of prot_n_pep
max_n_pep <- max(df$prot_n_pep, na.rm = TRUE)
max_n_pep <- max(df$prot_n_pep, na.rm = TRUE)
all_n_peps <- unique(df$prot_n_pep)

# protein enrichment score cut-offs at each `prot_n_pep`
Expand Down Expand Up @@ -2296,10 +2302,10 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L,
if (td[["prot_n_pep"]][[1]] > max_protnpep_co)
return(0L)

td <- td %>%
dplyr::group_by(prot_acc, pep_seq) %>%
dplyr::arrange(-pep_score) %>%
dplyr::filter(row_number() == 1L) %>%
td <- td |>
dplyr::group_by(prot_acc, pep_seq) |>
dplyr::arrange(-pep_score) |>
dplyr::filter(row_number() == 1L) |>
dplyr::ungroup()

## no decoys
Expand All @@ -2316,8 +2322,8 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L,
return(1L)

# (NOT to use `local` for hard `return (0L)`)
prot_scores <- td %>%
dplyr::mutate(prot_es = pep_score - pep_score_co) %>%
prot_scores <- td |>
dplyr::mutate(prot_es = pep_score - pep_score_co) |>
dplyr::group_by(prot_acc)

prot_scores <- switch(method_prot_es_co,
Expand Down Expand Up @@ -2345,8 +2351,8 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L,
# keep one unique `prot_es` (by roughly assuming `prot_es` is an indicator for
# the redundancy).

prot_scores <- prot_scores %>%
dplyr::mutate(prot_es = round(prot_es, digits = 5L)) %>%
prot_scores <- prot_scores |>
dplyr::mutate(prot_es = round(prot_es, digits = 5L)) |>
dplyr::filter(!duplicated(prot_es))

# Removes the last three in case only one or two decoy spikes near the end
Expand All @@ -2358,20 +2364,20 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L,
# -NP_001157012 54.4
# NP_077772 62.7

prot_scores <- prot_scores %>%
dplyr::arrange(prot_es) %>%
prot_scores <- prot_scores |>
dplyr::arrange(prot_es) |>
dplyr::filter(row_number() > n_burnin)

# no decoys
if (!any(grepl("^-", prot_scores$prot_acc)))
return (0L)

# both targets and decoys
td <- td %>%
dplyr::right_join(prot_scores, by = "prot_acc") %>%
dplyr::arrange(-prot_es) %>%
dplyr::mutate(total = row_number()) %>%
dplyr::mutate(decoy = cumsum(pep_isdecoy)) %>%
td <- td |>
dplyr::right_join(prot_scores, by = "prot_acc") |>
dplyr::arrange(-prot_es) |>
dplyr::mutate(total = row_number()) |>
dplyr::mutate(decoy = cumsum(pep_isdecoy)) |>
dplyr::mutate(fdr = decoy/total)

rm(list = "prot_scores")
Expand Down Expand Up @@ -2404,11 +2410,11 @@ calc_protfdr_i <- function (td, target_fdr = .01, max_protnpep_co = 10L,
min_score <- min(data$x, na.rm = TRUE)
max_score <- max(data$x, na.rm = TRUE)
newx <- min_score:max_score
newy <- predict(fit, data.frame(x = newx)) %>% `names<-`(newx)
newy <- predict(fit, data.frame(x = newx)) |> `names<-`(newx)

# NA if not existed
score_co2 <- which(newy <= target_fdr)[1] %>%
names() %>%
score_co2 <- which(newy <= target_fdr)[1] |>
names() |>
as.numeric()

local({
Expand Down Expand Up @@ -2493,9 +2499,9 @@ fit_protfdr <- function (vec, max_n_pep = 1000L, out_path)
NA
}
else {
fits %>%
.[!is.na(.)] %>%
.[[length(.)]]
# fits %>% .[!is.na(.)] %>% .[[length(.)]]
fits <- fits[!is.na(fits)]
fits <- fits[[length(fits)]]
}
}

Expand Down Expand Up @@ -2524,7 +2530,7 @@ fit_protfdr <- function (vec, max_n_pep = 1000L, out_path)

out <- data.frame(
prot_n_pep = newx,
prot_score_co = newy) %>%
prot_score_co = newy) |>
dplyr::mutate(prot_score_co = ifelse(prot_n_pep >= elbow, 0, prot_score_co))
}
else {
Expand All @@ -2538,7 +2544,7 @@ fit_protfdr <- function (vec, max_n_pep = 1000L, out_path)

out <- data.frame(
prot_n_pep = newx,
prot_score_co = newy) %>%
prot_score_co = newy) |>
dplyr::mutate(prot_score_co = ifelse(prot_n_pep >= elbow, 0, prot_score_co))

}
Expand Down Expand Up @@ -3051,15 +3057,21 @@ find_bestnotch <- function (x)
x[, pep_rank0 := data.table::frank(-pep_score, ties.method = "min"),
by = list(query_id)]

# the best query at each query_id
g <- x[, c("pep_rank0", "query_id", "pep_ms1_offset")]
g <- g[g$pep_rank0 == 1L, ]
g[["pep_rank0"]] <- NULL
g[, id := seq_len(.N), by = query_id]
g <- g[g$id == 1L, ]
g[["id"]] <- NULL

# pep_ms1_offset == 0 goes first?
if (length(us <- unique(g$pep_ms1_offset)) > 1L) {
g <- g[g$pep_ms1_offset == us[[1]], ]
if (FALSE) {
if (length(us <- unique(g$pep_ms1_offset)) > 1L) {
i <- if (length(ok <- which(abs(us) < 1e-4)) == 1L) ok else 1L
g <- g[g$pep_ms1_offset == us[[i]], ]
}
}

g[["keep"]] <- TRUE
g[, uid := paste(query_id, pep_ms1_offset, sep = ".")]
g[["pep_ms1_offset"]] <- g[["query_id"]] <- NULL
Expand Down
7 changes: 6 additions & 1 deletion man/prep_pepfdr_td.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit b487e01

Please sign in to comment.