diff --git a/DESCRIPTION b/DESCRIPTION index 3b66e0e..dfd237c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,6 +24,7 @@ Imports: data.table, digest, dplyr, + e1071, fastmatch, ggplot2, purrr, diff --git a/R/batchMS2.R b/R/batchMS2.R index a116023..2ebd146 100644 --- a/R/batchMS2.R +++ b/R/batchMS2.R @@ -162,7 +162,7 @@ hbatch_ms2ions <- function (ms1_time = NULL, type_ms2ions = "by", c("gen_ms2ions_base", "ms2ions_by_type", "byions", "czions", "axions"), - envir = environment(mzion:::gen_ms2ions_base) + envir = environment(mzion::matchMS) ) ms2s <- parallel::clusterApply( @@ -237,7 +237,7 @@ hbatch_ms2ions <- function (ms1_time = NULL, type_ms2ions = "by", "gen_ms2ions_base", "ms2ions_by_type", "byions", "czions", "axions"), - envir = environment(mzion:::gen_ms2ions_a0_vnl0_fnl1) + envir = environment(mzion::matchMS) ) ms2s <- parallel::clusterApply( @@ -316,7 +316,7 @@ hbatch_ms2ions <- function (ms1_time = NULL, type_ms2ions = "by", "ms2ions_by_type", "byions", "czions", "axions", "add_hexcodes"), - envir = environment(mzion:::gen_ms2ions_a1_vnl0_fnl0) + envir = environment(mzion::matchMS) ) ms2s <- parallel::clusterApply( @@ -395,7 +395,7 @@ hbatch_ms2ions <- function (ms1_time = NULL, type_ms2ions = "by", "ms2ions_by_type", "byions", "czions", "axions", "add_hexcodes_vnl2"), - envir = environment(mzion:::gen_ms2ions_a1_vnl1_fnl0) + envir = environment(mzion::matchMS) ) ms2s <- parallel::clusterApply( @@ -475,7 +475,7 @@ hbatch_ms2ions <- function (ms1_time = NULL, type_ms2ions = "by", "ms2ions_by_type", "byions", "czions", "axions", "add_hexcodes_fnl2"), - envir = environment(mzion:::gen_ms2ions_a1_vnl0_fnl1) + envir = environment(mzion::matchMS) ) ms2s <- parallel::clusterApply( diff --git a/R/bin_masses.R b/R/bin_masses.R index 96a1b59..2831c38 100644 --- a/R/bin_masses.R +++ b/R/bin_masses.R @@ -130,7 +130,7 @@ bin_ms1masses <- function (res = NULL, min_mass = 200L, max_mass = 4500L, "binTheoSeqs2", "bin_theoseqs", "find_ms1_cutpoints"), - envir = environment(mzion:::binTheoSeqs_i)) + envir = environment(mzion::matchMS)) # No need of flatten() as saveRDS by INDIVIDUAL idx (and return NULL) parallel::clusterApplyLB( @@ -320,7 +320,7 @@ binTheoSeqs <- function (idxes = NULL, res = NULL, min_mass = 200L, parallel::clusterExport(cl, list("qread", "qsave"), envir = environment(qs::qsave)) parallel::clusterExport(cl, c("bin_theoseqs", "find_ms1_cutpoints"), - envir = environment(mzion:::bin_theoseqs)) + envir = environment(mzion::matchMS)) out <- parallel::clusterMap(cl, bin_theoseqs, res, file.path(out_dir, out_nms), diff --git a/R/funs.R b/R/funs.R index 116f664..9e5bb2e 100644 --- a/R/funs.R +++ b/R/funs.R @@ -18,9 +18,6 @@ # $funs.R # character(0) # -# $hello.R -# [1] "hello" -# # $ion_ladder.R # [1] "ms2ions_by_type" "byions" "czions" "axions" "bions_base" "yions_base" # [7] "b2ions_base" "bstarions" "bstar2ions" "b0ions" "b02ions" "y2ions" @@ -32,7 +29,7 @@ # [1] "mapMS2ions" "match_mgf_path" "match_raw_id" "add_raw_ids" # [5] "find_secion_types" "find_psm_rows" "find_psm_rows1" "find_psm_rows2" # [9] "find_theoexpt_pair" "find_mgf_query" "combine_prisec_matches" "check_existed_psms" -# [13] "get_mzion_coltypes" +# [13] "get_mzion_coltypes" # # $mgfs.R # [1] "load_mgfs" "readMGF" "post_readmgf" "readlineMGFs" " f" @@ -81,11 +78,14 @@ # $msmsmatches2.R # [1] "ms2match" "hcalc_tmtint" "reverse_peps_in_frame" "reverse_seqs" "calib_ms1masses" # +# $mzion.R +# character(0) +# # $mztab.R # [1] "make_mztab" # -# $mzion.R -# character(0) +# $percolator.R +# [1] "creat_folds" "cv_svm" "perco_svm" "probco_bypepcharge" "calc_z_pepfdr" # # $quant2.R # [1] "calc_tmtint" "add_rptrs" "find_reporter_ints" "find_reporters_ppm" "msub_protpep" @@ -101,12 +101,13 @@ # [5] "calc_probi" "scalc_pepprobs" "calc_pepprobs_i" "calc_pepscores" # [9] "find_decoy" "find_targets" "calcpepsc" "add_primatches" # [13] "collapse_vecs" "post_pepscores" "find_pepscore_co1" "find_pepscore_co2" -# [17] "probco_bypeplen" "find_optlens" "find_probco_valley" "calc_pepfdr" -# [21] "fill_probco_nas" "fill_probs" "post_pepfdr" "calc_protfdr" -# [25] "aggr_prot_es" "calc_protfdr_i" "fit_protfdr" " f" -# [29] "find_ppm_outer_bycombi" "match_ex2th2" "calc_peploc" "calcpeprank_1" -# [33] "calcpeprank_2" "calcpeprank_3" "find_chunkbreaks" "findLocFracsDF" -# [37] "concatFracs" "na.interp" "is.constant" "tsoutliers" +# [17] "probco_bypeplen" "sub_td_byfdrtype" "find_optlens" "find_probco_valley" +# [21] "prep_pepfdr_td" "keep_pepfdr_best" "calc_pepfdr" "fill_probco_nas" +# [25] "fill_probs" "post_pepfdr" "calc_protfdr" "aggr_prot_es" +# [29] "calc_protfdr_i" "fit_protfdr" " f" "find_ppm_outer_bycombi" +# [33] "match_ex2th2" "calc_peploc" "calcpeprank_1" "calcpeprank_2" +# [37] "calcpeprank_3" "find_chunkbreaks" "findLocFracsDF" "concatFracs" +# [41] "na.interp" "is.constant" "tsoutliers" # # $silac.R # [1] "matchMS_silac_mix" "matchMS_par_groups" "add_fixedlab_masses" "matchMS_noenzyme" "combine_ion_matches" diff --git a/R/mgfs.R b/R/mgfs.R index d17902f..51be7b4 100644 --- a/R/mgfs.R +++ b/R/mgfs.R @@ -469,7 +469,7 @@ read_mgf_chunks <- function (filepath = "~/mzion/mgf/temp_1", "index_mz", "integerize_ms2ints", "find_ms1_interval"), - envir = environment(mzion:::proc_mgf_chunks) + envir = environment(mzion::matchMS) ) out <- parallel::clusterApply(cl, file.path(filepath, filelist), diff --git a/R/ms1_precursors.R b/R/ms1_precursors.R index 6c25e27..811f07f 100644 --- a/R/ms1_precursors.R +++ b/R/ms1_precursors.R @@ -351,7 +351,7 @@ calc_pepmasses2 <- function (aa_masses = NULL, c("hsemipeps_byprots", "semipeps_byprots", "calc_semipepmasses"), - envir = environment(mzion:::calc_semipepmasses) + envir = environment(mzion::matchMS) ) fwd_peps <- parallel::clusterApply( @@ -383,7 +383,7 @@ calc_pepmasses2 <- function (aa_masses = NULL, "ct_counts", "rm_char_in_nfirst", "rm_char_in_nlast"), - envir = environment(mzion:::distri_peps) + envir = environment(mzion::matchMS) ) # aa_masses_all[[1]] is for the original all-fixed mode not for the coerced, @@ -423,7 +423,7 @@ calc_pepmasses2 <- function (aa_masses = NULL, cl <- parallel::makeCluster(getOption("cl.cores", n_cores)) parallel::clusterExport(cl, c("simple_prots_peps"), - envir = environment(mzion:::simple_prots_peps)) + envir = environment(mzion::matchMS)) prps <- parallel::clusterApply( cl, @@ -538,7 +538,7 @@ calc_pepmasses2 <- function (aa_masses = NULL, "ms1_a0_vnl0_fnl1", "expand_grid_rows", "delta_ms1_a0_fnl1"), - envir = environment(mzion:::ms1_a0_vnl0_fnl1)) + envir = environment(mzion::matchMS)) fwd_peps[[i]] <- parallel::clusterApply( cl, @@ -602,7 +602,7 @@ calc_pepmasses2 <- function (aa_masses = NULL, "expand_grid_rows", "recur_flatten", "delta_ms1_a0_fnl1"), - envir = environment(mzion:::ms1_a1_vnl0_fnl0)) + envir = environment(mzion::matchMS)) for (i in inds) { amods_i <- amods[[i]] @@ -1924,7 +1924,7 @@ split_fastaseqs <- function (fasta = NULL, enzyme = "trypsin_p", cl, c("make_fastapeps0", "keep_n_misses"), - envir = environment(mzion:::make_fastapeps0)) + envir = environment(mzion::matchMS)) # --- message("Splitting fasta sequences.") @@ -2157,7 +2157,7 @@ split_fastaseqs_noenz <- function (fasta = NULL, acc_type = "uniprot_acc", "mmake_noenzpeps", "hmake_noenzpeps", "ms1masses_bare_noenz"), - envir = environment(mzion:::make_noenzpeps)) + envir = environment(mzion::matchMS)) peps <- parallel::clusterApply(cl, chunksplit(fasta_db, n_cores), mmake_noenzpeps, @@ -2657,7 +2657,7 @@ ms1masses_bare <- function (seqs = NULL, aa_masses = NULL, ftmass = NULL, cl <- parallel::makeCluster(getOption("cl.cores", n_cores)) parallel::clusterExport(cl, c("roll_sum", "accumulate_char"), - envir = environment(mzion:::roll_sum)) + envir = environment(mzion::matchMS)) ms_1 <- parallel::clusterApply( cl = cl, @@ -2775,7 +2775,7 @@ ms1masses_noterm <- function (aa_seqs, aa_masses, maxn_vmods_per_pep = 5L, c("calcms1mass_noterm", "calcms1mass_noterm_byprot", "calcms1mass_noterm_bypep"), - envir = environment(mzion:::calcms1mass_noterm)) + envir = environment(mzion::matchMS)) out <- parallel::clusterApply(cl, aa_seqs, calcms1mass_noterm, aa_masses = aa_masses, diff --git a/R/ms2_a0_vnl0_fnl1.R b/R/ms2_a0_vnl0_fnl1.R index 3135e41..bb54bee 100644 --- a/R/ms2_a0_vnl0_fnl1.R +++ b/R/ms2_a0_vnl0_fnl1.R @@ -41,6 +41,7 @@ ms2match_a0_vnl0_fnl1 <- function (i, aa_masses, ms1vmods, ms2vmods, "gen_ms2ions_a0_vnl0_fnl1", "expand_grid_rows", "gen_ms2ions_base", + "calc_rev_ms2", "ms2ions_by_type", "byions", "czions", "axions", "bions_base", "yions_base", @@ -51,7 +52,7 @@ ms2match_a0_vnl0_fnl1 <- function (i, aa_masses, ms1vmods, ms2vmods, "fuzzy_match_one", "fuzzy_match_one2", "post_frame_adv"), - envir = environment(mzion:::frames_adv) + envir = environment(mzion::matchMS) ) out <- parallel::clusterMap( diff --git a/R/ms2_a1_vnl0_fnl0.R b/R/ms2_a1_vnl0_fnl0.R index df05dd3..146435f 100644 --- a/R/ms2_a1_vnl0_fnl0.R +++ b/R/ms2_a1_vnl0_fnl0.R @@ -40,6 +40,7 @@ ms2match_a1_vnl0_fnl0 <- function (i, aa_masses, ms1vmods, ms2vmods, cl, c("frames_adv", "gen_ms2ions_a1_vnl0_fnl0", + "calc_rev_ms2", "match_mvmods", "expand_grid_rows", "find_vmodscombi", @@ -63,7 +64,7 @@ ms2match_a1_vnl0_fnl0 <- function (i, aa_masses, ms1vmods, ms2vmods, "fuzzy_match_one", "fuzzy_match_one2", "post_frame_adv"), - envir = environment(mzion:::frames_adv) + envir = environment(mzion::matchMS) ) out <- parallel::clusterMap( diff --git a/R/ms2_a1_vnl0_fnl1.R b/R/ms2_a1_vnl0_fnl1.R index ac7f206..31d0e7f 100644 --- a/R/ms2_a1_vnl0_fnl1.R +++ b/R/ms2_a1_vnl0_fnl1.R @@ -46,6 +46,7 @@ ms2match_a1_vnl0_fnl1 <- function (i, aa_masses, ms1vmods, ms2vmods, c("frames_adv", "gen_ms2ions_a1_vnl0_fnl1", "gen_ms2ions_a1_vnl0_fnl0", + "calc_rev_ms2", "match_mvmods", "expand_grid_rows", "find_vmodscombi", @@ -69,7 +70,7 @@ ms2match_a1_vnl0_fnl1 <- function (i, aa_masses, ms1vmods, ms2vmods, "fuzzy_match_one", "fuzzy_match_one2", "post_frame_adv"), - envir = environment(mzion:::frames_adv) + envir = environment(mzion::matchMS) ) out <- parallel::clusterMap( diff --git a/R/ms2_a1_vnl1_fnl0.R b/R/ms2_a1_vnl1_fnl0.R index 4a8ae41..24ea415 100644 --- a/R/ms2_a1_vnl1_fnl0.R +++ b/R/ms2_a1_vnl1_fnl0.R @@ -43,6 +43,7 @@ ms2match_a1_vnl1_fnl0 <- function (i, aa_masses, ms1vmods, ms2vmods, cl, c("frames_adv", "gen_ms2ions_a1_vnl1_fnl0", + "calc_rev_ms2", "match_mvmods", "expand_grid_rows", "find_vmodscombi", @@ -66,7 +67,7 @@ ms2match_a1_vnl1_fnl0 <- function (i, aa_masses, ms1vmods, ms2vmods, "fuzzy_match_one", "fuzzy_match_one2", "post_frame_adv"), - envir = environment(mzion:::frames_adv) + envir = environment(mzion::matchMS) ) out <- parallel::clusterMap( diff --git a/R/ms2_base.R b/R/ms2_base.R index 388c380..57695df 100644 --- a/R/ms2_base.R +++ b/R/ms2_base.R @@ -51,6 +51,7 @@ ms2match_base <- function (i, aa_masses, ms1vmods, ms2vmods, ntmass, ctmass, cl, c("frames_adv", "gen_ms2ions_base", + "calc_rev_ms2", "ms2ions_by_type", "byions", "czions", "axions", "bions_base", "yions_base", @@ -61,7 +62,7 @@ ms2match_base <- function (i, aa_masses, ms1vmods, ms2vmods, ntmass, ctmass, "fuzzy_match_one", "fuzzy_match_one2", "post_frame_adv"), - envir = environment(mzion:::frames_adv) + envir = environment(mzion::matchMS) ) out <- parallel::clusterMap( diff --git a/R/msmsmatches.R b/R/msmsmatches.R index 95f829f..b54e778 100644 --- a/R/msmsmatches.R +++ b/R/msmsmatches.R @@ -1,6 +1,6 @@ -#' Searches for MS ions. +#' An integrated facility for searches of mass spectrometry data. #' -#' Database searches of MSMS data. +#' Database searches of MS/MS data (DDA). #' #' @section \code{Output columns}: \code{system.file("extdata", #' "column_keys.txt", package = "mzion")} \cr @@ -36,9 +36,6 @@ #' #' With MSConvert, the default \code{titleMaker} is required for correct #' parsing (don't think it can be altered by users, but just in case). -#' -#' Individuality in MGF files are slightly preferred to take advantage of -#' parallel reading of the files. #' @param fasta Character string(s) to the name(s) of fasta file(s) with #' prepended directory path. The experimenter needs to supply the files. #' @param acc_type Character string(s); the types of protein accessions in one @@ -73,15 +70,16 @@ #' \code{locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)")} for #' phosphopeptides, \code{locmods = "Acetyl (K)"} for lysine acetylation. #' \code{fixedmods} that were coerced to \code{varmods} will be added -#' automatically to \code{locmods}. For convenience, the default is set to -#' look for applicable peptide phosphorylation. +#' automatically to \code{locmods}. +#' +#' For convenience, the default is set to look for applicable peptide +#' phosphorylation (and may encounter warning messages if the data type is +#' different to the default). #' @param mod_motifs The motifs to restrict \code{Anywhere} variable #' modification. For example, provided the \code{Anywhere} variable #' modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and -#' #' \code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = -#' c("NM", "MP"))} -#' +#' c("NM", "MP"))}, #' variable modifications will only be considered at sites that satisfy the #' motifs. #' @param enzyme A character string; the proteolytic specificity of the assumed @@ -150,12 +148,12 @@ #' @param min_len A positive integer; the minimum length of peptide sequences #' for considerations. Shorter peptides will be excluded. The default is 7. #' @param max_len A positive integer; the maximum length of peptide sequences -#' for considerations. Longer peptides will be excluded. +#' for considerations. Longer peptides will be excluded. The default is 40. #' @param max_miss A non-negative integer; the maximum number of mis-cleavages #' per peptide sequence for considerations. The default is 2. #' @param min_mass A positive integer; the minimum precursor mass for -#' interrogation. The default is an arbitrarily low value. The primary guard -#' against low molecular-weight precursors is \code{min_len}. +#' interrogation. The default is an arbitrarily low value (the primary guard +#' against low molecular-weight precursors is \code{min_len}). #' @param max_mass A positive integer; the maximum precursor mass for #' interrogation. #' @param min_ms2mass A positive integer; the minimum MS2 mass for @@ -169,16 +167,17 @@ #' envelopes. Nevertheless, by setting \code{n_13c = 1}, some increases in the #' number of PSMs may be readily achieved at a relatively small cost of search #' time. -#' @param par_groups Parameter(s) of \code{matchMS} multiplied by sets of values -#' in groups. Multiple searches will be performed separately against the -#' parameter groups. For instance with one set of samples in SILAC light and -#' the other in SILAC heavy, the experimenters may specify two arguments for -#' parameter \code{mgf_path} and two arguments for parameter \code{fixedmods} -#' that link to the respective samples. In this way, there is no need to -#' search against, e.g. heavy-isotope-labeled K8R10 with the light samples and -#' vice versa. Note that results will be combined at the end, with the group -#' names indicated under column \code{pep_group}. The default is NULL without -#' grouped searches. See the examples under SILAC and Group searches. +#' @param par_groups A low -priority feature. Parameter(s) of \code{matchMS} +#' multiplied by sets of values in groups. Multiple searches will be performed +#' separately against the parameter groups. For instance with one set of +#' samples in SILAC light and the other in SILAC heavy, the experimenters may +#' specify two arguments for parameter \code{mgf_path} and two arguments for +#' parameter \code{fixedmods} that link to the respective samples. In this +#' way, there is no need to search against, e.g. heavy-isotope-labeled K8R10 +#' with the light samples and vice versa. Note that results will be combined +#' at the end, with the group names indicated under column \code{pep_group}. +#' The default is NULL without grouped searches. See the examples under SILAC +#' and Group searches. #' @param silac_mix A list of labels indicating SILAC groups in samples. The #' parameter is most relevant for SILAC experiments where peptides of heavy, #' light etc. were \emph{mixed} into one sample. The default is NULL @@ -210,17 +209,18 @@ #' for consideration as a hit. Counts of secondary ions, e.g. b0, b* etc., are #' not part of the threshold. #' @param exclude_reporter_region Logical; if TRUE, excludes MS2 ions in the -#' region of TMT reporter ions. The default is FALSE. The argument affects -#' only TMT data. The range of TMT reporter ions is given by -#' \code{tmt_reporter_lower} and \code{tmt_reporter_upper}. +#' region of TMT reporter ions. The default is FALSE. The corresponding range +#' of TMT reporter ions is informed by \code{tmt_reporter_lower} and +#' \code{tmt_reporter_upper}. The argument affects only TMT data. #' @param tmt_reporter_lower The lower bound of the region of TMT reporter ions. #' The default is \eqn{126.1}. #' @param tmt_reporter_upper The upper bound of the region of TMT reporter ions. #' The default is \eqn{135.2}. -#' @param index_mgf_ms2 Logical; if TRUE, converts upfrontly MS2 m-over-z values -#' from numeric to integers as opposed to in-situ conversion during ion -#' matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be -#' useful for very large MS files by reducing RAM footprints. +#' @param index_mgf_ms2 A low-priority feature. Logical; if TRUE, converts +#' upfrontly MS2 m-over-z values from numeric to integers as opposed to +#' \emph{in-situ} conversion during ion matches. The default is FALSE. The +#' \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by +#' reducing RAM footprints. #' #' At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between #' theoretical and experimental MS2 m-over-z values is limited by the @@ -272,7 +272,7 @@ #' Note that \code{fdr_type = protein} is comparable to \code{fdr_type = #' peptide} with the additional filtration of data at \code{prot_tier == 1}. #' @param fdr_group A character string; the modification group(s) for uses in -#' peptide FDR controls. The value is in one of c("all", "base"). The +#' peptide FDR controls. The value is in one of \code{c("all", "base")}. The #' \code{base} corresponds to the modification group with the largest number #' of matches. #' @param max_pepscores_co A positive numeric; the upper limit in the cut-offs @@ -299,10 +299,11 @@ #' pep_score_cutoff} under a protein will be used to represent the threshold #' of a protein enrichment score. For more conserved thresholds, the #' statistics of \code{"max"} may be considered. -#' @param soft_secions Depreciated. Logical; if TRUE, collapses the intensities -#' of secondary ions to primary ions at the absence of the primaries. The -#' default is FALSE. For instance, the signal of \code{b5^*} will be ignored -#' if its primary ion \code{b5} is not matched. +#' @param soft_secions Impacts on search performance not yet assessed. Logical; +#' if TRUE, collapses the intensities of secondary ions to primary ions even +#' when the primaries are absent. The default is FALSE. For instance, the +#' signal of \code{b5^*} will be ignored if its primary ion \code{b5} is not +#' matched. #' @param topn_seqs_per_query Positive integer; a threshold to discard peptide #' matches under the same MS query with scores beyond the top-n. #' @@ -342,11 +343,11 @@ #' Tier-3: one significant peptide per protein and protein scores below #' significance thresholds. #' -#' @param max_n_prots A positive integer to threshold the maximum number of -#' protein entries before coercing \code{fdr_type} from \code{psm} or -#' \code{peptide} to \code{protein}. The argument has no effect if -#' \code{fdr_type} is already \code{protein}. In general, there is no need to -#' change the default. +#' @param max_n_prots Softly depreciated. A positive integer to threshold the +#' maximum number of protein entries before coercing \code{fdr_type} from +#' \code{psm} or \code{peptide} to \code{protein}. The argument has no effect +#' if \code{fdr_type} is already \code{protein}. In general, there is no need +#' to change the default. #' #' Note that for memory efficiency proteins at tiers 1, 2 and 3 are grouped #' separately. Further note that there is no tier-2 proteins at @@ -378,6 +379,22 @@ #' \eqn{m/z} values (\code{pep_ms2_moverzs}). #' @param add_ms2ints Logical; if TRUE, adds the sequence of experimental MS2 #' intensity values (\code{pep_ms2_ints}). +#' +#' @param svm_reproc Logical; if TRUE, reprocesses peptide data for significance +#' thresholds with a support vector machine (SVM) approach analogous to +#' \href{https://www.nature.com/articles/nmeth1113}{Percolator}. +#' @param svm_kernel The SVM kernel. See also \link[e1071]{svm}. +#' @param svm_feats Features used for SVM classifications. +#' @param svm_iters A positive integer; the number of iterations in +#' \link[e1071]{svm}. +#' @param svm_cv Logical; if TRUE, performs cross validation for the +#' regularization cost. +#' @param svm_k A positive integer; specifies the k-number of folds in cross +#' validation. +#' @param svm_costs The cost constraints for k-fold cross validation. +#' @param svm_def_cost The default cost for SVM. +#' @param svm_iters The number of iteration in SVM learning. +#' #' @param .path_cache The file path of cached search parameters. The parameter #' is for the users' awareness of the underlying structure of file folders and #' the use of default is suggested. Occasionally experimenters may remove the @@ -430,9 +447,6 @@ #' out_path = "~/mzion/examples", #' ) #' -#' # (from protein to PSM FDR) -#' reproc_psmC(out_path = "~/mzion/examples", fdr_type = "psm", -#' combine_tier_three = TRUE) #' #' # TMT-16plex, phospho #' matchMS( @@ -443,7 +457,6 @@ #' locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)"), #' quant = "tmt16", #' fdr_type = "psm", -#' combine_tier_three = TRUE, #' out_path = "~/mzion/examples", #' ) #' @@ -456,11 +469,11 @@ #' ppm_ms2 = 40, #' quant = "none", #' fdr_type = "protein", -#' out_path = "~/mzion/examples_pasef", +#' out_path = "~/mzion/examples", #' ) #' #' # Wrapper of matchMS(enzyme = noenzyme, ...) without sectional searches -#' # by ranges of peptide lengths +#' # by ranges of peptide lengths #' matchMS_NES( #' fasta = c("~/mzion/dbs/fasta/refseq/refseq_hs_2013_07.fasta", #' "~/mzion/dbs/fasta/refseq/refseq_mm_2013_07.fasta", @@ -697,6 +710,17 @@ matchMS <- function (out_path = "~/mzion/outs", add_ms2theos = FALSE, add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, add_ms2ints = FALSE, + svm_reproc = FALSE, + svm_kernel = "radial", + svm_feats = c("pep_score", "pep_ret_range", + "pep_delta", "pep_n_ms2", + "pep_expect", "pep_exp_mz", # "pep_exp_z", + "pep_exp_mr", "pep_tot_int", + "pep_n_matches2", "pep_ms2_deltas_mean"), + svm_cv = TRUE, svm_k = 3L, + svm_costs = c(.1, .3, 1, 3, 10), svm_def_cost = 1, + svm_iters = 10L, + digits = 4L, ...) { options(digits = 9L) @@ -756,7 +780,8 @@ matchMS <- function (out_path = "~/mzion/outs", # logical types stopifnot(vapply(c(soft_secions, combine_tier_three, calib_ms1mass, use_ms1_cache, add_ms2theos, add_ms2theos2, add_ms2moverzs, - add_ms2ints, exclude_reporter_region, index_mgf_ms2), + add_ms2ints, exclude_reporter_region, index_mgf_ms2, + svm_cv), is.logical, logical(1L))) # numeric types @@ -1285,6 +1310,9 @@ matchMS <- function (out_path = "~/mzion/outs", if (is.null(bypass_pepscores)) bypass_pepscores <- FALSE if (!bypass_pepscores) { + tally_ms2ints <- dots$tally_ms2ints + if (is.null(tally_ms2ints)) tally_ms2ints <- TRUE + calc_pepscores(topn_ms2ions = topn_ms2ions, type_ms2ions = type_ms2ions, target_fdr = target_fdr, @@ -1296,6 +1324,7 @@ matchMS <- function (out_path = "~/mzion/outs", out_path = out_path, min_ms2mass = min_ms2mass, index_mgf_ms2 = index_mgf_ms2, + tally_ms2ints = tally_ms2ints, # dummies mgf_path = mgf_path, @@ -1337,8 +1366,28 @@ matchMS <- function (out_path = "~/mzion/outs", nes_fdr_group = nes_fdr_group, out_path = out_path) - post_pepfdr(prob_cos, out_path) - rm(list = "prob_cos") + ans <- post_pepfdr(prob_cos, out_path) + + if (svm_reproc) { + message("SVM reprocessing of peptide probabilities.") + + prob_cos <- perco_svm(out_path = out_path, df = ans, prob_cos = prob_cos, + target_fdr = target_fdr, fdr_type = fdr_type, + min_len = min_len, max_len = max_len, + max_pepscores_co = max_pepscores_co, + min_pepscores_co = min_pepscores_co, enzyme = enzyme, + fdr_group = fdr_group, nes_fdr_group = nes_fdr_group, + svm_kernel = svm_kernel, svm_feats = svm_feats, + cross_valid = svm_cv, k = svm_k, + costs = svm_costs, + def_cost = svm_def_cost, + svm_iters = svm_iters) + + # post_pepfdr(prob_cos, out_path) + message("Completed SVM reprocessing.") + } + + rm(list = c("ans", "prob_cos")) } ## Peptide ranks and score deltas between `pep_ivmod` @@ -1354,43 +1403,64 @@ matchMS <- function (out_path = "~/mzion/outs", gc() } - ## Protein accessions, score cut-offs and optional reporter ions + ## Protein accessions bypass_from_protacc <- dots$bypass_from_protacc if (is.null(bypass_from_protacc)) bypass_from_protacc <- FALSE if (bypass_from_protacc) return(NULL) - if (enzyme != "noenzyme" || isTRUE(dots[["direct_prot_acc"]])) { - df <- add_protacc(out_path = out_path, - .path_cache = .path_cache, - .path_fasta = .path_fasta) - } + bypass_protacc <- dots$bypass_protacc + if (is.null(bypass_protacc)) bypass_protacc <- FALSE + temp_dir <- file.path(out_path, "temp") + file_protacc <- file.path(temp_dir, "df_protacc.rds") + + if (bypass_protacc && file.exists(file_protacc)) + df <- qs::qread(file_protacc) else { - silac_noenzyme <- if (isTRUE(dots$silac_noenzyme)) TRUE else FALSE - - if (silac_noenzyme) { - # see matchMS_noenzyme for nested silac under noenzyme + if (enzyme != "noenzyme" || isTRUE(dots[["direct_prot_acc"]])) df <- add_protacc(out_path = out_path, .path_cache = .path_cache, .path_fasta = .path_fasta) - } else { - # with multiple subdirs (length ranges) - df <- add_protacc2(out_path = out_path, - .path_cache = .path_cache, - .path_fasta = .path_fasta) + silac_noenzyme <- if (isTRUE(dots$silac_noenzyme)) TRUE else FALSE + + # see matchMS_noenzyme for nested silac under noenzyme + df <- if (silac_noenzyme) + add_protacc(out_path = out_path, + .path_cache = .path_cache, + .path_fasta = .path_fasta) + else + add_protacc2(out_path = out_path, + .path_cache = .path_cache, + .path_fasta = .path_fasta) + + rm(list = c("silac_noenzyme")) } - rm(list = "silac_noenzyme") + qs::qsave(df, file_protacc, preset = "fast") + } + + rm(list = "file_protacc") + + ## Protein FDR + bypass_protfdr <- dots$bypass_protfdr + if (is.null(bypass_protfdr)) bypass_protfdr <- FALSE + + file_protfdr <- file.path(temp_dir, "df_protfdr.rds") + + if (bypass_protfdr && file.exists(file_protfdr)) { + df <- qs::qread(file_protfdr) + } + else { + df <- calc_protfdr(df = df, + target_fdr = target_fdr, + max_protscores_co = max_protscores_co, + max_protnpep_co = max_protnpep_co, + method_prot_es_co = method_prot_es_co, + out_path = out_path) + qs::qsave(df, file_protfdr, preset = "fast") } - - df <- calc_protfdr(df = df, - target_fdr = target_fdr, - max_protscores_co = max_protscores_co, - max_protnpep_co = max_protnpep_co, - method_prot_es_co = method_prot_es_co, - out_path = out_path) df <- add_rptrs(df, quant, out_path) gc() @@ -1402,29 +1472,14 @@ matchMS <- function (out_path = "~/mzion/outs", df <- dplyr::mutate(df, pep_expect = 10^((pep_score_co - pep_score)/10) * target_fdr) df[["pep_score_co"]] <- NULL - df$pep_ms1_delta <- df$ms1_mass - df$theo_ms1 - - df <- dplyr::rename(df, - pep_scan_title = scan_title, - pep_exp_mz = ms1_moverz, - pep_exp_mr = ms1_mass, - pep_exp_z = ms1_charge, - pep_calc_mr = theo_ms1, - pep_delta = pep_ms1_delta, - pep_tot_int = ms1_int, - pep_ret_range = ret_time, - pep_scan_num = scan_num, - pep_n_ms2 = ms2_n, - pep_frame = frame) - - nms <- names(df) + df$pep_delta <- df$pep_exp_mr - df$pep_calc_mr - df <- dplyr::bind_cols( + nms <- names(df) + df <- dplyr::bind_cols( df[grepl("^prot_", nms)], df[grepl("^pep_", nms)], df[grepl("^psm_", nms)], df[!grepl("^prot_|^pep_|^psm_", nms)], ) - rm(list = "nms") df <- reloc_col_after(df, "pep_exp_z", "pep_exp_mr") @@ -1443,7 +1498,7 @@ matchMS <- function (out_path = "~/mzion/outs", session_info <- sessionInfo() save(session_info, file = file.path(out_path, "Calls", "mzion.rda")) }) - + ## psmC to psmQ df <- df[, c("prot_acc", "pep_seq", "pep_issig", "pep_isdecoy", "prot_issig", "prot_n_pep")] @@ -1547,7 +1602,11 @@ try_psmC2Q <- function (df = NULL, out_path = NULL, fdr_type = "protein", #' #' May solve some memory shortage issues for large data sets by restarting An #' Rstudio session. -#' +#' +#' The score cut-offs are different among the \code{fdr_type} of "psm", +#' "peptide" and "protein". An experimenter need to match the value of +#' \code{fdr_type}. +#' #' @param fct A factor for data splitting into chunks. May consider a greater #' value for a larger data set. #' @inheritParams matchMS @@ -1978,7 +2037,7 @@ map_raw_n_scan <- function (df, mgf_path) scans <- qs::qread(file_scan) scans2 <- names(scans) names(scans2) <- scans - df$scan_title <- unname(scans2[df$scan_title]) + df$pep_scan_title <- unname(scans2[df$pep_scan_title]) } else { stop("File not found: ", file_scan) @@ -1992,6 +2051,7 @@ map_raw_n_scan <- function (df, mgf_path) #' #' Not yet used. Takes values of integers or character strings. #' +#' @param oks A vector of allowed modification groups. #' @inheritParams matchMS check_fdr_group <- function (fdr_group = c("base", "all", "top3"), oks = c("base", "all")) diff --git a/R/percolator.R b/R/percolator.R new file mode 100644 index 0000000..7890d97 --- /dev/null +++ b/R/percolator.R @@ -0,0 +1,431 @@ +#' Creates folds for cross validation. +#' +#' From package caret. +#' +#' @param y The labels. +#' @param k The number of folds. +#' @param list Logical; should the result be a list or not. +#' @param returnTrain Logical; return training sets or not (test sets). +creat_folds <- function (y, k = 10L, list = TRUE, returnTrain = FALSE) +{ + if (k < length(y)) { + y <- factor(as.character(y)) + numInClass <- table(y) + foldVector <- vector(mode = "integer", length(y)) + + for (i in 1:length(numInClass)) { + min_reps <- numInClass[i] %/% k + + if (min_reps > 0L) { + spares <- numInClass[i] %% k + seqVector <- rep(1:k, min_reps) + + if (spares > 0L) + seqVector <- c(seqVector, sample(1:k, spares)) + + foldVector[which(y == names(numInClass)[i])] <- + sample(seqVector) + } + else { + foldVector[which(y == names(numInClass)[i])] <- + sample(1:k, size = numInClass[i]) + } + } + } + else + foldVector <- seq_along(y) + + if (list) { + out <- split(seq_along(y), foldVector) + names(out) <- paste("Fold", gsub(" ", "0", format(seq_along(out))), sep = "") + + if (returnTrain) + out <- lapply(out, function(data, y) y[-data], y = seq_along(y)) + } + else + out <- foldVector + + out +} + + +#' Calculates cross-validation errors. +#' +#' Optimizes the regularization cost for svm. +#' +#' @param train Training set. +#' @param test Test set. +#' @param costs A vector of costs. +#' @param ... Additional arguments for svm. +cv_svm <- function (train, test, costs = c(10E-2, 10E-1, 1, 5, 50), ...) +{ + len <- length(costs) + tabs <- errs <- vector("list", len) + + for (i in 1:len) { + m <- e1071::svm(y. ~ ., data = train, cost = costs[i], ...) + tab_i <- tabs[[i]] <- table(pred = predict(m, test), true = test[["y."]]) + tot <- tab_i[1, 1] + tab_i[1, 2] + tab_i[2, 1] + tab_i[2, 2] + errs[[i]] <- (tab_i[1, 1] + tab_i[1, 2])/tot + } + + list(tab = tabs, err = errs) +} + + +#' Percolator +#' +#' @param df A data frame of \code{psmC.txt}. +#' @param prob_cos Probability cot-offs (as a function of pep_len). +#' @param fct_score The factor in converting probability p-values to scores. The +#' value is always 10. +#' @param k The k-folds for cross validation. +#' @param cross_valid Logical; to perform cross validations or not. +#' @param costs The costs for cross validations. +#' @param def_cost The default cost. +#' @param svm_tol Tolerance in FDR. +#' @param svm_iters The number of iterations. +#' @inheritParams matchMS +perco_svm <- function (prob_cos = NULL, out_path = NULL, df = NULL, + target_fdr = .01, fdr_type = "protein", + min_len = 7L, max_len = 40L, max_pepscores_co = 50, + min_pepscores_co = 0, enzyme = "trypsin_p", + fdr_group = "base", nes_fdr_group = "base", + fct_score = 10, k = 10, cross_valid = FALSE, + costs = c(.1, .3, 1, 3, 10), def_cost = 1L, + svm_kernel = "radial", + svm_feats = c("pep_score", "pep_ret_range", + "pep_delta", "pep_n_ms2", + "pep_expect", # "pep_len", + "pep_exp_mz", "pep_exp_mr", + "pep_tot_int", # "pep_mod_group", + "pep_n_matches2", + "pep_ms2_deltas_mean"), + svm_iters = 10L, svm_tol = 1E-4, ...) +{ + if (!all(costs > 0)) + costs <- c(.1, .3, 1, 3, 10) + + if (def_cost <= 0) + def_cost <- 1L + + if (svm_iters <= 0) + svm_iters <- 10L + + if (svm_tol <= 0) + svm_tol <- 1E-4 + + fileC <- file.path(out_path, "psmC.txt") + fileP <- file.path(out_path, "temp", "prob_cos.rds") + + # --- preparation + if (is.null(df)) { + if (is.null(out_path)) + stop("Argument \"out_path\" cannot be NULL.") + + if (!file.exists(fileC)) + stop("File not found: ", fileC) + + df <- readr::read_tsv(fileC) + } + + if (is.null(prob_cos)) { + if (!file.exists(fileP)) + stop("File not found: ", fileP) + + prob_cos <- qs::qread(fileP) + } + prob_cos0 <- prob_cos + + if (FALSE) { + if (nrow(df) <= 500L) { + message("No SVM post-processing with fewer than 500 observations.") + return(df) + } + } + + if (!"pep_delta" %in% names(df)) + df$pep_delta <- df$pep_exp_mr - df$pep_calc_mr + + cnms <- names(df) + + if (!"pep_issig" %in% cnms) { + warning("No SVM post-processing without data column \"pep_issig\".") + return(df) + } + + if (!all(c("raw_file", "pep_scan_num") %in% cnms)) { + warning("Require columns \"raw_file\" and \"pep_scan_num\".") + return(df) + } + + if (!"pep_score" %in% cnms) { + warning("Column \"pep_score\" not found.") + return(df) + } + + # information already used in getting initial `prob_cos` + if ("pep_len" %in% svm_feats) { + warning("Excluded feature \"pep_len\" (information already used).") + svm_feats <- svm_feats[svm_feats != "pep_len"] + } + + if (FALSE) { + if ("pep_z_expect" %in% svm_feats && !"pep_z_expect" %in% cnms) { + df <- df %>% + dplyr::left_join(calc_z_pepfdr(out_path = out_path), by = "pep_exp_z") %>% + dplyr::mutate(pep_z_expect = 10^((pep_z_prob_co - pep_score)/10) * target_fdr) + } + } + + rm(list = c("cnms")) + + + # --- initialization + # metric for selecting high-quality training PSMs + if (!"pep_expect" %in% svm_feats) + svm_feats <- c("pep_expect", svm_feats) + + if (!"pep_score" %in% svm_feats) + svm_feats <- c("pep_score", svm_feats) + + if (!"pep_expect" %in% names(df)) + df <- dplyr::mutate(df, pep_expect = 10^((pep_score_co - pep_score)/10) * target_fdr) + + if (!"pep_prob" %in% names(df)) + df <- dplyr::mutate(df, pep_prob = 10^(-pep_score/fct_score)) + + # note: `df` being altered + if ("pep_exp_z" %in% names(df)) + df[["pep_exp_z"]] <- as.integer(factor(df[["pep_exp_z"]])) + + td <- prep_pepfdr_td(df, + out_path = out_path, + enzyme = enzyme, + nes_fdr_group = nes_fdr_group, + fdr_group = fdr_group) + td <- keep_pepfdr_best(td, cols = c("pep_scan_num", "raw_file")) + td[["y."]] <- as.factor(td[["pep_issig"]]) + + oks <- svm_feats %in% names(df) + + if (!all(oks)) { + warning("SVM features not found: ", svm_feats[!oks]) + svm_feats <- svm_feats[oks] + } + + oks <- unlist(lapply(df[svm_feats], is.numeric)) + + if (!all(oks)) { + warning("Non-numeric features excluded: ", svm_feats[!oks]) + svm_feats <- svm_feats[oks] + } + + for (pf in svm_feats) + td[[paste0(pf, ".")]] <- td[[pf]] + + rm(list = c("pf", "oks")) + + svm_feats <- paste0(svm_feats, ".") + + if ("pep_expect." %in% svm_feats) + td[["pep_expect."]] <- -log10(td[["pep_expect."]]) + + if (FALSE) { + if ("pep_z_expect." %in% svm_feats) + td[["pep_z_expect."]] <- -log10(td[["pep_z_expect."]]) + } + + nas <- lapply(svm_feats, function (x) is.na(td[[x]])) + nas <- Reduce(`|`, nas) + td0 <- td[nas, ] + td1 <- td[!nas, ] + + rows <- td1[["pep_isdecoy"]] + ta <- td1[!rows, c("y.", svm_feats), drop = FALSE] + de <- td1[ rows, c("y.", svm_feats), drop = FALSE] + rm(list = c("rows")) + + ## (2) train and test for each of target and decoy sets + oks <- ta[["pep_expect."]] >= median(ta[["pep_expect."]], na.rm = TRUE) + ttrain <- ta[oks, ] + ttest <- ta[!oks, ] + + rows <- sample(c(TRUE, FALSE), nrow(de), replace = TRUE) + dtrain <- de[rows, ] + dtest <- de[rows, ] + train <- dplyr::bind_rows(ttrain, dtrain) + rm(list = c("dtrain", "rows", "oks")) + + if (cross_valid) { + mses <- vector("numeric", length(costs)) + + folds <- creat_folds(ta[["y."]], k = k) + tests <- trains <- vector("list", k) + + for (i in seq_len(k)) { + tests[[i]] <- ta[folds[[i]], ] + trains[[i]] <- ta[-folds[[i]], ] + } + + n_cores <- min(mzion:::detect_cores(16L), k) + cl <- parallel::makeCluster(getOption("cl.cores", n_cores)) + cvs <- parallel::clusterMap(cl, cv_svm, trains, tests, + MoreArgs = list(costs = costs), + SIMPLIFY = FALSE, USE.NAMES = FALSE) + parallel::stopCluster(cl) + errs <- lapply(cvs, `[[`, "err") + + for (i in seq_along(costs)) + mses[[i]] <- mean(unlist(lapply(errs, `[[`, i))) + + best_co <- costs[which.min(mses)] + } + else + best_co <- def_cost + + message("Regularization cost: ", best_co) + + fit <- tryCatch( + e1071::svm(y. ~ ., data = train, kernel = svm_kernel, + cost = best_co, ...), + error = function (e) NULL) + + if (is.null(fit)) + return(prob_cos) + + pred <- tryCatch( + as.logical(predict(fit, td1[, svm_feats])), + error = function (e) NULL) + + if (is.null(pred)) + return(prob_cos) + + ### + # tdr_bare - target-decoy rate (TDR) based on one feature (pep_len) + # tdr_svm - TDR based on multiple features; initially tdr_svm < tdr_bare -> + # ^target_fdr -> ^tdr_svm ... tdr_svm == tdr_bare + ### + + tdr_bare <- sum(td1[td1$pep_isdecoy, "pep_issig"])/sum(td1[!td1$pep_isdecoy, "pep_issig"]) + td1[, "pep_issig"] <- as.logical(pred) + tdr_svm <- sum(td1[td1$pep_isdecoy, "pep_issig"])/sum(td1[!td1$pep_isdecoy, "pep_issig"]) + delta <- tdr_svm - tdr_bare + rm(list = c("tdr_svm")) + + # if ((delta > 0) || (abs(delta) <= svm_tol)) return(prob_cos) + + + # --- iteration + fdr0 <- target_fdr + fdr1 <- target_fdr * 5 + fdrm <- (fdr0 + fdr1)/2 + step <- fdr1 - fdr0 + + while((svm_iters > 0L) && (abs(delta) > svm_tol)) { + prob_cos <- calc_pepfdr(target_fdr = fdrm, + fdr_type = fdr_type, + min_len = min_len, + max_len = max_len, + max_pepscores_co = max_pepscores_co, + min_pepscores_co = min_pepscores_co, + enzyme = enzyme, + fdr_group = fdr_group, + nes_fdr_group = nes_fdr_group, + out_path = out_path) + df <- post_pepfdr(prob_cos, out_path) # also updates pepfdr.rds + + if (!"pep_delta" %in% names(df)) + df$pep_delta <- df$pep_exp_mr - df$pep_calc_mr + + if (!"pep_expect" %in% names(df)) + df <- dplyr::mutate(df, pep_expect = 10^((pep_score_co - pep_score)/10) * fdrm) + + if (!"pep_prob" %in% names(df)) + df <- dplyr::mutate(df, pep_prob = 10^(-pep_score/fct_score)) + + td <- prep_pepfdr_td(df, + out_path = out_path, + enzyme = enzyme, + nes_fdr_group = nes_fdr_group, + fdr_group = fdr_group) + td <- keep_pepfdr_best(td, cols = c("pep_scan_num", "raw_file")) + td[["y."]] <- as.factor(td[["pep_issig"]]) + + svm_feats <- gsub("\\.", "", svm_feats) + + for (pf in svm_feats) + td[[paste0(pf, ".")]] <- td[[pf]] + + rm(list = c("pf")) + svm_feats <- paste0(svm_feats, ".") + + if ("pep_expect." %in% svm_feats) + td[["pep_expect."]] <- -log10(td[["pep_expect."]]) + + nas <- lapply(svm_feats, function (x) is.na(td[[x]])) + nas <- Reduce(`|`, nas) + + if (length(nas)) { + td0 <- td[nas, ] + td1 <- td[!nas, ] + } + else { + td0 <- NULL + td1 <- td + } + + rows <- td1[["pep_isdecoy"]] + ta <- td1[!rows, c("y.", svm_feats), drop = FALSE] + de <- td1[ rows, c("y.", svm_feats), drop = FALSE] + rm(list = c("rows")) + + ## (2) train and test for each of target and decoy sets + oks <- ta[["pep_expect."]] >= median(ta[["pep_expect."]], na.rm = TRUE) + ttrain <- ta[oks, ] + ttest <- ta[!oks, ] + + rows <- sample(c(TRUE, FALSE), nrow(de), replace = TRUE) + dtrain <- de[rows, ] + dtest <- de[rows, ] + train <- dplyr::bind_rows(ttrain, dtrain) + rm(list = c("dtrain", "rows", "oks")) + + fit <- tryCatch(e1071::svm(y. ~ ., data = train, kernel = svm_kernel, + cost = best_co, ...), + error = function (e) NULL) + + if (is.null(fit)) + return(if (all(prob_cos[, 2] >= prob_cos0[, 2])) prob_cos else prob_cos0) + + pred <- tryCatch( + as.logical(predict(fit, td1[, svm_feats])), + error = function (e) NULL) + + if (is.null(pred)) + return(if (all(prob_cos[, 2] >= prob_cos0[, 2])) prob_cos else prob_cos0) + + td1[, "pep_issig"] <- as.logical(pred) + tdr_svm <- sum(td1[td1$pep_isdecoy, "pep_issig"])/sum(td1[!td1$pep_isdecoy, "pep_issig"]) + delta <- tdr_svm - tdr_bare + + if (delta < 0) { + # next grid + fdr0 <- fdrm + fdrm <- (fdr0 + fdr1)/2 + } + else { + # left-half grid + fdr1 <- fdrm + fdrm <- (fdr0 + fdr1)/2 + step <- fdrm - fdr0 + } + + message("Range of adjusted FDR: ", fdr0, " : ", fdr1) + svm_iters <- svm_iters - 1L + } + + if (all(prob_cos[, 2] >= prob_cos0[, 2])) prob_cos else prob_cos0 +} + + diff --git a/R/quant2.R b/R/quant2.R index a10277c..e5628ec 100644 --- a/R/quant2.R +++ b/R/quant2.R @@ -105,7 +105,7 @@ add_rptrs <- function (df = NULL, quant = "none", out_path = NULL) reporters <- dplyr::bind_rows(reporters) df <- df %>% - tidyr::unite(uniq_id, raw_file, pep_mod_group, scan_num, sep = ".", + tidyr::unite(uniq_id, raw_file, pep_mod_group, pep_scan_num, sep = ".", remove = FALSE) %>% dplyr::left_join(reporters, by = "uniq_id") %>% dplyr::select(-uniq_id) @@ -396,8 +396,7 @@ add_protacc <- function (df = NULL, out_path = NULL, .path_cache = NULL, #' Helper of annotating decoy peptides. #' #' @param df A data frame. -#' @param prps_fwd The look-ups of forward protein and peptides. -#' @param prps_rev The look-ups of reversed protein and peptides. +#' @param prps The look-ups of protein and peptides. hannot_decoys <- function (df, prps) { # keep prot_acc be the first column diff --git a/R/roadmaps.R b/R/roadmaps.R index f029fb1..2b16015 100644 --- a/R/roadmaps.R +++ b/R/roadmaps.R @@ -128,6 +128,7 @@ # gen_ms2ions_base (for specific pep_seq) # ms2ions_by_type (ion_ladder.R) # byions, czions, axions (ion_ladder.R) +# calc_rev_ms2 (utils_engine.R) # search_mgf # find_ms2_bypep2 # fuzzy_match_one @@ -143,8 +144,10 @@ # gen_ms2ions_a0_vnl0_fnl1 # // early return # gen_ms2ions_base (ms2base.R) +# calc_rev_ms2 (utils_engine.R) # ms2ions_by_type (ion_ladder.R) # byions, czions, axions (ion_ladder.R) +# calc_rev_ms2 (utils_engine.R) # // regular return # ms2ions_by_type (ion_ladder.R) # byions, czions, axions (ion_ladder.R) @@ -175,6 +178,7 @@ # "sim_combn" (vmod_ms2_labels.R) # "match_aas_indexes" (vmods_ms2_labels.R) # "check_ms1_mass_vmods2" (ms2_a1_vnl0_fnl0.R) +# "calc_rev_ms2" (utils_engine.R) # "calc_ms2ions_a1_vnl0_fnl0" (ms2_a1_vnl0_fnl0.R) # "ms2ions_by_type" (ion_ladder.R) # "byions", "czions", "axions" @@ -212,6 +216,7 @@ # "match_aas_indexes" (vmods_ms2_labels.R) # "check_ms1_mass_vmods2" (ms2_a1_vnl0_fnl0.R) # "expand_grid_rows" (utils_engine.R) +# "calc_rev_ms2" (utils_engine.R) # "calc_ms2ions_a1_vnl1_fnl0" # "ms2ions_by_type" (ion_ladder.R) # "byions", "czions", "axions" @@ -243,6 +248,7 @@ # - "sim_combn" (vmod_ms2_labels.R) # - match_aas_indexes # check_ms1_mass_vmods2 (ms2_a1_vnl0_fnl0.R) +# calc_rev_ms2 (utils_engine.R) # calc_ms2ions_a1_vnl0_fnl1 # ms2ions_by_type (ion_ladder.R) # byions, czions, axions diff --git a/R/scores.R b/R/scores.R index e75a6cd..774c9b4 100644 --- a/R/scores.R +++ b/R/scores.R @@ -259,15 +259,15 @@ calc_probi_byvmods <- function (df, nms, expt_moverzs, expt_ints, N, type_ms2ions = "by", topn_ms2ions = 100L, ppm_ms2 = 20L, soft_secions = FALSE, burn_ins = c(1:2), min_ms2mass = 115L, - d2 = 1E-5, index_mgf_ms2 = FALSE, digits = 4L) + d2 = 1E-5, index_mgf_ms2 = FALSE, + tally_ms2ints = TRUE, digits = 4L) { df_theo <- df$theo m <- length(df_theo) ## df2 - tt2 <- add_seions(df_theo, type_ms2ions = type_ms2ions, digits = digits) - df2 <- match_ex2th2(expt_moverzs, tt2, min_ms2mass, d2, index_mgf_ms2) - + tt2 <- add_seions(df_theo, type_ms2ions = type_ms2ions, digits = digits) + df2 <- match_ex2th2(expt_moverzs, tt2, min_ms2mass, d2, index_mgf_ms2) ith2 <- df2[["ith"]] iex2 <- df2[["iex"]] @@ -303,16 +303,18 @@ calc_probi_byvmods <- function (df, nms, expt_moverzs, expt_ints, y[["theo"]][iex] <- df_theo[ith] y[["idx"]][iex] <- ith - ## 3. join `int2` to `y` - y_idx <- y[["idx"]] - ok_iex <- .Internal(which(!is.na(y_idx))) - y_ith <- y_idx[ok_iex] - y[["int2"]][ok_iex] <- int2[y_ith] - - ## 4. collapses `int2` to `int` - y[["int"]] <- y[["int"]] %+% y[["int2"]] - y[["idx"]] <- y[["int2"]] <- NULL - + # if (tally_ms2ints) { + ## 3. join `int2` to `y` + y_idx <- y[["idx"]] + ok_iex <- .Internal(which(!is.na(y_idx))) + y_ith <- y_idx[ok_iex] + y[["int2"]][ok_iex] <- int2[y_ith] + + ## 4. collapses `int2` to `int` + y[["int"]] <- y[["int"]] %+% y[["int2"]] + y[["idx"]] <- y[["int2"]] <- NULL + # } + ### if (soft_secions) { ok_int2 <- .Internal(which(int2 > 0L & is.na(df[["int"]]))) @@ -321,17 +323,19 @@ calc_probi_byvmods <- function (df, nms, expt_moverzs, expt_ints, ok_iex2 <- iex2[match(ok_int2, ith2 %% m)] ok_iex2 <- ifelse(is.na(ok_iex2), m, ok_iex2) y[["int"]][ok_iex2] <- int2[ok_int2] - y[["theo"]][ok_iex2] <- df_theo[ok_int2] + # need to adjust the theoreticals, e.g., (df_theo[ok_int2] + 1.0078)/2; + # for simplicity use expt_moverzs, as long as they are not NA + y[["theo"]][ok_iex2] <- expt_moverzs[ok_iex2] } } ### ## 5. arrange by "-int" ord_int <- order(y[["int"]], decreasing = TRUE, method = "radix", na.last = TRUE) - y_theo <- y[["theo"]][ord_int] - maxi <- .Internal(which(!is.na(y_theo))) - maxi <- maxi[length(maxi)] - y_theo <- y_theo[1:maxi] + y_theo <- y[["theo"]][ord_int] + maxi <- .Internal(which(!is.na(y_theo))) + maxi <- maxi[length(maxi)] + y_theo <- y_theo[1:maxi] ## 6. mutate(k = row_number(), x = k - cumsum(is.na(theo))) k <- 1:maxi @@ -340,8 +344,8 @@ calc_probi_byvmods <- function (df, nms, expt_moverzs, expt_ints, ## 7. filter(!is.na(theo)) # note: x <= k <= x + n ok_y <- !is.na(y_theo) - k <- k[ok_y] - x <- x[ok_y] + k <- k[ok_y] + x <- x[ok_y] ## 8. Probability # (to have sufficient counts of noise) @@ -357,14 +361,15 @@ calc_probi_byvmods <- function (df, nms, expt_moverzs, expt_ints, if (length(x_)) { prs <- stats::dhyper(x = x_, m = m, n = N, k = k_) - pr <- min(prs, na.rm = TRUE) + pr <- min(prs, na.rm = TRUE) } - else + else { pr <- .5 - + } + ## outputs list(pep_ivmod = nms, - pep_prob = pr, + pep_prob = pr, pri_matches = list(df), sec_matches = list(df2)) } @@ -383,10 +388,11 @@ calc_probi_bypep <- function (mts, nms, expt_moverzs, expt_ints, N, type_ms2ions = "by", topn_ms2ions = 100L, ppm_ms2 = 20L, soft_secions = FALSE, min_ms2mass = 115L, d2 = 1E-5, - index_mgf_ms2 = FALSE, digits = 4L) + index_mgf_ms2 = FALSE, tally_ms2ints = TRUE, + digits = 4L) { ## for different positions: $TNLAMMR$`0000500`, $TNLAMMR$`0000050` - # the same `pep_seq`, `theo_ms1` for different mod positions + # the same `pep_seq`, `pep_calc_mr` for different mod positions # different `pep_ivmod`, `pep_prob`, `pri_matches`, `sec_matches` # NAMES are hexcodes: 0000000 @@ -404,6 +410,7 @@ calc_probi_bypep <- function (mts, nms, expt_moverzs, expt_ints, min_ms2mass = min_ms2mass, d2 = d2, index_mgf_ms2 = index_mgf_ms2, + tally_ms2ints = tally_ms2ints, digits = digits ), SIMPLIFY = FALSE, @@ -446,7 +453,7 @@ calc_probi <- function (mts, expt_moverzs, expt_ints, N, type_ms2ions = "by", topn_ms2ions = 100L, ppm_ms2 = 20L, soft_secions = FALSE, min_ms2mass = 115L, d2 = 1E-5, index_mgf_ms2 = FALSE, - digits = 4L) + tally_ms2ints = TRUE, digits = 4L) { out <- mapply( calc_probi_bypep, @@ -462,6 +469,7 @@ calc_probi <- function (mts, expt_moverzs, expt_ints, min_ms2mass = min_ms2mass, d2 = d2, index_mgf_ms2 = index_mgf_ms2, + tally_ms2ints = tally_ms2ints, digits = digits ), SIMPLIFY = FALSE, @@ -483,11 +491,11 @@ calc_probi <- function (mts, expt_moverzs, expt_ints, scalc_pepprobs <- function (entry, topn_ms2ions = 100L, type_ms2ions = "by", ppm_ms2 = 20L, soft_secions = FALSE, min_ms2mass = 115L, d2 = 1E-5, index_mgf_ms2 = FALSE, - digits = 4L) + tally_ms2ints = TRUE, digits = 4L) { # only one experimental set of values and thus `[[1]]` - expt_moverzs <- entry$ms2_moverz[[1]] - expt_ints <- entry[["ms2_int"]][[1]] + expt_moverzs <- entry[["pep_ms2_moverzs"]][[1]] + expt_ints <- entry[["pep_ms2_ints"]][[1]] ## matches between theoreticals and experimentals @@ -520,9 +528,7 @@ scalc_pepprobs <- function (entry, topn_ms2ions = 100L, type_ms2ions = "by", # (flattens by one level as is a list-column) mts <- entry[["matches"]][[1]] - - # N <- entry$ms2_n[[1]] - topn_ms2ions <- min(topn_ms2ions, entry$ms2_n[[1]]) + topn_ms2ions <- min(topn_ms2ions, entry$pep_n_ms2[[1]]) N <- min(topn_ms2ions * 5L, 500L) out <- calc_probi(mts = mts, @@ -536,6 +542,7 @@ scalc_pepprobs <- function (entry, topn_ms2ions = 100L, type_ms2ions = "by", min_ms2mass = min_ms2mass, d2 = d2, index_mgf_ms2 = index_mgf_ms2, + tally_ms2ints = tally_ms2ints, digits = digits) uniq_id <- .Internal(unlist(entry$uniq_id, recursive = FALSE, use.names = FALSE)) @@ -559,7 +566,8 @@ calc_pepprobs_i <- function (df, topn_ms2ions = 100L, type_ms2ions = "by", ppm_ms2 = 20L, soft_secions = FALSE, out_path = "~/mzion/outs", min_ms2mass = 115L, d2 = 1E-5, - index_mgf_ms2 = FALSE, digits = 4L) + index_mgf_ms2 = FALSE, tally_ms2ints = TRUE, + digits = 4L) { n_rows <- nrow(df) @@ -574,6 +582,7 @@ calc_pepprobs_i <- function (df, topn_ms2ions = 100L, type_ms2ions = "by", min_ms2mass = min_ms2mass, d2 = d2, index_mgf_ms2 = index_mgf_ms2, + tally_ms2ints = tally_ms2ints, digits = digits) df <- .Internal(unlist(df, recursive = FALSE, use.names = FALSE)) @@ -586,7 +595,7 @@ calc_pepprobs_i <- function (df, topn_ms2ions = 100L, type_ms2ions = "by", pep_prob = as.numeric(), pri_matches = list(), sec_matches = list(), - scan_num = as.integer()) + pep_scan_num = as.integer()) } invisible(df) @@ -594,7 +603,8 @@ calc_pepprobs_i <- function (df, topn_ms2ions = 100L, type_ms2ions = "by", #' Calculates the scores of peptides. -#' +#' +#' @param tally_ms2ints Logical; tally MS2 intensities or not. #' @inheritParams matchMS #' @import parallel calc_pepscores <- function (topn_ms2ions = 100L, type_ms2ions = "by", @@ -607,6 +617,7 @@ calc_pepscores <- function (topn_ms2ions = 100L, type_ms2ions = "by", soft_secions = FALSE, out_path = "~/mzion/outs", min_ms2mass = 115L, index_mgf_ms2 = FALSE, + tally_ms2ints = TRUE, mgf_path, maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, maxn_vmods_sitescombi_per_pep = 64L, minn_ms2 = 6L, @@ -693,6 +704,7 @@ calc_pepscores <- function (topn_ms2ions = 100L, type_ms2ions = "by", min_ms2mass = min_ms2mass, d2 = d2, index_mgf_ms2 = index_mgf_ms2, + tally_ms2ints = tally_ms2ints, add_ms2theos = add_ms2theos, add_ms2theos2 = add_ms2theos2, add_ms2moverzs = add_ms2moverzs, @@ -755,7 +767,8 @@ find_targets <- function (out_path, pattern = "^ion_matches_") #' @inheritParams calc_pepscores calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", ppm_ms2 = 20L, soft_secions = FALSE, out_path = NULL, - min_ms2mass = 115L, d2 = 1E-5, index_mgf_ms2 = FALSE, + min_ms2mass = 115L, d2 = 1E-5, index_mgf_ms2 = FALSE, + tally_ms2ints = TRUE, add_ms2theos = FALSE, add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, add_ms2ints = FALSE, n_cores = 16L, digits = 4L) @@ -766,14 +779,14 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", file_lt <- file.path(out_path, "temp", paste0("list_table_", idx, ".rds")) file_sc <- file.path(out_path, "temp", paste0("pepscores_", idx, ".rds")) - cols_a <- c("scan_num", "raw_file") - cols_b <- c("ms2_moverz", "ms2_int", "pri_matches", "sec_matches") + cols_a <- c("pep_scan_num", "raw_file") + cols_b <- c("pep_ms2_moverzs", "pep_ms2_ints", "pri_matches", "sec_matches") cols_lt <- c(cols_a, cols_b) - cols_sc <- c("pep_seq", "ms2_n", "scan_title", "ms1_moverz", "ms1_mass", - "ms1_int", "ms1_charge", "ret_time", "scan_num", "raw_file", - "pep_mod_group", "frame", "pep_fmod", "pep_vmod", "pep_isdecoy", - "theo_ms1", "pep_ivmod", "pep_prob", "pep_len", + cols_sc <- c("pep_seq", "pep_n_ms2", "pep_scan_title", "pep_exp_mz", "pep_exp_mr", + "pep_tot_int", "pep_exp_z", "pep_ret_range", "pep_scan_num", "raw_file", + "pep_mod_group", "pep_frame", "pep_fmod", "pep_vmod", "pep_isdecoy", + "pep_calc_mr", "pep_ivmod", "pep_prob", "pep_len", "pep_ms2_moverzs", "pep_ms2_ints", "pep_ms2_theos", "pep_ms2_theos2", "pep_ms2_exptints", "pep_ms2_exptints2", @@ -784,9 +797,23 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", # for localization scores "pep_ms2_ideltas.") + df <- qs::qread(file.path(out_path, "temp", file)) n_rows <- nrow(df) + df <- dplyr::rename(df, + pep_ret_range = ret_time, + pep_scan_title = scan_title, + pep_exp_mz = ms1_moverz, + pep_n_ms2 = ms2_n, + pep_exp_mr = ms1_mass, + pep_tot_int = ms1_int, + pep_scan_num = scan_num, + pep_exp_z = ms1_charge, + pep_ms2_moverzs = ms2_moverz, + pep_ms2_ints = ms2_int, + pep_frame = frame) + if (!n_rows) { dfa <- data.frame(matrix(ncol = length(cols_lt), nrow = 0L)) colnames(dfa) <- cols_lt @@ -801,8 +828,8 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", tempdir <- create_dir(file.path(out_path, "sc_temp")) - df[["uniq_id"]] <- paste(df[["scan_num"]], df[["raw_file"]], sep = "@") - esscols <- c("ms2_moverz", "ms2_int", "matches", "ms2_n", "uniq_id") + df[["uniq_id"]] <- paste(df[["pep_scan_num"]], df[["raw_file"]], sep = "@") + esscols <- c("pep_ms2_moverzs", "pep_ms2_ints", "matches", "pep_n_ms2", "uniq_id") path_df2 <- file.path(tempdir, "df2_sc_temp.rda") df2 <- df[, -which(names(df) %in% esscols), drop = FALSE] qs::qsave(df2, path_df2, preset = "fast") @@ -825,6 +852,7 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", min_ms2mass = min_ms2mass, d2 = d2, index_mgf_ms2 = index_mgf_ms2, + tally_ms2ints = tally_ms2ints, digits = digits) } else { @@ -837,7 +865,7 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", "calc_probi_byvmods", "add_seions", "find_ppm_outer_bycombi", "match_ex2th2", "add_primatches"), - envir = environment(mzion:::scalc_pepprobs)) + envir = environment(mzion::matchMS)) if (n_rows > max_rows) { dfs <- suppressWarnings(chunksplit(df, ceiling(n_rows/max_rows), "row")) @@ -869,6 +897,7 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", min_ms2mass = min_ms2mass, d2 = d2, index_mgf_ms2 = index_mgf_ms2, + tally_ms2ints = tally_ms2ints, digits = digits) probs[[i]] <- dplyr::bind_rows(probs[[i]]) } @@ -896,6 +925,7 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", min_ms2mass = min_ms2mass, d2 = d2, index_mgf_ms2 = index_mgf_ms2, + tally_ms2ints = tally_ms2ints, digits = digits) } else { @@ -909,6 +939,7 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", min_ms2mass = min_ms2mass, d2 = d2, index_mgf_ms2 = index_mgf_ms2, + tally_ms2ints = tally_ms2ints, digits = digits) parallel::stopCluster(cl) @@ -956,7 +987,7 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", else { cl <- parallel::makeCluster(getOption("cl.cores", n_cores)) parallel::clusterExport(cl, list("add_primatches"), - envir = environment(mzion:::add_primatches)) + envir = environment(mzion::matchMS)) max_theos <- 500000L @@ -1011,6 +1042,7 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", message("\tCompleted theoretical MS2 m/z and intensity values: ", Sys.time()) df[["pep_isdecoy"]] <- ifelse(is.na(df[["pep_ivmod"]]), TRUE, FALSE) + df <- dplyr::rename(df, pep_calc_mr = theo_ms1) if (!all(cols_sc %in% names(df))) stop("Developer needs to update the columns of peptide scores.") @@ -1026,6 +1058,9 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", #' Adds sequences of primary and secondary matches. #' +#' Applied to both targets and decoys as feature "pep_ms2_deltas_mean" may be +#' used in SVM-Percolator. +#' #' @param df A data frame. #' @inheritParams matchMS add_primatches <- function (df, add_ms2theos = FALSE, add_ms2theos2 = FALSE, @@ -1089,19 +1124,19 @@ add_primatches <- function (df, add_ms2theos = FALSE, add_ms2theos2 = FALSE, d1s[[i]] <- .Internal(paste0(list(ds1), collapse = ";", recycle0 = FALSE)) d2s[[i]] <- .Internal(paste0(list(ds2), collapse = ";", recycle0 = FALSE)) - p1s[[i]] <- .Internal(paste0(list(ps1), collapse = ";", recycle0 = FALSE)) - p2s[[i]] <- .Internal(paste0(list(ps2), collapse = ";", recycle0 = FALSE)) + p1s[[i]] <- .Internal(paste0(list(ps1), collapse = ";", recycle0 = FALSE)) + p2s[[i]] <- .Internal(paste0(list(ps2), collapse = ";", recycle0 = FALSE)) iys1[[i]] <- .Internal(paste0(list(iy1[ps1]), collapse = ";", recycle0 = FALSE)) iys2[[i]] <- .Internal(paste0(list(iy2[ps2]), collapse = ";", recycle0 = FALSE)) me1s[[i]] <- me1 sd1s[[i]] <- sd1 - m1s[[i]] <- mt1$m - m2s[[i]] <- mt2$m + m1s[[i]] <- mt1$m + m2s[[i]] <- mt2$m p1s.[[i]] <- ps1 - if (i %% 5000L == 0L) gc() + # if (i %% 5000L == 0L) gc() } if (index_mgf_ms2) { @@ -1157,15 +1192,15 @@ collapse_vecs <- function (vecs, nm = "theo", sep = ";") #' @param df A results after pep_scores. post_pepscores <- function (df) { - df[["scan_num"]] <- as.character(df[["scan_num"]]) + df[["pep_scan_num"]] <- as.character(df[["pep_scan_num"]]) df[["pep_len"]] <- stringr::str_length(df[["pep_seq"]]) - df[["scan_title"]] <- as.character(df[["scan_title"]]) - df[["ms1_moverz"]] <- as.numeric(df[["ms1_moverz"]]) - df[["ms1_mass"]] <- as.numeric(df[["ms1_mass"]]) - df[["ms1_int"]] <- as.numeric(df[["ms1_int"]]) - df[["ms1_charge"]] <- as.character(df[["ms1_charge"]]) - df[["ret_time"]] <- as.integer(df[["ret_time"]]) - df[["ms2_n"]] <- as.integer(df[["ms2_n"]]) + df[["pep_scan_title"]] <- as.character(df[["pep_scan_title"]]) + df[["pep_exp_mz"]] <- as.numeric(df[["pep_exp_mz"]]) + df[["pep_exp_mz"]] <- as.numeric(df[["pep_exp_mz"]]) + df[["pep_tot_int"]] <- as.numeric(df[["pep_tot_int"]]) + df[["pep_exp_z"]] <- as.character(df[["pep_exp_z"]]) + df[["pep_ret_range"]] <- as.integer(df[["pep_ret_range"]]) + df[["pep_n_ms2"]] <- as.integer(df[["pep_n_ms2"]]) df[["pep_fmod"]] <- as.character(df[["pep_fmod"]]) df[["pep_vmod"]] <- as.character(df[["pep_vmod"]]) @@ -1182,29 +1217,32 @@ post_pepscores <- function (df) find_pepscore_co1 <- function (td, target_fdr = 0.01) { target <- dplyr::filter(td, !pep_isdecoy) - decoy <- dplyr::filter(td, pep_isdecoy) + decoy <- dplyr::filter(td, pep_isdecoy) nt <- nrow(target) nd <- nrow(decoy) - if (nd <= 5L) return(NA) + if (nd <= 5L) + return(NA_real_) n <- nt + nd - lambt <- nt/(n) + lambt <- nt / n lambd <- 1 - lambt vecd <- log2(decoy$pep_score) sigmad <- sd(vecd, na.rm = TRUE) mud <- mean(vecd, na.rm = TRUE) - if (is.na(sigmad)) return(NA) + if (is.na(sigmad)) + return(NA_real_) xs <- seq(mud + sigmad, mud + 3*sigmad, 0.014355293) for (i in seq_along(xs)) { y <- (1 - pnorm(xs[i], mud, sigmad)) * nd / n - if (y <= target_fdr) break + if (y <= target_fdr) + break } 2^(xs[i]) @@ -1220,29 +1258,32 @@ find_pepscore_co1 <- function (td, target_fdr = 0.01) find_pepscore_co2 <- function (td, target_fdr = 0.01) { target <- dplyr::filter(td, !pep_isdecoy) - decoy <- dplyr::filter(td, pep_isdecoy) + decoy <- dplyr::filter(td, pep_isdecoy) nt <- nrow(target) nd <- nrow(decoy) - if (nd <= 5L) return(NA) + if (nd <= 5L) + return(NA_real_) n <- nt + nd - lambt <- nt/(n) + lambt <- nt / n lambd <- 1 - lambt vecd <- decoy$pep_score sigmad <- sd(vecd, na.rm = TRUE) mud <- mean(vecd, na.rm = TRUE) - if (is.na(sigmad)) return(NA) + if (is.na(sigmad)) + return(NA_real_) xs <- seq( mud + 4*sigmad, mud + sigmad, -.1) for (i in seq_along(xs)) { y <- (1 - plnorm(xs[i], mud, sigmad, lower.tail = FALSE)) * nd / n - if (y >= target_fdr) break + if (y >= target_fdr) + break } xs[i] @@ -1261,40 +1302,13 @@ probco_bypeplen <- function (len, td, fdr_type = "protein", target_fdr = 0.01, min_pepscores_co = 0, out_path) { td <- dplyr::filter(td, pep_len == len) + td <- sub_td_byfdrtype(td, fdr_type) - if (fdr_type %in% c("peptide", "protein")) { - if (fdr_type == "protein") { - td <- dplyr::arrange(td, pep_seq, pep_prob) - td <- dplyr::group_by(td, pep_seq) - td <- dplyr::filter(td, row_number() == 1L) - td <- dplyr::ungroup(td) - } - else { - # td[["pep_ivmod2"]] <- gsub(" [\\(\\[]\\d+[\\)\\[]$", "", td[["pep_ivmod"]]) - td[["pep_ivmod2"]] <- gsub(" .*", "", td[["pep_ivmod"]]) - td[["pep_seq_mod"]] <- ifelse(is.na(td[["pep_ivmod2"]]), td[["pep_seq"]], - paste0(td[["pep_seq"]], ".", td[["pep_ivmod2"]])) - td <- dplyr::arrange(td, pep_seq_mod, pep_prob) - td <- dplyr::group_by(td, pep_seq_mod) - td <- dplyr::filter(td, row_number() == 1L) - td <- dplyr::ungroup(td) - td[["pep_ivmod2"]] <- NULL - td[["pep_seq_mod"]] <- NULL - } - } - - td <- dplyr::select(td, pep_prob, pep_isdecoy) - td <- dplyr::arrange(td, pep_prob) - td <- dplyr::mutate(td, total = row_number()) - td <- dplyr::mutate(td, decoy = cumsum(pep_isdecoy)) - td <- dplyr::mutate(td, fdr = decoy/total) - td <- dplyr::mutate(td, pep_score = -log10(pep_prob) * 10) - count <- nrow(td) if (count < (1 / target_fdr)) { - if (count <= 10L) # changed from 20L - return(NA) + if (count <= 10L) + return(NA_real_) best_co <- tryCatch( (find_pepscore_co1(td, target_fdr) + find_pepscore_co2(td, target_fdr))/2, @@ -1426,13 +1440,12 @@ probco_bypeplen <- function (len, td, fdr_type = "protein", target_fdr = 0.01, rm(list = c("df", "fit")) prob_co <- 10^(-best_co/10) - } + } else { best_co <- tryCatch( (find_pepscore_co1(td, target_fdr) + find_pepscore_co2(td, target_fdr))/2, - error = function(e) NA - ) - + error = function(e) NA) + prob_co <- 10^(-best_co/10) } @@ -1442,6 +1455,42 @@ probco_bypeplen <- function (len, td, fdr_type = "protein", target_fdr = 0.01, } +#' Subsets targets and decoys by fdr_type. +#' +#' @param td A data frame of targets and decoys. +#' @inheritParams matchMS +sub_td_byfdrtype <- function (td, fdr_type) +{ + if (fdr_type %in% c("peptide", "protein")) { + if (fdr_type == "protein") { + td <- dplyr::arrange(td, pep_seq, pep_prob) + td <- dplyr::group_by(td, pep_seq) + td <- dplyr::filter(td, row_number() == 1L) + td <- dplyr::ungroup(td) + } + else { + # td[["pep_ivmod2"]] <- gsub(" [\\(\\[]\\d+[\\)\\[]$", "", td[["pep_ivmod"]]) + td[["pep_ivmod2"]] <- gsub(" .*", "", td[["pep_ivmod"]]) + td[["pep_seq_mod"]] <- ifelse(is.na(td[["pep_ivmod2"]]), td[["pep_seq"]], + paste0(td[["pep_seq"]], ".", td[["pep_ivmod2"]])) + td <- dplyr::arrange(td, pep_seq_mod, pep_prob) + td <- dplyr::group_by(td, pep_seq_mod) + td <- dplyr::filter(td, row_number() == 1L) + td <- dplyr::ungroup(td) + td[["pep_ivmod2"]] <- NULL + td[["pep_seq_mod"]] <- NULL + } + } + + td <- dplyr::select(td, pep_prob, pep_isdecoy) + td <- dplyr::arrange(td, pep_prob) + td <- dplyr::mutate(td, total = row_number()) + td <- dplyr::mutate(td, decoy = cumsum(pep_isdecoy)) + td <- dplyr::mutate(td, fdr = decoy/total) + td <- dplyr::mutate(td, pep_score = -log10(pep_prob) * 10) +} + + #' Find the suitable pep_len values for the fitting of probability cut-offs. #' #' Recursively decrease the value of \code{min_count} by half until some indexes @@ -1492,54 +1541,31 @@ find_probco_valley <- function (prob_cos, guess = 12L) } -#' Calculates the cut-off score at a peptide FDR. -#' -#' Needs \code{min_len} and \code{max_len} since the target-decoy pair may not -#' cover all \code{pep_len} values. -#' -#' @param target_fdr Numeric; the levels of false-discovery rate (FDR). -#' @param fdr_type Character string; the type of FDR for controlling. -#' @inheritParams matchMS -#' @examples -#' \donttest{ -#' library(mzion) -#' -#' if (FALSE) { -#' prob_cos <- calc_pepfdr(target_fdr = .01, -#' fdr_type = "protein", -#' min_len = 7L, -#' max_len = 50L, -#' out_path = "~/mzion/bi_1") -#' } +#' Prepares target-decoy data. #' -#' } -calc_pepfdr <- function (target_fdr = .01, fdr_type = "protein", - min_len = 7L, max_len = 40L, - max_pepscores_co = 50, min_pepscores_co = 0, - enzyme = "trypsin_p", - fdr_group = "base", - nes_fdr_group = "base", - out_path) +#' @param td A data frame of targets and decoys (for Percolator). +#' @inheritParams matchMS +prep_pepfdr_td <- function (td = NULL, out_path, enzyme = "trypsin_p", + nes_fdr_group = "base", fdr_group = "base") { - message("Calculating peptide FDR.") - - fct_score <- 10 - files <- list.files(path = file.path(out_path, "temp"), pattern = "^pepscores_", full.names = TRUE) + if (!length(files)) + stop("Score results not found.", call. = FALSE) + top3s <- gsub(paste0("^.*pepscores_", "(\\d+)\\.rds$"), "\\1", files[which_topx2(file.size(files), 3L)[1:3]]) - + top3s <- top3s[!is.na(top3s)] + max_i <- gsub(paste0("^.*pepscores_", "(\\d+)\\.rds$"), "\\1", files[which.max(file.size(files))[[1]]]) - if (!length(files)) - stop("Score results not found.", call. = FALSE) - - td <- lapply(files, qs::qread) - td <- td[lapply(td, nrow) > 0L] # otherwise, error with bind_rows - td <- dplyr::bind_rows(td) + if (is.null(td)) { + td <- lapply(files, qs::qread) + td <- td[lapply(td, nrow) > 0L] # otherwise, error with bind_rows + td <- dplyr::bind_rows(td) + } enzyme <- tolower(enzyme) is_nes <- enzyme == "noenzyme" || grepl("^semi", enzyme) @@ -1584,20 +1610,22 @@ calc_pepfdr <- function (target_fdr = .01, fdr_type = "protein", if (!nrow(td)) stop("Found nothing: empty targets and decoys.") - if (!sum(td[["pep_isdecoy"]])) { - warning("No decoys found.") - seqs <- min_len:max(td[["pep_len"]], na.rm = TRUE) - prob_cos <- rep(.5, length(seqs)) - - return(data.frame(pep_len = seqs, pep_prob_co = prob_cos)) - } - + td +} + + +#' Keeps the best entries of targets and decoys. +#' +#' @param td A data frame of targets and decoys. +#' @param cols Columns for grouping. +keep_pepfdr_best <- function (td, cols = c("pep_scan_num", "raw_file")) +{ # keeps separated best hits for targets and decoys # two lists of "TRUE" and "FALSE" td <- split(td, td[["pep_isdecoy"]]) td <- lapply(td, function (x) { - x <- dplyr::group_by(x, scan_num, raw_file) + x <- dplyr::group_by_at(x, cols) x <- dplyr::arrange(x, pep_prob) x <- dplyr::filter(x, row_number() == 1L) x <- dplyr::ungroup(x) @@ -1607,19 +1635,74 @@ calc_pepfdr <- function (target_fdr = .01, fdr_type = "protein", ok <- is.na(fastmatch::fmatch(reverse_seqs(td[["FALSE"]][["pep_seq"]]), td[["TRUE"]][["pep_seq"]])) td[["FALSE"]] <- td[["FALSE"]][ok, ] - + # keeps the best hit for each `scan_num` td <- if (nrow(td[["FALSE"]])) dplyr::bind_rows(td[c("TRUE", "FALSE")]) else td[["TRUE"]] - td <- dplyr::group_by(td, scan_num, raw_file) + td <- dplyr::group_by_at(td, cols) td <- dplyr::arrange(td, pep_prob) td <- dplyr::filter(td, row_number() == 1L) td <- dplyr::ungroup(td) - gc() +} + + +#' Calculates the cut-off score at a peptide FDR. +#' +#' Needs \code{min_len} and \code{max_len} since the target-decoy pair may not +#' cover all \code{pep_len} values. +#' +#' @param target_fdr Numeric; the levels of false-discovery rate (FDR). +#' @param fdr_type Character string; the type of FDR for controlling. +#' @param fct_score A trivial factor converting p-values to scores. +#' @inheritParams matchMS +#' @examples +#' \donttest{ +#' library(mzion) +#' +#' if (FALSE) { +#' prob_cos <- calc_pepfdr(target_fdr = .01, +#' fdr_type = "protein", +#' min_len = 7L, +#' max_len = 50L, +#' out_path = "~/mzion/bi_1") +#' } +#' +#' } +calc_pepfdr <- function (target_fdr = .01, fdr_type = "protein", + min_len = 7L, max_len = 40L, + max_pepscores_co = 50, min_pepscores_co = 0, + enzyme = "trypsin_p", + fdr_group = "base", + nes_fdr_group = "base", + fct_score = 10, + out_path) +{ + message("Calculating peptide FDR.") + td <- prep_pepfdr_td(out_path = out_path, + enzyme = enzyme, + nes_fdr_group = nes_fdr_group, + fdr_group = fdr_group) + + # back-compatibility to new column keys (e.g. scan_num -> pep_scan_num) + if (!"pep_scan_num" %in% names(td)) + stop("Seems like reprocessing of results from an earlier version.\n", + " Please delete old \"temp\\pep_score[...].rds\".") + + if (!sum(td[["pep_isdecoy"]])) { + warning("No decoys found.") + seqs <- min_len:max(td[["pep_len"]], na.rm = TRUE) + prob_cos <- rep(.5, length(seqs)) + + return(data.frame(pep_len = seqs, pep_prob_co = prob_cos)) + } + + td <- keep_pepfdr_best(td) + qs::qsave(td, file.path(out_path, "temp", "td_pepfdr.rds"), preset = "fast") + # --- all_lens <- sort(unique(td$pep_len)) @@ -2530,7 +2613,7 @@ calc_peploc <- function (x = NULL, out_path = NULL, mod_indexes = NULL, # For simplicity `pep_seq` uses interchangeably with `uniq_id` and # `pep_seq_mod` with `uniq_id2` where everything is on top of the same - # pep_isdecoy, scan_num, raw_file. + # pep_isdecoy, pep_scan_num, raw_file. # # uniq_id --- can differentiate the same `pep_seq` at different mod locations & NL # uniq_id2 --- can differentiate the same `pep_seq_mod` at different NLs @@ -2546,7 +2629,7 @@ calc_peploc <- function (x = NULL, out_path = NULL, mod_indexes = NULL, message("\tRank peptides by neutral losses.") x[, pep_isdecoy := as.integer(pep_isdecoy)] - x[, uniq_id := paste(pep_isdecoy, scan_num, raw_file, pep_seq, sep = ".")] + x[, uniq_id := paste(pep_isdecoy, pep_scan_num, raw_file, pep_seq, sep = ".")] x[, "pep_ivmod2" := gsub(" [\\(\\[]\\d+[\\)\\[]$", "", pep_ivmod)] x[, uniq_id2 := paste(uniq_id, pep_ivmod2, sep = ".")] @@ -2746,7 +2829,7 @@ calc_peploc <- function (x = NULL, out_path = NULL, mod_indexes = NULL, message("\tSubset peptide sequences by query: \"topn_seqs_per_query <= ", topn_seqs_per_query, "\".") - x0[, uniq_id3 := paste(pep_isdecoy, scan_num, raw_file, sep = ".")] + x0[, uniq_id3 := paste(pep_isdecoy, pep_scan_num, raw_file, sep = ".")] if (para) { x0 <- x0[order(x0[["uniq_id3"]]), ] diff --git a/R/silac.R b/R/silac.R index ab7498e..a3ab67b 100644 --- a/R/silac.R +++ b/R/silac.R @@ -377,8 +377,7 @@ matchMS_noenzyme <- function (this_call = NULL, min_len = 7L, max_len = 40L, file.copy(file.path(out_paths[[1]], "Calls"), out_path, recursive = TRUE) combine_ion_matches(out_path, out_paths, type = "ion_matches_") combine_ion_matches(out_path, out_paths, type = "reporters_") - combine_ion_matches(out_path, out_paths, type = "ion_matches_rev_") - + this_call$bypass_noenzyme <- TRUE this_call$bypass_pepmasses <- TRUE this_call$bypass_bin_ms1 <- TRUE @@ -440,19 +439,16 @@ combine_ion_matches <- function (out_path, out_paths, type = "ion_matches_") out_path_temp <- create_dir(file.path(out_path, "temp")) out_paths_temp <- lapply(out_paths, function(x) file.path(x, "temp")) - pat <- paste0(type, "[0-9]+\\.rds$") + pat <- paste0(type, "[0-9]+\\.rds$") pat2 <- paste0(type, "([0-9]+)\\.rds$") files_mts <- local({ xs <- list.files(out_paths_temp[[1]], pattern = pat) - if (length(xs)) { - idxes <- sort(as.integer(gsub(pat2, "\\1", xs))) - files <- paste0(type, idxes, ".rds") - } - else { - files <- NULL - } + files <- if (length(xs)) + paste0(type, sort(as.integer(gsub(pat2, "\\1", xs))), ".rds") + else + NULL }) len_mts <- length(files_mts) @@ -515,8 +511,7 @@ comine_PSMsubs <- function (sub_paths, groups, out_path) file.copy(file.path(sub_paths[[1]], "Calls"), out_path, recursive = TRUE) combine_ion_matches(out_path, sub_paths, type = "ion_matches_") suppressWarnings(combine_ion_matches(out_path, sub_paths, type = "reporters_")) - combine_ion_matches(out_path, sub_paths, type = "ion_matches_rev_") - + invisible(NULL) } diff --git a/R/unimods.R b/R/unimods.R index 074cefa..7a518f3 100644 --- a/R/unimods.R +++ b/R/unimods.R @@ -315,6 +315,7 @@ hfind_unimod <- function (xml_files = c("master.xml", "custom.xml"), unimod) #' @export table_unimods <- function (out_nm = "~/mzion/unimods.txt") { + dir.create("~/mzion") files <- c("master.xml", "custom.xml") lapply(files, htable_unimods) %>% diff --git a/R/utils_engine.R b/R/utils_engine.R index 136e0bd..1bfa999 100644 --- a/R/utils_engine.R +++ b/R/utils_engine.R @@ -681,7 +681,7 @@ expand_grid_rows <- function (..., use.names = TRUE) #' #' @param vec A named vector. #' @examples -#' \donttest{ +#' \dontrun{ #' library(mzion) #' library(microbenchmark) #' @@ -708,7 +708,7 @@ count_elements <- function (vec) #' #' @param x A named character vector. #' @examples -#' \donttest{ +#' \dontrun{ #' library(mzion) #' library(microbenchmark) #' @@ -734,7 +734,7 @@ vec_to_list <- function (x) #' @param vec A vector. #' #' @examples -#' \donttest{ +#' \dontrun{ #' ## M #' library(mzion) #' library(microbenchmark) @@ -988,9 +988,9 @@ flatten_list <- function (data, use_names = TRUE) #' Calculates the reversed MS2 from the forward #' #' @param af An sequence of answer of the forward. -#' @param l The number amino acid residues in a peptide. #' @param aas The sequence of amino acid residues. -calc_rev_ms2 <- function (af, aas) { +calc_rev_ms2 <- function (af, aas) +{ l <- length(aas) l1 <- l - 1L l2 <- l - 2L diff --git a/R/vmod_ms2_labels.R b/R/vmod_ms2_labels.R index 023b6d8..6757b12 100644 --- a/R/vmod_ms2_labels.R +++ b/R/vmod_ms2_labels.R @@ -528,7 +528,7 @@ find_ms2resids <- function (M, vec) #' @param labs A vector of labels. #' #' @examples -#' \donttest{ +#' \dontrun{ #' library(gtools) #' library(mzion) #' library(dplyr) @@ -611,7 +611,7 @@ find_perm_sets <- function (labs = c("A", "A", "A", "B", "B", "C")) #' @param x A new label not in M for permutation. #' #' @examples -#' \donttest{ +#' \dontrun{ #' library(mzion) #' library(gtools) #' library(dplyr) diff --git a/R/zzz.R b/R/zzz.R index 55829a8..8d09e49 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -2,12 +2,12 @@ .onAttach <- function(libname, pkgname) { packageStartupMessage("Welcome to mzion.\n\n", "============================================================================================\n", - # "NEW features (v1.1.9.0):\n", + # "NEW features (v1.2.4):\n", # "[x] Incompatible with cached results from previous versions.\n\n", "[x] For examples, enter \"?matchMS\".\n", - # "[x] Optimized under R.4.1.3 (tested with R.4.2).\n", + # "[x] Please delete cached \"\temp\pep_score.rds\" for reprocessing wither older versions.\n", - # "[x] Updated SILAC utility for custom chemistry.\n", + # "[x] Added Percolator utility.\n", # "[x] See also package `proteoQ` for downstream data QA and informatics.\n", # "\n", diff --git a/data-raw/annotProteinAccession.R b/data-raw/annotProteinAccession.R index ffccaa7..5bd2a4b 100644 --- a/data-raw/annotProteinAccession.R +++ b/data-raw/annotProteinAccession.R @@ -9,7 +9,7 @@ foo_combine_codes <- function (filepath = file.path("~/Github/mzion/R")) ans <- lapply(file.path(filepath, filenames), readLines) ans <- purrr::reduce(ans, `c`, init = NULL) - writeLines(ans, file.path(filepath, "temp/all - mzion.R")) + writeLines(ans, file.path(filepath, "temp/all_mzion.R")) } diff --git a/inst/extdata/precursor_masses.Rmd b/inst/extdata/precursor_masses.Rmd new file mode 100644 index 0000000..4c5f21d --- /dev/null +++ b/inst/extdata/precursor_masses.Rmd @@ -0,0 +1,20 @@ +--- +title: "Methods" +output: html_document +date: "2023-04-11" +--- + +### Specifications of fixed and variable modifications + +The Unimod [@creasydavidm2004] definition of positions and sites were adopted by Mzion for specifying fixed and variable modifications. The value of a position is in one of "Anywhere", "Protein N-term", "Protein C-term", "Any N-term" or "Any C-term". The last two position labels can be shorthanded as "N-term" and "C-term". A site is a one-letter representation of the twenty amino-acid residues, as well as the terminal sites of "N-term" and "C-term". The general format in specifying a fixed or variable modification is `title (position = site)` where title is a unique character string without space. At a position of "Anywhere", the modification can be shorthanded as `title (site)`, for example, `TMT10plex (K)`. For a terminal modification at any site, it can be abbreviated as `title (position)`, for example, `Acetyl (Protein N-term)` and `TMT10plex (N-term)`. There are circumstances that both position and site are needed for specifying a modification, for instance, `Gln->pyro-Glu (N-term = Q)`. More examples are available in the help document of Mzion utility of `parse_unimod`. + +*Precursor masses*. Protein entries in FASTA databases were digested *in silico* into peptide sequences according to a full enzymatic specificity, for example tryptic cleavages at C-terminal K or R, at no missed cleavage. Masses of the bare sequences are first obtained by summing over the masses of amino-acid residues therein (temporarily ignore the masses at sites "N-term", "C-term" and the mass deltas by "Anywhere" variable modifications). Rolling sums are then applied to obtain the masses of longer sequences by concatenating adjacent pieces of peptide sequences according to the number of allowed missed cleavages, followed by the capping of "N-term" and "C-term" masses. The full sequences are then filtered by the range of lengths specified in the search. Analogous rolling sums are employed to obtain the masses of peptide sequences over a range of length for searches at NES. For searches with semi-enzyme specificity, the semi-enzyme sequences and their masses are generated by sequential trimmings of residues from the N- or the C-terminals of full sequences. Next the sequences are dispatched according to the attributes of amino-acid look-ups (Supplementary Fig. 12). For instance, when against a look-up with an attribute of "Oxidation (M)", only sequences with residue M are retained. + +The base module of the look-ups corresponds to the combination of fixed and variable modifications that were specified originally in a search. Subjected to fixed-to-variable coercions at sites "N-term", "C-term" and "Anywhere", the masses in the base module are compensated accordingly and concluded. The masses of various variable terminals are then capped onto the sequences for the remaining modules (there is only one N- and C-terminal in a peptide sequence and simple summation makes no difference whether the terminal sites are coerced or not). Note that coerced "Anywhere" modifications are reverted back to fixed modifications when possible (*Compilation of fixed and variable modifications*). The precursor masses are thus compensated by the back-coerced "Anywhere" masses accordingly. Finally, combinatorial variable masses of sequences, including neutral losses, are computed to form the space of precursor masses by modules. + +The modualized precursor masses are further binned according to the tolerance in mass error. The bin width is half of the size of the tolerance. For instance, at a default precursor mass tolerance of 20 ppm, the theoretical masses are binned at every 10 ppm. Later when matching tandem spectra, +/-20 ppm is the maximum mass error (the maximum distance between two adjacent bins) that can be found between experimentals and theoreticals. Experimental spectra are binned accordingly. Namely, the minimum precursor mass that have been specified in a search is used as the common starting point in the binning. This ensures a one-to-one correspondence of bin indexes between experimentals and theoreticals. Finally, the binned theoretical and experimental precursor masses in the form of neutral masses are stored on disk (and can be reused in a new search at matched search parameters). The decoy sequences are constructed by reversing target peptide sequences with the fixation of both the N- and the C-terminal residues (e.g. PEPTIDE to PDITPEE). + +*Tandem spectra matches*. Experimental tandem spectra are matched to theoreticals by modules of fixed and variable modifications. Under each module, the experimental and theoretical spaces are first subset mutually by bins of overlapping precursor masses. Tandem spectra are then generated for the theoretical precursors and the searches traverse through the bins. To avoid combinatorial explosion, the permutation of variable modifications and their sites are modified from the recursion version in R package gtools with "for" loops. + +When matching the primary MS2 species (e.g., *b* and *y* ions), both experimental and theoretical MS2 values, $x$, are converted to indexes $\left\lceil \textbf{ln}(x/x_0)/\textbf{ln}(1+d) \right\rceil$ where $x_0$ is the minimum MS2 *m*/*z* supplied in a search and $d$ is a bin width, for example 10 ppm. The experimental and theoretical values with mass difference smaller than $d$ can be off by +/-1 in bin indexes during the numeric-to-integer conversions. Additional matches with 1-increment or decrease of theoretical indexes are carried out. The one-to-one correspondence between experimental and theoretical *m*/*z* values, as well as experimental MS2 intensities, are necessarily maintained. These allow a later enrichment analyses of peptide scores based on the number of matches at descending MS2 ion intensity. Disparity between experimental and theoretical values can occasionally occur due to the numeric-to-integer conversions (for example, multiple experimental indexes match to the same theoretical index). In rare events of this kind, computationally more expensive outer products are performed for matches between the original experimental and theoretical *m*/*z* values. + diff --git a/man/add_aamasses_motifs.Rd b/man/add_aamasses_motifs.Rd index 318bfb5..8f5791d 100644 --- a/man/add_aamasses_motifs.Rd +++ b/man/add_aamasses_motifs.Rd @@ -11,14 +11,12 @@ add_aamasses_motifs(aa_masses, mod_motifs, positions_sites) acid residues.} \item{mod_motifs}{The motifs to restrict \code{Anywhere} variable - modification. For example, provided the \code{Anywhere} variable - modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and - - \code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = - c("NM", "MP"))} - - variable modifications will only be considered at sites that satisfy the - motifs.} +modification. For example, provided the \code{Anywhere} variable +modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and +\code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = +c("NM", "MP"))}, +variable modifications will only be considered at sites that satisfy the +motifs.} \item{positions_sites}{Named list of positions (in names) and sites (in values).} diff --git a/man/add_fixed_masses.Rd b/man/add_fixed_masses.Rd index 6f4853a..e82a24d 100644 --- a/man/add_fixed_masses.Rd +++ b/man/add_fixed_masses.Rd @@ -13,14 +13,12 @@ add_fixed_masses(mods, aa_masses, mod_motifs = NULL) acid residues.} \item{mod_motifs}{The motifs to restrict \code{Anywhere} variable - modification. For example, provided the \code{Anywhere} variable - modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and - - \code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = - c("NM", "MP"))} - - variable modifications will only be considered at sites that satisfy the - motifs.} +modification. For example, provided the \code{Anywhere} variable +modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and +\code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = +c("NM", "MP"))}, +variable modifications will only be considered at sites that satisfy the +motifs.} } \description{ Adds fixed masses diff --git a/man/add_one_permlab.Rd b/man/add_one_permlab.Rd index 60149b8..5404be5 100644 --- a/man/add_one_permlab.Rd +++ b/man/add_one_permlab.Rd @@ -15,7 +15,7 @@ add_one_permlab(M, x) Helper in making permuation table. } \examples{ -\donttest{ +\dontrun{ library(mzion) library(gtools) library(dplyr) diff --git a/man/add_primatches.Rd b/man/add_primatches.Rd index 2e0b9ff..5ef8655 100644 --- a/man/add_primatches.Rd +++ b/man/add_primatches.Rd @@ -38,10 +38,11 @@ add_primatches( \item{add_ms2ints}{Logical; if TRUE, adds the sequence of experimental MS2 intensity values (\code{pep_ms2_ints}).} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the @@ -52,5 +53,6 @@ intensity values (\code{pep_ms2_ints}).} pep_ms2_deltas_sd} are nullified in the outputs.} } \description{ -Adds sequences of primary and secondary matches. +Applied to both targets and decoys as feature "pep_ms2_deltas_mean" may be +used in SVM-Percolator. } diff --git a/man/add_raw_ids.Rd b/man/add_raw_ids.Rd index 4948ef2..b394cb5 100644 --- a/man/add_raw_ids.Rd +++ b/man/add_raw_ids.Rd @@ -17,10 +17,7 @@ add_raw_ids(df, mgf_path) Proteome Discoverer or (3) Bruker's DataAnalysis. With MSConvert, the default \code{titleMaker} is required for correct - parsing (don't think it can be altered by users, but just in case). - - Individuality in MGF files are slightly preferred to take advantage of - parallel reading of the files.} + parsing (don't think it can be altered by users, but just in case).} } \description{ Currently only used with psmC.txt during matchMS. An inverse function of diff --git a/man/add_term_mass.Rd b/man/add_term_mass.Rd index ef1e5e6..5aa1b83 100644 --- a/man/add_term_mass.Rd +++ b/man/add_term_mass.Rd @@ -13,8 +13,8 @@ add_term_mass(peps, aa_masses, min_mass = 200L, max_mass = 4500L) acid residues.} \item{min_mass}{A positive integer; the minimum precursor mass for -interrogation. The default is an arbitrarily low value. The primary guard -against low molecular-weight precursors is \code{min_len}.} +interrogation. The default is an arbitrarily low value (the primary guard +against low molecular-weight precursors is \code{min_len}).} \item{max_mass}{A positive integer; the maximum precursor mass for interrogation.} diff --git a/man/add_var_masses.Rd b/man/add_var_masses.Rd index b088280..2276ca5 100644 --- a/man/add_var_masses.Rd +++ b/man/add_var_masses.Rd @@ -25,14 +25,12 @@ of SILAC for details. Can be but not typically used in standard alone searches of labeled residues.} \item{mod_motifs}{The motifs to restrict \code{Anywhere} variable - modification. For example, provided the \code{Anywhere} variable - modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and - - \code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = - c("NM", "MP"))} - - variable modifications will only be considered at sites that satisfy the - motifs.} +modification. For example, provided the \code{Anywhere} variable +modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and +\code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = +c("NM", "MP"))}, +variable modifications will only be considered at sites that satisfy the +motifs.} \item{anywhere_coerce_sites}{The sites of coerced Anywhere modifications; for example, \code{K}.} diff --git a/man/adj_base_masses.Rd b/man/adj_base_masses.Rd index 112429f..1cdb822 100644 --- a/man/adj_base_masses.Rd +++ b/man/adj_base_masses.Rd @@ -23,8 +23,8 @@ fixed to variable modification.} of fixed to variable modification.} \item{min_mass}{A positive integer; the minimum precursor mass for -interrogation. The default is an arbitrarily low value. The primary guard -against low molecular-weight precursors is \code{min_len}.} +interrogation. The default is an arbitrarily low value (the primary guard +against low molecular-weight precursors is \code{min_len}).} \item{max_mass}{A positive integer; the maximum precursor mass for interrogation.} diff --git a/man/calc_aamasses.Rd b/man/calc_aamasses.Rd index d431b6c..b5f1100 100644 --- a/man/calc_aamasses.Rd +++ b/man/calc_aamasses.Rd @@ -28,14 +28,12 @@ of SILAC for details. Can be but not typically used in standard alone searches of labeled residues.} \item{mod_motifs}{The motifs to restrict \code{Anywhere} variable - modification. For example, provided the \code{Anywhere} variable - modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and - - \code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = - c("NM", "MP"))} - - variable modifications will only be considered at sites that satisfy the - motifs.} +modification. For example, provided the \code{Anywhere} variable +modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and +\code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = +c("NM", "MP"))}, +variable modifications will only be considered at sites that satisfy the +motifs.} \item{maxn_vmods_setscombi}{Integer; the maximum number of combinatorial variable modifications and neutral losses.} diff --git a/man/calc_monopep.Rd b/man/calc_monopep.Rd index 8dcf7ff..14a5e7b 100644 --- a/man/calc_monopep.Rd +++ b/man/calc_monopep.Rd @@ -28,8 +28,8 @@ peptide.} variable modifications per site in a per peptide sequence.} \item{min_mass}{A positive integer; the minimum precursor mass for -interrogation. The default is an arbitrarily low value. The primary guard -against low molecular-weight precursors is \code{min_len}.} +interrogation. The default is an arbitrarily low value (the primary guard +against low molecular-weight precursors is \code{min_len}).} \item{max_mass}{A positive integer; the maximum precursor mass for interrogation.} diff --git a/man/calc_monopeptide.Rd b/man/calc_monopeptide.Rd index e119ef5..7cadcc3 100644 --- a/man/calc_monopeptide.Rd +++ b/man/calc_monopeptide.Rd @@ -35,8 +35,8 @@ peptide.} variable modifications per site in a per peptide sequence.} \item{min_mass}{A positive integer; the minimum precursor mass for -interrogation. The default is an arbitrarily low value. The primary guard -against low molecular-weight precursors is \code{min_len}.} +interrogation. The default is an arbitrarily low value (the primary guard +against low molecular-weight precursors is \code{min_len}).} \item{max_mass}{A positive integer; the maximum precursor mass for interrogation.} diff --git a/man/calc_pepfdr.Rd b/man/calc_pepfdr.Rd index 021512e..cc1f1e8 100644 --- a/man/calc_pepfdr.Rd +++ b/man/calc_pepfdr.Rd @@ -14,6 +14,7 @@ calc_pepfdr( enzyme = "trypsin_p", fdr_group = "base", nes_fdr_group = "base", + fct_score = 10, out_path ) } @@ -26,7 +27,7 @@ calc_pepfdr( for considerations. Shorter peptides will be excluded. The default is 7.} \item{max_len}{A positive integer; the maximum length of peptide sequences -for considerations. Longer peptides will be excluded.} +for considerations. Longer peptides will be excluded. The default is 40.} \item{max_pepscores_co}{A positive numeric; the upper limit in the cut-offs of peptide scores for discriminating significant and insignificant @@ -41,7 +42,7 @@ enzyme will be used to generate peptide sequences from protein entries. The default is \code{Trypsin_P}. See also parameter \code{custom_enzyme}.} \item{fdr_group}{A character string; the modification group(s) for uses in -peptide FDR controls. The value is in one of c("all", "base"). The +peptide FDR controls. The value is in one of \code{c("all", "base")}. The \code{base} corresponds to the modification group with the largest number of matches.} @@ -56,6 +57,8 @@ will be used at \code{"all_cterm_nontryptic"}. The same applied to difference of only peptides from the \code{base} group being used. See also parameter \code{fdr_group}.} +\item{fct_score}{A trivial factor converting p-values to scores.} + \item{out_path}{A file path of outputs.} } \description{ diff --git a/man/calc_peploc.Rd b/man/calc_peploc.Rd index 5be57a9..a90cfd3 100644 --- a/man/calc_peploc.Rd +++ b/man/calc_peploc.Rd @@ -22,12 +22,15 @@ calc_peploc( modifications.} \item{locmods}{Among \code{varmods} for the consideration of localization -probabilities; for instance, \code{locmods = NULL} for nothing, -\code{locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)")} for -phosphopeptides, \code{locmods = "Acetyl (K)"} for lysine acetylation. -\code{fixedmods} that were coerced to \code{varmods} will be added -automatically to \code{locmods}. For convenience, the default is set to -look for applicable peptide phosphorylation.} + probabilities; for instance, \code{locmods = NULL} for nothing, + \code{locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)")} for + phosphopeptides, \code{locmods = "Acetyl (K)"} for lysine acetylation. + \code{fixedmods} that were coerced to \code{varmods} will be added + automatically to \code{locmods}. + + For convenience, the default is set to look for applicable peptide + phosphorylation (and may encounter warning messages if the data type is + different to the default).} \item{topn_mods_per_seq}{Positive integer; a threshold to discard variable modifications under the same peptide match with scores beyond the top-n. diff --git a/man/calc_pepmasses2.Rd b/man/calc_pepmasses2.Rd index a661c1d..d7a029b 100644 --- a/man/calc_pepmasses2.Rd +++ b/man/calc_pepmasses2.Rd @@ -68,14 +68,12 @@ of SILAC for details. Can be but not typically used in standard alone searches of labeled residues.} \item{mod_motifs}{The motifs to restrict \code{Anywhere} variable - modification. For example, provided the \code{Anywhere} variable - modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and - - \code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = - c("NM", "MP"))} - - variable modifications will only be considered at sites that satisfy the - motifs.} +modification. For example, provided the \code{Anywhere} variable +modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and +\code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = +c("NM", "MP"))}, +variable modifications will only be considered at sites that satisfy the +motifs.} \item{enzyme}{A character string; the proteolytic specificity of the assumed enzyme will be used to generate peptide sequences from proteins. The enzyme @@ -129,8 +127,8 @@ be excluded.} \item{max_miss}{The maximum number of mis-cleavages per peptide sequence.} \item{min_mass}{A positive integer; the minimum precursor mass for -interrogation. The default is an arbitrarily low value. The primary guard -against low molecular-weight precursors is \code{min_len}.} +interrogation. The default is an arbitrarily low value (the primary guard +against low molecular-weight precursors is \code{min_len}).} \item{max_mass}{A positive integer; the maximum precursor mass for interrogation.} diff --git a/man/calc_pepprobs_i.Rd b/man/calc_pepprobs_i.Rd index 87d9600..8df03a9 100644 --- a/man/calc_pepprobs_i.Rd +++ b/man/calc_pepprobs_i.Rd @@ -14,6 +14,7 @@ calc_pepprobs_i( min_ms2mass = 115L, d2 = 1e-05, index_mgf_ms2 = FALSE, + tally_ms2ints = TRUE, digits = 4L ) } @@ -31,10 +32,11 @@ and y-ions.} \item{ppm_ms2}{A positive integer; the mass tolerance of MS2 species. The default is 20.} -\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities -of secondary ions to primary ions at the absence of the primaries. The -default is FALSE. For instance, the signal of \code{b5^*} will be ignored -if its primary ion \code{b5} is not matched.} +\item{soft_secions}{Impacts on search performance not yet assessed. Logical; +if TRUE, collapses the intensities of secondary ions to primary ions even +when the primaries are absent. The default is FALSE. For instance, the +signal of \code{b5^*} will be ignored if its primary ion \code{b5} is not +matched.} \item{out_path}{A file path of outputs.} @@ -43,10 +45,11 @@ interrogation. The default is 110.} \item{d2}{Bin width in ppm divided by 1E6.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the @@ -56,6 +59,8 @@ interrogation. The default is 110.} the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, pep_ms2_deltas_sd} are nullified in the outputs.} +\item{tally_ms2ints}{Logical; tally MS2 intensities or not.} + \item{digits}{A non-negative integer; the number of decimal places to be used. The default is 4.} } diff --git a/man/calc_pepscores.Rd b/man/calc_pepscores.Rd index a43b735..0ef7f87 100644 --- a/man/calc_pepscores.Rd +++ b/man/calc_pepscores.Rd @@ -16,6 +16,7 @@ calc_pepscores( out_path = "~/mzion/outs", min_ms2mass = 115L, index_mgf_ms2 = FALSE, + tally_ms2ints = TRUE, mgf_path, maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, @@ -62,25 +63,27 @@ levels of PSM, peptide or protein. The default is 0.01. See also argument for considerations. Shorter peptides will be excluded. The default is 7.} \item{max_len}{A positive integer; the maximum length of peptide sequences -for considerations. Longer peptides will be excluded.} +for considerations. Longer peptides will be excluded. The default is 40.} \item{ppm_ms2}{A positive integer; the mass tolerance of MS2 species. The default is 20.} -\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities -of secondary ions to primary ions at the absence of the primaries. The -default is FALSE. For instance, the signal of \code{b5^*} will be ignored -if its primary ion \code{b5} is not matched.} +\item{soft_secions}{Impacts on search performance not yet assessed. Logical; +if TRUE, collapses the intensities of secondary ions to primary ions even +when the primaries are absent. The default is FALSE. For instance, the +signal of \code{b5^*} will be ignored if its primary ion \code{b5} is not +matched.} \item{out_path}{A file path of outputs.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the @@ -90,6 +93,8 @@ interrogation. The default is 110.} the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, pep_ms2_deltas_sd} are nullified in the outputs.} +\item{tally_ms2ints}{Logical; tally MS2 intensities or not.} + \item{mgf_path}{A file path to a list of MGF files. The experimenter needs to supply the files. @@ -98,10 +103,7 @@ interrogation. The default is 110.} Proteome Discoverer or (3) Bruker's DataAnalysis. With MSConvert, the default \code{titleMaker} is required for correct - parsing (don't think it can be altered by users, but just in case). - - Individuality in MGF files are slightly preferred to take advantage of - parallel reading of the files.} + parsing (don't think it can be altered by users, but just in case).} \item{maxn_vmods_per_pep}{A non-negative integer; the maximum number of \code{Anywhere} (non-terminal) variable modifications per peptide. The diff --git a/man/calc_probi.Rd b/man/calc_probi.Rd index ed32e48..d7ec1e8 100644 --- a/man/calc_probi.Rd +++ b/man/calc_probi.Rd @@ -16,6 +16,7 @@ calc_probi( min_ms2mass = 115L, d2 = 1e-05, index_mgf_ms2 = FALSE, + tally_ms2ints = TRUE, digits = 4L ) } @@ -39,20 +40,22 @@ searches.} \item{ppm_ms2}{A positive integer; the mass tolerance of MS2 species. The default is 20.} -\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities -of secondary ions to primary ions at the absence of the primaries. The -default is FALSE. For instance, the signal of \code{b5^*} will be ignored -if its primary ion \code{b5} is not matched.} +\item{soft_secions}{Impacts on search performance not yet assessed. Logical; +if TRUE, collapses the intensities of secondary ions to primary ions even +when the primaries are absent. The default is FALSE. For instance, the +signal of \code{b5^*} will be ignored if its primary ion \code{b5} is not +matched.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} \item{d2}{Bin width in ppm divided by 1E6.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the @@ -62,6 +65,8 @@ interrogation. The default is 110.} the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, pep_ms2_deltas_sd} are nullified in the outputs.} +\item{tally_ms2ints}{Logical; tally MS2 intensities or not.} + \item{digits}{A non-negative integer; the number of decimal places to be used. The default is 4.} } diff --git a/man/calc_probi_bypep.Rd b/man/calc_probi_bypep.Rd index ba2ed56..c0ed2ea 100644 --- a/man/calc_probi_bypep.Rd +++ b/man/calc_probi_bypep.Rd @@ -17,6 +17,7 @@ calc_probi_bypep( min_ms2mass = 115L, d2 = 1e-05, index_mgf_ms2 = FALSE, + tally_ms2ints = TRUE, digits = 4L ) } @@ -42,20 +43,22 @@ searches.} \item{ppm_ms2}{A positive integer; the mass tolerance of MS2 species. The default is 20.} -\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities -of secondary ions to primary ions at the absence of the primaries. The -default is FALSE. For instance, the signal of \code{b5^*} will be ignored -if its primary ion \code{b5} is not matched.} +\item{soft_secions}{Impacts on search performance not yet assessed. Logical; +if TRUE, collapses the intensities of secondary ions to primary ions even +when the primaries are absent. The default is FALSE. For instance, the +signal of \code{b5^*} will be ignored if its primary ion \code{b5} is not +matched.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} \item{d2}{Bin width in ppm divided by 1E6.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the @@ -65,6 +68,8 @@ interrogation. The default is 110.} the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, pep_ms2_deltas_sd} are nullified in the outputs.} +\item{tally_ms2ints}{Logical; tally MS2 intensities or not.} + \item{digits}{A non-negative integer; the number of decimal places to be used. The default is 4.} } diff --git a/man/calc_probi_byvmods.Rd b/man/calc_probi_byvmods.Rd index 2af15ed..96044f7 100644 --- a/man/calc_probi_byvmods.Rd +++ b/man/calc_probi_byvmods.Rd @@ -18,6 +18,7 @@ calc_probi_byvmods( min_ms2mass = 115L, d2 = 1e-05, index_mgf_ms2 = FALSE, + tally_ms2ints = TRUE, digits = 4L ) } @@ -44,10 +45,11 @@ searches.} \item{ppm_ms2}{A positive integer; the mass tolerance of MS2 species. The default is 20.} -\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities -of secondary ions to primary ions at the absence of the primaries. The -default is FALSE. For instance, the signal of \code{b5^*} will be ignored -if its primary ion \code{b5} is not matched.} +\item{soft_secions}{Impacts on search performance not yet assessed. Logical; +if TRUE, collapses the intensities of secondary ions to primary ions even +when the primaries are absent. The default is FALSE. For instance, the +signal of \code{b5^*} will be ignored if its primary ion \code{b5} is not +matched.} \item{burn_ins}{The range of burn-ins where inputs will be excluded from probablity assessments.} @@ -57,10 +59,11 @@ interrogation. The default is 110.} \item{d2}{Bin width in ppm divided by 1E6.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the @@ -70,6 +73,8 @@ interrogation. The default is 110.} the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, pep_ms2_deltas_sd} are nullified in the outputs.} +\item{tally_ms2ints}{Logical; tally MS2 intensities or not.} + \item{digits}{A non-negative integer; the number of decimal places to be used. The default is 4.} } diff --git a/man/calc_rev_ms2.Rd b/man/calc_rev_ms2.Rd index 05cce6c..733afc1 100644 --- a/man/calc_rev_ms2.Rd +++ b/man/calc_rev_ms2.Rd @@ -10,8 +10,6 @@ calc_rev_ms2(af, aas) \item{af}{An sequence of answer of the forward.} \item{aas}{The sequence of amino acid residues.} - -\item{l}{The number amino acid residues in a peptide.} } \description{ Calculates the reversed MS2 from the forward diff --git a/man/calc_semipepmasses.Rd b/man/calc_semipepmasses.Rd index bdd7edd..bd955cd 100644 --- a/man/calc_semipepmasses.Rd +++ b/man/calc_semipepmasses.Rd @@ -22,7 +22,7 @@ calc_semipepmasses( for considerations. Shorter peptides will be excluded. The default is 7.} \item{max_len}{A positive integer; the maximum length of peptide sequences -for considerations. Longer peptides will be excluded.} +for considerations. Longer peptides will be excluded. The default is 40.} \item{aa_masses}{An amino-acid mass lookup.} diff --git a/man/calc_tmtint.Rd b/man/calc_tmtint.Rd index 56b0314..da733f2 100644 --- a/man/calc_tmtint.Rd +++ b/man/calc_tmtint.Rd @@ -21,10 +21,11 @@ and \code{tmt10} for \code{tmt8} etc.} \item{ppm_reporters}{The mass tolerance of MS2 reporter ions.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/calcpepsc.Rd b/man/calcpepsc.Rd index 33316c7..e66d9b9 100644 --- a/man/calcpepsc.Rd +++ b/man/calcpepsc.Rd @@ -14,6 +14,7 @@ calcpepsc( min_ms2mass = 115L, d2 = 1e-05, index_mgf_ms2 = FALSE, + tally_ms2ints = TRUE, add_ms2theos = FALSE, add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, @@ -36,10 +37,11 @@ and y-ions.} \item{ppm_ms2}{A positive integer; the mass tolerance of MS2 species. The default is 20.} -\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities -of secondary ions to primary ions at the absence of the primaries. The -default is FALSE. For instance, the signal of \code{b5^*} will be ignored -if its primary ion \code{b5} is not matched.} +\item{soft_secions}{Impacts on search performance not yet assessed. Logical; +if TRUE, collapses the intensities of secondary ions to primary ions even +when the primaries are absent. The default is FALSE. For instance, the +signal of \code{b5^*} will be ignored if its primary ion \code{b5} is not +matched.} \item{out_path}{A file path of outputs.} @@ -48,10 +50,11 @@ interrogation. The default is 110.} \item{d2}{Bin width in ppm divided by 1E6.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the @@ -61,6 +64,8 @@ interrogation. The default is 110.} the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, pep_ms2_deltas_sd} are nullified in the outputs.} +\item{tally_ms2ints}{Logical; tally MS2 intensities or not.} + \item{add_ms2theos}{Logical. If true, adds the sequence of primary theoretical MS2 m/z values (\code{pep_ms2_theos}). The sequence order at a given \code{type_ms2ions} is: diff --git a/man/calib_ms1masses.Rd b/man/calib_ms1masses.Rd index 1c25a5e..649ca39 100644 --- a/man/calib_ms1masses.Rd +++ b/man/calib_ms1masses.Rd @@ -26,10 +26,7 @@ calib_ms1masses( Proteome Discoverer or (3) Bruker's DataAnalysis. With MSConvert, the default \code{titleMaker} is required for correct - parsing (don't think it can be altered by users, but just in case). - - Individuality in MGF files are slightly preferred to take advantage of - parallel reading of the files.} + parsing (don't think it can be altered by users, but just in case).} \item{out_path}{A file path of outputs.} @@ -37,8 +34,8 @@ calib_ms1masses( default is 20.} \item{min_mass}{A positive integer; the minimum precursor mass for -interrogation. The default is an arbitrarily low value. The primary guard -against low molecular-weight precursors is \code{min_len}.} +interrogation. The default is an arbitrarily low value (the primary guard +against low molecular-weight precursors is \code{min_len}).} \item{max_mass}{A positive integer; the maximum precursor mass for interrogation.} diff --git a/man/checkMGF.Rd b/man/checkMGF.Rd index 091b448..60fc13e 100644 --- a/man/checkMGF.Rd +++ b/man/checkMGF.Rd @@ -15,10 +15,7 @@ checkMGF(mgf_path = NULL, grp_args = NULL, error = c("stop", "warn")) Proteome Discoverer or (3) Bruker's DataAnalysis. With MSConvert, the default \code{titleMaker} is required for correct - parsing (don't think it can be altered by users, but just in case). - - Individuality in MGF files are slightly preferred to take advantage of - parallel reading of the files.} + parsing (don't think it can be altered by users, but just in case).} \item{grp_args}{The names of arguments in \code{par_groups}.} diff --git a/man/check_fdr_group.Rd b/man/check_fdr_group.Rd index 98cb5c2..dec570d 100644 --- a/man/check_fdr_group.Rd +++ b/man/check_fdr_group.Rd @@ -8,9 +8,11 @@ check_fdr_group(fdr_group = c("base", "all", "top3"), oks = c("base", "all")) } \arguments{ \item{fdr_group}{A character string; the modification group(s) for uses in -peptide FDR controls. The value is in one of c("all", "base"). The +peptide FDR controls. The value is in one of \code{c("all", "base")}. The \code{base} corresponds to the modification group with the largest number of matches.} + +\item{oks}{A vector of allowed modification groups.} } \description{ Not yet used. Takes values of integers or character strings. diff --git a/man/check_locmods.Rd b/man/check_locmods.Rd index e87c265..75688a0 100644 --- a/man/check_locmods.Rd +++ b/man/check_locmods.Rd @@ -21,12 +21,15 @@ check_locmods(fixedmods, varmods, locmods) \code{position} and \code{site}.} \item{locmods}{Among \code{varmods} for the consideration of localization -probabilities; for instance, \code{locmods = NULL} for nothing, -\code{locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)")} for -phosphopeptides, \code{locmods = "Acetyl (K)"} for lysine acetylation. -\code{fixedmods} that were coerced to \code{varmods} will be added -automatically to \code{locmods}. For convenience, the default is set to -look for applicable peptide phosphorylation.} + probabilities; for instance, \code{locmods = NULL} for nothing, + \code{locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)")} for + phosphopeptides, \code{locmods = "Acetyl (K)"} for lysine acetylation. + \code{fixedmods} that were coerced to \code{varmods} will be added + automatically to \code{locmods}. + + For convenience, the default is set to look for applicable peptide + phosphorylation (and may encounter warning messages if the data type is + different to the default).} } \description{ Coerced \code{fixedmods} not considered. diff --git a/man/check_mod_motifs.Rd b/man/check_mod_motifs.Rd index ba41efd..cba09a5 100644 --- a/man/check_mod_motifs.Rd +++ b/man/check_mod_motifs.Rd @@ -8,14 +8,12 @@ check_mod_motifs(mod_motifs, mods) } \arguments{ \item{mod_motifs}{The motifs to restrict \code{Anywhere} variable - modification. For example, provided the \code{Anywhere} variable - modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and - - \code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = - c("NM", "MP"))} - - variable modifications will only be considered at sites that satisfy the - motifs.} +modification. For example, provided the \code{Anywhere} variable +modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and +\code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = +c("NM", "MP"))}, +variable modifications will only be considered at sites that satisfy the +motifs.} \item{mods}{A concatenated list of fixed and variable modifications.} } diff --git a/man/count_elements.Rd b/man/count_elements.Rd index dfaf294..4152566 100644 --- a/man/count_elements.Rd +++ b/man/count_elements.Rd @@ -13,7 +13,7 @@ count_elements(vec) A faster alternative to \code{table}. } \examples{ -\donttest{ +\dontrun{ library(mzion) library(microbenchmark) diff --git a/man/creat_folds.Rd b/man/creat_folds.Rd new file mode 100644 index 0000000..97bfae5 --- /dev/null +++ b/man/creat_folds.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/percolator.R +\name{creat_folds} +\alias{creat_folds} +\title{Creates folds for cross validation.} +\usage{ +creat_folds(y, k = 10L, list = TRUE, returnTrain = FALSE) +} +\arguments{ +\item{y}{The labels.} + +\item{k}{The number of folds.} + +\item{list}{Logical; should the result be a list or not.} + +\item{returnTrain}{Logical; return training sets or not (test sets).} +} +\description{ +From package caret. +} diff --git a/man/cv_svm.Rd b/man/cv_svm.Rd new file mode 100644 index 0000000..a2a89fd --- /dev/null +++ b/man/cv_svm.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/percolator.R +\name{cv_svm} +\alias{cv_svm} +\title{Calculates cross-validation errors.} +\usage{ +cv_svm(train, test, costs = c(0.1, 1, 1, 5, 50), ...) +} +\arguments{ +\item{train}{Training set.} + +\item{test}{Test set.} + +\item{costs}{A vector of costs.} + +\item{...}{Additional arguments for svm.} +} +\description{ +Optimizes the regularization cost for svm. +} diff --git a/man/extract_mgf_rptrs.Rd b/man/extract_mgf_rptrs.Rd index 21ad5c5..463f3d7 100644 --- a/man/extract_mgf_rptrs.Rd +++ b/man/extract_mgf_rptrs.Rd @@ -34,9 +34,9 @@ The default is \eqn{126.1}.} The default is \eqn{135.2}.} \item{exclude_reporter_region}{Logical; if TRUE, excludes MS2 ions in the -region of TMT reporter ions. The default is FALSE. The argument affects -only TMT data. The range of TMT reporter ions is given by -\code{tmt_reporter_lower} and \code{tmt_reporter_upper}.} +region of TMT reporter ions. The default is FALSE. The corresponding range +of TMT reporter ions is informed by \code{tmt_reporter_lower} and +\code{tmt_reporter_upper}. The argument affects only TMT data.} } \description{ Also purges MS2 m-over-z and intensity when applicable. diff --git a/man/finalize_aamasses.Rd b/man/finalize_aamasses.Rd index 0012474..836c937 100644 --- a/man/finalize_aamasses.Rd +++ b/man/finalize_aamasses.Rd @@ -16,14 +16,12 @@ of SILAC for details. Can be but not typically used in standard alone searches of labeled residues.} \item{mod_motifs}{The motifs to restrict \code{Anywhere} variable - modification. For example, provided the \code{Anywhere} variable - modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and - - \code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = - c("NM", "MP"))} - - variable modifications will only be considered at sites that satisfy the - motifs.} +modification. For example, provided the \code{Anywhere} variable +modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and +\code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = +c("NM", "MP"))}, +variable modifications will only be considered at sites that satisfy the +motifs.} } \description{ Replaces interim fixed and variable modifications with the finals. Results in diff --git a/man/find_aa_masses.Rd b/man/find_aa_masses.Rd index 5e092ed..1a0d68d 100644 --- a/man/find_aa_masses.Rd +++ b/man/find_aa_masses.Rd @@ -29,14 +29,12 @@ of SILAC for details. Can be but not typically used in standard alone searches of labeled residues.} \item{mod_motifs}{The motifs to restrict \code{Anywhere} variable - modification. For example, provided the \code{Anywhere} variable - modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and - - \code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = - c("NM", "MP"))} - - variable modifications will only be considered at sites that satisfy the - motifs.} +modification. For example, provided the \code{Anywhere} variable +modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and +\code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = +c("NM", "MP"))}, +variable modifications will only be considered at sites that satisfy the +motifs.} \item{maxn_vmods_setscombi}{Integer; the maximum number of combinatorial variable modifications and neutral losses.} diff --git a/man/find_ms2_bypep.Rd b/man/find_ms2_bypep.Rd index 45782fd..d09e6ad 100644 --- a/man/find_ms2_bypep.Rd +++ b/man/find_ms2_bypep.Rd @@ -32,10 +32,11 @@ interrogation. The default is 110.} \item{minn_ms2}{Integer; the minimum number of MS2 ions for consideration as a hit.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/find_perm_sets.Rd b/man/find_perm_sets.Rd index 9f99138..2c3c121 100644 --- a/man/find_perm_sets.Rd +++ b/man/find_perm_sets.Rd @@ -13,7 +13,7 @@ find_perm_sets(labs = c("A", "A", "A", "B", "B", "C")) A (faster alternative) to \link[gtools]{permutations} with duplicated labels. } \examples{ -\donttest{ +\dontrun{ library(gtools) library(mzion) library(dplyr) diff --git a/man/frames_adv.Rd b/man/frames_adv.Rd index 1ca7756..dd6ff0d 100644 --- a/man/frames_adv.Rd +++ b/man/frames_adv.Rd @@ -103,10 +103,11 @@ a hit.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/hannot_decoys.Rd b/man/hannot_decoys.Rd index a308a56..4653dcc 100644 --- a/man/hannot_decoys.Rd +++ b/man/hannot_decoys.Rd @@ -9,9 +9,7 @@ hannot_decoys(df, prps) \arguments{ \item{df}{A data frame.} -\item{prps_fwd}{The look-ups of forward protein and peptides.} - -\item{prps_rev}{The look-ups of reversed protein and peptides.} +\item{prps}{The look-ups of protein and peptides.} } \description{ Helper of annotating decoy peptides. diff --git a/man/hcalc_tmtint.Rd b/man/hcalc_tmtint.Rd index f90c381..0910a0f 100644 --- a/man/hcalc_tmtint.Rd +++ b/man/hcalc_tmtint.Rd @@ -25,10 +25,11 @@ ions. The default is 10.} \item{out_path}{A file path of outputs.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/hms1_a0_vnl0_fnl1.Rd b/man/hms1_a0_vnl0_fnl1.Rd index 255cd98..ccb9a0f 100644 --- a/man/hms1_a0_vnl0_fnl1.Rd +++ b/man/hms1_a0_vnl0_fnl1.Rd @@ -24,8 +24,8 @@ corresponds to the combination without NLs (all zeros).} acid residues.} \item{min_mass}{A positive integer; the minimum precursor mass for -interrogation. The default is an arbitrarily low value. The primary guard -against low molecular-weight precursors is \code{min_len}.} +interrogation. The default is an arbitrarily low value (the primary guard +against low molecular-weight precursors is \code{min_len}).} \item{max_mass}{A positive integer; the maximum precursor mass for interrogation.} diff --git a/man/hms1_a1_vnl0_fnl0.Rd b/man/hms1_a1_vnl0_fnl0.Rd index ffa8541..5123c79 100644 --- a/man/hms1_a1_vnl0_fnl0.Rd +++ b/man/hms1_a1_vnl0_fnl0.Rd @@ -47,8 +47,8 @@ default is 5.} \item{ms1vmods}{The set of all possible MS1 vmod labels at a given aa_masses.} \item{min_mass}{A positive integer; the minimum precursor mass for -interrogation. The default is an arbitrarily low value. The primary guard -against low molecular-weight precursors is \code{min_len}.} +interrogation. The default is an arbitrarily low value (the primary guard +against low molecular-weight precursors is \code{min_len}).} \item{max_mass}{A positive integer; the maximum precursor mass for interrogation.} diff --git a/man/hsemipeps_byprots.Rd b/man/hsemipeps_byprots.Rd index 4db86db..805c92b 100644 --- a/man/hsemipeps_byprots.Rd +++ b/man/hsemipeps_byprots.Rd @@ -13,7 +13,7 @@ under a protein, the value is a mass and the name is a peptide sequence.} \item{min_len}{The minimum length of peptide sequences for consideration.} \item{max_len}{A positive integer; the maximum length of peptide sequences -for considerations. Longer peptides will be excluded.} +for considerations. Longer peptides will be excluded. The default is 40.} \item{aa_masses}{A named list containing the (mono-isotopic) masses of amino acid residues.} diff --git a/man/keep_pepfdr_best.Rd b/man/keep_pepfdr_best.Rd new file mode 100644 index 0000000..d11e923 --- /dev/null +++ b/man/keep_pepfdr_best.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/scores.R +\name{keep_pepfdr_best} +\alias{keep_pepfdr_best} +\title{Keeps the best entries of targets and decoys.} +\usage{ +keep_pepfdr_best(td, cols = c("pep_scan_num", "raw_file")) +} +\arguments{ +\item{td}{A data frame of targets and decoys.} + +\item{cols}{Columns for grouping.} +} +\description{ +Keeps the best entries of targets and decoys. +} diff --git a/man/load_mgfs.Rd b/man/load_mgfs.Rd index deee981..75913e3 100644 --- a/man/load_mgfs.Rd +++ b/man/load_mgfs.Rd @@ -44,10 +44,7 @@ load_mgfs( Proteome Discoverer or (3) Bruker's DataAnalysis. With MSConvert, the default \code{titleMaker} is required for correct - parsing (don't think it can be altered by users, but just in case). - - Individuality in MGF files are slightly preferred to take advantage of - parallel reading of the files.} + parsing (don't think it can be altered by users, but just in case).} \item{min_mass}{A minimum mass of precursors for considerations.} @@ -95,14 +92,15 @@ The default is \eqn{126.1}.} The default is \eqn{135.2}.} \item{exclude_reporter_region}{Logical; if TRUE, excludes MS2 ions in the -region of TMT reporter ions. The default is FALSE. The argument affects -only TMT data. The range of TMT reporter ions is given by -\code{tmt_reporter_lower} and \code{tmt_reporter_upper}.} - -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +region of TMT reporter ions. The default is FALSE. The corresponding range +of TMT reporter ions is informed by \code{tmt_reporter_lower} and +\code{tmt_reporter_upper}. The argument affects only TMT data.} + +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/map_raw_n_scan.Rd b/man/map_raw_n_scan.Rd index 0baf074..ab39ecb 100644 --- a/man/map_raw_n_scan.Rd +++ b/man/map_raw_n_scan.Rd @@ -17,10 +17,7 @@ map_raw_n_scan(df, mgf_path) Proteome Discoverer or (3) Bruker's DataAnalysis. With MSConvert, the default \code{titleMaker} is required for correct - parsing (don't think it can be altered by users, but just in case). - - Individuality in MGF files are slightly preferred to take advantage of - parallel reading of the files.} + parsing (don't think it can be altered by users, but just in case).} } \description{ Maps raw_file and scan_title from indexes to real values. diff --git a/man/matchMS.Rd b/man/matchMS.Rd index 1ce0e96..24d227e 100644 --- a/man/matchMS.Rd +++ b/man/matchMS.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/msmsmatches.R \name{matchMS} \alias{matchMS} -\title{Searches for MS ions.} +\title{An integrated facility for searches of mass spectrometry data.} \usage{ matchMS( out_path = "~/mzion/outs", @@ -84,6 +84,15 @@ matchMS( add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, add_ms2ints = FALSE, + svm_reproc = FALSE, + svm_kernel = "radial", + svm_feats = c("pep_score", "pep_ret_range", "pep_delta", "pep_n_ms2", "pep_expect", + "pep_exp_mz", "pep_exp_mr", "pep_tot_int", "pep_n_matches2", "pep_ms2_deltas_mean"), + svm_cv = TRUE, + svm_k = 3L, + svm_costs = c(0.1, 0.3, 1, 3, 10), + svm_def_cost = 1, + svm_iters = 10L, digits = 4L, ... ) @@ -99,10 +108,7 @@ matchMS( Proteome Discoverer or (3) Bruker's DataAnalysis. With MSConvert, the default \code{titleMaker} is required for correct - parsing (don't think it can be altered by users, but just in case). - - Individuality in MGF files are slightly preferred to take advantage of - parallel reading of the files.} + parsing (don't think it can be altered by users, but just in case).} \item{fasta}{Character string(s) to the name(s) of fasta file(s) with prepended directory path. The experimenter needs to supply the files.} @@ -140,22 +146,23 @@ of SILAC for details. Can be but not typically used in standard alone searches of labeled residues.} \item{locmods}{Among \code{varmods} for the consideration of localization -probabilities; for instance, \code{locmods = NULL} for nothing, -\code{locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)")} for -phosphopeptides, \code{locmods = "Acetyl (K)"} for lysine acetylation. -\code{fixedmods} that were coerced to \code{varmods} will be added -automatically to \code{locmods}. For convenience, the default is set to -look for applicable peptide phosphorylation.} - -\item{mod_motifs}{The motifs to restrict \code{Anywhere} variable - modification. For example, provided the \code{Anywhere} variable - modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and + probabilities; for instance, \code{locmods = NULL} for nothing, + \code{locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)")} for + phosphopeptides, \code{locmods = "Acetyl (K)"} for lysine acetylation. + \code{fixedmods} that were coerced to \code{varmods} will be added + automatically to \code{locmods}. - \code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = - c("NM", "MP"))} + For convenience, the default is set to look for applicable peptide + phosphorylation (and may encounter warning messages if the data type is + different to the default).} - variable modifications will only be considered at sites that satisfy the - motifs.} +\item{mod_motifs}{The motifs to restrict \code{Anywhere} variable +modification. For example, provided the \code{Anywhere} variable +modifications containing \code{c("Oxidation (M)", "Deamidated (N)")} and +\code{mod_motifs = list(`Deamidated (N)` = c("NG", "NM"), `Oxidation (M)` = +c("NM", "MP"))}, +variable modifications will only be considered at sites that satisfy the +motifs.} \item{enzyme}{A character string; the proteolytic specificity of the assumed enzyme will be used to generate peptide sequences from protein entries. The @@ -233,14 +240,14 @@ default is 64.} for considerations. Shorter peptides will be excluded. The default is 7.} \item{max_len}{A positive integer; the maximum length of peptide sequences -for considerations. Longer peptides will be excluded.} +for considerations. Longer peptides will be excluded. The default is 40.} \item{max_miss}{A non-negative integer; the maximum number of mis-cleavages per peptide sequence for considerations. The default is 2.} \item{min_mass}{A positive integer; the minimum precursor mass for -interrogation. The default is an arbitrarily low value. The primary guard -against low molecular-weight precursors is \code{min_len}.} +interrogation. The default is an arbitrarily low value (the primary guard +against low molecular-weight precursors is \code{min_len}).} \item{max_mass}{A positive integer; the maximum precursor mass for interrogation.} @@ -256,16 +263,17 @@ envelopes. Nevertheless, by setting \code{n_13c = 1}, some increases in the number of PSMs may be readily achieved at a relatively small cost of search time.} -\item{par_groups}{Parameter(s) of \code{matchMS} multiplied by sets of values -in groups. Multiple searches will be performed separately against the -parameter groups. For instance with one set of samples in SILAC light and -the other in SILAC heavy, the experimenters may specify two arguments for -parameter \code{mgf_path} and two arguments for parameter \code{fixedmods} -that link to the respective samples. In this way, there is no need to -search against, e.g. heavy-isotope-labeled K8R10 with the light samples and -vice versa. Note that results will be combined at the end, with the group -names indicated under column \code{pep_group}. The default is NULL without -grouped searches. See the examples under SILAC and Group searches.} +\item{par_groups}{A low -priority feature. Parameter(s) of \code{matchMS} +multiplied by sets of values in groups. Multiple searches will be performed +separately against the parameter groups. For instance with one set of +samples in SILAC light and the other in SILAC heavy, the experimenters may +specify two arguments for parameter \code{mgf_path} and two arguments for +parameter \code{fixedmods} that link to the respective samples. In this +way, there is no need to search against, e.g. heavy-isotope-labeled K8R10 +with the light samples and vice versa. Note that results will be combined +at the end, with the group names indicated under column \code{pep_group}. +The default is NULL without grouped searches. See the examples under SILAC +and Group searches.} \item{silac_mix}{A list of labels indicating SILAC groups in samples. The parameter is most relevant for SILAC experiments where peptides of heavy, @@ -298,14 +306,15 @@ The default is \eqn{126.1}.} The default is \eqn{135.2}.} \item{exclude_reporter_region}{Logical; if TRUE, excludes MS2 ions in the -region of TMT reporter ions. The default is FALSE. The argument affects -only TMT data. The range of TMT reporter ions is given by -\code{tmt_reporter_lower} and \code{tmt_reporter_upper}.} +region of TMT reporter ions. The default is FALSE. The corresponding range +of TMT reporter ions is informed by \code{tmt_reporter_lower} and +\code{tmt_reporter_upper}. The argument affects only TMT data.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the @@ -338,7 +347,7 @@ levels of PSM, peptide or protein. The default is 0.01. See also argument peptide} with the additional filtration of data at \code{prot_tier == 1}.} \item{fdr_group}{A character string; the modification group(s) for uses in -peptide FDR controls. The value is in one of c("all", "base"). The +peptide FDR controls. The value is in one of \code{c("all", "base")}. The \code{base} corresponds to the modification group with the largest number of matches.} @@ -371,10 +380,11 @@ pep_score_cutoff} under a protein will be used to represent the threshold of a protein enrichment score. For more conserved thresholds, the statistics of \code{"max"} may be considered.} -\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities -of secondary ions to primary ions at the absence of the primaries. The -default is FALSE. For instance, the signal of \code{b5^*} will be ignored -if its primary ion \code{b5} is not matched.} +\item{soft_secions}{Impacts on search performance not yet assessed. Logical; +if TRUE, collapses the intensities of secondary ions to primary ions even +when the primaries are absent. The default is FALSE. For instance, the +signal of \code{b5^*} will be ignored if its primary ion \code{b5} is not +matched.} \item{topn_mods_per_seq}{Positive integer; a threshold to discard variable modifications under the same peptide match with scores beyond the top-n. @@ -417,11 +427,11 @@ if its primary ion \code{b5} is not matched.} Tier-3: one significant peptide per protein and protein scores below significance thresholds.} -\item{max_n_prots}{A positive integer to threshold the maximum number of - protein entries before coercing \code{fdr_type} from \code{psm} or - \code{peptide} to \code{protein}. The argument has no effect if - \code{fdr_type} is already \code{protein}. In general, there is no need to - change the default. +\item{max_n_prots}{Softly depreciated. A positive integer to threshold the + maximum number of protein entries before coercing \code{fdr_type} from + \code{psm} or \code{peptide} to \code{protein}. The argument has no effect + if \code{fdr_type} is already \code{protein}. In general, there is no need + to change the default. Note that for memory efficiency proteins at tiers 1, 2 and 3 are grouped separately. Further note that there is no tier-2 proteins at @@ -518,6 +528,26 @@ at \code{calib_ms1mass = FALSE}.} \item{add_ms2ints}{Logical; if TRUE, adds the sequence of experimental MS2 intensity values (\code{pep_ms2_ints}).} +\item{svm_reproc}{Logical; if TRUE, reprocesses peptide data for significance +thresholds with a support vector machine (SVM) approach analogous to +\href{https://www.nature.com/articles/nmeth1113}{Percolator}.} + +\item{svm_kernel}{The SVM kernel. See also \link[e1071]{svm}.} + +\item{svm_feats}{Features used for SVM classifications.} + +\item{svm_cv}{Logical; if TRUE, performs cross validation for the +regularization cost.} + +\item{svm_k}{A positive integer; specifies the k-number of folds in cross +validation.} + +\item{svm_costs}{The cost constraints for k-fold cross validation.} + +\item{svm_def_cost}{The default cost for SVM.} + +\item{svm_iters}{The number of iteration in SVM learning.} + \item{digits}{A non-negative integer; the number of decimal places to be used. The default is 4.} @@ -528,7 +558,7 @@ A list of complete PSMs in \code{psmC.txt}; a list of quality PSMs in \code{psmQ.txt}. } \description{ -Database searches of MSMS data. +Database searches of MS/MS data (DDA). } \section{\code{Output columns}}{ \code{system.file("extdata", @@ -604,9 +634,6 @@ matchMS( out_path = "~/mzion/examples", ) -# (from protein to PSM FDR) -reproc_psmC(out_path = "~/mzion/examples", fdr_type = "psm", - combine_tier_three = TRUE) # TMT-16plex, phospho matchMS( @@ -617,7 +644,6 @@ matchMS( locmods = c("Phospho (S)", "Phospho (T)", "Phospho (Y)"), quant = "tmt16", fdr_type = "psm", - combine_tier_three = TRUE, out_path = "~/mzion/examples", ) @@ -630,11 +656,11 @@ matchMS( ppm_ms2 = 40, quant = "none", fdr_type = "protein", - out_path = "~/mzion/examples_pasef", + out_path = "~/mzion/examples", ) # Wrapper of matchMS(enzyme = noenzyme, ...) without sectional searches -# by ranges of peptide lengths +# by ranges of peptide lengths matchMS_NES( fasta = c("~/mzion/dbs/fasta/refseq/refseq_hs_2013_07.fasta", "~/mzion/dbs/fasta/refseq/refseq_mm_2013_07.fasta", diff --git a/man/matchMS_noenzyme.Rd b/man/matchMS_noenzyme.Rd index 778b736..6132130 100644 --- a/man/matchMS_noenzyme.Rd +++ b/man/matchMS_noenzyme.Rd @@ -25,7 +25,7 @@ matchMS_noenzyme( for considerations. Shorter peptides will be excluded. The default is 7.} \item{max_len}{A positive integer; the maximum length of peptide sequences -for considerations. Longer peptides will be excluded.} +for considerations. Longer peptides will be excluded. The default is 40.} \item{fasta}{Character string(s) to the name(s) of fasta file(s) with prepended directory path. The experimenter needs to supply the files.} @@ -40,10 +40,7 @@ prepended directory path. The experimenter needs to supply the files.} Proteome Discoverer or (3) Bruker's DataAnalysis. With MSConvert, the default \code{titleMaker} is required for correct - parsing (don't think it can be altered by users, but just in case). - - Individuality in MGF files are slightly preferred to take advantage of - parallel reading of the files.} + parsing (don't think it can be altered by users, but just in case).} \item{noenzyme_maxn}{Non-negative integer; the maximum number of peptide lengths for sectional searches at \code{noenzyme} specificity. The argument diff --git a/man/matchMS_par_groups.Rd b/man/matchMS_par_groups.Rd index 7965d0a..edfc241 100644 --- a/man/matchMS_par_groups.Rd +++ b/man/matchMS_par_groups.Rd @@ -13,16 +13,17 @@ matchMS_par_groups( ) } \arguments{ -\item{par_groups}{Parameter(s) of \code{matchMS} multiplied by sets of values -in groups. Multiple searches will be performed separately against the -parameter groups. For instance with one set of samples in SILAC light and -the other in SILAC heavy, the experimenters may specify two arguments for -parameter \code{mgf_path} and two arguments for parameter \code{fixedmods} -that link to the respective samples. In this way, there is no need to -search against, e.g. heavy-isotope-labeled K8R10 with the light samples and -vice versa. Note that results will be combined at the end, with the group -names indicated under column \code{pep_group}. The default is NULL without -grouped searches. See the examples under SILAC and Group searches.} +\item{par_groups}{A low -priority feature. Parameter(s) of \code{matchMS} +multiplied by sets of values in groups. Multiple searches will be performed +separately against the parameter groups. For instance with one set of +samples in SILAC light and the other in SILAC heavy, the experimenters may +specify two arguments for parameter \code{mgf_path} and two arguments for +parameter \code{fixedmods} that link to the respective samples. In this +way, there is no need to search against, e.g. heavy-isotope-labeled K8R10 +with the light samples and vice versa. Note that results will be combined +at the end, with the group names indicated under column \code{pep_group}. +The default is NULL without grouped searches. See the examples under SILAC +and Group searches.} \item{grp_args}{The names of arguments in \code{par_groups}.} diff --git a/man/matchMS_silac_mix.Rd b/man/matchMS_silac_mix.Rd index b1b4e39..f39e0e7 100644 --- a/man/matchMS_silac_mix.Rd +++ b/man/matchMS_silac_mix.Rd @@ -31,10 +31,7 @@ SILAC.} Proteome Discoverer or (3) Bruker's DataAnalysis. With MSConvert, the default \code{titleMaker} is required for correct - parsing (don't think it can be altered by users, but just in case). - - Individuality in MGF files are slightly preferred to take advantage of - parallel reading of the files.} + parsing (don't think it can be altered by users, but just in case).} \item{aa_masses}{An amino acid look-ups.} } diff --git a/man/match_ex2th2.Rd b/man/match_ex2th2.Rd index 2a3e81d..0167be8 100644 --- a/man/match_ex2th2.Rd +++ b/man/match_ex2th2.Rd @@ -16,10 +16,11 @@ interrogation. The default is 110.} \item{d}{Bin size, e.g., \eqn{20 ppm / 2 * 1E-6}.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/ms1_a0_vnl0_fnl1.Rd b/man/ms1_a0_vnl0_fnl1.Rd index 0ad9cce..d81bf6a 100644 --- a/man/ms1_a0_vnl0_fnl1.Rd +++ b/man/ms1_a0_vnl0_fnl1.Rd @@ -28,8 +28,8 @@ corresponds to the combination without NLs (all zeros).} acid residues.} \item{min_mass}{A positive integer; the minimum precursor mass for -interrogation. The default is an arbitrarily low value. The primary guard -against low molecular-weight precursors is \code{min_len}.} +interrogation. The default is an arbitrarily low value (the primary guard +against low molecular-weight precursors is \code{min_len}).} \item{max_mass}{A positive integer; the maximum precursor mass for interrogation.} diff --git a/man/ms1_a1_vnl0_fnl0.Rd b/man/ms1_a1_vnl0_fnl0.Rd index 731c8ac..285f3c4 100644 --- a/man/ms1_a1_vnl0_fnl0.Rd +++ b/man/ms1_a1_vnl0_fnl0.Rd @@ -51,8 +51,8 @@ default is 5.} \item{ms1vmods}{The set of all possible MS1 vmod labels at a given aa_masses.} \item{min_mass}{A positive integer; the minimum precursor mass for -interrogation. The default is an arbitrarily low value. The primary guard -against low molecular-weight precursors is \code{min_len}.} +interrogation. The default is an arbitrarily low value (the primary guard +against low molecular-weight precursors is \code{min_len}).} \item{max_mass}{A positive integer; the maximum precursor mass for interrogation.} diff --git a/man/ms2match.Rd b/man/ms2match.Rd index f6c442e..9e13afe 100644 --- a/man/ms2match.Rd +++ b/man/ms2match.Rd @@ -50,10 +50,7 @@ ms2match( Proteome Discoverer or (3) Bruker's DataAnalysis. With MSConvert, the default \code{titleMaker} is required for correct - parsing (don't think it can be altered by users, but just in case). - - Individuality in MGF files are slightly preferred to take advantage of - parallel reading of the files.} + parsing (don't think it can be altered by users, but just in case).} \item{aa_masses_all}{A list of amino acid lookups for all the combination of fixed and variable modifications.} @@ -107,8 +104,8 @@ at \code{calib_ms1mass = FALSE}.} default is 20.} \item{min_mass}{A positive integer; the minimum precursor mass for -interrogation. The default is an arbitrarily low value. The primary guard -against low molecular-weight precursors is \code{min_len}.} +interrogation. The default is an arbitrarily low value (the primary guard +against low molecular-weight precursors is \code{min_len}).} \item{max_mass}{A positive integer; the maximum precursor mass for interrogation.} @@ -174,15 +171,16 @@ sets of combinatorial variable modifications. The default is 512.} for considerations. Shorter peptides will be excluded. The default is 7.} \item{max_len}{A positive integer; the maximum length of peptide sequences -for considerations. Longer peptides will be excluded.} +for considerations. Longer peptides will be excluded. The default is 40.} \item{max_miss}{A non-negative integer; the maximum number of mis-cleavages per peptide sequence for considerations. The default is 2.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/ms2match_base.Rd b/man/ms2match_base.Rd index dac0f3d..ec1cad8 100644 --- a/man/ms2match_base.Rd +++ b/man/ms2match_base.Rd @@ -167,10 +167,7 @@ modifications.} Proteome Discoverer or (3) Bruker's DataAnalysis. With MSConvert, the default \code{titleMaker} is required for correct - parsing (don't think it can be altered by users, but just in case). - - Individuality in MGF files are slightly preferred to take advantage of - parallel reading of the files.} + parsing (don't think it can be altered by users, but just in case).} \item{out_path}{A file path of outputs.} @@ -213,10 +210,11 @@ default is 20.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/perco_svm.Rd b/man/perco_svm.Rd new file mode 100644 index 0000000..018d812 --- /dev/null +++ b/man/perco_svm.Rd @@ -0,0 +1,107 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/percolator.R +\name{perco_svm} +\alias{perco_svm} +\title{Percolator} +\usage{ +perco_svm( + prob_cos = NULL, + out_path = NULL, + df = NULL, + target_fdr = 0.01, + fdr_type = "protein", + min_len = 7L, + max_len = 40L, + max_pepscores_co = 50, + min_pepscores_co = 0, + enzyme = "trypsin_p", + fdr_group = "base", + nes_fdr_group = "base", + fct_score = 10, + k = 10, + cross_valid = FALSE, + costs = c(0.1, 0.3, 1, 3, 10), + def_cost = 1L, + svm_kernel = "radial", + svm_feats = c("pep_score", "pep_ret_range", "pep_delta", "pep_n_ms2", "pep_expect", + "pep_exp_mz", "pep_exp_mr", "pep_tot_int", "pep_n_matches2", "pep_ms2_deltas_mean"), + svm_iters = 10L, + svm_tol = 1e-04, + ... +) +} +\arguments{ +\item{prob_cos}{Probability cot-offs (as a function of pep_len).} + +\item{out_path}{A file path of outputs.} + +\item{df}{A data frame of \code{psmC.txt}.} + +\item{target_fdr}{A numeric; the targeted false-discovery rate (FDR) at the +levels of PSM, peptide or protein. The default is 0.01. See also argument +\code{fdr_type}.} + +\item{fdr_type}{A character string; the type of FDR control. The value is in + one of c("protein", "peptide", "psm"). The default is \code{protein}. + + Note that \code{fdr_type = protein} is comparable to \code{fdr_type = + peptide} with the additional filtration of data at \code{prot_tier == 1}.} + +\item{min_len}{A positive integer; the minimum length of peptide sequences +for considerations. Shorter peptides will be excluded. The default is 7.} + +\item{max_len}{A positive integer; the maximum length of peptide sequences +for considerations. Longer peptides will be excluded. The default is 40.} + +\item{max_pepscores_co}{A positive numeric; the upper limit in the cut-offs +of peptide scores for discriminating significant and insignificant +identities.} + +\item{min_pepscores_co}{A non-negative numeric; the lower limit in the +cut-offs of peptide scores for discriminating significant and insignificant +identities.} + +\item{enzyme}{A character string; the proteolytic specificity of the assumed +enzyme will be used to generate peptide sequences from protein entries. The +default is \code{Trypsin_P}. See also parameter \code{custom_enzyme}.} + +\item{fdr_group}{A character string; the modification group(s) for uses in +peptide FDR controls. The value is in one of \code{c("all", "base")}. The +\code{base} corresponds to the modification group with the largest number +of matches.} + +\item{nes_fdr_group}{A character string in one of \code{c("all", +"all_cterm_tryptic", "all_cterm_nontryptic", "base", "base_cterm_tryptic", +"base_cterm_nontryptic")}. All peptides will be used in the classifications +of targets and decoys at \code{"all"}. Peptides with the chemistry of +C-terminal K or R will be used at \code{"all_cterm_tryptic"} (peptides from +protein C-terminals being excluded). Peptides without C-terminal K or R +will be used at \code{"all_cterm_nontryptic"}. The same applied to +\code{"base_cterm_tryptic"} and \code{"base_cterm_nontryptic"} with the +difference of only peptides from the \code{base} group being used. See also +parameter \code{fdr_group}.} + +\item{fct_score}{The factor in converting probability p-values to scores. The +value is always 10.} + +\item{k}{The k-folds for cross validation.} + +\item{cross_valid}{Logical; to perform cross validations or not.} + +\item{costs}{The costs for cross validations.} + +\item{def_cost}{The default cost.} + +\item{svm_kernel}{The SVM kernel. See also \link[e1071]{svm}.} + +\item{svm_feats}{Features used for SVM classifications.} + +\item{svm_iters}{The number of iterations.} + +\item{svm_tol}{Tolerance in FDR.} + +\item{...}{Not currently used.} +} +\description{ +Percolator +} diff --git a/man/prep_pepfdr_td.Rd b/man/prep_pepfdr_td.Rd new file mode 100644 index 0000000..59ab9af --- /dev/null +++ b/man/prep_pepfdr_td.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/scores.R +\name{prep_pepfdr_td} +\alias{prep_pepfdr_td} +\title{Prepares target-decoy data.} +\usage{ +prep_pepfdr_td( + td = NULL, + out_path, + enzyme = "trypsin_p", + nes_fdr_group = "base", + fdr_group = "base" +) +} +\arguments{ +\item{td}{A data frame of targets and decoys (for Percolator).} + +\item{out_path}{A file path of outputs.} + +\item{enzyme}{A character string; the proteolytic specificity of the assumed +enzyme will be used to generate peptide sequences from protein entries. The +default is \code{Trypsin_P}. See also parameter \code{custom_enzyme}.} + +\item{nes_fdr_group}{A character string in one of \code{c("all", +"all_cterm_tryptic", "all_cterm_nontryptic", "base", "base_cterm_tryptic", +"base_cterm_nontryptic")}. All peptides will be used in the classifications +of targets and decoys at \code{"all"}. Peptides with the chemistry of +C-terminal K or R will be used at \code{"all_cterm_tryptic"} (peptides from +protein C-terminals being excluded). Peptides without C-terminal K or R +will be used at \code{"all_cterm_nontryptic"}. The same applied to +\code{"base_cterm_tryptic"} and \code{"base_cterm_nontryptic"} with the +difference of only peptides from the \code{base} group being used. See also +parameter \code{fdr_group}.} + +\item{fdr_group}{A character string; the modification group(s) for uses in +peptide FDR controls. The value is in one of \code{c("all", "base")}. The +\code{base} corresponds to the modification group with the largest number +of matches.} +} +\description{ +Prepares target-decoy data. +} diff --git a/man/proc_mgf_chunks.Rd b/man/proc_mgf_chunks.Rd index c5edc7b..2f6fc15 100644 --- a/man/proc_mgf_chunks.Rd +++ b/man/proc_mgf_chunks.Rd @@ -117,14 +117,15 @@ The default is \eqn{126.1}.} The default is \eqn{135.2}.} \item{exclude_reporter_region}{Logical; if TRUE, excludes MS2 ions in the -region of TMT reporter ions. The default is FALSE. The argument affects -only TMT data. The range of TMT reporter ions is given by -\code{tmt_reporter_lower} and \code{tmt_reporter_upper}.} - -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +region of TMT reporter ions. The default is FALSE. The corresponding range +of TMT reporter ions is informed by \code{tmt_reporter_lower} and +\code{tmt_reporter_upper}. The argument affects only TMT data.} + +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/proc_mgfs.Rd b/man/proc_mgfs.Rd index b34f852..75fb5e6 100644 --- a/man/proc_mgfs.Rd +++ b/man/proc_mgfs.Rd @@ -117,14 +117,15 @@ The default is \eqn{126.1}.} The default is \eqn{135.2}.} \item{exclude_reporter_region}{Logical; if TRUE, excludes MS2 ions in the -region of TMT reporter ions. The default is FALSE. The argument affects -only TMT data. The range of TMT reporter ions is given by -\code{tmt_reporter_lower} and \code{tmt_reporter_upper}.} - -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +region of TMT reporter ions. The default is FALSE. The corresponding range +of TMT reporter ions is informed by \code{tmt_reporter_lower} and +\code{tmt_reporter_upper}. The argument affects only TMT data.} + +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/proc_mzml.Rd b/man/proc_mzml.Rd index ee4ea3f..6dccbec 100644 --- a/man/proc_mzml.Rd +++ b/man/proc_mzml.Rd @@ -63,14 +63,15 @@ The default is \eqn{126.1}.} The default is \eqn{135.2}.} \item{exclude_reporter_region}{Logical; if TRUE, excludes MS2 ions in the -region of TMT reporter ions. The default is FALSE. The argument affects -only TMT data. The range of TMT reporter ions is given by -\code{tmt_reporter_lower} and \code{tmt_reporter_upper}.} - -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +region of TMT reporter ions. The default is FALSE. The corresponding range +of TMT reporter ions is informed by \code{tmt_reporter_lower} and +\code{tmt_reporter_upper}. The argument affects only TMT data.} + +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/psmC2Q.Rd b/man/psmC2Q.Rd index b1c0940..dff4bb5 100644 --- a/man/psmC2Q.Rd +++ b/man/psmC2Q.Rd @@ -49,11 +49,11 @@ or decoy peptides, as well as decoy proteins.} Tier-3: one significant peptide per protein and protein scores below significance thresholds.} -\item{max_n_prots}{A positive integer to threshold the maximum number of - protein entries before coercing \code{fdr_type} from \code{psm} or - \code{peptide} to \code{protein}. The argument has no effect if - \code{fdr_type} is already \code{protein}. In general, there is no need to - change the default. +\item{max_n_prots}{Softly depreciated. A positive integer to threshold the + maximum number of protein entries before coercing \code{fdr_type} from + \code{psm} or \code{peptide} to \code{protein}. The argument has no effect + if \code{fdr_type} is already \code{protein}. In general, there is no need + to change the default. Note that for memory efficiency proteins at tiers 1, 2 and 3 are grouped separately. Further note that there is no tier-2 proteins at diff --git a/man/purge_search_space.Rd b/man/purge_search_space.Rd index a1591ee..5f88b01 100644 --- a/man/purge_search_space.Rd +++ b/man/purge_search_space.Rd @@ -27,10 +27,7 @@ theoretical peptides.} Proteome Discoverer or (3) Bruker's DataAnalysis. With MSConvert, the default \code{titleMaker} is required for correct - parsing (don't think it can be altered by users, but just in case). - - Individuality in MGF files are slightly preferred to take advantage of - parallel reading of the files.} + parsing (don't think it can be altered by users, but just in case).} \item{n_cores}{The number of CPU cores.} diff --git a/man/readMGF.Rd b/man/readMGF.Rd index bf56347..8cdd6e4 100644 --- a/man/readMGF.Rd +++ b/man/readMGF.Rd @@ -74,14 +74,15 @@ The default is \eqn{126.1}.} The default is \eqn{135.2}.} \item{exclude_reporter_region}{Logical; if TRUE, excludes MS2 ions in the -region of TMT reporter ions. The default is FALSE. The argument affects -only TMT data. The range of TMT reporter ions is given by -\code{tmt_reporter_lower} and \code{tmt_reporter_upper}.} - -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +region of TMT reporter ions. The default is FALSE. The corresponding range +of TMT reporter ions is informed by \code{tmt_reporter_lower} and +\code{tmt_reporter_upper}. The argument affects only TMT data.} + +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/read_mgf_chunks.Rd b/man/read_mgf_chunks.Rd index 3a6bc9d..87adffd 100644 --- a/man/read_mgf_chunks.Rd +++ b/man/read_mgf_chunks.Rd @@ -117,14 +117,15 @@ The default is \eqn{126.1}.} The default is \eqn{135.2}.} \item{exclude_reporter_region}{Logical; if TRUE, excludes MS2 ions in the -region of TMT reporter ions. The default is FALSE. The argument affects -only TMT data. The range of TMT reporter ions is given by -\code{tmt_reporter_lower} and \code{tmt_reporter_upper}.} - -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +region of TMT reporter ions. The default is FALSE. The corresponding range +of TMT reporter ions is informed by \code{tmt_reporter_lower} and +\code{tmt_reporter_upper}. The argument affects only TMT data.} + +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/read_mzml.Rd b/man/read_mzml.Rd index 07e434d..46e71f2 100644 --- a/man/read_mzml.Rd +++ b/man/read_mzml.Rd @@ -28,14 +28,15 @@ The default is \eqn{126.1}.} The default is \eqn{135.2}.} \item{exclude_reporter_region}{Logical; if TRUE, excludes MS2 ions in the -region of TMT reporter ions. The default is FALSE. The argument affects -only TMT data. The range of TMT reporter ions is given by -\code{tmt_reporter_lower} and \code{tmt_reporter_upper}.} +region of TMT reporter ions. The default is FALSE. The corresponding range +of TMT reporter ions is informed by \code{tmt_reporter_lower} and +\code{tmt_reporter_upper}. The argument affects only TMT data.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/readmzML.Rd b/man/readmzML.Rd index b9f024f..74afac0 100644 --- a/man/readmzML.Rd +++ b/man/readmzML.Rd @@ -74,14 +74,15 @@ The default is \eqn{126.1}.} The default is \eqn{135.2}.} \item{exclude_reporter_region}{Logical; if TRUE, excludes MS2 ions in the -region of TMT reporter ions. The default is FALSE. The argument affects -only TMT data. The range of TMT reporter ions is given by -\code{tmt_reporter_lower} and \code{tmt_reporter_upper}.} - -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +region of TMT reporter ions. The default is FALSE. The corresponding range +of TMT reporter ions is informed by \code{tmt_reporter_lower} and +\code{tmt_reporter_upper}. The argument affects only TMT data.} + +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/reproc_psmC.Rd b/man/reproc_psmC.Rd index 2ad1991..e535aba 100644 --- a/man/reproc_psmC.Rd +++ b/man/reproc_psmC.Rd @@ -45,11 +45,11 @@ reproc_psmC( Tier-3: one significant peptide per protein and protein scores below significance thresholds.} -\item{max_n_prots}{A positive integer to threshold the maximum number of - protein entries before coercing \code{fdr_type} from \code{psm} or - \code{peptide} to \code{protein}. The argument has no effect if - \code{fdr_type} is already \code{protein}. In general, there is no need to - change the default. +\item{max_n_prots}{Softly depreciated. A positive integer to threshold the + maximum number of protein entries before coercing \code{fdr_type} from + \code{psm} or \code{peptide} to \code{protein}. The argument has no effect + if \code{fdr_type} is already \code{protein}. In general, there is no need + to change the default. Note that for memory efficiency proteins at tiers 1, 2 and 3 are grouped separately. Further note that there is no tier-2 proteins at @@ -66,4 +66,8 @@ Protein grouping from \code{psmC.txt} to \code{psmQ.txt}. \details{ May solve some memory shortage issues for large data sets by restarting An Rstudio session. + +The score cut-offs are different among the \code{fdr_type} of "psm", +"peptide" and "protein". An experimenter need to match the value of +\code{fdr_type}. } diff --git a/man/rm_char_in_nfirst.Rd b/man/rm_char_in_nfirst.Rd index 52ce8ff..7b702e1 100644 --- a/man/rm_char_in_nfirst.Rd +++ b/man/rm_char_in_nfirst.Rd @@ -16,7 +16,7 @@ in values.} (max_miss + 1L) * 2L}.} \item{max_len}{A positive integer; the maximum length of peptide sequences -for considerations. Longer peptides will be excluded.} +for considerations. Longer peptides will be excluded. The default is 40.} } \description{ Removes a starting character from the first \code{n} entries. diff --git a/man/save_ms1calib.Rd b/man/save_ms1calib.Rd index 10558bc..ac7eb8e 100644 --- a/man/save_ms1calib.Rd +++ b/man/save_ms1calib.Rd @@ -20,10 +20,7 @@ default is 20.} Proteome Discoverer or (3) Bruker's DataAnalysis. With MSConvert, the default \code{titleMaker} is required for correct - parsing (don't think it can be altered by users, but just in case). - - Individuality in MGF files are slightly preferred to take advantage of - parallel reading of the files.} + parsing (don't think it can be altered by users, but just in case).} } \description{ Saves the \code{ppm_ms1} before and after calibration. diff --git a/man/scalc_pepprobs.Rd b/man/scalc_pepprobs.Rd index b637417..d1a94b9 100644 --- a/man/scalc_pepprobs.Rd +++ b/man/scalc_pepprobs.Rd @@ -13,6 +13,7 @@ scalc_pepprobs( min_ms2mass = 115L, d2 = 1e-05, index_mgf_ms2 = FALSE, + tally_ms2ints = TRUE, digits = 4L ) } @@ -30,20 +31,22 @@ and y-ions.} \item{ppm_ms2}{A positive integer; the mass tolerance of MS2 species. The default is 20.} -\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities -of secondary ions to primary ions at the absence of the primaries. The -default is FALSE. For instance, the signal of \code{b5^*} will be ignored -if its primary ion \code{b5} is not matched.} +\item{soft_secions}{Impacts on search performance not yet assessed. Logical; +if TRUE, collapses the intensities of secondary ions to primary ions even +when the primaries are absent. The default is FALSE. For instance, the +signal of \code{b5^*} will be ignored if its primary ion \code{b5} is not +matched.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} \item{d2}{Bin width in ppm divided by 1E6.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the @@ -53,6 +56,8 @@ interrogation. The default is 110.} the fields of \code{pep_ms2_deltas, pep_ms2_deltas2, pep_ms2_deltas_mean, pep_ms2_deltas_sd} are nullified in the outputs.} +\item{tally_ms2ints}{Logical; tally MS2 intensities or not.} + \item{digits}{A non-negative integer; the number of decimal places to be used. The default is 4.} } diff --git a/man/search_mgf.Rd b/man/search_mgf.Rd index fff8549..44bfe3e 100644 --- a/man/search_mgf.Rd +++ b/man/search_mgf.Rd @@ -56,10 +56,11 @@ default is 20.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} -\item{index_mgf_ms2}{Logical; if TRUE, converts upfrontly MS2 m-over-z values - from numeric to integers as opposed to in-situ conversion during ion - matches. The default is FALSE. The \code{index_mgf_ms2 = TRUE} might be - useful for very large MS files by reducing RAM footprints. +\item{index_mgf_ms2}{A low-priority feature. Logical; if TRUE, converts + upfrontly MS2 m-over-z values from numeric to integers as opposed to + \emph{in-situ} conversion during ion matches. The default is FALSE. The + \code{index_mgf_ms2 = TRUE} might be useful for very large MS files by + reducing RAM footprints. At \code{index_mgf_ms2 = TRUE}, the resolution of mass deltas between theoretical and experimental MS2 m-over-z values is limited by the diff --git a/man/semipeps_byprots.Rd b/man/semipeps_byprots.Rd index 9cf43c7..8d798e9 100644 --- a/man/semipeps_byprots.Rd +++ b/man/semipeps_byprots.Rd @@ -13,7 +13,7 @@ semipeps_byprots(vals, min_len = 7L, max_len = 40L, aa_masses) for considerations. Shorter peptides will be excluded. The default is 7.} \item{max_len}{A positive integer; the maximum length of peptide sequences -for considerations. Longer peptides will be excluded.} +for considerations. Longer peptides will be excluded. The default is 40.} \item{aa_masses}{A named list containing the (mono-isotopic) masses of amino acid residues.} diff --git a/man/split_vec.Rd b/man/split_vec.Rd index 9a054c2..a704c69 100644 --- a/man/split_vec.Rd +++ b/man/split_vec.Rd @@ -13,7 +13,7 @@ split_vec(vec) Split a vector by values } \examples{ -\donttest{ +\dontrun{ ## M library(mzion) library(microbenchmark) diff --git a/man/sub_td_byfdrtype.Rd b/man/sub_td_byfdrtype.Rd new file mode 100644 index 0000000..fb9a3b9 --- /dev/null +++ b/man/sub_td_byfdrtype.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/scores.R +\name{sub_td_byfdrtype} +\alias{sub_td_byfdrtype} +\title{Subsets targets and decoys by fdr_type.} +\usage{ +sub_td_byfdrtype(td, fdr_type) +} +\arguments{ +\item{td}{A data frame of targets and decoys.} + +\item{fdr_type}{A character string; the type of FDR control. The value is in + one of c("protein", "peptide", "psm"). The default is \code{protein}. + + Note that \code{fdr_type = protein} is comparable to \code{fdr_type = + peptide} with the additional filtration of data at \code{prot_tier == 1}.} +} +\description{ +Subsets targets and decoys by fdr_type. +} diff --git a/man/try_psmC2Q.Rd b/man/try_psmC2Q.Rd index d6200c8..f969ad6 100644 --- a/man/try_psmC2Q.Rd +++ b/man/try_psmC2Q.Rd @@ -48,11 +48,11 @@ or decoy peptides, as well as decoy proteins.} Tier-3: one significant peptide per protein and protein scores below significance thresholds.} -\item{max_n_prots}{A positive integer to threshold the maximum number of - protein entries before coercing \code{fdr_type} from \code{psm} or - \code{peptide} to \code{protein}. The argument has no effect if - \code{fdr_type} is already \code{protein}. In general, there is no need to - change the default. +\item{max_n_prots}{Softly depreciated. A positive integer to threshold the + maximum number of protein entries before coercing \code{fdr_type} from + \code{psm} or \code{peptide} to \code{protein}. The argument has no effect + if \code{fdr_type} is already \code{protein}. In general, there is no need + to change the default. Note that for memory efficiency proteins at tiers 1, 2 and 3 are grouped separately. Further note that there is no tier-2 proteins at diff --git a/man/vec_to_list.Rd b/man/vec_to_list.Rd index e52be85..2c68620 100644 --- a/man/vec_to_list.Rd +++ b/man/vec_to_list.Rd @@ -13,7 +13,7 @@ vec_to_list(x) Split a named character vector to lists. } \examples{ -\donttest{ +\dontrun{ library(mzion) library(microbenchmark) diff --git a/vignettes/README.Rmd b/vignettes/README.Rmd index eb569a1..e11a198 100644 --- a/vignettes/README.Rmd +++ b/vignettes/README.Rmd @@ -73,6 +73,10 @@ devtools::install_github("qzhang503/mzion") ## Help documents Enter `?mzion::matchMS` from an R console. +## Specifications of fixed and variable modifications + +The Unimod definition of positions and sites were adopted by Mzion for specifying fixed and variable modifications. The value of a position is in one of "Anywhere", "Protein N-term", "Protein C-term", "Any N-term" or "Any C-term". The last two position labels can be shorthanded as "N-term" and "C-term". A site is a one-letter representation of the twenty amino-acid residues, as well as the terminal sites of "N-term" and "C-term". The general format in specifying a fixed or variable modification is `title (position = site)` where title is a unique character string without space. At a position of "Anywhere", the modification can be shorthanded as `title (site)`, for example, `TMT10plex (K)`. For a terminal modification at any site, it can be abbreviated as `title (position)`, for example, `Acetyl (Protein N-term)` and `TMT10plex (N-term)`. There are circumstances that both position and site are needed for specifying a modification, for instance, `Gln->pyro-Glu (N-term = Q)`. More examples are available in the help document of Mzion utility of `parse_unimod`. + ## Database searches ``` r @@ -135,3 +139,5 @@ matchMS( - `remove_unimod`: removes a Unimod entry - `remove_unimod_title`: removes a Unimod entry by title - `make_mztab`: prepares a mzTab file from the search results + +