diff --git a/DESCRIPTION b/DESCRIPTION index 8b5e977..7d0bc8e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: proteoM Type: Package Title: Database Searches of Proteomic Mass-spectrometrirc Data -Version: 1.2.2.1 +Version: 1.2.3 Authors@R: person(given = "Qiang", family = "Zhang", diff --git a/R/bin_masses.R b/R/bin_masses.R index e6358db..67eccad 100644 --- a/R/bin_masses.R +++ b/R/bin_masses.R @@ -296,14 +296,11 @@ binTheoSeqs <- function (idxes = NULL, res = NULL, min_mass = 200L, out_dir <- create_dir(gsub("(^.*/).*$", "\\1", out_path)) - out_nms <- gsub("^.*/(.*)\\.[^\\.].*$", "\\1", out_path) %>% - paste(idxes, sep = "_") %>% - paste0(".rds") - - res <- res %>% - lapply(attributes) %>% - lapply(`[[`, "data") + out_nms <- gsub("^.*/(.*)\\.[^\\.].*$", "\\1", out_path) + out_nms <- paste0(out_nms, "_", idxes, ".rds") + res <- lapply(res, attributes) + res <- lapply(res, `[[`, "data") gc() n_cores <- local({ diff --git a/R/fastas.R b/R/fastas.R index 2a4b10a..d1c0eb4 100644 --- a/R/fastas.R +++ b/R/fastas.R @@ -120,15 +120,13 @@ load_fasta <- function (fasta = NULL) oks <- file.exists(fasta) - if (!all(oks)) { - bads <- fasta[!oks] - stop("Missing FASTA file(s): \n", paste(bads, collapse = "\n")) - } + if (!all(oks)) + stop("Missing FASTA file(s): \n", paste(fasta[!oks], collapse = "\n")) - lapply(fasta, function (x) read_fasta(x)) %>% - do.call(`c`, .) %>% - `names<-`(gsub(">", "", names(.))) %>% - .[!duplicated(names(.))] + ans <- lapply(fasta, function (x) read_fasta(x)) + ans <- flatten_list(ans) + names(ans) <- gsub(">", "", names(ans)) + ans <- ans[!duplicated(names(ans))] } ### End of also in proteoQ @@ -155,30 +153,26 @@ load_fasta <- function (fasta = NULL) #' fasta_db <- load_fasta2( #' c("~/proteoM/dbs/fasta/uniprot/uniprot_hs_2020_05.fasta", #' "~/proteoM/dbs/fasta/crap/crap.fasta"), -#' c("uniprot_acc", "other") -#' ) +#' c("uniprot_acc", "other")) #' #' # Need `acc_pattern` as "crap" is not one of the default acc_type #' load_fasta2( #' c("~/proteoM/dbs/fasta/uniprot/uniprot_hs_2020_05.fasta", #' "~/proteoM/dbs/fasta/crap/crap.fasta"), -#' c("uniprot_acc", "crap") -#' ) +#' c("uniprot_acc", "crap")) #' #' # ok #' fasta_db2 <- load_fasta2( #' c("~/proteoM/dbs/fasta/uniprot/uniprot_hs_2020_05.fasta", #' "~/proteoM/dbs/fasta/crap/crap.fasta"), #' c("uniprot_acc", "crap"), -#' c("^>..\\|([^\\|]+)\\|[^\\|]+", "(.*)") -#' ) +#' c("^>..\\|([^\\|]+)\\|[^\\|]+", "(.*)")) #' #' fasta_db3 <- load_fasta2( #' c("~/proteoM/dbs/fasta/uniprot/uniprot_hs_2020_05.fasta", #' "~/proteoM/dbs/fasta/crap/crap.fasta"), #' c("my_acc", "crap"), -#' c("^>..\\|([^\\|]+)\\|[^\\|]+", "(.*)") -#' ) +#' c("^>..\\|([^\\|]+)\\|[^\\|]+", "(.*)")) #' #' stopifnot(identical(fasta_db, fasta_db2), #' identical(fasta_db, fasta_db3)) @@ -191,10 +185,8 @@ load_fasta2 <- function (fasta = NULL, acc_type = NULL, acc_pattern = NULL) oks <- file.exists(fasta) - if (!all(oks)) { - bads <- fasta[!oks] - stop("Missing FASTA file(s): \n", paste(bads, collapse = "\n")) - } + if (!all(oks)) + stop("Missing FASTA file(s): \n", paste(fasta[!oks], collapse = "\n")) len_f <- length(fasta) len_a <- length(acc_type) @@ -204,8 +196,7 @@ load_fasta2 <- function (fasta = NULL, acc_type = NULL, acc_pattern = NULL) stop("More accession types than fasta files.") if (len_f < len_p) - stop("More acc_pattern types than fasta files.", - call. = FALSE) + stop("More acc_pattern types than fasta files.") if (len_a && (len_a < len_f)) { warning("More fasta files than accession types; ", @@ -219,7 +210,7 @@ load_fasta2 <- function (fasta = NULL, acc_type = NULL, acc_pattern = NULL) acc_pattern <- rep(acc_pattern[1], len_f) } - if (! (is.null(acc_type) || is.null(acc_pattern))) { + if (!(is.null(acc_type) || is.null(acc_pattern))) { acc_type <- acc_type acc_pattern <- acc_pattern } @@ -228,10 +219,12 @@ load_fasta2 <- function (fasta = NULL, acc_type = NULL, acc_pattern = NULL) acc_pattern <- rep("(.*)", len_f) } else if (!is.null(acc_type)) { - acc_pattern <- purrr::map_chr(acc_type, find_acc_pattern) + acc_pattern <- lapply(acc_type, find_acc_pattern) + acc_pattern <- unlist(acc_pattern, recursive = FALSE, use.names = FALSE) } else { - acc_type <- purrr::map_chr(acc_pattern, find_acc_type) + acc_type <- lapply(acc_pattern, find_acc_type) + acc_type <- unlist(acc_type, recursive = FALSE, use.names = FALSE) } if (length(acc_pattern) != len_f) @@ -239,11 +232,11 @@ load_fasta2 <- function (fasta = NULL, acc_type = NULL, acc_pattern = NULL) # Not to USE.NAMES; otherwise fasta names prefix to accession names # this is different to map2 where names are NULL for each fasta_db - mapply(function (x, y) read_fasta(x, y), fasta, acc_pattern, - SIMPLIFY = FALSE, USE.NAMES = FALSE) %>% - do.call(`c`, .) %>% - `names<-`(gsub(">", "", names(.))) %>% - .[!duplicated(names(.))] + ans <- mapply(function (x, y) read_fasta(x, y), fasta, acc_pattern, + SIMPLIFY = FALSE, USE.NAMES = FALSE) + ans <- flatten_list(ans) + names(ans) <- gsub(">", "", names(ans)) + ans <- ans[!duplicated(names(ans))] } @@ -262,23 +255,16 @@ find_acc_pattern <- function (acc_type) if (!acc_type %in% oks) stop("`acc_type` is not one of ", paste(oks, collapse = ", ")) - acc_pattern <- if (acc_type == "uniprot_acc") { + if (acc_type == "uniprot_acc") "^>..\\|([^\\|]+)\\|[^\\|]+" - } - else if (acc_type == "uniprot_id") { + else if (acc_type == "uniprot_id") "^>..\\|[^\\|]+\\|([^ ]+) .*" - } - else if (acc_type == "refseq_acc") { + else if (acc_type == "refseq_acc") "^>([^ ]+?) .*" - } - else if (acc_type == "other") { + else if (acc_type == "other") "(.*)" - } - else { + else stop("Unknown `acc_type`.") - } - - invisible(acc_pattern) } @@ -302,22 +288,16 @@ find_acc_type <- function (acc_pattern) pat_rsacc <- "^>([^ ]+?) " pat_other <- "(.*)" - acc_type <- if (acc_pattern == pat_upacc) { + if (acc_pattern == pat_upacc) "uniprot_acc" - } - else if (acc_pattern == pat_upid) { + else if (acc_pattern == pat_upid) "uniprot_id" - } - else if (acc_pattern == pat_rsacc) { + else if (acc_pattern == pat_rsacc) "refseq_acc" - } - else if (acc_pattern == pat_other) { + else if (acc_pattern == pat_other) "other" - } - else { + else stop("Unknown `acc_pattern`.") - } - - invisible(acc_type) } + diff --git a/R/mgfs.R b/R/mgfs.R index f69dff9..bf78f32 100644 --- a/R/mgfs.R +++ b/R/mgfs.R @@ -237,8 +237,7 @@ readMGF <- function (filepath = NULL, filelist = NULL, # (2) parallel five MGF files and parallel chunks in each len <- length(filelist) n_cores <- min(len, detect_cores(32L)) - # n_cores <- min(detect_cores(32L), floor((find_free_mem()/1024)/(sizes * 8)), len) - + if (n_cores == 1L) raw_files <- readlineMGFs(1, filelist, filepath, raw_file) else { @@ -323,10 +322,9 @@ readMGF <- function (filepath = NULL, filelist = NULL, post_readmgf <- function (df, min_mass = 200L, max_mass = 4500L, ppm_ms1 = 10L, filepath, out_path) { - df <- df %>% - dplyr::arrange(ms1_mass) %>% - # dplyr::filter(ms1_mass >= min_mass, ms1_mass <= max_mass) %>% - dplyr::mutate(frame = find_ms1_interval(ms1_mass, from = min_mass, ppm = ppm_ms1)) + df <- dplyr::arrange(df, ms1_mass) + # df <- dplyr::filter(df, ms1_mass >= min_mass, ms1_mass <= max_mass) + df <- dplyr::mutate(df, frame = find_ms1_interval(ms1_mass, from = min_mass, ppm = ppm_ms1)) raws_files <- df$raw_file raws <- raws_files[!duplicated.default(raws_files)] @@ -450,8 +448,6 @@ read_mgf_chunks <- function (filepath = "~/proteoM/mgf/temp_1", n_cores <- min(detect_cores(32L), len) cl <- parallel::makeCluster(getOption("cl.cores", n_cores)) - parallel::clusterExport(cl, list("%>%"), envir = environment(magrittr::`%>%`)) - parallel::clusterExport( cl, c("stri_startswith_fixed", @@ -508,7 +504,7 @@ read_mgf_chunks <- function (filepath = "~/proteoM/mgf/temp_1", digits = digits) parallel::stopCluster(cl) - gc() + # gc() out <- dplyr::bind_rows(out) @@ -536,9 +532,10 @@ read_mgf_chunks <- function (filepath = "~/proteoM/mgf/temp_1", # perfect case of no gaps: two lines of "" and "" if (length(ab) > 2L) ab else NULL - }) %>% - unlist(use.names = FALSE) %T>% - write(file.path(filepath, "gaps.mgf")) + }) + + gaps <- unlist(gaps, use.names = FALSE) + write(gaps, file.path(filepath, "gaps.mgf")) local({ nms <- list.files(path = file.path(filepath), pattern = "^.*\\_[ab]f.mgf$") @@ -620,11 +617,10 @@ proc_mgf_chunks <- function (file, topn_ms2ions = 100L, { message("Parsing '", file, "'.") lines <- stringi::stri_read_lines(file) - basename <- gsub("\\.[^.]*$", "", file) - - begins <- which(stringi::stri_startswith_fixed(lines, "BEGIN IONS")) - ends <- which(stringi::stri_endswith_fixed(lines, "END IONS")) + + begins <- .Internal(which(stringi::stri_startswith_fixed(lines, "BEGIN IONS"))) + ends <- .Internal(which(stringi::stri_endswith_fixed(lines, "END IONS"))) af <- local({ le <- ends[length(ends)] @@ -708,8 +704,8 @@ proc_mgfs <- function (lines, topn_ms2ions = 100L, { options(digits = 9L) - begins <- which(stringi::stri_startswith_fixed(lines, "BEGIN IONS")) - ends <- which(stringi::stri_endswith_fixed(lines, "END IONS")) + begins <- .Internal(which(stringi::stri_startswith_fixed(lines, "BEGIN IONS"))) + ends <- .Internal(which(stringi::stri_endswith_fixed(lines, "END IONS"))) ## MS1 # (1) m-over-z and intensity @@ -751,7 +747,7 @@ proc_mgfs <- function (lines, topn_ms2ions = 100L, !is.na(ms1_masses)) # timsTOF data may have undetermined charge states - na_rows <- which(is.na(rows)) + na_rows <- .Internal(which(is.na(rows))) if (length(na_rows)) rows[na_rows] <- FALSE begins <- begins[rows] @@ -1452,7 +1448,7 @@ read_mzml <- function (xml_file, tmt_reporter_lower = 126.1, tmt_reporter_upper ## spectrum xml_root <- xml2::read_xml(xml_file) mzML <- xml2::xml_child(xml_root) - idx_run <- which(xml2::xml_name(xml2::xml_children(mzML)) == "run") # 8 + idx_run <- which(xml2::xml_name(xml2::xml_children(mzML)) == "run") run <- xml2::xml_children(mzML)[[idx_run]] idx_specs <- which(xml2::xml_name(xml2::xml_children(run)) == "spectrumList") spec <- xml2::xml_children(xml2::xml_children(run)[[idx_specs]]) @@ -1468,14 +1464,14 @@ read_mzml <- function (xml_file, tmt_reporter_lower = 126.1, tmt_reporter_upper x <- spec[[i]] scan_nums[i] <- gsub(".* scan=(.*)$", "\\1", xml2::xml_attr(x, "id")) xc <- xml2::xml_children(x) - idx_precursor <- grep("precursorList", xc) # 12 + idx_precursor <- grep("precursorList", xc) rm(list = c("x")) if (length(idx_precursor)) { nms <- xml2::xml_attr(xc, "name") - idx_title <- which(nms == "spectrum title") # 10 + idx_title <- .Internal(which(nms == "spectrum title")) idx_scanList <- grep("scanList", xc) # 11 - idx_bin <- grep("binaryDataArrayList", xc) # 13 + idx_bin <- grep("binaryDataArrayList", xc) ## title title <- xml2::xml_attr(xc[[idx_title]], "value") @@ -1486,7 +1482,8 @@ read_mzml <- function (xml_file, tmt_reporter_lower = 126.1, tmt_reporter_upper scanList <- xml2::xml_children(xc[[idx_scanList]]) idx_rt <- grep("scan", scanList) # 2 scanList_scan <- xml2::xml_children(scanList[[idx_rt]]) - idx_scan_start <- which(xml2::xml_attr(scanList_scan, "name") == "scan start time") # 1 + idx_scan_start <- + .Internal(which(xml2::xml_attr(scanList_scan, "name") == "scan start time")) ret_times[i] <- xml2::xml_attr(scanList_scan[[idx_scan_start]], "value") rm(list = c("nms", "title", "scanList_scan", "scanList")) diff --git a/R/ms1_precursors.R b/R/ms1_precursors.R index 17f06a6..48b1661 100644 --- a/R/ms1_precursors.R +++ b/R/ms1_precursors.R @@ -549,11 +549,12 @@ calc_pepmasses2 <- function (aa_masses = NULL, min_mass = min_mass, max_mass = max_mass, digits = digits - ) %>% - flatten_list() %>% - unlist(recursive = FALSE, use.names = TRUE) - + ) + parallel::stopCluster(cl) + + fwd_peps[[i]] <- flatten_list(fwd_peps[[i]]) + fwd_peps[[i]] <- unlist(fwd_peps[[i]], recursive = FALSE, use.names = TRUE) gc() } } @@ -591,6 +592,18 @@ calc_pepmasses2 <- function (aa_masses = NULL, "amods+ tmod+ vnl+ fnl+")) if (length(inds)) { + cl <- parallel::makeCluster(getOption("cl.cores", n_cores)) + + parallel::clusterExport( + cl, + c("hms1_a1_vnl0_fnl0", + "ms1_a1_vnl0_fnl0", + "match_mvmods", + "expand_grid_rows", + "recur_flatten", + "delta_ms1_a0_fnl1"), + envir = environment(proteoM:::ms1_a1_vnl0_fnl0)) + for (i in inds) { amods_i <- amods[[i]] aa_masses_i <- aa_masses_all[[i]] @@ -610,19 +623,7 @@ calc_pepmasses2 <- function (aa_masses = NULL, vmods_nl_i = vmods_nl[[i]] fmods_nl_i = fmods_nl[[i]] - - cl <- parallel::makeCluster(getOption("cl.cores", n_cores)) - - parallel::clusterExport( - cl, - c("hms1_a1_vnl0_fnl0", - "ms1_a1_vnl0_fnl0", - "match_mvmods", - "expand_grid_rows", - "recur_flatten", - "delta_ms1_a0_fnl1"), - envir = environment(proteoM:::ms1_a1_vnl0_fnl0)) - + fwd_peps[[i]] <- parallel::clusterApply( cl, chunksplit(fwd_peps_i, n_cores, "list"), @@ -636,19 +637,19 @@ calc_pepmasses2 <- function (aa_masses = NULL, ms1vmods = ms1vmods_i, min_mass = min_mass, max_mass = max_mass, - digits = digits - ) %>% - flatten_list() %>% - unlist(recursive = FALSE, use.names = TRUE) - - parallel::stopCluster(cl) - gc() + digits = digits) + + fwd_peps[[i]] <- flatten_list(fwd_peps[[i]]) + fwd_peps[[i]] <- unlist(fwd_peps[[i]], recursive = FALSE, use.names = TRUE) message("\tCompleted peptide masses: ", paste(attributes(aa_masses_i)$fmods, "|", attributes(aa_masses_i)$vmods, collapse = ", ")) } + + parallel::stopCluster(cl) + gc() } suppressWarnings( @@ -1062,6 +1063,15 @@ calc_aamasses <- function (fixedmods = c("TMT6plex (K)", save_mod_indexes(out_path, fixedmods, varmods, f_to_v) qs::qsave(aa_masses_ms1, file.path(out_path, "aa_masses_ms1.rds"), preset = "fast") qs::qsave(aa_masses_all, file.path(out_path, "aa_masses_all.rds"), preset = "fast") + + fmods <- lapply(aa_masses_all, attr, "fmods", exact = TRUE) + vmods <- lapply(aa_masses_all, attr, "vmods", exact = TRUE) + + readr::write_tsv( + data.frame(pep_fmod = unlist(fmods, recursive = FALSE), + pep_vmod = unlist(vmods, recursive = FALSE), + pep_mod_group = seq_along(aa_masses_all)), + file.path(out_path, "summary_mod_groups.txt")) } invisible(aa_masses_all) @@ -1119,10 +1129,10 @@ save_mod_indexes <- function (out_path = NULL, fixedmods, varmods, f_to_v) if (is.null(out_path)) return(NULL) - mod_indexes <- seq_along(c(fixedmods, varmods)) %>% - as.hexmode() %>% - `names<-`(c(fixedmods, varmods)) - + mod_indexes <- seq_along(c(fixedmods, varmods)) + mod_indexes <- as.hexmode(mod_indexes) + names(mod_indexes) <- c(fixedmods, varmods) + is_coerced <- if (length(f_to_v)) names(mod_indexes) %in% f_to_v else @@ -1253,12 +1263,13 @@ find_f_to_v <- function (fixedmods, fmods_ps, vmods_ps) # e.g. "N-term" can be matched by both site and position # (no guarantee in the order of coerce_sites; so match names one at a time) - coerce_sites <- unique(c(coerce_asites, coerce_tsites)) %>% - lapply(function (x) { - names(x) <- fixedmods[fmods_ps == x] - x - }) + coerce_sites <- unique(c(coerce_asites, coerce_tsites)) + coerce_sites <- lapply(coerce_sites, function (x) { + names(x) <- fixedmods[fmods_ps == x] + x + }) + unlist(coerce_sites) } @@ -1292,9 +1303,9 @@ find_aamasses_vmodscombi <- function (varmods = NULL, f_to_v = NULL, varmods_comb <- unlist(varmods_comb, recursive = FALSE) vmods_ps <- find_modps(varmods) - vmods_ps_combi <- seq_along(vmods_ps) %>% - lapply(function (x) sim_combn(vmods_ps, x)) %>% - flatten_list() + vmods_ps_combi <- seq_along(vmods_ps) + vmods_ps_combi <- lapply(vmods_ps_combi, function (x) sim_combn(vmods_ps, x)) + vmods_ps_combi <- flatten_list(vmods_ps_combi) ## Remove the combinations without anywhere_coerce_sites # [x] e.g. "TMT6plex (K)" coerced from fixedmod to varmod, @@ -1598,9 +1609,9 @@ check_resunimod <- function (res) check_fmods_pos_site <- function (positions_sites) { if (length(positions_sites) > 1L) { - dups <- purrr::reduce(positions_sites, `c`) %>% - .[duplicated(.)] - + dups <- purrr::reduce(positions_sites, `c`) + dups <- dups[duplicated(dups)] + if (length(dups)) { dups_in_each <- lapply(positions_sites, function (x) x == dups) dup_mods <- names(positions_sites[unlist(dups_in_each)]) diff --git a/R/ms2_a1_vnl1_fnl0.R b/R/ms2_a1_vnl1_fnl0.R index 2c4ea82..8ba4b71 100644 --- a/R/ms2_a1_vnl1_fnl0.R +++ b/R/ms2_a1_vnl1_fnl0.R @@ -277,7 +277,7 @@ gen_ms2ions_a1_vnl1_fnl0 <- function (aa_seq = NULL, ms1_mass = NULL, aam <- aa_masses[aas] ms1vmods <- match_mvmods(aas = aas, ms1vmods = ms1vmods, amods = amods) - oks <- ms1vmods$inds + oks <- ms1vmods[["inds"]] ms2vmods <- ms2vmods[oks] vmods_combi <- find_vmodscombi(aas = aas, ms2vmods = ms2vmods, @@ -295,20 +295,11 @@ gen_ms2ions_a1_vnl1_fnl0 <- function (aa_seq = NULL, ms1_mass = NULL, return(NULL) vnl_combi <- lapply(vmods_combi, function (x) expand_grid_rows(vmods_nl[x])) + vnl_combi <- lapply(vnl_combi, function (x) if (length(x) > maxn_vnl_per_seq) x[1:maxn_vnl_per_seq] else x) - ## --- (tentative) to restricts the total number of vnl_combi's - # nrows <- lapply(vnl_combi, function (x) length(attributes(x)$row.names)) # faster than nrow - # nrows <- .Internal(unlist(nrows, recursive = FALSE, use.names = FALSE)) - # counts <- cumsum(nrows) - - # oks <- which(counts <= maxn_vmods_sitescombi_per_pep) - # vnl_combi <- vnl_combi[oks] - # vmods_combi <- vmods_combi[oks] - ## --- - - # 725 us + # theoretical MS2 of forward sequences af <- mapply( calc_ms2ions_a1_vnl1_fnl0, vmods_combi = vmods_combi, @@ -324,15 +315,12 @@ gen_ms2ions_a1_vnl1_fnl0 <- function (aa_seq = NULL, ms1_mass = NULL, SIMPLIFY = FALSE, USE.NAMES = FALSE) - # 360 us - len <- length(aas) - af <- mapply( add_hexcodes_vnl2, ms2ions = af, vmods_combi = vmods_combi, MoreArgs = list( - len = len, + len = length(aas), mod_indexes = mod_indexes), SIMPLIFY = FALSE, USE.NAMES = FALSE) @@ -342,8 +330,9 @@ gen_ms2ions_a1_vnl1_fnl0 <- function (aa_seq = NULL, ms1_mass = NULL, if (length(af) > maxn_vmods_sitescombi_per_pep) af <- af[1:maxn_vmods_sitescombi_per_pep] - # need to fix hex tags which will not be used; - # for time efficiency just leave as + # hexcodes of the reversed entries are not yet reversed; + # they are not used and for time efficiency just leave them as are + # names are `pep_ivmod`; NA is the indicator for reversed entries av <- lapply(af, calc_rev_ms2, aas) names(av) <- NA_character_ c(af, av) @@ -366,25 +355,30 @@ calc_ms2ions_a1_vnl1_fnl0 <- function (vmods_combi, vnl_combi, aam, aa_masses, { # updates vmod masses delta_amod <- aa_masses[vmods_combi] - idxes <- as.numeric(names(vmods_combi)) + idxes <- as.integer(names(vmods_combi)) aam[idxes] <- aam[idxes] + delta_amod # updates vnl masses len <- length(vnl_combi) out <- vector("list", len) - for (i in 1:len) { - aam_i <- aam - delta_nl <- .Internal(unlist(vnl_combi[[i]], recursive = FALSE, use.names = FALSE)) - aam_i[idxes] <- aam_i[idxes] - delta_nl - out[[i]] <- ms2ions_by_type(aam_i, ntmass, ctmass, type_ms2ions, digits) + # the first vnl masses are always all zeros + out[[1]] <- ms2ions_by_type(aam, ntmass, ctmass, type_ms2ions, digits) + + if (len > 1L) { + for (i in 2:len) { + aam_i <- aam + delta_nl <- .Internal(unlist(vnl_combi[[i]], recursive = FALSE, use.names = FALSE)) + aam_i[idxes] <- aam_i[idxes] - delta_nl + out[[i]] <- ms2ions_by_type(aam_i, ntmass, ctmass, type_ms2ions, digits) + } } invisible(out) } -#' Adds hex codes (with variable NLs). +#' Adds hexcodes (with variable NLs). #' #' To indicate the variable modifications of an amino acid sequence. #' @@ -393,11 +387,9 @@ calc_ms2ions_a1_vnl1_fnl0 <- function (vmods_combi, vnl_combi, aam, aa_masses, #' @inheritParams ms2match add_hexcodes_vnl2 <- function (ms2ions, vmods_combi, len, mod_indexes = NULL) { - # idxes <- .Internal(unlist(vmods_combi, recursive = FALSE, use.names = FALSE)) nms <- names(vmods_combi) hexs <- rep("0", len) - # hexs[as.integer(nms)] <- mod_indexes[idxes] hexs[as.integer(nms)] <- mod_indexes[vmods_combi] hexs <- .Internal(paste0(list(hexs), collapse = "", recycle0 = FALSE)) diff --git a/R/ms2_base.R b/R/ms2_base.R index 04505cf..1895293 100644 --- a/R/ms2_base.R +++ b/R/ms2_base.R @@ -870,7 +870,7 @@ find_ms2_bypep <- function (theos = NULL, expts = NULL, ex = NULL, d = NULL, ## forward matches ps <- th_i %fin% ex | (th_i - 1L) %fin% ex | (th_i + 1L) %fin% ex - ips <- which(ps) + ips <- .Internal(which(ps)) ## "ith = ips" in ascending order, not "iex = ips_12" @@ -894,12 +894,12 @@ find_ms2_bypep <- function (theos = NULL, expts = NULL, ex = NULL, d = NULL, # b-ions y_1 <- th_i[1:mid] ps_1 <- ex %fin% y_1 | ex_bf %fin% y_1 | ex_af %fin% y_1 - ips_1 <- which(ps_1) - + ips_1 <- .Internal(which(ps_1)) + # y-ions y_2 <- th_i[(mid+1L):lth] ps_2 <- ex %fin% y_2 | ex_bf %fin% y_2 | ex_af %fin% y_2 - ips_2 <- which(ps_2) + ips_2 <- .Internal(which(ps_2)) # b- and y-ions expt_1 <- expts[ips_1] diff --git a/R/msmsmatches.R b/R/msmsmatches.R index 6879295..e33c875 100644 --- a/R/msmsmatches.R +++ b/R/msmsmatches.R @@ -269,7 +269,7 @@ #' @param fdr_type A character string; the type of FDR control. The value is in #' one of c("protein", "peptide", "psm"). The default is \code{protein}. #' -#' Note that \code{fdr_type = protein} is equivalent to \code{fdr_type = +#' Note that \code{fdr_type = protein} is comparable to \code{fdr_type = #' peptide} with the additional filtration of data at \code{prot_tier == 1}. #' @param fdr_group A character string; the modification group(s) for uses in #' peptide FDR controls. The value is in one of c("all", "base"). The @@ -277,8 +277,7 @@ #' of matches. #' @param max_pepscores_co A positive numeric; the upper limit in the cut-offs #' of peptide scores for discriminating significant and insignificant -#' identities. The default is changed from \code{Inf} to 50 from version -#' 1.1.9.2 on. +#' identities. #' @param min_pepscores_co A non-negative numeric; the lower limit in the #' cut-offs of peptide scores for discriminating significant and insignificant #' identities. @@ -300,10 +299,10 @@ #' pep_score_cutoff} under a protein will be used to represent the threshold #' of a protein enrichment score. For more conserved thresholds, the #' statistics of \code{"max"} may be considered. -#' @param soft_secions Logical; if TRUE, collapses the intensities of secondary -#' ions to primary ions at the absence of the primaries. The default is FALSE. -#' For instance, the signal of \code{b5^*} will be ignored if its primary ion -#' \code{b5} is not matched. +#' @param soft_secions Depreciated. Logical; if TRUE, collapses the intensities +#' of secondary ions to primary ions at the absence of the primaries. The +#' default is FALSE. For instance, the signal of \code{b5^*} will be ignored +#' if its primary ion \code{b5} is not matched. #' @param topn_seqs_per_query Positive integer; a threshold to discard peptide #' matches under the same MS query with scores beyond the top-n. #' @@ -319,17 +318,20 @@ #' #' For a variable modification with multiple neutral losses (NL), the #' best-scored NL will be used in the ranking. -#' @param combine_tier_three Logical; if TRUE, combines search results at tiers -#' 1, 2 and 3 to the single output of \code{psmQ.txt}. The default is FALSE in -#' that data will be segregated into the three quality tiers according to the -#' choice of \code{fdr_type}. The (convenience) parameter matters since -#' \href{http://github.com/qzhang503/proteoQ}{proteoQ} will only look for the -#' inputs of \code{psmQ[...].txt}. -#' -#' For instance, if the aim is to bypass the constraint by protein FDR and -#' focus on PSMs that have met the cut-offs specified by \code{target_fdr}, an -#' experimenter may set \code{combine_tier_three = TRUE} and hence pool all -#' significant peptides in \code{psmQ.txt} for downstream proteoQ. +#' @param combine_tier_three Logical; if TRUE, combines search results at tier-3 +#' to tier-1 to form the single output of \code{psmQ.txt}. The default is +#' FALSE in that data will be segregated into the three quality tiers (shown +#' below) by the choice of \code{fdr_type}. Note that the argument affects +#' only at the \code{fdr_type} of \code{psm} or \code{peptide} where there are +#' no tier-2 outputs. In general, the tier-3 results correspond to +#' one-hit-wonders and setting \code{combine_tier_three = TRUE} is +#' discouraged. +#' +#' In subproteome analysis, such as phosphoproteome analysis, some proteins +#' may be well established globally, but fail the significance assessment by +#' protein FDR on the local scale. In situations like this, it may be suitable +#' to apply \code{fdr_type = "peptide"} or \code{fdr_type = "psm"} other than +#' incurring \code{combine_tier_three = TRUE}. #' #' Tier-1: both proteins and peptides with scores above significance #' thresholds. @@ -382,9 +384,6 @@ #' file folder for disk space or under infrequent events of modified framework #' incurred by the developer. #' -#' Started from version 1.2.1.5, a new data structure of protein-peptide -#' lookups are used. Version 1.2.1.4.1 is the final version using the old data -#' structure. #' @param .path_fasta The parent file path to the theoretical masses of MS1 #' precursors. At the NULL default, the path is \code{gsub("(.*)\\.[^\\.]*$", #' "\\1", get("fasta", envir = environment())[1])}. The parameter is for the @@ -634,17 +633,19 @@ matchMS <- function (out_path = "~/proteoM/outs", "SemiGluN", "SemiAspC", "SemiAspN", "Noenzyme", "Nodigest"), custom_enzyme = c(Cterm = NULL, Nterm = NULL), - nes_fdr_group = c("all", "all_cterm_tryptic", - "all_cterm_nontryptic", "base", - "base_cterm_tryptic", - "base_cterm_nontryptic"), + nes_fdr_group = c("base", "base_cterm_tryptic", + "base_cterm_nontryptic", + "all", "all_cterm_tryptic", + "all_cterm_nontryptic", + "top3", "top3_cterm_tryptic", + "top3_cterm_nontryptic"), noenzyme_maxn = 0L, maxn_fasta_seqs = 200000L, maxn_vmods_setscombi = 512L, maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, - maxn_fnl_per_seq = 64L, - maxn_vnl_per_seq = 64L, + maxn_fnl_per_seq = 8L, + maxn_vnl_per_seq = 8L, maxn_vmods_sitescombi_per_pep = 64L, min_len = 7L, max_len = 40L, max_miss = 2L, min_mass = 200L, max_mass = 4500L, @@ -669,7 +670,7 @@ matchMS <- function (out_path = "~/proteoM/outs", target_fdr = 0.01, fdr_type = c("protein", "peptide", "psm"), - fdr_group = c("all", "base"), + fdr_group = c("base", "all", "top3"), max_pepscores_co = 50, min_pepscores_co = 0, max_protscores_co = Inf, max_protnpep_co = 10L, @@ -931,10 +932,13 @@ matchMS <- function (out_path = "~/proteoM/outs", } # fdr_group + + # for future supports of character strings or integers (mod_groups) + # fdr_group <- check_fdr_group(fdr_group, eval(this_fml[["fdr_group"]])) oks <- eval(this_fml[["fdr_group"]]) fdr_group <- substitute(fdr_group) - if (length(fdr_group) > 1L) + if (length(fdr_group) > 1L && identical(eval(fdr_group), oks)) fdr_group <- oks[1] else { fdr_group <- as.character(fdr_group) @@ -942,12 +946,12 @@ matchMS <- function (out_path = "~/proteoM/outs", if (!fdr_group %in% oks) stop("Incorrect `fdr_group`.") } - + # nes_fdr_group oks <- eval(this_fml[["nes_fdr_group"]]) nes_fdr_group <- substitute(nes_fdr_group) - if (length(nes_fdr_group) > 1L) + if (length(nes_fdr_group) > 1L && identical(eval(nes_fdr_group), oks)) nes_fdr_group <- oks[1] else { nes_fdr_group <- as.character(nes_fdr_group) @@ -1284,7 +1288,7 @@ matchMS <- function (out_path = "~/proteoM/outs", calc_pepscores(topn_ms2ions = topn_ms2ions, type_ms2ions = type_ms2ions, target_fdr = target_fdr, - fdr_type = fdr_type, + # fdr_type = fdr_type, # not used min_len = min_len, max_len = max_len, ppm_ms2 = ppm_ms2, @@ -1449,7 +1453,7 @@ matchMS <- function (out_path = "~/proteoM/outs", df <- try_psmC2Q(df, out_path = out_path, - fdr_type = fdr_type, + fdr_type = fdr_type, # for workflow controls combine_tier_three = combine_tier_three, max_n_prots = max_n_prots) @@ -1587,6 +1591,7 @@ reproc_psmC <- function (out_path = NULL, fdr_type = "protein", #' @param fct A factor for data splitting into chunks. May consider a greater #' value for a larger data set. #' @inheritParams matchMS +#' @importFrom fastmatch %fin% psmC2Q <- function (df = NULL, out_path = NULL, fdr_type = "protein", combine_tier_three = FALSE, max_n_prots = 60000L, fct = 4L) @@ -1595,7 +1600,7 @@ psmC2Q <- function (df = NULL, out_path = NULL, fdr_type = "protein", # if (!all(df[["pep_issig"]])) stop("Developer: filter data by \"pep_issig\" first.") # if (any(df[["pep_isdecoy"]])) stop("Developer: remove decoy peptide first.") - # if (any(grepl("^-", df["prot_acc"]))) stop("Developr: remove decoy proteins first.") + # if (any(grepl("^-", df["prot_acc"]))) stop("Developer: remove decoy proteins first.") message("\n=================================\n", "prot_tier prot_issig prot_n_pep \n", @@ -1605,26 +1610,22 @@ psmC2Q <- function (df = NULL, out_path = NULL, fdr_type = "protein", "=================================\n") # Set aside one-hit wonders - df3 <- df %>% - dplyr::filter(!prot_issig, prot_n_pep == 1L) %>% - dplyr::mutate(prot_tier = 3L) - + df3 <- dplyr::filter(df, !prot_issig, prot_n_pep == 1L) + df3 <- dplyr::mutate(df3, prot_tier = 3L) + df <- dplyr::bind_rows( dplyr::filter(df, prot_issig), - dplyr::filter(df, !prot_issig, prot_n_pep >= 2L) - ) %>% - dplyr::mutate(prot_tier = ifelse(prot_issig, 1L, 2L)) - + dplyr::filter(df, !prot_issig, prot_n_pep >= 2L)) + + df <- dplyr::mutate(df, prot_tier = ifelse(prot_issig, 1L, 2L)) + # the same peptide can be present in all three protein tiers; # steps up if pep_seq(s) in tier 3 also in tiers 1, 2 if (FALSE) { rows <- df3$pep_seq %in% df$pep_seq - df <- dplyr::bind_rows(df, df3[rows, ]) df3 <- df3[!rows, ] - rm(list = "rows") - gc() } # Protein groups @@ -1641,12 +1642,12 @@ psmC2Q <- function (df = NULL, out_path = NULL, fdr_type = "protein", fdr_type <- "protein" df2 <- dplyr::filter(df, prot_tier == 2L) - df <- dplyr::filter(df, prot_tier == 1L) + df <- dplyr::filter(df, prot_tier == 1L) } else { if (fdr_type == "protein") { df2 <- dplyr::filter(df, prot_tier == 2L) - df <- dplyr::filter(df, prot_tier == 1L) + df <- dplyr::filter(df, prot_tier == 1L) } else { message("No tier-2 outputs at `fdr_type = ", fdr_type, "`.") @@ -1665,7 +1666,7 @@ psmC2Q <- function (df = NULL, out_path = NULL, fdr_type = "protein", # df may have both prot_tier 1 and 2 if fdr_type != "protein" df_tier12 <- unique(df[, c("prot_acc", "prot_tier")]) - df <- unique(df[, c("prot_acc", "pep_seq")]) + df <- unique(df [, c("prot_acc", "pep_seq")]) df2 <- unique(df2[, c("prot_acc", "pep_seq")]) df3 <- unique(df3[, c("prot_acc", "pep_seq")]) gc() @@ -1680,26 +1681,25 @@ psmC2Q <- function (df = NULL, out_path = NULL, fdr_type = "protein", else df <- make_zero_df(nms) - if (nrow(df2)) - df2 <- groupProts(df2, out_path = file.path(out_path, "temp2"), fct = fct) + df2 <- if (nrow(df2)) + groupProts(df2, out_path = file.path(out_path, "temp2"), fct = fct) else - df2 <- make_zero_df(nms) + make_zero_df(nms) - if (nrow(df3)) - df3 <- groupProts(df3, out_path = file.path(out_path, "temp3"), fct = fct) + df3 <- if (nrow(df3)) + groupProts(df3, out_path = file.path(out_path, "temp3"), fct = fct) else - df3 <- make_zero_df(nms) + make_zero_df(nms) rm(list = c("nms", "df_tier12")) # Cleanup dfC <- suppressWarnings( - read_tsv(file.path(out_path, "psmC.txt"), col_types = get_proteoM_coltypes()) - ) %>% - dplyr::filter(pep_issig, !pep_isdecoy, !grepl("^-", prot_acc)) %>% - tidyr::unite(uniq_id, prot_acc, pep_seq, sep = ".", remove = FALSE) - - df <- post_psmC2Q(df, dfC, tier = NULL) + read_tsv(file.path(out_path, "psmC.txt"), col_types = get_proteoM_coltypes())) + dfC <- dplyr::filter(dfC, pep_issig, !pep_isdecoy, !grepl("^-", prot_acc)) + dfC <- tidyr::unite(dfC, uniq_id, prot_acc, pep_seq, sep = ".", remove = FALSE) + + df <- post_psmC2Q(df, dfC, tier = NULL) df2 <- post_psmC2Q(df2, dfC, tier = 2L) df3 <- post_psmC2Q(df3, dfC, tier = 3L) @@ -1721,11 +1721,11 @@ psmC2Q <- function (df = NULL, out_path = NULL, fdr_type = "protein", } if (combine_tier_three) { - df <- list(df, df2, df3) %>% - dplyr::bind_rows() %>% - dplyr::arrange(prot_acc, pep_seq) %T>% - readr::write_tsv(file.path(out_path, "psmQ.txt")) - + df3 <- df3[!df3[["pep_seq"]] %fin% df[["pep_seq"]], ] + df <- dplyr::bind_rows(list(df, df3)) # df2 should have no rows + df <- dplyr::arrange(df, prot_acc, pep_seq) + readr::write_tsv(df, file.path(out_path, "psmQ.txt")) + local({ file_t2 <- file.path(out_path, "psmT2.txt") file_t3 <- file.path(out_path, "psmT3.txt") @@ -1742,22 +1742,18 @@ psmC2Q <- function (df = NULL, out_path = NULL, fdr_type = "protein", }) } else { - df <- df %>% - dplyr::arrange(prot_acc, pep_seq) %T>% - readr::write_tsv(file.path(out_path, "psmQ.txt")) - + df <- dplyr::arrange(df, prot_acc, pep_seq) + readr::write_tsv(df, file.path(out_path, "psmQ.txt")) + if (nrow(df2)) { - df2 <- df2[names(df)] %>% - dplyr::mutate(prot_hit_num = prot_hit_num + max) %T>% - readr::write_tsv(file.path(out_path, "psmT2.txt")) - - max <- max(df2$prot_hit_num, na.rm = TRUE) + df2 <- dplyr::mutate(df2[names(df)], prot_hit_num = prot_hit_num + max) + readr::write_tsv(df2, file.path(out_path, "psmT2.txt")) + max <- max(df2[["prot_hit_num"]], na.rm = TRUE) } if (nrow(df3)) { - df3 <- df3[names(df)] %>% - dplyr::mutate(prot_hit_num = prot_hit_num + max) %T>% - readr::write_tsv(file.path(out_path, "psmT3.txt")) + df3 <- dplyr::mutate(df3[names(df)], prot_hit_num = prot_hit_num + max) + readr::write_tsv(df3, file.path(out_path, "psmT3.txt")) } } @@ -1776,11 +1772,10 @@ post_psmC2Q <- function (df, dfC, tier = NULL) if (!is.null(tier)) df <- dplyr::mutate(df, prot_tier = tier) - df <- df %>% - tidyr::unite(uniq_id, prot_acc, pep_seq, sep = ".", remove = TRUE) %>% - dplyr::left_join(dfC, by = "uniq_id") %>% - dplyr::select(-uniq_id) - + df <- tidyr::unite(df, uniq_id, prot_acc, pep_seq, sep = ".", remove = TRUE) + df <- dplyr::left_join(df, dfC, by = "uniq_id") + df <- dplyr::select(df, -uniq_id) + ord_prots <- c("prot_acc", "prot_issig") df <- dplyr::bind_cols( @@ -1797,14 +1792,13 @@ post_psmC2Q <- function (df, dfC, tier = NULL) ) df <- dplyr::bind_cols( - df %>% .[grepl("^prot_", names(.))], - df %>% .[grepl("^pep_", names(.))], - df %>% .[grepl("^psm_", names(.))], - df %>% .[!grepl("^prot_|^pep_|^psm_", names(.))], + df[grepl("^prot_", names(df))], + df[grepl("^pep_", names(df))], + df[grepl("^psm_", names(df))], + df[!grepl("^prot_|^pep_|^psm_", names(df))], ) - df <- df %>% - dplyr::select(-which(names(.) %in% c("prot_n_psm", "prot_n_pep"))) + df <- dplyr::select(df, -which(names(df) %in% c("prot_n_psm", "prot_n_pep"))) } @@ -1992,3 +1986,32 @@ map_raw_n_scan <- function (df, mgf_path) } +#' Checks the values of \code{fdr_group} +#' +#' Not yet used. Takes values of integers or character strings. +#' +#' @inheritParams matchMS +check_fdr_group <- function (fdr_group = c("base", "all", "top3"), + oks = c("base", "all")) +{ + is_trivial <- all(is.null(fdr_group)) || all(is.na(fdr_group)) || + all(fdr_group == "") + + if (is_trivial) + return(oks[[1]]) + + fdr_group <- unique(fdr_group) + + len <- length(fdr_group) + + if (len > 1L) { + if (all(fdr_group %in% oks)) + fdr_group <- oks[1] + else + fdr_group <- fdr_group[!fdr_group %in% oks] + } + + as.character(fdr_group) +} + + diff --git a/R/scores.R b/R/scores.R index dff320c..1cc2183 100644 --- a/R/scores.R +++ b/R/scores.R @@ -266,16 +266,8 @@ calc_probi_byvmods <- function (df, nms, expt_moverzs, expt_ints, ## df2 tt2 <- add_seions(df_theo, type_ms2ions = type_ms2ions, digits = digits) + df2 <- match_ex2th2(expt_moverzs, tt2, min_ms2mass, d2, index_mgf_ms2) - if (index_mgf_ms2) { - # df2 <- find_ppm_outer_bycombi(expt_moverzs, index_mz(tt2, from = min_ms2mass, d = d2), 2L) - df2 <- match_ex2th2(expt_moverzs, tt2, min_ms2mass, d2, index_mgf_ms2) - } - else { - # df2 <- find_ppm_outer_bycombi(expt_moverzs, tt2, ppm_ms2) - df2 <- match_ex2th2(expt_moverzs, tt2, min_ms2mass, d2, index_mgf_ms2) - } - ith2 <- df2[["ith"]] iex2 <- df2[["iex"]] @@ -313,7 +305,7 @@ calc_probi_byvmods <- function (df, nms, expt_moverzs, expt_ints, ## 3. join `int2` to `y` y_idx <- y[["idx"]] - ok_iex <- which(!is.na(y_idx)) + ok_iex <- .Internal(which(!is.na(y_idx))) y_ith <- y_idx[ok_iex] y[["int2"]][ok_iex] <- int2[y_ith] @@ -323,7 +315,7 @@ calc_probi_byvmods <- function (df, nms, expt_moverzs, expt_ints, ### if (soft_secions) { - ok_int2 <- which(int2 > 0L & is.na(df[["int"]])) # 11, 21 + ok_int2 <- .Internal(which(int2 > 0L & is.na(df[["int"]]))) if (length(ok_int2)) { ok_iex2 <- iex2[match(ok_int2, ith2 %% m)] @@ -337,7 +329,7 @@ calc_probi_byvmods <- function (df, nms, expt_moverzs, expt_ints, ## 5. arrange by "-int" ord_int <- order(y[["int"]], decreasing = TRUE, method = "radix", na.last = TRUE) y_theo <- y[["theo"]][ord_int] - maxi <- which(!is.na(y_theo)) + maxi <- .Internal(which(!is.na(y_theo))) maxi <- maxi[length(maxi)] y_theo <- y_theo[1:maxi] @@ -527,8 +519,8 @@ scalc_pepprobs <- function (entry, topn_ms2ions = 100L, type_ms2ions = "by", # 3 331. 331. # (flattens by one level as is a list-column) - mts <- entry$matches[[1]] - + mts <- entry[["matches"]][[1]] + # N <- entry$ms2_n[[1]] topn_ms2ions <- min(topn_ms2ions, entry$ms2_n[[1]]) N <- min(topn_ms2ions * 5L, 500L) @@ -549,7 +541,7 @@ scalc_pepprobs <- function (entry, topn_ms2ions = 100L, type_ms2ions = "by", uniq_id <- .Internal(unlist(entry$uniq_id, recursive = FALSE, use.names = FALSE)) out <- lapply(out, function (x) { - x$uniq_id <- uniq_id + x[["uniq_id"]] <- uniq_id x }) @@ -606,7 +598,11 @@ calc_pepprobs_i <- function (df, topn_ms2ions = 100L, type_ms2ions = "by", #' @inheritParams matchMS #' @import parallel calc_pepscores <- function (topn_ms2ions = 100L, type_ms2ions = "by", - target_fdr = 0.01, fdr_type = "psm", + target_fdr = 0.01, + + # to be deleted later + fdr_type = "protein", + min_len = 7L, max_len = 40L, ppm_ms2 = 20L, soft_secions = FALSE, out_path = "~/proteoM/outs", @@ -632,7 +628,7 @@ calc_pepscores <- function (topn_ms2ions = 100L, type_ms2ions = "by", add = TRUE ) - # Check priors + ## Check priors pat_i <- "^ion_matches_" list_i <- find_targets(out_path, pattern = pat_i)$files len_i <- length(list_i) @@ -645,10 +641,13 @@ calc_pepscores <- function (topn_ms2ions = 100L, type_ms2ions = "by", fun_env <- environment() fml_nms <- names(formals(fun)) - # args_except <- c("sys_ram") - # fml_incl <- fml_nms[!fml_nms %in% args_except] - fml_incl <- fml_nms + args_except <- c("fdr_type") + fml_incl <- if (length(args_except)) + fml_nms[!fml_nms %in% args_except] + else + fml_nms + message("[x] For reprocessing (with new score function) ", "delete cached 'pepscores_[...]' and 'calc_pepscores.rda'.\n") @@ -682,14 +681,9 @@ calc_pepscores <- function (topn_ms2ions = 100L, type_ms2ions = "by", rm(list = c("cache_pars", "call_pars")) } - # to be deleted - # n_cores <- detect_cores(16L) - d2 <- calc_threeframe_ppm(ppm_ms2) * 1E-6 - for (fi in list_i) { - message("\tModule: ", fi) - + for (fi in list_i) calcpepsc(file = fi, topn_ms2ions = topn_ms2ions, type_ms2ions = type_ms2ions, @@ -703,18 +697,16 @@ calc_pepscores <- function (topn_ms2ions = 100L, type_ms2ions = "by", add_ms2theos2 = add_ms2theos2, add_ms2moverzs = add_ms2moverzs, add_ms2ints = add_ms2ints, + # slower with 48 cores + n_cores = detect_cores(16L), digits = digits) - - gc() - } - + .savecall <- TRUE invisible(NULL) } - #' Find the index or name of decoy results. #' #' @param pattern The pattern of files. @@ -758,6 +750,7 @@ find_targets <- function (out_path, pattern = "^ion_matches_") #' #' @param file A file name of \code{ion_matches_}. #' @param d2 Bin width in ppm divided by 1E6. +#' @param n_cores The number of CPU cores. #' @inheritParams matchMS #' @inheritParams calc_pepscores calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", @@ -765,9 +758,10 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", min_ms2mass = 115L, d2 = 1E-5, index_mgf_ms2 = FALSE, add_ms2theos = FALSE, add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, add_ms2ints = FALSE, - digits = 4L) + n_cores = 16L, digits = 4L) { - # (can be decoy => .*, not \\d+) + message("\tModule: ", file) + idx <- gsub("^ion_matches_(.*)\\.rds$", "\\1", file) file_lt <- file.path(out_path, "temp", paste0("list_table_", idx, ".rds")) file_sc <- file.path(out_path, "temp", paste0("pepscores_", idx, ".rds")) @@ -820,9 +814,6 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", # -> res[[i]] <- NULL # -> length(res) shortened by 1 - # will be used again later - n_cores <- detect_cores(48L) - if (n_rows <= 5000L) { probs <- calc_pepprobs_i( df, @@ -865,9 +856,10 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", for (i in seq_len(len)) { dfi <- suppressWarnings( - chunksplit(qs::qread(file.path(tempdir, nms[[i]])), n_cores, "row")) + # * 4L of smaller hashes for some slow phospho scoring + chunksplit(qs::qread(file.path(tempdir, nms[[i]])), n_cores * 4L, "row")) - probs[[i]] <- parallel::clusterApply(cl, dfi, + probs[[i]] <- parallel::clusterApplyLB(cl, dfi, calc_pepprobs_i, topn_ms2ions = topn_ms2ions, type_ms2ions = type_ms2ions, @@ -890,7 +882,7 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", df <- qs::qread(path_df) } else { - dfs <- suppressWarnings(chunksplit(df, n_cores, "row")) + dfs <- suppressWarnings(chunksplit(df, n_cores * 4L, "row")) # a case that `chunksplit` did not successfully split if (is.data.frame(dfs)) { @@ -907,7 +899,7 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", digits = digits) } else { - probs <- parallel::clusterApply(cl, dfs, + probs <- parallel::clusterApplyLB(cl, dfs, calc_pepprobs_i, topn_ms2ions = topn_ms2ions, type_ms2ions = type_ms2ions, @@ -932,8 +924,6 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", if ("matches" %in% names(df)) df <- df[, -which(names(df) == "matches"), drop = FALSE] - gc() - df2 <- qs::qread(path_df2) df <- dplyr::bind_cols(df, df2) rm(list = c("df2", "path_df2")) @@ -946,7 +936,8 @@ calcpepsc <- function (file, topn_ms2ions = 100L, type_ms2ions = "by", df <- df[, -which(names(df) == "uniq_id"), drop = FALSE] df <- post_pepscores(df) - ## Outputs + + ## Add MS2 m/z and intensity values qs::qsave(df[, cols_lt, drop = FALSE], file_lt, preset = "fast") message("\tAdding theoretical MS2 m/z and intensity values: ", Sys.time()) @@ -1266,16 +1257,30 @@ find_pepscore_co2 <- function (td, target_fdr = 0.01) #' @param td A target-decoy pair. #' @param len Numeric; the length of peptides. #' @inheritParams matchMS -probco_bypeplen <- function (len, td, fdr_type = "psm", target_fdr = 0.01, +probco_bypeplen <- function (len, td, fdr_type = "protein", target_fdr = 0.01, min_pepscores_co = 0, out_path) { td <- dplyr::filter(td, pep_len == len) if (fdr_type %in% c("peptide", "protein")) { - td <- dplyr::arrange(td, pep_seq, pep_prob) - td <- dplyr::group_by(td, pep_seq) - td <- dplyr::filter(td, row_number() == 1L) - td <- dplyr::ungroup(td) + if (fdr_type == "protein") { + td <- dplyr::arrange(td, pep_seq, pep_prob) + td <- dplyr::group_by(td, pep_seq) + td <- dplyr::filter(td, row_number() == 1L) + td <- dplyr::ungroup(td) + } + else { + # td[["pep_ivmod2"]] <- gsub(" [\\(\\[]\\d+[\\)\\[]$", "", td[["pep_ivmod"]]) + td[["pep_ivmod2"]] <- gsub(" .*", "", td[["pep_ivmod"]]) + td[["pep_seq_mod"]] <- ifelse(is.na(td[["pep_ivmod2"]]), td[["pep_seq"]], + paste0(td[["pep_seq"]], ".", td[["pep_ivmod2"]])) + td <- dplyr::arrange(td, pep_seq_mod, pep_prob) + td <- dplyr::group_by(td, pep_seq_mod) + td <- dplyr::filter(td, row_number() == 1L) + td <- dplyr::ungroup(td) + td[["pep_ivmod2"]] <- NULL + td[["pep_seq_mod"]] <- NULL + } } td <- dplyr::select(td, pep_prob, pep_isdecoy) @@ -1288,7 +1293,7 @@ probco_bypeplen <- function (len, td, fdr_type = "psm", target_fdr = 0.01, count <- nrow(td) if (count < (1 / target_fdr)) { - if (count <= 20L) + if (count <= 10L) # changed from 20L return(NA) best_co <- tryCatch( @@ -1508,12 +1513,12 @@ find_probco_valley <- function (prob_cos, guess = 12L) #' } #' #' } -calc_pepfdr <- function (target_fdr = .01, fdr_type = "psm", +calc_pepfdr <- function (target_fdr = .01, fdr_type = "protein", min_len = 7L, max_len = 40L, max_pepscores_co = 50, min_pepscores_co = 0, enzyme = "trypsin_p", - fdr_group = "all", - nes_fdr_group = "all", + fdr_group = "base", + nes_fdr_group = "base", out_path) { message("Calculating peptide FDR.") @@ -1522,7 +1527,12 @@ calc_pepfdr <- function (target_fdr = .01, fdr_type = "psm", files <- list.files(path = file.path(out_path, "temp"), pattern = "^pepscores_", full.names = TRUE) - max_i <- which.max(file.size(files))[[1]] + + top3s <- gsub(paste0("^.*pepscores_", "(\\d+)\\.rds$"), "\\1", + files[which_topx2(file.size(files), 3L)[1:3]]) + + max_i <- gsub(paste0("^.*pepscores_", "(\\d+)\\.rds$"), "\\1", + files[which.max(file.size(files))[[1]]]) if (!length(files)) stop("Score results not found.", call. = FALSE) @@ -1551,6 +1561,12 @@ calc_pepfdr <- function (target_fdr = .01, fdr_type = "psm", td[td[["pep_mod_group"]] == max_i & grepl("[KR]$", td[["pep_seq"]]), ] else if (nes_fdr_group == "base_cterm_nontryptic") td[td[["pep_mod_group"]] == max_i & !grepl("[KR]$", td[["pep_seq"]]), ] + else if (nes_fdr_group == "top3") + td[td[["pep_mod_group"]] %in% top3s, ] + else if (nes_fdr_group == "top3_cterm_tryptic") + td[td[["pep_mod_group"]] %in% top3s & grepl("[KR]$", td[["pep_seq"]]), ] + else if (nes_fdr_group == "top3_cterm_nontryptic") + td[td[["pep_mod_group"]] %in% top3s & !grepl("[KR]$", td[["pep_seq"]]), ] else stop("Invalid argument for \"nes_fdr_group\".") } @@ -1559,6 +1575,8 @@ calc_pepfdr <- function (target_fdr = .01, fdr_type = "psm", td else if (fdr_group == "base") td[td[["pep_mod_group"]] == max_i, ] + else if (fdr_group == "top3") + td[td[["pep_mod_group"]] %in% top3s, ] else stop("Invalid argument for \"fdr_group\".") } @@ -1600,7 +1618,6 @@ calc_pepfdr <- function (target_fdr = .01, fdr_type = "psm", td <- dplyr::arrange(td, pep_prob) td <- dplyr::filter(td, row_number() == 1L) td <- dplyr::ungroup(td) - gc() # --- @@ -1625,7 +1642,7 @@ calc_pepfdr <- function (target_fdr = .01, fdr_type = "psm", return(data.frame(pep_len = seqs, pep_prob_co = prob_cos)) } - counts <- as.numeric(names(prob_cos)) + counts <- as.integer(names(prob_cos)) names(counts) <- all_lens names(prob_cos) <- all_lens @@ -1643,7 +1660,7 @@ calc_pepfdr <- function (target_fdr = .01, fdr_type = "psm", # (At least two non-trivial prob_cos) ######################################### - lens <- find_optlens(all_lens, counts, 128L) + lens <- find_optlens(all_lens, counts, 50L) # changed from 128L # no fittings if (length(lens) <= 3L) { @@ -2428,8 +2445,8 @@ match_ex2th2 <- function (expt, theo, min_ms2mass = 115L, d = 1E-5, { th <- index_mz(theo, from = min_ms2mass, d = d) ex <- if (index_mgf_ms2) expt else index_mz(expt, from = min_ms2mass, d = d) - ith <- which(th %fin% ex | (th - 1L) %fin% ex | (th + 1L) %fin% ex) - + ith <- .Internal(which(th %fin% ex | (th - 1L) %fin% ex | (th + 1L) %fin% ex)) + # if: e.g. th[ith+1] = th[ith] + 1 -> can have NA in iex: # th[ith+1] not in ex but th[ith+1] - 1 # OK to keep the NA: @@ -2441,7 +2458,7 @@ match_ex2th2 <- function (expt, theo, min_ms2mass = 115L, d = 1E-5, iex <- fastmatch::fmatch(thi, ex) # indexes before and after - nas <- which(is.na(iex)) + nas <- .Internal(which(is.na(iex))) if (length(nas)) { bf <- fastmatch::fmatch(thi - 1L, ex) @@ -2452,8 +2469,8 @@ match_ex2th2 <- function (expt, theo, min_ms2mass = 115L, d = 1E-5, } else { iex[nas] <- bf[nas] - nas <- which(is.na(iex)) - + nas <- .Internal(which(is.na(iex))) + if (length(nas)) { af <- fastmatch::fmatch(thi + 1L, ex) iex[nas] <- af[nas] diff --git a/R/utils_engine.R b/R/utils_engine.R index 7c6f426..9ca199a 100644 --- a/R/utils_engine.R +++ b/R/utils_engine.R @@ -26,7 +26,7 @@ which_topx <- function(x, n = 50L, ...) xp <- sort(x, partial = p, ...)[p] - which(x > xp) + .Internal(which(x > xp)) } @@ -85,8 +85,8 @@ which_topx2 <- function(x, n = 50L, ...) # or: xp <- sort(x, partial = p, na.last = TRUE)[p] xp <- sort(x, partial = p, ...)[p] - inds <- which(x > xp) - + inds <- .Internal(which(x > xp)) + # in case of ties -> length(inds) < n # detrimental e.g. ms2_n = 500 and n = 100 # -> expect 100 `ms2_moverzs` guaranteed but may be only 99 @@ -98,7 +98,7 @@ which_topx2 <- function(x, n = 50L, ...) if (d) { # must exist and length(ties) >= length(d) - ties <- which(x == xp) + ties <- .Internal(which(x == xp)) for (i in seq_len(d)) inds <- insVal(ties[i], inds) } @@ -253,11 +253,11 @@ purge_search_space <- function (i, aa_masses, mgf_path, n_cores, ppm_ms1 = 10L, fmods_nl = NULL) { # loads freshly mgfs (as will be modified) - mgf_frames <- - qs::qread(file.path(mgf_path, "mgf_queries.rds")) %>% - dplyr::group_by(frame) %>% - dplyr::group_split() %>% - setNames(purrr::map_dbl(., function (x) x$frame[1])) + mgf_frames <- qs::qread(file.path(mgf_path, "mgf_queries.rds")) + mgf_frames <- dplyr::group_by(mgf_frames, frame) + mgf_frames <- dplyr::group_split(mgf_frames) + frs <- lapply(mgf_frames, function (x) x[["frame"]][1]) + names(mgf_frames) <- unlist(frs, recursive = FALSE, use.names = FALSE) mgf_frames <- local({ ranges <- seq_along(mgf_frames) @@ -270,12 +270,8 @@ purge_search_space <- function (i, aa_masses, mgf_path, n_cores, ppm_ms1 = 10L, # parses aa_masses nm_fmods <- attr(aa_masses, "fmods", exact = TRUE) nm_vmods <- attr(aa_masses, "vmods", exact = TRUE) - msg_end <- if (grepl("^rev_", i)) " (decoy)." else "." - - message("Matching against: ", - paste0(nm_fmods, - nm_vmods %>% { if (nchar(.) > 0L) paste0(" | ", .) else . }, - msg_end)) + message("Matching against: ", + if (nchar(nm_vmods) == 0L) nm_fmods else paste0(nm_fmods, " | ", nm_vmods)) # reads theoretical peptide data .path_bin <- get(".path_bin", envir = .GlobalEnv, inherits = FALSE) @@ -294,8 +290,10 @@ purge_search_space <- function (i, aa_masses, mgf_path, n_cores, ppm_ms1 = 10L, oks <- names(x) %fin% frames_theo x <- x[oks] - empties <- purrr::map_lgl(x, purrr::is_empty) - x[!empties] + ans <- .Internal(unlist(lapply(x, function (y) length(x) > 0L), + recursive = FALSE, use.names = FALSE)) + + x[ans] }) rm(list = "frames_theo") @@ -304,11 +302,11 @@ purge_search_space <- function (i, aa_masses, mgf_path, n_cores, ppm_ms1 = 10L, # preceding and following frames: (o)|range of mgf_frames[[1]]|(o) frames_mgf <- lapply(mgf_frames, function (x) as.integer(names(x))) - mins <- purrr::map_int(frames_mgf, function (x) - if (length(x)) min(x, na.rm = TRUE) else 0L) + mins <- lapply(frames_mgf, function (x) if (length(x)) min(x, na.rm = TRUE) else 0L) + mins <- .Internal(unlist(mins, recursive = FALSE, use.names = FALSE)) - maxs <- purrr::map_int(frames_mgf, function (x) - if (length(x)) max(x, na.rm = TRUE) else 0L) + maxs <- lapply(frames_mgf, function (x) if (length(x)) max(x, na.rm = TRUE) else 0L) + maxs <- .Internal(unlist(maxs, recursive = FALSE, use.names = FALSE)) frames_theo <- as.integer(names(theopeps)) @@ -329,9 +327,11 @@ purge_search_space <- function (i, aa_masses, mgf_path, n_cores, ppm_ms1 = 10L, SIMPLIFY = FALSE, USE.NAMES = FALSE) # (4) removes empties (zero overlap between mgf_frames and theopeps) - oks <- purrr::map_lgl(mgf_frames, function (x) length(x) > 0L) | - purrr::map_lgl(theopeps, function (x) length(x) > 0L) - + ok_mgfs <- lapply(mgf_frames, function (x) length(x) > 0L) + ok_mgfs <- .Internal(unlist(ok_mgfs, recursive = FALSE, use.names = FALSE)) + ok_theos <- lapply(theopeps, function (x) length(x) > 0L) + ok_theos <- .Internal(unlist(ok_theos, recursive = FALSE, use.names = FALSE)) + oks <- ok_mgfs | ok_theos mgf_frames <- mgf_frames[oks] theopeps <- theopeps[oks] diff --git a/man/add_hexcodes_vnl2.Rd b/man/add_hexcodes_vnl2.Rd index 5277a5d..fcdc41a 100644 --- a/man/add_hexcodes_vnl2.Rd +++ b/man/add_hexcodes_vnl2.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/ms2_a1_vnl1_fnl0.R \name{add_hexcodes_vnl2} \alias{add_hexcodes_vnl2} -\title{Adds hex codes (with variable NLs).} +\title{Adds hexcodes (with variable NLs).} \usage{ add_hexcodes_vnl2(ms2ions, vmods_combi, len, mod_indexes = NULL) } diff --git a/man/add_protacc.Rd b/man/add_protacc.Rd index 6df8132..3c7b04b 100644 --- a/man/add_protacc.Rd +++ b/man/add_protacc.Rd @@ -12,14 +12,10 @@ add_protacc(df = NULL, out_path = NULL, .path_cache = NULL, .path_fasta = NULL) \item{out_path}{An output path.} \item{.path_cache}{The file path of cached search parameters. The parameter - is for the users' awareness of the underlying structure of file folders and - the use of default is suggested. Occasionally experimenters may remove the - file folder for disk space or under infrequent events of modified framework - incurred by the developer. - - Started from version 1.2.1.5, a new data structure of protein-peptide - lookups are used. Version 1.2.1.4.1 is the final version using the old data - structure.} +is for the users' awareness of the underlying structure of file folders and +the use of default is suggested. Occasionally experimenters may remove the +file folder for disk space or under infrequent events of modified framework +incurred by the developer.} \item{.path_fasta}{The parent file path to the theoretical masses of MS1 precursors. At the NULL default, the path is \code{gsub("(.*)\\.[^\\.]*$", diff --git a/man/add_protacc2.Rd b/man/add_protacc2.Rd index 41aa213..6362d73 100644 --- a/man/add_protacc2.Rd +++ b/man/add_protacc2.Rd @@ -17,14 +17,10 @@ add_protacc2( \item{out_path}{An output path.} \item{.path_cache}{The file path of cached search parameters. The parameter - is for the users' awareness of the underlying structure of file folders and - the use of default is suggested. Occasionally experimenters may remove the - file folder for disk space or under infrequent events of modified framework - incurred by the developer. - - Started from version 1.2.1.5, a new data structure of protein-peptide - lookups are used. Version 1.2.1.4.1 is the final version using the old data - structure.} +is for the users' awareness of the underlying structure of file folders and +the use of default is suggested. Occasionally experimenters may remove the +file folder for disk space or under infrequent events of modified framework +incurred by the developer.} \item{.path_fasta}{The parent file path to the theoretical masses of MS1 precursors. At the NULL default, the path is \code{gsub("(.*)\\.[^\\.]*$", diff --git a/man/batch_ms2ions.Rd b/man/batch_ms2ions.Rd index 1d5cf29..291016f 100644 --- a/man/batch_ms2ions.Rd +++ b/man/batch_ms2ions.Rd @@ -64,14 +64,10 @@ number of combinatorial variable modifications per peptide sequence. The default is 64.} \item{.path_cache}{The file path of cached search parameters. The parameter - is for the users' awareness of the underlying structure of file folders and - the use of default is suggested. Occasionally experimenters may remove the - file folder for disk space or under infrequent events of modified framework - incurred by the developer. - - Started from version 1.2.1.5, a new data structure of protein-peptide - lookups are used. Version 1.2.1.4.1 is the final version using the old data - structure.} +is for the users' awareness of the underlying structure of file folders and +the use of default is suggested. Occasionally experimenters may remove the +file folder for disk space or under infrequent events of modified framework +incurred by the developer.} \item{.path_fasta}{The parent file path to the theoretical masses of MS1 precursors. At the NULL default, the path is \code{gsub("(.*)\\.[^\\.]*$", diff --git a/man/bin_ms1masses.Rd b/man/bin_ms1masses.Rd index 1256f41..482f01d 100644 --- a/man/bin_ms1masses.Rd +++ b/man/bin_ms1masses.Rd @@ -34,14 +34,10 @@ default is 20.} \code{fdr_type = psm} to \code{fdr_type = protein}.} \item{.path_cache}{The file path of cached search parameters. The parameter - is for the users' awareness of the underlying structure of file folders and - the use of default is suggested. Occasionally experimenters may remove the - file folder for disk space or under infrequent events of modified framework - incurred by the developer. - - Started from version 1.2.1.5, a new data structure of protein-peptide - lookups are used. Version 1.2.1.4.1 is the final version using the old data - structure.} +is for the users' awareness of the underlying structure of file folders and +the use of default is suggested. Occasionally experimenters may remove the +file folder for disk space or under infrequent events of modified framework +incurred by the developer.} \item{.path_ms1masses}{The file path to the theoretical masses of MS1 precursors.} diff --git a/man/calc_pepfdr.Rd b/man/calc_pepfdr.Rd index 869998f..93ca125 100644 --- a/man/calc_pepfdr.Rd +++ b/man/calc_pepfdr.Rd @@ -6,14 +6,14 @@ \usage{ calc_pepfdr( target_fdr = 0.01, - fdr_type = "psm", + fdr_type = "protein", min_len = 7L, max_len = 40L, max_pepscores_co = 50, min_pepscores_co = 0, enzyme = "trypsin_p", - fdr_group = "all", - nes_fdr_group = "all", + fdr_group = "base", + nes_fdr_group = "base", out_path ) } @@ -30,8 +30,7 @@ for considerations. Longer peptides will be excluded.} \item{max_pepscores_co}{A positive numeric; the upper limit in the cut-offs of peptide scores for discriminating significant and insignificant -identities. The default is changed from \code{Inf} to 50 from version -1.1.9.2 on.} +identities.} \item{min_pepscores_co}{A non-negative numeric; the lower limit in the cut-offs of peptide scores for discriminating significant and insignificant diff --git a/man/calc_pepmasses2.Rd b/man/calc_pepmasses2.Rd index 08845db..62a2dd4 100644 --- a/man/calc_pepmasses2.Rd +++ b/man/calc_pepmasses2.Rd @@ -154,14 +154,10 @@ time.} \code{fdr_type = psm} to \code{fdr_type = protein}.} \item{.path_cache}{The file path of cached search parameters. The parameter - is for the users' awareness of the underlying structure of file folders and - the use of default is suggested. Occasionally experimenters may remove the - file folder for disk space or under infrequent events of modified framework - incurred by the developer. - - Started from version 1.2.1.5, a new data structure of protein-peptide - lookups are used. Version 1.2.1.4.1 is the final version using the old data - structure.} +is for the users' awareness of the underlying structure of file folders and +the use of default is suggested. Occasionally experimenters may remove the +file folder for disk space or under infrequent events of modified framework +incurred by the developer.} \item{.path_fasta}{The parent file path to the theoretical masses of MS1 precursors. At the NULL default, the path is \code{gsub("(.*)\\.[^\\.]*$", diff --git a/man/calc_pepprobs_i.Rd b/man/calc_pepprobs_i.Rd index 2903c28..c4bc09e 100644 --- a/man/calc_pepprobs_i.Rd +++ b/man/calc_pepprobs_i.Rd @@ -31,10 +31,10 @@ and y-ions.} \item{ppm_ms2}{A positive integer; the mass tolerance of MS2 species. The default is 20.} -\item{soft_secions}{Logical; if TRUE, collapses the intensities of secondary -ions to primary ions at the absence of the primaries. The default is FALSE. -For instance, the signal of \code{b5^*} will be ignored if its primary ion -\code{b5} is not matched.} +\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities +of secondary ions to primary ions at the absence of the primaries. The +default is FALSE. For instance, the signal of \code{b5^*} will be ignored +if its primary ion \code{b5} is not matched.} \item{out_path}{A file path of outputs.} diff --git a/man/calc_pepscores.Rd b/man/calc_pepscores.Rd index a0a1c33..ec00e44 100644 --- a/man/calc_pepscores.Rd +++ b/man/calc_pepscores.Rd @@ -8,7 +8,7 @@ calc_pepscores( topn_ms2ions = 100L, type_ms2ions = "by", target_fdr = 0.01, - fdr_type = "psm", + fdr_type = "protein", min_len = 7L, max_len = 40L, ppm_ms2 = 20L, @@ -55,7 +55,7 @@ levels of PSM, peptide or protein. The default is 0.01. See also argument \item{fdr_type}{A character string; the type of FDR control. The value is in one of c("protein", "peptide", "psm"). The default is \code{protein}. - Note that \code{fdr_type = protein} is equivalent to \code{fdr_type = + Note that \code{fdr_type = protein} is comparable to \code{fdr_type = peptide} with the additional filtration of data at \code{prot_tier == 1}.} \item{min_len}{A positive integer; the minimum length of peptide sequences @@ -67,10 +67,10 @@ for considerations. Longer peptides will be excluded.} \item{ppm_ms2}{A positive integer; the mass tolerance of MS2 species. The default is 20.} -\item{soft_secions}{Logical; if TRUE, collapses the intensities of secondary -ions to primary ions at the absence of the primaries. The default is FALSE. -For instance, the signal of \code{b5^*} will be ignored if its primary ion -\code{b5} is not matched.} +\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities +of secondary ions to primary ions at the absence of the primaries. The +default is FALSE. For instance, the signal of \code{b5^*} will be ignored +if its primary ion \code{b5} is not matched.} \item{out_path}{A file path of outputs.} diff --git a/man/calc_probi.Rd b/man/calc_probi.Rd index 7ffd6d4..ed32e48 100644 --- a/man/calc_probi.Rd +++ b/man/calc_probi.Rd @@ -39,10 +39,10 @@ searches.} \item{ppm_ms2}{A positive integer; the mass tolerance of MS2 species. The default is 20.} -\item{soft_secions}{Logical; if TRUE, collapses the intensities of secondary -ions to primary ions at the absence of the primaries. The default is FALSE. -For instance, the signal of \code{b5^*} will be ignored if its primary ion -\code{b5} is not matched.} +\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities +of secondary ions to primary ions at the absence of the primaries. The +default is FALSE. For instance, the signal of \code{b5^*} will be ignored +if its primary ion \code{b5} is not matched.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} diff --git a/man/calc_probi_bypep.Rd b/man/calc_probi_bypep.Rd index 6e518a3..ba2ed56 100644 --- a/man/calc_probi_bypep.Rd +++ b/man/calc_probi_bypep.Rd @@ -42,10 +42,10 @@ searches.} \item{ppm_ms2}{A positive integer; the mass tolerance of MS2 species. The default is 20.} -\item{soft_secions}{Logical; if TRUE, collapses the intensities of secondary -ions to primary ions at the absence of the primaries. The default is FALSE. -For instance, the signal of \code{b5^*} will be ignored if its primary ion -\code{b5} is not matched.} +\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities +of secondary ions to primary ions at the absence of the primaries. The +default is FALSE. For instance, the signal of \code{b5^*} will be ignored +if its primary ion \code{b5} is not matched.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} diff --git a/man/calc_probi_byvmods.Rd b/man/calc_probi_byvmods.Rd index c4d940d..ab31f8a 100644 --- a/man/calc_probi_byvmods.Rd +++ b/man/calc_probi_byvmods.Rd @@ -44,10 +44,10 @@ searches.} \item{ppm_ms2}{A positive integer; the mass tolerance of MS2 species. The default is 20.} -\item{soft_secions}{Logical; if TRUE, collapses the intensities of secondary -ions to primary ions at the absence of the primaries. The default is FALSE. -For instance, the signal of \code{b5^*} will be ignored if its primary ion -\code{b5} is not matched.} +\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities +of secondary ions to primary ions at the absence of the primaries. The +default is FALSE. For instance, the signal of \code{b5^*} will be ignored +if its primary ion \code{b5} is not matched.} \item{burn_ins}{The range of burn-ins where inputs will be excluded from probablity assessments.} diff --git a/man/calcpepsc.Rd b/man/calcpepsc.Rd index 267d62f..33316c7 100644 --- a/man/calcpepsc.Rd +++ b/man/calcpepsc.Rd @@ -18,6 +18,7 @@ calcpepsc( add_ms2theos2 = FALSE, add_ms2moverzs = FALSE, add_ms2ints = FALSE, + n_cores = 16L, digits = 4L ) } @@ -35,10 +36,10 @@ and y-ions.} \item{ppm_ms2}{A positive integer; the mass tolerance of MS2 species. The default is 20.} -\item{soft_secions}{Logical; if TRUE, collapses the intensities of secondary -ions to primary ions at the absence of the primaries. The default is FALSE. -For instance, the signal of \code{b5^*} will be ignored if its primary ion -\code{b5} is not matched.} +\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities +of secondary ions to primary ions at the absence of the primaries. The +default is FALSE. For instance, the signal of \code{b5^*} will be ignored +if its primary ion \code{b5} is not matched.} \item{out_path}{A file path of outputs.} @@ -82,6 +83,8 @@ interrogation. The default is 110.} \item{add_ms2ints}{Logical; if TRUE, adds the sequence of experimental MS2 intensity values (\code{pep_ms2_ints}).} +\item{n_cores}{The number of CPU cores.} + \item{digits}{A non-negative integer; the number of decimal places to be used. The default is 4.} } diff --git a/man/check_fdr_group.Rd b/man/check_fdr_group.Rd new file mode 100644 index 0000000..98cb5c2 --- /dev/null +++ b/man/check_fdr_group.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/msmsmatches.R +\name{check_fdr_group} +\alias{check_fdr_group} +\title{Checks the values of \code{fdr_group}} +\usage{ +check_fdr_group(fdr_group = c("base", "all", "top3"), oks = c("base", "all")) +} +\arguments{ +\item{fdr_group}{A character string; the modification group(s) for uses in +peptide FDR controls. The value is in one of c("all", "base"). The +\code{base} corresponds to the modification group with the largest number +of matches.} +} +\description{ +Not yet used. Takes values of integers or character strings. +} diff --git a/man/check_ms2frames.Rd b/man/check_ms2frames.Rd index 3e04e08..495a7a6 100644 --- a/man/check_ms2frames.Rd +++ b/man/check_ms2frames.Rd @@ -23,14 +23,10 @@ developer.} \item{ms1_time}{A cached MS1 time (directory).} \item{.path_cache}{The file path of cached search parameters. The parameter - is for the users' awareness of the underlying structure of file folders and - the use of default is suggested. Occasionally experimenters may remove the - file folder for disk space or under infrequent events of modified framework - incurred by the developer. - - Started from version 1.2.1.5, a new data structure of protein-peptide - lookups are used. Version 1.2.1.4.1 is the final version using the old data - structure.} +is for the users' awareness of the underlying structure of file folders and +the use of default is suggested. Occasionally experimenters may remove the +file folder for disk space or under infrequent events of modified framework +incurred by the developer.} \item{path_bin2}{A file path to binned MS2 results.} } diff --git a/man/hbatch_ms2ions.Rd b/man/hbatch_ms2ions.Rd index b286876..2cf9f89 100644 --- a/man/hbatch_ms2ions.Rd +++ b/man/hbatch_ms2ions.Rd @@ -42,14 +42,10 @@ number of combinatorial variable modifications per peptide sequence. The default is 64.} \item{.path_cache}{The file path of cached search parameters. The parameter - is for the users' awareness of the underlying structure of file folders and - the use of default is suggested. Occasionally experimenters may remove the - file folder for disk space or under infrequent events of modified framework - incurred by the developer. - - Started from version 1.2.1.5, a new data structure of protein-peptide - lookups are used. Version 1.2.1.4.1 is the final version using the old data - structure.} +is for the users' awareness of the underlying structure of file folders and +the use of default is suggested. Occasionally experimenters may remove the +file folder for disk space or under infrequent events of modified framework +incurred by the developer.} \item{.path_fasta}{The parent file path to the theoretical masses of MS1 precursors. At the NULL default, the path is \code{gsub("(.*)\\.[^\\.]*$", diff --git a/man/load_fasta2.Rd b/man/load_fasta2.Rd index 956158a..c430e7f 100644 --- a/man/load_fasta2.Rd +++ b/man/load_fasta2.Rd @@ -29,30 +29,26 @@ otherwise, the first value will be used for all \code{fasta} files. fasta_db <- load_fasta2( c("~/proteoM/dbs/fasta/uniprot/uniprot_hs_2020_05.fasta", "~/proteoM/dbs/fasta/crap/crap.fasta"), - c("uniprot_acc", "other") -) + c("uniprot_acc", "other")) # Need `acc_pattern` as "crap" is not one of the default acc_type load_fasta2( c("~/proteoM/dbs/fasta/uniprot/uniprot_hs_2020_05.fasta", "~/proteoM/dbs/fasta/crap/crap.fasta"), - c("uniprot_acc", "crap") -) + c("uniprot_acc", "crap")) # ok fasta_db2 <- load_fasta2( c("~/proteoM/dbs/fasta/uniprot/uniprot_hs_2020_05.fasta", "~/proteoM/dbs/fasta/crap/crap.fasta"), c("uniprot_acc", "crap"), - c("^>..\\\\|([^\\\\|]+)\\\\|[^\\\\|]+", "(.*)") -) + c("^>..\\\\|([^\\\\|]+)\\\\|[^\\\\|]+", "(.*)")) fasta_db3 <- load_fasta2( c("~/proteoM/dbs/fasta/uniprot/uniprot_hs_2020_05.fasta", "~/proteoM/dbs/fasta/crap/crap.fasta"), c("my_acc", "crap"), - c("^>..\\\\|([^\\\\|]+)\\\\|[^\\\\|]+", "(.*)") -) + c("^>..\\\\|([^\\\\|]+)\\\\|[^\\\\|]+", "(.*)")) stopifnot(identical(fasta_db, fasta_db2), identical(fasta_db, fasta_db3)) diff --git a/man/make_ms2frames.Rd b/man/make_ms2frames.Rd index 8502287..9abdec5 100644 --- a/man/make_ms2frames.Rd +++ b/man/make_ms2frames.Rd @@ -22,14 +22,10 @@ masses in values.} \item{ms2s}{Lists of MS2 ion series.} \item{.path_cache}{The file path of cached search parameters. The parameter - is for the users' awareness of the underlying structure of file folders and - the use of default is suggested. Occasionally experimenters may remove the - file folder for disk space or under infrequent events of modified framework - incurred by the developer. - - Started from version 1.2.1.5, a new data structure of protein-peptide - lookups are used. Version 1.2.1.4.1 is the final version using the old data - structure.} +is for the users' awareness of the underlying structure of file folders and +the use of default is suggested. Occasionally experimenters may remove the +file folder for disk space or under infrequent events of modified framework +incurred by the developer.} \item{path_bin2}{A file path to binned MS2 results.} diff --git a/man/matchMS.Rd b/man/matchMS.Rd index 5fbef33..d655cd4 100644 --- a/man/matchMS.Rd +++ b/man/matchMS.Rd @@ -23,15 +23,16 @@ matchMS( "SemiLysN", "SemiArgC", "SemiLysC_P", "SemiChymotrypsin", "SemiGluC", "SemiGluN", "SemiAspC", "SemiAspN", "Noenzyme", "Nodigest"), custom_enzyme = c(Cterm = NULL, Nterm = NULL), - nes_fdr_group = c("all", "all_cterm_tryptic", "all_cterm_nontryptic", "base", - "base_cterm_tryptic", "base_cterm_nontryptic"), + nes_fdr_group = c("base", "base_cterm_tryptic", "base_cterm_nontryptic", "all", + "all_cterm_tryptic", "all_cterm_nontryptic", "top3", "top3_cterm_tryptic", + "top3_cterm_nontryptic"), noenzyme_maxn = 0L, maxn_fasta_seqs = 200000L, maxn_vmods_setscombi = 512L, maxn_vmods_per_pep = 5L, maxn_sites_per_vmod = 3L, - maxn_fnl_per_seq = 64L, - maxn_vnl_per_seq = 64L, + maxn_fnl_per_seq = 8L, + maxn_vnl_per_seq = 8L, maxn_vmods_sitescombi_per_pep = 64L, min_len = 7L, max_len = 40L, @@ -55,7 +56,7 @@ matchMS( quant = c("none", "tmt6", "tmt10", "tmt11", "tmt16", "tmt18"), target_fdr = 0.01, fdr_type = c("protein", "peptide", "psm"), - fdr_group = c("all", "base"), + fdr_group = c("base", "all", "top3"), max_pepscores_co = 50, min_pepscores_co = 0, max_protscores_co = Inf, @@ -333,7 +334,7 @@ levels of PSM, peptide or protein. The default is 0.01. See also argument \item{fdr_type}{A character string; the type of FDR control. The value is in one of c("protein", "peptide", "psm"). The default is \code{protein}. - Note that \code{fdr_type = protein} is equivalent to \code{fdr_type = + Note that \code{fdr_type = protein} is comparable to \code{fdr_type = peptide} with the additional filtration of data at \code{prot_tier == 1}.} \item{fdr_group}{A character string; the modification group(s) for uses in @@ -343,8 +344,7 @@ of matches.} \item{max_pepscores_co}{A positive numeric; the upper limit in the cut-offs of peptide scores for discriminating significant and insignificant -identities. The default is changed from \code{Inf} to 50 from version -1.1.9.2 on.} +identities.} \item{min_pepscores_co}{A non-negative numeric; the lower limit in the cut-offs of peptide scores for discriminating significant and insignificant @@ -371,10 +371,10 @@ pep_score_cutoff} under a protein will be used to represent the threshold of a protein enrichment score. For more conserved thresholds, the statistics of \code{"max"} may be considered.} -\item{soft_secions}{Logical; if TRUE, collapses the intensities of secondary -ions to primary ions at the absence of the primaries. The default is FALSE. -For instance, the signal of \code{b5^*} will be ignored if its primary ion -\code{b5} is not matched.} +\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities +of secondary ions to primary ions at the absence of the primaries. The +default is FALSE. For instance, the signal of \code{b5^*} will be ignored +if its primary ion \code{b5} is not matched.} \item{topn_mods_per_seq}{Positive integer; a threshold to discard variable modifications under the same peptide match with scores beyond the top-n. @@ -393,17 +393,19 @@ For instance, the signal of \code{b5^*} will be ignored if its primary ion and \code{MS raw file name}. Target and decoys matches are treated separately.} -\item{combine_tier_three}{Logical; if TRUE, combines search results at tiers - 1, 2 and 3 to the single output of \code{psmQ.txt}. The default is FALSE in - that data will be segregated into the three quality tiers according to the - choice of \code{fdr_type}. The (convenience) parameter matters since - \href{http://github.com/qzhang503/proteoQ}{proteoQ} will only look for the - inputs of \code{psmQ[...].txt}. - - For instance, if the aim is to bypass the constraint by protein FDR and - focus on PSMs that have met the cut-offs specified by \code{target_fdr}, an - experimenter may set \code{combine_tier_three = TRUE} and hence pool all - significant peptides in \code{psmQ.txt} for downstream proteoQ. +\item{combine_tier_three}{Logical; if TRUE, combines search results at tier-3 + to tier-1 to form the single output of \code{psmQ.txt}. The default is + FALSE in that data will be segregated into the three quality tiers shown + below by the choice of \code{fdr_type}. Note that the argument affects only + at the \code{fdr_type} of \code{psm} or \code{peptide} and there is no + tier-2 outputs at the \code{fdr_type} of \code{psm} or \code{peptide}. In + general, the one-hit-wonders from the tier-3 should not be used. + + In subproteome analysis, such as phosphoproteome analysis, some proteins + may be well established globally, but fail the significance assessment by + protein FDR on the local scale. In situations like this, it may be suitable + to use \code{fdr_type = "peptide"} or \code{fdr_type = "psm"} and not to + incur \code{combine_tier_three = TRUE}. Tier-1: both proteins and peptides with scores above significance thresholds. @@ -433,14 +435,10 @@ For instance, the signal of \code{b5^*} will be ignored if its primary ion \code{fdr_type = psm} to \code{fdr_type = protein}.} \item{.path_cache}{The file path of cached search parameters. The parameter - is for the users' awareness of the underlying structure of file folders and - the use of default is suggested. Occasionally experimenters may remove the - file folder for disk space or under infrequent events of modified framework - incurred by the developer. - - Started from version 1.2.1.5, a new data structure of protein-peptide - lookups are used. Version 1.2.1.4.1 is the final version using the old data - structure.} +is for the users' awareness of the underlying structure of file folders and +the use of default is suggested. Occasionally experimenters may remove the +file folder for disk space or under infrequent events of modified framework +incurred by the developer.} \item{.path_fasta}{The parent file path to the theoretical masses of MS1 precursors. At the NULL default, the path is \code{gsub("(.*)\\.[^\\.]*$", diff --git a/man/probco_bypeplen.Rd b/man/probco_bypeplen.Rd index 45d3d19..7b691c2 100644 --- a/man/probco_bypeplen.Rd +++ b/man/probco_bypeplen.Rd @@ -7,7 +7,7 @@ probco_bypeplen( len, td, - fdr_type = "psm", + fdr_type = "protein", target_fdr = 0.01, min_pepscores_co = 0, out_path @@ -21,7 +21,7 @@ probco_bypeplen( \item{fdr_type}{A character string; the type of FDR control. The value is in one of c("protein", "peptide", "psm"). The default is \code{protein}. - Note that \code{fdr_type = protein} is equivalent to \code{fdr_type = + Note that \code{fdr_type = protein} is comparable to \code{fdr_type = peptide} with the additional filtration of data at \code{prot_tier == 1}.} \item{target_fdr}{A numeric; the targeted false-discovery rate (FDR) at the diff --git a/man/psmC2Q.Rd b/man/psmC2Q.Rd index db54803..e1350f9 100644 --- a/man/psmC2Q.Rd +++ b/man/psmC2Q.Rd @@ -22,20 +22,22 @@ or decoy peptides, as well as decoy proteins.} \item{fdr_type}{A character string; the type of FDR control. The value is in one of c("protein", "peptide", "psm"). The default is \code{protein}. - Note that \code{fdr_type = protein} is equivalent to \code{fdr_type = + Note that \code{fdr_type = protein} is comparable to \code{fdr_type = peptide} with the additional filtration of data at \code{prot_tier == 1}.} -\item{combine_tier_three}{Logical; if TRUE, combines search results at tiers - 1, 2 and 3 to the single output of \code{psmQ.txt}. The default is FALSE in - that data will be segregated into the three quality tiers according to the - choice of \code{fdr_type}. The (convenience) parameter matters since - \href{http://github.com/qzhang503/proteoQ}{proteoQ} will only look for the - inputs of \code{psmQ[...].txt}. - - For instance, if the aim is to bypass the constraint by protein FDR and - focus on PSMs that have met the cut-offs specified by \code{target_fdr}, an - experimenter may set \code{combine_tier_three = TRUE} and hence pool all - significant peptides in \code{psmQ.txt} for downstream proteoQ. +\item{combine_tier_three}{Logical; if TRUE, combines search results at tier-3 + to tier-1 to form the single output of \code{psmQ.txt}. The default is + FALSE in that data will be segregated into the three quality tiers shown + below by the choice of \code{fdr_type}. Note that the argument affects only + at the \code{fdr_type} of \code{psm} or \code{peptide} and there is no + tier-2 outputs at the \code{fdr_type} of \code{psm} or \code{peptide}. In + general, the one-hit-wonders from the tier-3 should not be used. + + In subproteome analysis, such as phosphoproteome analysis, some proteins + may be well established globally, but fail the significance assessment by + protein FDR on the local scale. In situations like this, it may be suitable + to use \code{fdr_type = "peptide"} or \code{fdr_type = "psm"} and not to + incur \code{combine_tier_three = TRUE}. Tier-1: both proteins and peptides with scores above significance thresholds. diff --git a/man/reproc_psmC.Rd b/man/reproc_psmC.Rd index eb7de3c..c961538 100644 --- a/man/reproc_psmC.Rd +++ b/man/reproc_psmC.Rd @@ -18,20 +18,22 @@ reproc_psmC( \item{fdr_type}{A character string; the type of FDR control. The value is in one of c("protein", "peptide", "psm"). The default is \code{protein}. - Note that \code{fdr_type = protein} is equivalent to \code{fdr_type = + Note that \code{fdr_type = protein} is comparable to \code{fdr_type = peptide} with the additional filtration of data at \code{prot_tier == 1}.} -\item{combine_tier_three}{Logical; if TRUE, combines search results at tiers - 1, 2 and 3 to the single output of \code{psmQ.txt}. The default is FALSE in - that data will be segregated into the three quality tiers according to the - choice of \code{fdr_type}. The (convenience) parameter matters since - \href{http://github.com/qzhang503/proteoQ}{proteoQ} will only look for the - inputs of \code{psmQ[...].txt}. - - For instance, if the aim is to bypass the constraint by protein FDR and - focus on PSMs that have met the cut-offs specified by \code{target_fdr}, an - experimenter may set \code{combine_tier_three = TRUE} and hence pool all - significant peptides in \code{psmQ.txt} for downstream proteoQ. +\item{combine_tier_three}{Logical; if TRUE, combines search results at tier-3 + to tier-1 to form the single output of \code{psmQ.txt}. The default is + FALSE in that data will be segregated into the three quality tiers shown + below by the choice of \code{fdr_type}. Note that the argument affects only + at the \code{fdr_type} of \code{psm} or \code{peptide} and there is no + tier-2 outputs at the \code{fdr_type} of \code{psm} or \code{peptide}. In + general, the one-hit-wonders from the tier-3 should not be used. + + In subproteome analysis, such as phosphoproteome analysis, some proteins + may be well established globally, but fail the significance assessment by + protein FDR on the local scale. In situations like this, it may be suitable + to use \code{fdr_type = "peptide"} or \code{fdr_type = "psm"} and not to + incur \code{combine_tier_three = TRUE}. Tier-1: both proteins and peptides with scores above significance thresholds. diff --git a/man/scalc_pepprobs.Rd b/man/scalc_pepprobs.Rd index da17950..b637417 100644 --- a/man/scalc_pepprobs.Rd +++ b/man/scalc_pepprobs.Rd @@ -30,10 +30,10 @@ and y-ions.} \item{ppm_ms2}{A positive integer; the mass tolerance of MS2 species. The default is 20.} -\item{soft_secions}{Logical; if TRUE, collapses the intensities of secondary -ions to primary ions at the absence of the primaries. The default is FALSE. -For instance, the signal of \code{b5^*} will be ignored if its primary ion -\code{b5} is not matched.} +\item{soft_secions}{Depreciated. Logical; if TRUE, collapses the intensities +of secondary ions to primary ions at the absence of the primaries. The +default is FALSE. For instance, the signal of \code{b5^*} will be ignored +if its primary ion \code{b5} is not matched.} \item{min_ms2mass}{A positive integer; the minimum MS2 mass for interrogation. The default is 110.} diff --git a/man/try_psmC2Q.Rd b/man/try_psmC2Q.Rd index 9345c45..c6c7844 100644 --- a/man/try_psmC2Q.Rd +++ b/man/try_psmC2Q.Rd @@ -21,20 +21,22 @@ or decoy peptides, as well as decoy proteins.} \item{fdr_type}{A character string; the type of FDR control. The value is in one of c("protein", "peptide", "psm"). The default is \code{protein}. - Note that \code{fdr_type = protein} is equivalent to \code{fdr_type = + Note that \code{fdr_type = protein} is comparable to \code{fdr_type = peptide} with the additional filtration of data at \code{prot_tier == 1}.} -\item{combine_tier_three}{Logical; if TRUE, combines search results at tiers - 1, 2 and 3 to the single output of \code{psmQ.txt}. The default is FALSE in - that data will be segregated into the three quality tiers according to the - choice of \code{fdr_type}. The (convenience) parameter matters since - \href{http://github.com/qzhang503/proteoQ}{proteoQ} will only look for the - inputs of \code{psmQ[...].txt}. - - For instance, if the aim is to bypass the constraint by protein FDR and - focus on PSMs that have met the cut-offs specified by \code{target_fdr}, an - experimenter may set \code{combine_tier_three = TRUE} and hence pool all - significant peptides in \code{psmQ.txt} for downstream proteoQ. +\item{combine_tier_three}{Logical; if TRUE, combines search results at tier-3 + to tier-1 to form the single output of \code{psmQ.txt}. The default is + FALSE in that data will be segregated into the three quality tiers shown + below by the choice of \code{fdr_type}. Note that the argument affects only + at the \code{fdr_type} of \code{psm} or \code{peptide} and there is no + tier-2 outputs at the \code{fdr_type} of \code{psm} or \code{peptide}. In + general, the one-hit-wonders from the tier-3 should not be used. + + In subproteome analysis, such as phosphoproteome analysis, some proteins + may be well established globally, but fail the significance assessment by + protein FDR on the local scale. In situations like this, it may be suitable + to use \code{fdr_type = "peptide"} or \code{fdr_type = "psm"} and not to + incur \code{combine_tier_three = TRUE}. Tier-1: both proteins and peptides with scores above significance thresholds.