Skip to content

Commit

Permalink
2023-11-11
Browse files Browse the repository at this point in the history
Fixes bugs in custom enzyme specificity.
  • Loading branch information
qzhang503 committed Nov 11, 2023
1 parent 9d994d1 commit c3612f6
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 41 deletions.
2 changes: 1 addition & 1 deletion R/bin_masses.R
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ set_bin_ncores <- function (len_m, enzyme)
else
min(n_cores, len_m)

if (enzyme == "noenzyme")
if (isTRUE(enzyme == "noenzyme"))
n_cores <- floor(n_cores/2L)

max(1L, n_cores)
Expand Down
28 changes: 16 additions & 12 deletions R/mgfs.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ load_mgfs <- function (out_path, mgf_path, min_mass = 200L, max_mass = 4500L,
ok_pars <- identical(call_pars, cache_pars)

# suboptimal for handling matchMS_noenzyme()
if ((!ok_pars) && enzyme == "noenzyme")
if ((!ok_pars) && isTRUE(enzyme == "noenzyme"))
ok_pars <- TRUE

# checks processed mgfs
Expand Down Expand Up @@ -1164,7 +1164,7 @@ extract_mgf_rptrs <- function (xvals, yvals, quant = "none",
tmt_reporter_upper = 135.2,
exclude_reporter_region = FALSE)
{
if (grepl("^tmt.*\\d+", quant)) {
if (isTRUE(grepl("^tmt.*\\d+", quant))) {
ok_rptrs <- lapply(xvals, function (x) x > tmt_reporter_lower &
x < tmt_reporter_upper)
rptr_moverzs <- mapply(function (x, y) x[y], xvals, ok_rptrs,
Expand Down Expand Up @@ -1312,13 +1312,17 @@ find_mgf_type <- function (file)
scan_pd <- "scans: \"\\d+\""
scan_default_pasef <- NULL

if (grepl(file_msconvert_thermo, ln_tit) && grepl(scan_msconv_thermo, ln_tit))
if (isTRUE(grepl(file_msconvert_thermo, ln_tit)) &&
isTRUE(grepl(scan_msconv_thermo, ln_tit)))
"msconv_thermo"
else if (grepl(file_msconvert_pasef, ln_tit) && grepl(scan_msconv_pasef, ln_tit))
else if (isTRUE(grepl(file_msconvert_pasef, ln_tit)) &&
isTRUE(grepl(scan_msconv_pasef, ln_tit)))
"msconv_pasef"
else if (grepl(file_pd, ln_tit) && grepl(scan_pd, ln_tit))
else if (isTRUE(grepl(file_pd, ln_tit)) &&
isTRUE(grepl(scan_pd, ln_tit)))
"pd"
else if (grepl(file_default_pasef, ln_tit) && is.null(scan_default_pasef))
else if (isTRUE(grepl(file_default_pasef, ln_tit)) &&
isTRUE(is.null(scan_default_pasef)))
"default_pasef"
else
stop("Unkown format of MGFs.")
Expand Down Expand Up @@ -2020,6 +2024,8 @@ proc_mdda <- function (spec, raw_file, idx_sc = 3L, idx_osc = 3L, idx_mslev = 2L
ms1_moverzs <- ms1_ints <- ms1_charges <- vector("list", len)
ms0_moverzs <- ms0_ints <- ms0_charges <- character(len)

is_tmt <- if (isTRUE(grepl("^tmt.*\\d+", quant))) TRUE else FALSE

for (i in 1:len) {
x <- spec[[i]]
ids <- .Internal(strsplit(xml2::xml_attr(x, "id"), " ", fixed = TRUE,
Expand All @@ -2032,9 +2038,7 @@ proc_mdda <- function (spec, raw_file, idx_sc = 3L, idx_osc = 3L, idx_mslev = 2L
xc <- xml2::xml_children(x)
ms_levs[[i]] <- ms_lev <- as.integer(xml2::xml_attr(xc[[idx_mslev]], "value"))
scan_titles[[i]] <- xml2::xml_attr(xc[[idx_title]], "value")

is_tmt <- if (grepl("^tmt.*\\d+", quant)) TRUE else FALSE


if (ms_lev == 2L) {
scanList <- xml2::xml_children(xc[[idx_scanList_2]])
scanList_ret <- xml2::xml_children(scanList[[idx_rt_2]])
Expand Down Expand Up @@ -2262,9 +2266,9 @@ proc_dia <- function (spec, raw_file, is_demux = FALSE, idx_sc = 5L, idx_osc = 3
ms_levs <- msx_ns <- integer(len)
msx_moverzs <- msx_ints <- ms2_charges <- vector("list", len)
ms1_moverzs <- ms1_ints <- ms1_charges <- vector("list", len)
demux <- if (is_demux) rep("0", len) else NULL
demux <- if (is_demux) rep_len("0", len) else NULL

is_tmt <- if (grepl("^tmt.*\\d+", quant)) TRUE else FALSE
is_tmt <- if (isTRUE(grepl("^tmt.*\\d+", quant))) TRUE else FALSE

for (i in 1:len) {
x <- spec[[i]]
Expand Down Expand Up @@ -2477,7 +2481,7 @@ proc_dda <- function (spec, raw_file, idx_sc = 3L, idx_osc = 3L,
msx_moverzs <- msx_ints <- ms2_charges <- vector("list", len)
ms1_moverzs <- ms1_ints <- ms1_charges <- character(len)

is_tmt <- if (grepl("^tmt.*\\d+", quant)) TRUE else FALSE
is_tmt <- if (isTRUE(grepl("^tmt.*\\d+", quant))) TRUE else FALSE

for (i in 1:len) {
x <- spec[[i]]
Expand Down
71 changes: 48 additions & 23 deletions R/ms1_precursors.R
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ calc_pepmasses2 <- function (aa_masses = NULL,
is_fixed_protct <- any(grepl("Protein C-term", fixedmods))

# --- Forward sequences ---
# (enzyme can be NULL)
if (isTRUE(enzyme == "noenzyme")) {
if (max_len > 25L)
warning("May be out of RAM at `max_len = ", max_len, "`.\n",
Expand Down Expand Up @@ -334,7 +335,7 @@ calc_pepmasses2 <- function (aa_masses = NULL,
fwd_peps <- chunksplit(fwd_peps, n_cores, "list")

# (a) Optional semi-enzymatic peptides
if (grepl("^semi", enzyme)) {
if (isTRUE(grepl("^semi", enzyme))) {
message("Generating semi-enzymatic peptides.")

cl <- parallel::makeCluster(getOption("cl.cores", n_cores))
Expand Down Expand Up @@ -778,7 +779,7 @@ find_aa_site <- function (pos_site)
{
nms <- names(pos_site)

site <- if (grepl("[NC]{1}-term", nms))
site <- if (isTRUE(grepl("[NC]{1}-term", nms)))
gsub("(Protein|Any) ([NC]{1}-term)", "\\2", nms)
else
pos_site
Expand Down Expand Up @@ -1955,9 +1956,25 @@ make_fastapeps0 <- function (fasta_db, enzyme = "trypsin_p", custom_enzyme = NUL
inds_m <- grep("^M", fasta_db)

if (is.null(enzyme)) {
patc <- custom_enzyme[["Cterm"]]
patn <- custom_enzyme[["Nterm"]]

# already checked custom_enzyme names in matchMS
if ((len_enz <- length(custom_enzyme)) == 2L) {
patc <- custom_enzyme[["Cterm"]]
patn <- custom_enzyme[["Nterm"]]
}
else if (len_enz == 1L) {
if ((tm_enz <- names(custom_enzyme)) == "Cterm") {
patc <- custom_enzyme[["Cterm"]]
patn <- NULL
}
else if (tm_enz == "Nterm") {
patc <- NULL
patn <- custom_enzyme[["Nterm"]]
}
else {
stop("Custom enzyme terminal is not `Cterm` or `Nterm`.")
}
}

if (!is.null(patc)) {
if (grepl("\\^", patc)) {
fasta_db <- lapply(fasta_db, function (x)
Expand Down Expand Up @@ -2024,55 +2041,55 @@ make_fastapeps0 <- function (fasta_db, enzyme = "trypsin_p", custom_enzyme = NUL
fixed = FALSE, useBytes = FALSE)),
"-"))
}
else if (enzyme == "argc"|| enzyme == "semiargc") {
else if (enzyme == "argc" || enzyme == "semiargc") {
fasta_db <- lapply(fasta_db, function (x)
paste0("-",
.Internal(gsub("([R]{1})", paste0("\\1", "@"), x,
ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)),
"-"))
}
else if (enzyme == "lysc_p"|| enzyme == "semilysc_p") {
else if (enzyme == "lysc_p" || enzyme == "semilysc_p") {
fasta_db <- lapply(fasta_db, function (x)
paste0("-",
.Internal(gsub("([K]{1})([^P]{1})", paste0("\\1", "@", "\\2"), x,
ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)),
"-"))
}
else if (enzyme == "chymotrypsin"|| enzyme == "semichymotrypsin") {
else if (enzyme == "chymotrypsin" || enzyme == "semichymotrypsin") {
fasta_db <- lapply(fasta_db, function (x)
paste0("-",
.Internal(gsub("([FWY]{1})", paste0("\\1", "@"), x,
ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)),
"-"))
}
else if (enzyme == "gluc"|| enzyme == "semigluc") {
else if (enzyme == "gluc" || enzyme == "semigluc") {
fasta_db <- lapply(fasta_db, function (x)
paste0("-",
.Internal(gsub("([E]{1})", paste0("\\1", "@"), x,
ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)),
"-"))
}
else if (enzyme == "glun"|| enzyme == "semiglun") {
else if (enzyme == "glun" || enzyme == "semiglun") {
fasta_db <- lapply(fasta_db, function (x)
paste0("-",
.Internal(gsub("([E]{1})", paste0("@", "\\1"), x,
ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)),
"-"))
}
else if (enzyme == "aspc"|| enzyme == "semiaspc") {
else if (enzyme == "aspc" || enzyme == "semiaspc") {
fasta_db <- lapply(fasta_db, function (x)
paste0("-",
.Internal(gsub("([D]{1})", paste0("\\1", "@"), x,
ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)),
"-"))
}
else if (enzyme == "aspn"|| enzyme == "semiaspn") {
else if (enzyme == "aspn" || enzyme == "semiaspn") {
fasta_db <- lapply(fasta_db, function (x)
paste0("-",
.Internal(gsub("([D]{1})", paste0("@", "\\1"), x,
Expand Down Expand Up @@ -2867,7 +2884,8 @@ calcms1mass_noterm_byprot <- function (prot_peps, aa_masses,
calcms1mass_noterm_bypep <- function (aa_seq, aa_masses, maxn_vmods_per_pep = 5L,
maxn_sites_per_vmod = 3L)
{
if (is.na(aa_seq)) return(NULL)
if (is.na(aa_seq))
return(NULL)

aas <- .Internal(strsplit(aa_seq, "", fixed = TRUE, perl = FALSE,
useBytes = FALSE))
Expand Down Expand Up @@ -2915,15 +2933,22 @@ distri_peps <- function (prps, aa_masses_all, motifs_all, max_miss = 2L,
# semi enzymes: don't know the maximum number of
# C-term peptides that can end with "-"
# (N-term remains the same)
n1 <- if (enzyme == "noenzyme")
Inf
else
(max_miss + 1L) * 2L

n2 <- if (grepl("^semi", enzyme) || enzyme == "noenzyme")
Inf
else
ct_counts(max_miss)

if (is.null(enzyme)) {
n1 <- (max_miss + 1L) * 2L
n2 <- ct_counts(max_miss)
}
else {
n1 <- if (enzyme == "noenzyme")
Inf
else
(max_miss + 1L) * 2L

n2 <- if (grepl("^semi", enzyme) || enzyme == "noenzyme")
Inf
else
ct_counts(max_miss)
}

# ZN207_HUMAN: MGRKKKK (no N-term pep_seq at 2 misses and min_len >= 7L)

Expand All @@ -2945,7 +2970,7 @@ ct_counts <- function (max_miss = 2L)
{
ct <- integer(max_miss)

if (max_miss > 0L) {
if (max_miss) {
ct[1] <- 2L

for (i in 1:max_miss) {
Expand Down
23 changes: 20 additions & 3 deletions R/msmsmatches.R
Original file line number Diff line number Diff line change
Expand Up @@ -829,8 +829,19 @@ matchMS <- function (out_path = "~/mzion/outs",
if ((!is.na(topn_ms2ion_cuts)) && (topn_ms2ion_cuts == "")) topn_ms2ion_cuts <- NA
if (!(is.numeric(ms1_notches) && length(ms1_notches))) ms1_notches <- 0
if (is.null(noenzyme_maxn)) noenzyme_maxn <- 0L
if ((!is.null(custom_enzyme)) && custom_enzyme == "")
custom_enzyme = c(Cterm = NULL, Nterm = NULL)

if (!is.null(custom_enzyme)) {
if (!all(names(custom_enzyme) %in% c("Cterm", "Nterm")))
stop("Custom enzyme terminals need to be `Cterm` and/or `Nterm`.")

if (length(custom_enzyme) > 2L)
stop("Custom enzyme cannot have more than 2 terminals.")

custom_enzyme <- custom_enzyme[custom_enzyme != ""]

if (!length(custom_enzyme))
custom_enzyme = c(Cterm = NULL, Nterm = NULL)
}

oks <- fasta != ""

Expand Down Expand Up @@ -1571,10 +1582,12 @@ matchMS <- function (out_path = "~/mzion/outs",
if (bypass_protacc && file.exists(file_protacc))
df <- qs::qread(file_protacc)
else {
if (enzyme != "noenzyme" || isTRUE(dots[["direct_prot_acc"]]))
if (is.null(enzyme) ||
(enzyme != "noenzyme" || isTRUE(dots[["direct_prot_acc"]]))) {
df <- add_protacc(out_path = out_path,
.path_cache = .path_cache,
.path_fasta = .path_fasta)
}
else {
silac_noenzyme <- if (isTRUE(dots$silac_noenzyme)) TRUE else FALSE

Expand Down Expand Up @@ -2064,6 +2077,10 @@ check_tmt_pars <- function (fixedmods, varmods, quant)

tmts <- fvmods[grepl("^TMT", fvmods)]

if (!length(tmts))
warning("No fixed or variable modifications of TMT were specified at ",
"\"quant = ", quant, "\"")

if (quant == "tmt18") {
ok <- all(grepl("TMTpro18.* |TMT18plex.* ", tmts))

Expand Down
8 changes: 6 additions & 2 deletions R/scores.R
Original file line number Diff line number Diff line change
Expand Up @@ -1757,8 +1757,12 @@ prep_pepfdr_td <- function (td = NULL, out_path, enzyme = "trypsin_p",
top3s <- cts$pep_mod_group[which_topx2(cts$n, 3)[1:3]]
top3s <- top3s[!is.na(top3s)]

enzyme <- tolower(enzyme)
is_nes <- enzyme == "noenzyme" || grepl("^semi", enzyme)
if (is.null(enzyme))
is_nes <- FALSE
else {
enzyme <- tolower(enzyme)
is_nes <- isTRUE(enzyme == "noenzyme") || isTRUE(grepl("^semi", enzyme))
}

if (!is_nes)
if (!nes_fdr_group %in% c("all", "base"))
Expand Down

0 comments on commit c3612f6

Please sign in to comment.