Skip to content

Commit

Permalink
2023-12-01
Browse files Browse the repository at this point in the history
Bug fixes:
- Unplanned removals of mgf or mzML files with their folder name other than mgf or mzML under an output folder.
- Handled the case of mzML with empty MS2.
  • Loading branch information
qzhang503 committed Dec 1, 2023
1 parent b6094ec commit add0966
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 33 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: mzion
Type: Package
Title: Proteomics Database Searches of Mass-spectrometrirc Data.
Version: 1.3.4
Version: 1.3.5
Authors@R:
person(given = "Qiang",
family = "Zhang",
Expand Down
125 changes: 112 additions & 13 deletions R/mgfs.R
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,13 @@ load_mgfs <- function (out_path, mgf_path, min_mass = 200L, max_mass = 4500L,

delete_files(
out_path,
ignores = c("\\.[Rr]$", "\\.(mgf|MGF)$", "\\.mzML$", "\\.xlsx$",
"\\.xls$", "\\.csv$", "\\.txt$", "\\.tsv$", "\\.pars$",
"^mgf$", "^mgfs$", "Calls",
# in case of reprocessing after proteoQ
ignores = c("\\.[Rr]$", "\\.(mgf|MGF)$", "\\.(mzML|mzml)$",
"\\.xlsx$", "\\.xls$", "\\.csv$", "\\.txt$", "\\.pars$",
"^mgf$", "^mgfs$", "^mzML$", "^mzMLs$",
"Calls", "^PSM$", "^Peptide$", "^Protein$",
"fraction_scheme.rda", "label_scheme.rda",
"label_scheme_full.rda"))

fi_mgf <- list.files(path = mgf_path, pattern = "^.*\\.(mgf|MGF)$")
fi_mzml <- list.files(path = mgf_path, pattern = "^.*\\.(mzML|mzml)$")
len_mgf <- length(fi_mgf)
Expand Down Expand Up @@ -1782,36 +1782,102 @@ read_mzml <- function (xml_file, topn_ms2ions = 100L,

if (xml2::xml_attr(xc[[idx_mslev]], "value") == "2") {
idx_precursor_2 <- grep("precursorList", xc)
if (!length(idx_precursor_2)) {
warning("Fields of `precursorList` not found.")
idx_precursor_2 <- 12L
}

idx_scanList_2 <- grep("scanList", xc)
if (!length(idx_scanList_2)) {
warning("Fields of `scanList` not found.")
idx_scanList_2 <- 11L
}

idx_bin_2 <- grep("binaryDataArrayList", xc)
if (!length(idx_bin_2)) {
warning("Fields of `binaryDataArrayList` not found.")
idx_bin_2 <- 13L
}

scanList <- xml2::xml_children(xc[[idx_scanList_2]])

idx_rt_2 <- which(xml2::xml_name(scanList) == "scan")
if (!length(idx_rt_2)) {
warning("Fields of `scan` not found.")
idx_rt_2 <- 2L
}

scanList_ret <- xml2::xml_children(scanList[[idx_rt_2]])
scanList_ret_attrs <- xml2::xml_attr(scanList_ret, "name")
idx_scan_start_2 <- which(scanList_ret_attrs == "scan start time")
ms2_reso <- scanList_ret[[which(scanList_ret_attrs == "mass resolving power")]]
ms2_reso <- as.integer(xml2::xml_attr(ms2_reso, "value"))

idx_scan_start_2 <- which(scanList_ret_attrs == "scan start time") # 1
if (!length(idx_scan_start_2)) {
warning("Fields of `scan start time` not found.")
idx_scan_start_2 <- 1L
}

idx_ms2_reso <- which(scanList_ret_attrs == "mass resolving power")
if (length(idx_ms2_reso)) {
ms2_reso <- scanList_ret[[idx_ms2_reso]]
ms2_reso <- as.integer(xml2::xml_attr(ms2_reso, "value"))
} else {
warning("Fields of `mass resolving power` not found.")
ms2_reso <- 120000L
}

# entire MS2 is empty
if (length(xc) < idx_precursor_2) {
next
}

precursorList <- xml2::xml_children(xc[[idx_precursor_2]])
precursor <- precursorList[[1]] # assume one precursor
precursorc <- xml2::xml_children(precursor)
idx_selectedIonList <- grep("selectedIonList", precursorc)

idx_selectedIonList <- grep("selectedIonList", precursorc)
if (!length(idx_selectedIonList)) {
warning("Fields of `selectedIonList` not found.")
idx_selectedIonList <- 2L
}

idx_isolationWindow <- grep("isolationWindow", precursorc)
if (!length(idx_isolationWindow)) {
warning("Fields of `isolationWindow` not found.")
idx_isolationWindow <- 1L
}

isolationWindowc <- xml2::xml_children(precursorc[[idx_isolationWindow]])
iso_nms <- lapply(isolationWindowc, function (x) xml2::xml_attr(x, "name"))

idx_ctrmz <- which(iso_nms == "isolation window target m/z")
if (!length(idx_ctrmz)) {
warning("Fields of `isolation window target m/z` not found.")
idx_ctrmz <- 1L
}

idx_lwrmz <- which(iso_nms == "isolation window lower offset")
if (!length(idx_lwrmz)) {
warning("Fields of `isolation window lower offset` not found.")
idx_lwrmz <- 2L
}

idx_uprmz <- which(iso_nms == "isolation window upper offset")
if (!length(idx_uprmz)) {
warning("Fields of `isolation window upper offset` not found.")
idx_uprmz <- 3L
}

selectedIon <- xml2::xml_child(precursorc[[idx_selectedIonList]], 1)
selectedIonc <- xml2::xml_children(selectedIon)
selion_nms <- lapply(selectedIonc, function (x) xml2::xml_attr(x, "name"))

idx_moverz <- which(selion_nms == "selected ion m/z")
idx_ms1int <- which(selion_nms == "peak intensity") # DIA: zero intensity
if (!length(idx_moverz)) {
warning("Fields of `selected ion m/z` not found.")
idx_moverz <- 1L
}

## MSConcert: no "peak intensity"
idx_ms1int <- which(selion_nms == "peak intensity") # DIA: zero intensity
if (!length(idx_ms1int)) {
if (count <= allowance) {
count <- count + 1L
Expand All @@ -1822,8 +1888,7 @@ read_mzml <- function (xml_file, topn_ms2ions = 100L,
idx_ms1int <- 3L
}
}
##


idx_charge <- which(selion_nms == "charge state") # DIA: no "charge state"

# better let user define as MSConvert may be missing "charge state"
Expand All @@ -1843,7 +1908,16 @@ read_mzml <- function (xml_file, topn_ms2ions = 100L,

if (as.integer(xml2::xml_attr(xc[[idx_mslev]], "value")) == 1) {
idx_scanList_1 <- grep("scanList", xc)
if (!length(idx_scanList_1)) {
warning("Fields of `scanList` not found.")
idx_scanList_1 <- 11L
}

idx_bin_1 <- grep("binaryDataArrayList", xc)
if (!length(idx_bin_1)) {
warning("Fields of `binaryDataArrayList` not found.")
idx_bin_1 <- 12L
}

local({
binData <- xml2::xml_children(xc[[idx_bin_1]])[[1]]
Expand All @@ -1861,14 +1935,24 @@ read_mzml <- function (xml_file, topn_ms2ions = 100L,
})

scanList <- xml2::xml_children(xc[[idx_scanList_1]])

idx_rt_1 <- which(xml2::xml_name(scanList) == "scan")
if (!length(idx_rt_1)) {
warning("Fields of `scan` not found.")
idx_rt_1 <- 2L
}

scanList_ret <- xml2::xml_children(scanList[[idx_rt_1]])

scan_ret_attrs <- xml2::xml_attr(scanList_ret, "name")
# later remove the assumption of a fixed resolution with Astral
# ms1_reso <- scanList_ret[[which(scan_ret_attrs == "mass resolving power")]]
# ms1_reso <- as.integer(xml2::xml_attr(ms1_reso, "value"))
idx_scan_start_1 <- which(scan_ret_attrs == "scan start time")
if (!length(idx_scan_start_1)) {
warning("Fields of `scan start time` not found.")
idx_scan_start_1 <- 1L
}

rm(list = c("scanList", "scanList_ret", "scan_ret_attrs"))
break
Expand Down Expand Up @@ -2036,6 +2120,11 @@ proc_mdda <- function (spec, raw_file, idx_sc = 3L, idx_osc = 3L, idx_mslev = 2L
scanList_ret <- xml2::xml_children(scanList[[idx_rt_2]])
ret_times[[i]] <- xml2::xml_attr(scanList_ret[[idx_scan_start_2]], "value")

# entire MS2 is empty
if (length(xc) < idx_precursor_2) {
next
}

precursorList <- xml2::xml_children(xc[[idx_precursor_2]])
precursor <- precursorList[[1]] # (assume one precursor, not yet chimeric)
precursorc <- xml2::xml_children(precursor)
Expand Down Expand Up @@ -2280,6 +2369,11 @@ proc_dia <- function (spec, raw_file, is_demux = FALSE, idx_sc = 5L, idx_osc = 3
scanList_ret <- xml2::xml_children(scanList[[idx_rt_2]])
ret_times[[i]] <- xml2::xml_attr(scanList_ret[[idx_scan_start_2]], "value")

# entire MS2 is empty
if (length(xc) < idx_precursor_2) {
next
}

precursorList <- xml2::xml_children(xc[[idx_precursor_2]])
precursor <- precursorList[[1]] # (assume one precursor, not yet chimeric)
precursorc <- xml2::xml_children(precursor)
Expand Down Expand Up @@ -2493,6 +2587,11 @@ proc_dda <- function (spec, raw_file, idx_sc = 3L, idx_osc = 3L,
scanList_ret <- xml2::xml_children(scanList[[idx_rt_2]])
ret_times[[i]] <- xml2::xml_attr(scanList_ret[[idx_scan_start_2]], "value")

# entire MS2 is empty
if (length(xc) < idx_precursor_2) {
next
}

precursorList <- xml2::xml_children(xc[[idx_precursor_2]])
precursor <- precursorList[[1]] # (assume one precursor, not yet chimeric)
precursorc <- xml2::xml_children(precursor)
Expand Down
12 changes: 8 additions & 4 deletions R/ms1_precursors.R
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,14 @@ calc_pepmasses2 <- function (aa_masses = NULL,
}
else {
# `mgf_quries.rds` kept (only affected by min_mass, max_mass and ppm_ms1)
delete_files(out_path,
ignores = c("\\.[Rr]$", "\\.(mgf|MGF)$", "\\.xlsx$", "\\.xls$",
"\\.csv$", "\\.txt$", "^mgf$", "^mgfs$",
"\\.pars$"))
delete_files(
out_path,
ignores = c("\\.[Rr]$", "\\.(mgf|MGF)$", "\\.(mzML|mzml)$",
"\\.xlsx$", "\\.xls$", "\\.csv$", "\\.txt$", "\\.pars$",
"^mgf$", "^mgfs$", "^mzML$", "^mzMLs$",
"Calls", "^PSM$", "^Peptide$", "^Protein$",
"fraction_scheme.rda", "label_scheme.rda",
"label_scheme_full.rda"))

.time_stamp <- format(Sys.time(), ".%Y-%m-%d_%H%M%S")
path_tstamp <- file.path(.path_fasta, "ms1masses", .time_stamp)
Expand Down
28 changes: 14 additions & 14 deletions R/msmsmatches2.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,15 @@ ms2match <- function (mgf_path, aa_masses_all, out_path, .path_bin,
call_pars <- call_pars[sort(names(call_pars))]

# backward compatibility of old cached parameters
if (TRUE) {
if (".path_bin" %in% names(cache_pars) && ".path_bin" %in% names(call_pars)) {
if (!(is.null(cache_pars$.path_bin) ||
"fs_path" %in% class(cache_pars$.path_bin))) {
cache_pars$.path_bin <- fs::fs_path(cache_pars$.path_bin)
}

if (!(is.null(call_pars$.path_bin) ||
"fs_path" %in% class(call_pars$.path_bin))) {
call_pars$.path_bin <- fs::fs_path(call_pars$.path_bin)
}
if (".path_bin" %in% names(cache_pars) && ".path_bin" %in% names(call_pars)) {
if (!(is.null(cache_pars$.path_bin) ||
"fs_path" %in% class(cache_pars$.path_bin))) {
cache_pars$.path_bin <- fs::fs_path(cache_pars$.path_bin)
}

if (!(is.null(call_pars$.path_bin) ||
"fs_path" %in% class(call_pars$.path_bin))) {
call_pars$.path_bin <- fs::fs_path(call_pars$.path_bin)
}
}

Expand All @@ -100,10 +98,12 @@ ms2match <- function (mgf_path, aa_masses_all, out_path, .path_bin,

delete_files(
out_path,
ignores = c("\\.[Rr]$", "\\.(mgf|MGF)$", "\\.(mzML|mzml)$", "\\.xlsx$",
"\\.xls$", "\\.csv$", "\\.txt$", "\\.pars$",
ignores = c(mgf_path, "\\.[Rr]$", "\\.(mgf|MGF)$", "\\.(mzML|mzml)$",
"\\.xlsx$", "\\.xls$", "\\.csv$", "\\.txt$", "\\.pars$",
"^mgf$", "^mgfs$", "^mzML$", "^mzMLs$",
"Calls", "^PSM$", "^Peptide$", "^Protein$"))
"Calls", "^PSM$", "^Peptide$", "^Protein$",
"fraction_scheme.rda", "label_scheme.rda",
"label_scheme_full.rda"))

# pairs expts and theos
files_a <- list.files(mgf_path, pattern = "^expttheo_", full.names = TRUE)
Expand Down
2 changes: 1 addition & 1 deletion R/utils_os.R
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ delete_files <- function (path, ignores = NULL, ...)
if (!is.null(ignores)) {
nms <- local({
dirs <- list.dirs(path, full.names = FALSE, recursive = recursive)
dirs <- dirs[! dirs == ""]
dirs <- dirs[dirs != ""]

idxes_kept <- purrr::map_lgl(dirs, function (x) any(grepl(x, ignores)))

Expand Down

0 comments on commit add0966

Please sign in to comment.