Skip to content

Commit

Permalink
v1.2.3
Browse files Browse the repository at this point in the history
  • Loading branch information
qzhang503 committed Mar 22, 2023
1 parent 8721385 commit 75ce7eb
Show file tree
Hide file tree
Showing 34 changed files with 510 additions and 506 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: proteoM
Type: Package
Title: Database Searches of Proteomic Mass-spectrometrirc Data
Version: 1.2.2.1
Version: 1.2.3
Authors@R:
person(given = "Qiang",
family = "Zhang",
Expand Down
11 changes: 4 additions & 7 deletions R/bin_masses.R
Original file line number Diff line number Diff line change
Expand Up @@ -296,14 +296,11 @@ binTheoSeqs <- function (idxes = NULL, res = NULL, min_mass = 200L,

out_dir <- create_dir(gsub("(^.*/).*$", "\\1", out_path))

out_nms <- gsub("^.*/(.*)\\.[^\\.].*$", "\\1", out_path) %>%
paste(idxes, sep = "_") %>%
paste0(".rds")

res <- res %>%
lapply(attributes) %>%
lapply(`[[`, "data")
out_nms <- gsub("^.*/(.*)\\.[^\\.].*$", "\\1", out_path)
out_nms <- paste0(out_nms, "_", idxes, ".rds")

res <- lapply(res, attributes)
res <- lapply(res, `[[`, "data")
gc()

n_cores <- local({
Expand Down
88 changes: 34 additions & 54 deletions R/fastas.R
Original file line number Diff line number Diff line change
Expand Up @@ -120,15 +120,13 @@ load_fasta <- function (fasta = NULL)

oks <- file.exists(fasta)

if (!all(oks)) {
bads <- fasta[!oks]
stop("Missing FASTA file(s): \n", paste(bads, collapse = "\n"))
}
if (!all(oks))
stop("Missing FASTA file(s): \n", paste(fasta[!oks], collapse = "\n"))

lapply(fasta, function (x) read_fasta(x)) %>%
do.call(`c`, .) %>%
`names<-`(gsub(">", "", names(.))) %>%
.[!duplicated(names(.))]
ans <- lapply(fasta, function (x) read_fasta(x))
ans <- flatten_list(ans)
names(ans) <- gsub(">", "", names(ans))
ans <- ans[!duplicated(names(ans))]
}

### End of also in proteoQ
Expand All @@ -155,30 +153,26 @@ load_fasta <- function (fasta = NULL)
#' fasta_db <- load_fasta2(
#' c("~/proteoM/dbs/fasta/uniprot/uniprot_hs_2020_05.fasta",
#' "~/proteoM/dbs/fasta/crap/crap.fasta"),
#' c("uniprot_acc", "other")
#' )
#' c("uniprot_acc", "other"))
#'
#' # Need `acc_pattern` as "crap" is not one of the default acc_type
#' load_fasta2(
#' c("~/proteoM/dbs/fasta/uniprot/uniprot_hs_2020_05.fasta",
#' "~/proteoM/dbs/fasta/crap/crap.fasta"),
#' c("uniprot_acc", "crap")
#' )
#' c("uniprot_acc", "crap"))
#'
#' # ok
#' fasta_db2 <- load_fasta2(
#' c("~/proteoM/dbs/fasta/uniprot/uniprot_hs_2020_05.fasta",
#' "~/proteoM/dbs/fasta/crap/crap.fasta"),
#' c("uniprot_acc", "crap"),
#' c("^>..\\|([^\\|]+)\\|[^\\|]+", "(.*)")
#' )
#' c("^>..\\|([^\\|]+)\\|[^\\|]+", "(.*)"))
#'
#' fasta_db3 <- load_fasta2(
#' c("~/proteoM/dbs/fasta/uniprot/uniprot_hs_2020_05.fasta",
#' "~/proteoM/dbs/fasta/crap/crap.fasta"),
#' c("my_acc", "crap"),
#' c("^>..\\|([^\\|]+)\\|[^\\|]+", "(.*)")
#' )
#' c("^>..\\|([^\\|]+)\\|[^\\|]+", "(.*)"))
#'
#' stopifnot(identical(fasta_db, fasta_db2),
#' identical(fasta_db, fasta_db3))
Expand All @@ -191,10 +185,8 @@ load_fasta2 <- function (fasta = NULL, acc_type = NULL, acc_pattern = NULL)

oks <- file.exists(fasta)

if (!all(oks)) {
bads <- fasta[!oks]
stop("Missing FASTA file(s): \n", paste(bads, collapse = "\n"))
}
if (!all(oks))
stop("Missing FASTA file(s): \n", paste(fasta[!oks], collapse = "\n"))

len_f <- length(fasta)
len_a <- length(acc_type)
Expand All @@ -204,8 +196,7 @@ load_fasta2 <- function (fasta = NULL, acc_type = NULL, acc_pattern = NULL)
stop("More accession types than fasta files.")

if (len_f < len_p)
stop("More acc_pattern types than fasta files.",
call. = FALSE)
stop("More acc_pattern types than fasta files.")

if (len_a && (len_a < len_f)) {
warning("More fasta files than accession types; ",
Expand All @@ -219,7 +210,7 @@ load_fasta2 <- function (fasta = NULL, acc_type = NULL, acc_pattern = NULL)
acc_pattern <- rep(acc_pattern[1], len_f)
}

if (! (is.null(acc_type) || is.null(acc_pattern))) {
if (!(is.null(acc_type) || is.null(acc_pattern))) {
acc_type <- acc_type
acc_pattern <- acc_pattern
}
Expand All @@ -228,22 +219,24 @@ load_fasta2 <- function (fasta = NULL, acc_type = NULL, acc_pattern = NULL)
acc_pattern <- rep("(.*)", len_f)
}
else if (!is.null(acc_type)) {
acc_pattern <- purrr::map_chr(acc_type, find_acc_pattern)
acc_pattern <- lapply(acc_type, find_acc_pattern)
acc_pattern <- unlist(acc_pattern, recursive = FALSE, use.names = FALSE)
}
else {
acc_type <- purrr::map_chr(acc_pattern, find_acc_type)
acc_type <- lapply(acc_pattern, find_acc_type)
acc_type <- unlist(acc_type, recursive = FALSE, use.names = FALSE)
}

if (length(acc_pattern) != len_f)
stop("Unequal length between `acc_pattern` and `fasta`.")

# Not to USE.NAMES; otherwise fasta names prefix to accession names
# this is different to map2 where names are NULL for each fasta_db
mapply(function (x, y) read_fasta(x, y), fasta, acc_pattern,
SIMPLIFY = FALSE, USE.NAMES = FALSE) %>%
do.call(`c`, .) %>%
`names<-`(gsub(">", "", names(.))) %>%
.[!duplicated(names(.))]
ans <- mapply(function (x, y) read_fasta(x, y), fasta, acc_pattern,
SIMPLIFY = FALSE, USE.NAMES = FALSE)
ans <- flatten_list(ans)
names(ans) <- gsub(">", "", names(ans))
ans <- ans[!duplicated(names(ans))]
}


Expand All @@ -262,23 +255,16 @@ find_acc_pattern <- function (acc_type)
if (!acc_type %in% oks)
stop("`acc_type` is not one of ", paste(oks, collapse = ", "))

acc_pattern <- if (acc_type == "uniprot_acc") {
if (acc_type == "uniprot_acc")
"^>..\\|([^\\|]+)\\|[^\\|]+"
}
else if (acc_type == "uniprot_id") {
else if (acc_type == "uniprot_id")
"^>..\\|[^\\|]+\\|([^ ]+) .*"
}
else if (acc_type == "refseq_acc") {
else if (acc_type == "refseq_acc")
"^>([^ ]+?) .*"
}
else if (acc_type == "other") {
else if (acc_type == "other")
"(.*)"
}
else {
else
stop("Unknown `acc_type`.")
}

invisible(acc_pattern)
}


Expand All @@ -302,22 +288,16 @@ find_acc_type <- function (acc_pattern)
pat_rsacc <- "^>([^ ]+?) "
pat_other <- "(.*)"

acc_type <- if (acc_pattern == pat_upacc) {
if (acc_pattern == pat_upacc)
"uniprot_acc"
}
else if (acc_pattern == pat_upid) {
else if (acc_pattern == pat_upid)
"uniprot_id"
}
else if (acc_pattern == pat_rsacc) {
else if (acc_pattern == pat_rsacc)
"refseq_acc"
}
else if (acc_pattern == pat_other) {
else if (acc_pattern == pat_other)
"other"
}
else {
else
stop("Unknown `acc_pattern`.")
}

invisible(acc_type)
}


45 changes: 21 additions & 24 deletions R/mgfs.R
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,7 @@ readMGF <- function (filepath = NULL, filelist = NULL,
# (2) parallel five MGF files and parallel chunks in each
len <- length(filelist)
n_cores <- min(len, detect_cores(32L))
# n_cores <- min(detect_cores(32L), floor((find_free_mem()/1024)/(sizes * 8)), len)


if (n_cores == 1L)
raw_files <- readlineMGFs(1, filelist, filepath, raw_file)
else {
Expand Down Expand Up @@ -323,10 +322,9 @@ readMGF <- function (filepath = NULL, filelist = NULL,
post_readmgf <- function (df, min_mass = 200L, max_mass = 4500L, ppm_ms1 = 10L,
filepath, out_path)
{
df <- df %>%
dplyr::arrange(ms1_mass) %>%
# dplyr::filter(ms1_mass >= min_mass, ms1_mass <= max_mass) %>%
dplyr::mutate(frame = find_ms1_interval(ms1_mass, from = min_mass, ppm = ppm_ms1))
df <- dplyr::arrange(df, ms1_mass)
# df <- dplyr::filter(df, ms1_mass >= min_mass, ms1_mass <= max_mass)
df <- dplyr::mutate(df, frame = find_ms1_interval(ms1_mass, from = min_mass, ppm = ppm_ms1))

raws_files <- df$raw_file
raws <- raws_files[!duplicated.default(raws_files)]
Expand Down Expand Up @@ -450,8 +448,6 @@ read_mgf_chunks <- function (filepath = "~/proteoM/mgf/temp_1",
n_cores <- min(detect_cores(32L), len)
cl <- parallel::makeCluster(getOption("cl.cores", n_cores))

parallel::clusterExport(cl, list("%>%"), envir = environment(magrittr::`%>%`))

parallel::clusterExport(
cl,
c("stri_startswith_fixed",
Expand Down Expand Up @@ -508,7 +504,7 @@ read_mgf_chunks <- function (filepath = "~/proteoM/mgf/temp_1",
digits = digits)

parallel::stopCluster(cl)
gc()
# gc()

out <- dplyr::bind_rows(out)

Expand Down Expand Up @@ -536,9 +532,10 @@ read_mgf_chunks <- function (filepath = "~/proteoM/mgf/temp_1",

# perfect case of no gaps: two lines of "" and ""
if (length(ab) > 2L) ab else NULL
}) %>%
unlist(use.names = FALSE) %T>%
write(file.path(filepath, "gaps.mgf"))
})

gaps <- unlist(gaps, use.names = FALSE)
write(gaps, file.path(filepath, "gaps.mgf"))

local({
nms <- list.files(path = file.path(filepath), pattern = "^.*\\_[ab]f.mgf$")
Expand Down Expand Up @@ -620,11 +617,10 @@ proc_mgf_chunks <- function (file, topn_ms2ions = 100L,
{
message("Parsing '", file, "'.")
lines <- stringi::stri_read_lines(file)

basename <- gsub("\\.[^.]*$", "", file)

begins <- which(stringi::stri_startswith_fixed(lines, "BEGIN IONS"))
ends <- which(stringi::stri_endswith_fixed(lines, "END IONS"))
begins <- .Internal(which(stringi::stri_startswith_fixed(lines, "BEGIN IONS")))
ends <- .Internal(which(stringi::stri_endswith_fixed(lines, "END IONS")))

af <- local({
le <- ends[length(ends)]
Expand Down Expand Up @@ -708,8 +704,8 @@ proc_mgfs <- function (lines, topn_ms2ions = 100L,
{
options(digits = 9L)

begins <- which(stringi::stri_startswith_fixed(lines, "BEGIN IONS"))
ends <- which(stringi::stri_endswith_fixed(lines, "END IONS"))
begins <- .Internal(which(stringi::stri_startswith_fixed(lines, "BEGIN IONS")))
ends <- .Internal(which(stringi::stri_endswith_fixed(lines, "END IONS")))

## MS1
# (1) m-over-z and intensity
Expand Down Expand Up @@ -751,7 +747,7 @@ proc_mgfs <- function (lines, topn_ms2ions = 100L,
!is.na(ms1_masses))

# timsTOF data may have undetermined charge states
na_rows <- which(is.na(rows))
na_rows <- .Internal(which(is.na(rows)))
if (length(na_rows)) rows[na_rows] <- FALSE

begins <- begins[rows]
Expand Down Expand Up @@ -1452,7 +1448,7 @@ read_mzml <- function (xml_file, tmt_reporter_lower = 126.1, tmt_reporter_upper
## spectrum
xml_root <- xml2::read_xml(xml_file)
mzML <- xml2::xml_child(xml_root)
idx_run <- which(xml2::xml_name(xml2::xml_children(mzML)) == "run") # 8
idx_run <- which(xml2::xml_name(xml2::xml_children(mzML)) == "run")
run <- xml2::xml_children(mzML)[[idx_run]]
idx_specs <- which(xml2::xml_name(xml2::xml_children(run)) == "spectrumList")
spec <- xml2::xml_children(xml2::xml_children(run)[[idx_specs]])
Expand All @@ -1468,14 +1464,14 @@ read_mzml <- function (xml_file, tmt_reporter_lower = 126.1, tmt_reporter_upper
x <- spec[[i]]
scan_nums[i] <- gsub(".* scan=(.*)$", "\\1", xml2::xml_attr(x, "id"))
xc <- xml2::xml_children(x)
idx_precursor <- grep("precursorList", xc) # 12
idx_precursor <- grep("precursorList", xc)
rm(list = c("x"))

if (length(idx_precursor)) {
nms <- xml2::xml_attr(xc, "name")
idx_title <- which(nms == "spectrum title") # 10
idx_title <- .Internal(which(nms == "spectrum title"))
idx_scanList <- grep("scanList", xc) # 11
idx_bin <- grep("binaryDataArrayList", xc) # 13
idx_bin <- grep("binaryDataArrayList", xc)

## title
title <- xml2::xml_attr(xc[[idx_title]], "value")
Expand All @@ -1486,7 +1482,8 @@ read_mzml <- function (xml_file, tmt_reporter_lower = 126.1, tmt_reporter_upper
scanList <- xml2::xml_children(xc[[idx_scanList]])
idx_rt <- grep("scan", scanList) # 2
scanList_scan <- xml2::xml_children(scanList[[idx_rt]])
idx_scan_start <- which(xml2::xml_attr(scanList_scan, "name") == "scan start time") # 1
idx_scan_start <-
.Internal(which(xml2::xml_attr(scanList_scan, "name") == "scan start time"))
ret_times[i] <- xml2::xml_attr(scanList_scan[[idx_scan_start]], "value")
rm(list = c("nms", "title", "scanList_scan", "scanList"))

Expand Down
Loading

0 comments on commit 75ce7eb

Please sign in to comment.