diff --git a/DESCRIPTION b/DESCRIPTION index 9e5b18a..912013e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: mzion Type: Package Title: Proteomics Database Searches of Mass-spectrometrirc Data. -Version: 1.3.9 +Version: 1.3.9.1 Authors@R: person(given = "Qiang", family = "Zhang", diff --git a/R/msfilereader.R b/R/msfilereader.R index d7fba81..53b0686 100644 --- a/R/msfilereader.R +++ b/R/msfilereader.R @@ -4,6 +4,9 @@ #' @param filelist A list of RAW MS files. readRAW <- function (mgf_path = NULL, filelist = NULL) { + sys_path <- system.file("extdata", package = "mzion") + acceptMSFileReaderLicense(sys_path) + temp_dir <- create_dir(file.path(mgf_path, "temp_dir")) local({ @@ -46,7 +49,8 @@ proc_raws <- function (raw_file, mgf_path, temp_dir) filepeak <- exeReadRAW(raw_file, mgf_path) pathfile <- file.path(mgf_path, filepeak) lines <- readLines(pathfile) - # lines <- stringi::stri_read_lines(pathfile) # memory allocation or access error + # DON'T; see the help document; memory allocation or access error + # lines <- stringi::stri_read_lines(pathfile) idx_scans <- .Internal(which(stringi::stri_cmp_eq(lines, "SCAN"))) scan_nums <- lines[idx_scans + 1L] @@ -75,16 +79,20 @@ proc_raws <- function (raw_file, mgf_path, temp_dir) iso_lwrs <- iso_ctrs - half_widths iso_uprs <- iso_ctrs + half_widths + len <- length(msx_moverzs) + na_ints <- rep_len(NA_integer_, len) + na_reals <- rep_len(NA_real_, len) + out <- list( msx_moverzs = msx_moverzs, msx_ints = msx_ints, msx_ns = msx_ns, - ms1_moverzs = NA_real_, - ms1_charges = NA_integer_, - ms1_ints = NA_real_, + ms1_moverzs = na_reals, + ms1_charges = na_ints, + ms1_ints = na_reals, scan_title = scan_titles, - raw_file = raw_file, # single entry + raw_file = raw_file, # scalar ms_level = ms_levels, ret_time = ret_times, scan_num = as.integer(scan_nums), @@ -115,7 +123,6 @@ proc_raws <- function (raw_file, mgf_path, temp_dir) exeReadRAW <- function(raw_file, mgf_path) { sys_path <- system.file("extdata", package = "mzion") - acceptMSFileReaderLicense(sys_path) exe <- file.path(sys_path, name_exe = "ReadRAW.exe") mono <- if (Sys.info()['sysname'] %in% c("Darwin", "Linux")) TRUE else FALSE diff --git a/R/mzml.R b/R/mzml.R index 97c7da7..6f78698 100644 --- a/R/mzml.R +++ b/R/mzml.R @@ -62,36 +62,25 @@ readmzML <- function (filelist = NULL, mgf_path = NULL, data_type = "mzml", # - find_ms1stat # - if (data_type == "raw") - message("Processing RAW files") - else if (data_type == "mzml") - message("Processing mzML files.") - temp_dir <- create_dir(file.path(mgf_path, "temp_dir")) if (data_type == "raw") { + message("Processing RAW files") peakfiles <- readRAW(mgf_path = mgf_path, filelist = filelist) } else if (data_type == "mzml") { + message("Processing mzML files.") peakfiles <- hloadMZML(filelist, mgf_path, temp_dir) gc() # free up xml pointers } else { - is_dia <- TRUE - peakfiles <- "23aug2017_hela_serum_timecourse_wide_1a.raw.rds" - - is_dia <- FALSE - peakfiles <- "20230108_AST_Neo1_DDA_UHG_HeLa_200ng_2th2p5ms_Cycle_01_20230808143253.raw.rds" - is_dia <- FALSE peakfiles <- "01CPTAC3_Benchmarking_W_BI_20170508_BL_f02.raw.rds" + mzml_type <- "pwiz_3d" is_dia <- FALSE - peakfiles <- "CPTAC_CCRCC_W_JHU_20190112_LUMOS_C3N-01261_T.raw.rds" - - # peakfiles <- qs::qread("~/peakfiles_bi_g1.rds") - peakfiles <- qs::qread(file.path(temp_dir, "peaklists.rds")) - is_dia <- FALSE + peakfiles <- "01CPTAC3_Benchmarking_W_BI_20170508_BL_f02.raw.rds" + mzml_type <- "raw" } peakfile1 <- peakfiles[[1]] @@ -262,6 +251,7 @@ readmzML <- function (filelist = NULL, mgf_path = NULL, data_type = "mzml", MoreArgs = list( mgf_path = mgf_path, temp_dir = temp_dir, + mzml_type = mzml_type, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, maxn_mdda_precurs = maxn_mdda_precurs, topn_ms2ions = topn_ms2ions, @@ -295,6 +285,7 @@ readmzML <- function (filelist = NULL, mgf_path = NULL, data_type = "mzml", MoreArgs = list( mgf_path = mgf_path, temp_dir = temp_dir, + mzml_type = mzml_type, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, maxn_mdda_precurs = maxn_mdda_precurs, topn_ms2ions = topn_ms2ions, @@ -1041,15 +1032,15 @@ extrDDA <- function (spec = NULL, raw_file = NULL, temp_dir = NULL, #' Helper of \link{deisoDDA}. -#' +#' #' @param raw_id A raw file id. -#' @param n_para The allowance of parallel processing. +#' @param n_para The allowance of parallel processing. #' @param mgf_cutmzs The cut points in peak lists. #' @param mgf_cutpercs The percentage of cuts. #' @inheritParams deisoDDA #' @inheritParams matchMS hdeisoDDA <- function (filename, raw_id = 1L, mgf_path = NULL, temp_dir = NULL, - ppm_ms1 = 10L, ppm_ms2 = 10L, + mzml_type = "raw", ppm_ms1 = 10L, ppm_ms2 = 10L, maxn_mdda_precurs = 5L, topn_ms2ions = 150L, n_mdda_flanks = 6L, n_dia_scans = 4L, min_mass = 200L, max_mass = 4500L, @@ -1069,6 +1060,7 @@ hdeisoDDA <- function (filename, raw_id = 1L, mgf_path = NULL, temp_dir = NULL, df <- deisoDDA( filename, temp_dir = temp_dir, + mzml_type = mzml_type, ppm_ms1 = ppm_ms1, ppm_ms2 = ppm_ms2, maxn_mdda_precurs = maxn_mdda_precurs, @@ -1127,9 +1119,11 @@ hdeisoDDA <- function (filename, raw_id = 1L, mgf_path = NULL, temp_dir = NULL, #' #' @param filename A peaklist filename. #' @param temp_dir A temp_dir to the filename. +#' @param mzml_type The type of peak lists. To trigger MS1 deisotoping at +#' \code{mzml_type == "raw"} (MS1 X, y and Z are NA with Mzion). #' @param n_para The allowance of parallel processing. #' @inheritParams matchMS -deisoDDA <- function (filename = NULL, temp_dir = NULL, +deisoDDA <- function (filename = NULL, temp_dir = NULL, mzml_type = "raw", ppm_ms1 = 10L, ppm_ms2 = 10L, maxn_mdda_precurs = 5L, topn_ms2ions = 150L, n_mdda_flanks = 6L, n_dia_scans = 4L, @@ -1156,9 +1150,9 @@ deisoDDA <- function (filename = NULL, temp_dir = NULL, msx_moverzs <- ans$msx_moverzs msx_ints <- ans$msx_ints msx_ns <- ans$msx_ns - ms1_moverzs <- ans$ms1_moverzs # by MSConvert - ms1_ints <- ans$ms1_ints # by MSConvert - ms1_charges <- ans$ms1_charges # by MSConvert + ms1_moverzs <- ans$ms1_moverzs # by MSConvert; Mzion: NA + ms1_ints <- ans$ms1_ints # by MSConvert; Mzion: NA + ms1_charges <- ans$ms1_charges # by MSConvert; Mzion: NA scan_title <- ans$scan_title raw_file <- ans$raw_file # scalar ms_level <- ans$ms_level @@ -1249,6 +1243,11 @@ deisoDDA <- function (filename = NULL, temp_dir = NULL, } } + # MS1 X, Y and Z are NA from Mzion::readRAW; need to trigger deisotoping + if (mzml_type == "raw" && maxn_mdda_precurs < 1L) { + maxn_mdda_precurs <- 1L + } + if (maxn_mdda_precurs) { if (FALSE) { # slower but keep the codes for group split examples n_cores <- n_para diff --git a/inst/extdata/ReadRAW/Program.cs b/inst/extdata/ReadRAW/Program.cs new file mode 100644 index 0000000..dd5fabb --- /dev/null +++ b/inst/extdata/ReadRAW/Program.cs @@ -0,0 +1,213 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using ThermoFisher.CommonCore.Data.Business; +using ThermoFisher.CommonCore.Data.Interfaces; +using ThermoFisher.CommonCore.RawFileReader; + +namespace MYSPACE +{ + public static class StringArrayExtensions + { + public static int NthIndexOf(this string[] array, string value, int n) + { + int count = 0; + for (int i = 0; i < array.Length; i++) + { + if (array[i] == value) + { + count++; + if (count == n) + { + return i; + } + } + } + return -1; + } + } + + public static class IRawDataPlusExtension + { + public static void WriteSpectrum(this IRawDataPlus rawFile, string filename, List L) + { + using (System.IO.StreamWriter file = new System.IO.StreamWriter(filename)) + { + int IndexMonoMZ = 8, IndexIsoWidth = 12; + double isoMass = 0.0, isoWidth = 0.0; + string msOrder = "0"; + bool isCentroidMS2 = true; + + file.WriteLine("RAW\n{0}\n", Path.GetFileName(rawFile.FileName)); + + foreach (int scanNumber in L) + { + var scanStatistics = rawFile.GetScanStatsForScanNumber(scanNumber); + var scanEvent = rawFile.GetScanEventForScanNumber(scanNumber); + + file.WriteLine("SCAN\n{0}", scanNumber); + file.WriteLine("TIME\n{0}", Math.Round(Convert.ToDouble(scanStatistics.StartTime), 6)); + + // MS order + msOrder = scanEvent.MSOrder.ToString().ToLower(); + + if (msOrder == "ms2") + { + msOrder = "2"; + } + else if (msOrder == "ms") + { + msOrder = "1"; + } + else + { + msOrder = "0"; + } + + file.WriteLine("MSORDER\n{0}", msOrder); + + // Centroid MS2 + isCentroidMS2 = scanStatistics.IsCentroidScan && msOrder == "2"; + + if (isCentroidMS2) + { + var reaction0 = scanEvent.GetReaction(0); // only available with centroid scans + isoMass = Math.Round(Convert.ToDouble(reaction0.PrecursorMass), 4); + isoWidth = Math.Round(Convert.ToDouble(reaction0.IsolationWidth), 4); + } + + var centroidStream = rawFile.GetCentroidStream(scanNumber, false); + var NcentroidPeaks = centroidStream.Length; + + if (NcentroidPeaks > 0) + { + var scanTrailer = rawFile.GetTrailerExtraInformation(scanNumber); + IndexMonoMZ = scanTrailer.Labels.NthIndexOf("Monoisotopic M/Z:", 1); + IndexIsoWidth = scanTrailer.Labels.NthIndexOf("MS2 Isolation Width:", 1); + + double[] masses = new double[NcentroidPeaks]; + double[] intensities = new double[NcentroidPeaks]; + + for (int i = 0; i < NcentroidPeaks; i++) + { + masses[i] = Math.Round(Convert.ToDouble(centroidStream.Masses[i]), 4); + intensities[i] = Math.Round(Convert.ToDouble(centroidStream.Intensities[i]), 1); + } + + if (isCentroidMS2) + { + file.WriteLine("ISOMASS\n{0}", isoMass); + file.WriteLine("ISOWIDTH\n{0}", isoWidth); + } + else + { + if (IndexMonoMZ >= 0) + { + file.WriteLine("ISOMASS\n{0}", scanTrailer.Values[IndexMonoMZ]); // "-1.00" for DIA + } + else + { + file.WriteLine("ISOMASS\n0"); + } + + if (IndexIsoWidth >= 0) + { + file.WriteLine("ISOWIDTH\n{0}", scanTrailer.Values[IndexIsoWidth]); // "-1.00" for DIA + } + else + { + file.WriteLine("ISOWIDTH\n0"); + } + } + + file.WriteLine("NPEAKS\n{0}", NcentroidPeaks); + file.WriteLine("X\n{0}", string.Join(",", masses)); + file.WriteLine("Y\n{0}", string.Join(",", intensities)); + // file.WriteLine("LOWMASS\n{0}", masses[0]); + // file.WriteLine("HIGHMASS\n{0}", masses[masses.Length - 1]); + file.WriteLine("TITLE\nFile: {0}; scans: {1}\n", rawFile.FileName, scanNumber); + } + else + { + file.WriteLine("ISOMASS\n0"); + file.WriteLine("ISOWIDTH\n0"); + file.WriteLine("NPEAKS\n0"); + file.WriteLine("X\n0"); + file.WriteLine("Y\n0"); + // file.WriteLine("LOWMASS\n0"); + // file.WriteLine("HIGHMASS\n0"); + file.WriteLine("TITLE\nFile: {0}; scans: {1}\n", rawFile.FileName, scanNumber); + } + + } + } + + return; + } + + } +} + + +namespace MyApp +{ + using MYSPACE; + + internal static class Program + { + private static void Main(string[] args) + { + + string filename = string.Empty; + + filename = args[0]; + + if (string.IsNullOrEmpty(filename)) + { + Console.WriteLine("No RAW file specified!"); + return; + } + + try + { + var rawFile = RawFileReaderAdapter.FileFactory(filename); + + if (!rawFile.IsOpen || rawFile.IsError) + { + Console.WriteLine("Unable to access the RAW file using the RawFileReader class!"); + return; + } + + if (rawFile.IsError) + { + return; + } + + if (rawFile.InAcquisition) + { + return; + } + + rawFile.SelectInstrument(Device.MS, 1); + + int firstScanNumber = rawFile.RunHeaderEx.FirstSpectrum; + int lastScanNumber = rawFile.RunHeaderEx.LastSpectrum; + List scans = Enumerable.Range(firstScanNumber, lastScanNumber).ToList(); + + if (scans.Count > 0) + { + rawFile.WriteSpectrum(args[1], scans); + } + + return; + } + catch (Exception ex) + { + Console.WriteLine("Error accessing RAWFileReader library! - " + ex.Message); + } + } + + } +} + diff --git a/man/deisoDDA.Rd b/man/deisoDDA.Rd index 8ab1885..6764443 100644 --- a/man/deisoDDA.Rd +++ b/man/deisoDDA.Rd @@ -7,6 +7,7 @@ deisoDDA( filename = NULL, temp_dir = NULL, + mzml_type = "raw", ppm_ms1 = 10L, ppm_ms2 = 10L, maxn_mdda_precurs = 5L, @@ -43,6 +44,9 @@ deisoDDA( \item{temp_dir}{A temp_dir to the filename.} +\item{mzml_type}{The type of peak lists. To trigger MS1 deisotoping at +\code{mzml_type == "raw"} (MS1 X, y and Z are NA with Mzion).} + \item{ppm_ms1}{A positive integer; the mass tolerance of MS1 species. The default is 20.} diff --git a/man/hdeisoDDA.Rd b/man/hdeisoDDA.Rd index b84cca6..59dcf61 100644 --- a/man/hdeisoDDA.Rd +++ b/man/hdeisoDDA.Rd @@ -9,6 +9,7 @@ hdeisoDDA( raw_id = 1L, mgf_path = NULL, temp_dir = NULL, + mzml_type = "raw", ppm_ms1 = 10L, ppm_ms2 = 10L, maxn_mdda_precurs = 5L, @@ -59,6 +60,9 @@ hdeisoDDA( \item{temp_dir}{A temp_dir to the filename.} +\item{mzml_type}{The type of peak lists. To trigger MS1 deisotoping at +\code{mzml_type == "raw"} (MS1 X, y and Z are NA with Mzion).} + \item{ppm_ms1}{A positive integer; the mass tolerance of MS1 species. The default is 20.} diff --git a/vignettes/README.Rmd b/vignettes/README.Rmd index deca831..e65c4c4 100644 --- a/vignettes/README.Rmd +++ b/vignettes/README.Rmd @@ -57,8 +57,10 @@ devtools::install_github("qzhang503/mzionShiny") ``` ## Peaklist formats + - Thermo's MS - - [x] `MSConvert mgf` + - [x] `RAW` + - [x] `MSConvert mzML` [+] Binary encode 64-bit @@ -68,17 +70,20 @@ devtools::install_github("qzhang503/mzionShiny") [+] TPP compatibility [-] Use zlib compression (*do not compress*) - - [+] Filters - (1) peakPicking: vendor msLevel = 1- - - (2) (optional) zeroSamples: removeExtra 1- + + [+] Filters + + (1) peakPicking: vendor msLevel = 1- + + (2) (optional) zeroSamples: removeExtra 1- + + - [x] `MSConvert mgf` - Bruker's MS - [x] `DataAnalysis mgf` - + ## Database searches (via an app) -``` {r shinyapp, eval=FALSE} +```{r shinyapp, eval=FALSE} library(mzionShiny) run_app() ``` @@ -131,16 +136,20 @@ matchMS( ``` ## Help documents + Enter `?mzion::matchMS` from an RStudio section. ## Specifications of fixed and variable modifications + The Unimod definition of positions and sites were adopted by Mzion for specifying fixed and variable modifications. The value of a position is in one of "Anywhere", "Protein N-term", "Protein C-term", "Any N-term" or "Any C-term". The last two position labels can be shorthanded as "N-term" and "C-term". A site is a one-letter representation of the twenty amino-acid residues, as well as the terminal sites of "N-term" and "C-term". The general format in specifying a fixed or variable modification is `title (position = site)` where title is a unique character string without space. At a position of "Anywhere", the modification can be shorthanded as `title (site)`, for example, `TMT10plex (K)`. For a terminal modification at any site, it can be abbreviated as `title (position)`, for examples, `Acetyl (Protein N-term)` and `TMT10plex (N-term)`. There are circumstances that both position and site are needed for specifying a modification, for instance, `Gln->pyro-Glu (N-term = Q)`. More examples are available in the help document of Mzion utility of `parse_unimod`. ## Data QC and mining + - [proteoQ](https://github.com/qzhang503/proteoQ/) ## Other utilities -- `mapMS2ions`: visualizes MS2 spectrum matches + +- `mapMS2ions`: visualizes MS2 spectrum matches - `table_unimods` summarizes Unimod entries - `find_unimod`: finds a Unimod entry - `parse_unimod`: parses a Unimod entry @@ -151,4 +160,5 @@ The Unimod definition of positions and sites were adopted by Mzion for specifyin - `make_mztab`: prepares a mzTab file from the search results ## Cite Mzion + Zhang, Q. Mzion enables deep and precise identification of peptides in data-dependent acquisition proteomics. Sci. Rep. 13, 7056 (2023).