Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved handling of mods in .msp files #3213

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pwiz_tools/Skyline/Model/Lib/LibResources.designer.cs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions pwiz_tools/Skyline/Model/Lib/LibResources.resx
Original file line number Diff line number Diff line change
Expand Up @@ -341,4 +341,7 @@
<data name="XHunterLibSpec_PEP_RANK_PROCESSED_INTENSITY_Processed_intensity" xml:space="preserve">
<value>Processed intensity</value>
</data>
<data name="NistLibraryBase_GetMod_Unknown_modification__0__at_line__1_" xml:space="preserve">
<value>Unknown modification {0} at line {1}</value>
</data>
</root>
81 changes: 60 additions & 21 deletions pwiz_tools/Skyline/Model/Lib/NistLibSpec.cs
Original file line number Diff line number Diff line change
Expand Up @@ -787,7 +787,8 @@ private bool Load(ILoadMonitor loader, IProgressStatus status, bool cached)
private static readonly Regex REGEX_RI = new Regex(@"^Retention_index:\s*([^ ]+)", NOCASE); // Retention Index for GC
private static readonly Regex REGEX_RI_LINE = new Regex(@"^(?:Synon:.* )?RI:\s*([^ ]+)", NOCASE); // Retention Index for GC
private static readonly Regex REGEX_SAMPLE = new Regex(@" Nreps=\d+/(\d+)", NOCASE); // Observer spectrum count
private static readonly char[] MAJOR_SEP = {'/'};
private static readonly char[] MODS_MAJOR_SEP = { '/' };
private static readonly char[] MODS_ALTERNATE_MAJOR_SEP = { ')','(' };
private static readonly char[] MINOR_SEP = {','};
// Small molecule items
private static readonly Regex REGEX_NAME_SMALLMOL = new Regex(@"^Name:\s*(.*)", NOCASE); // small molecule names can be anything
Expand All @@ -812,6 +813,9 @@ private bool Load(ILoadMonitor loader, IProgressStatus status, bool cached)
private static readonly Regex REGEX_CCS = new Regex(@"^CCS(?:_Sqa)?:\s*(.*)", NOCASE); // Accept CCS or CCS_SqA

// ReSharper restore LocalizableElement

private long lineCount;

private bool CreateCache(ILoadMonitor loader, IProgressStatus status, int percent, out string warning)
{
var sm = loader.StreamManager;
Expand All @@ -828,7 +832,7 @@ private bool CreateCache(ILoadMonitor loader, IProgressStatus status, int percen
{
var libraryEntries = new List<NistSpectrumInfo>(10000);

long lineCount = 0;
lineCount = 0;
string line;
long nMasslessEntries = 0;
while ((line = reader.ReadLine()) != null)
Expand Down Expand Up @@ -902,7 +906,7 @@ private bool CreateCache(ILoadMonitor loader, IProgressStatus status, int percen
continue; // Line is fully consumed
}

if (!isPeptide && ParseMolecule(line, otherKeys, lineCount, ref formula, ref inChiKey, ref inChi, ref CAS,
if (!isPeptide && ParseMolecule(line, otherKeys, ref formula, ref inChiKey, ref inChi, ref CAS,
ref KEGG, ref SMILES, ref adduct, ref precursorMz, ref molWeight, ref isPositive))
{
continue; // Line is fully consumed
Expand All @@ -926,7 +930,7 @@ private bool CreateCache(ILoadMonitor loader, IProgressStatus status, int percen
continue; // Line is fully consumed
}

if (ParseIonMobility(line, lineCount, ref ionMobility))
if (ParseIonMobility(line, ref ionMobility))
{
continue; // Line is fully consumed
}
Expand Down Expand Up @@ -1089,19 +1093,19 @@ private bool CreateCache(ILoadMonitor loader, IProgressStatus status, int percen
}
else
{
ThrowIoExceptionInvalidPeakFormat(lineCount, i, sequence);
ThrowIoExceptionInvalidPeakFormat(i, sequence);
}
}
string mzField = linePeak.Substring(0, iSeperator1++);
string intensityField = linePeak.Substring(iSeperator1, iSeperator2 - iSeperator1);

if (!TextUtil.TryParseFloatUncertainCulture(mzField, out var mz))
{
ThrowIoExceptionInvalidPeakFormat(lineCount, i, sequence);
ThrowIoExceptionInvalidPeakFormat(i, sequence);
}
if (!TextUtil.TryParseFloatUncertainCulture(intensityField, out var intensity))
{
ThrowIoExceptionInvalidPeakFormat(lineCount, i, sequence);
ThrowIoExceptionInvalidPeakFormat(i, sequence);
}
if (intensity != 0)
{
Expand Down Expand Up @@ -1301,7 +1305,7 @@ private static bool ParseMzVaultPolarity(string line, ref bool? isPositive, ref
return isMzVault; // Line was consumed
}

private bool ParseIonMobility(string line, long lineCount, ref IonMobilityAndCCS im)
private bool ParseIonMobility(string line, ref IonMobilityAndCCS im)
{
if (!im.HasCollisionalCrossSection)
{
Expand All @@ -1326,7 +1330,7 @@ private bool ParseIonMobility(string line, long lineCount, ref IonMobilityAndCCS
/// For peptides (and some molecules), a lot of useful info is jammed into the COMMENT line and must be further picked apart
/// </summary>
/// <returns>true if line was shown to be comment info, and parser can advance to next line</returns>
private static bool ParseComment(string line, bool isPeptide, ref string sequence, ref int? copies, ref float? tfRatio,
private bool ParseComment(string line, bool isPeptide, ref string sequence, ref int? copies, ref float? tfRatio,
ref double? rt, ref double? irt)
{
if (line.StartsWith(COMMENT, StringComparison.InvariantCultureIgnoreCase)) // Case insensitive
Expand Down Expand Up @@ -1383,7 +1387,7 @@ private static bool ParseComment(string line, bool isPeptide, ref string sequenc
/// Parse line for molecule information
/// </summary>
/// <returns>true if line was shown to be molecule info, and parser can advance to next line</returns>
private bool ParseMolecule(string line, Dictionary<string, string> otherKeys, long lineCount, ref string formula, ref string inChiKey,
private bool ParseMolecule(string line, Dictionary<string, string> otherKeys, ref string formula, ref string inChiKey,
ref string inChi, ref string CAS, ref string KEGG, ref string SMILES, ref Adduct adduct, ref double? precursorMz,
ref double? molWeight, ref bool? isPositive)
{
Expand Down Expand Up @@ -1571,7 +1575,7 @@ private static string HandleMzVaultLineVariant(string line, out bool isMzVault)
return line;
}

private void ThrowIoExceptionInvalidPeakFormat(long lineCount, int i, string sequence)
private void ThrowIoExceptionInvalidPeakFormat(int i, string sequence)
{
ThrowIOException(lineCount,
string.Format(LibResources.NistLibraryBase_CreateCache_Invalid_format_at_peak__0__for__1__, i + 1, sequence));
Expand Down Expand Up @@ -1632,24 +1636,27 @@ private void ThrowIOException(long lineNum, string message)
lineNum, message));
}

private static string Modify(string sequence, string mod)
private string Modify(string sequence, string mod)
{
// If no modifications, just return the input sequence
bool clean = (sequence.IndexOfAny(new[] { '(', '[' }) == -1);
if (clean && Equals(mod, @"0"))
return sequence;

// Parse the modification spec, and insert [+/-00.0] modifiers
string[] mods = mod.Split(MAJOR_SEP);
string[] mods = mod.Split(MODS_MAJOR_SEP);
if (mods.Length == 1)
mods = mod.Split(MODS_ALTERNATE_MAJOR_SEP, StringSplitOptions.RemoveEmptyEntries); // e.g. " Mods=2(10,S,Phospho)(14,C,CAM) " instead of " Mods=2/10,S,Phospho/14,C,CAM "

StringBuilder sb = new StringBuilder(sequence.Length);
var seqLen = sequence.Length;
StringBuilder sb = new StringBuilder(seqLen);
bool inMod = false;
int i = 0, iMod = 1, iNextMod = -1;
string massDiffDesc = null;
foreach (char c in sequence)
{
while (iNextMod < i && iMod < mods.Length)
iNextMod = GetMod(mods[iMod++], out massDiffDesc);
iNextMod = GetMod(mods[iMod++], seqLen, out massDiffDesc);

// At least for Oxidation the sequence already contains
// inserted identifiers that look like M(O) for Methyonine
Expand All @@ -1672,7 +1679,7 @@ private static string Modify(string sequence, string mod)
return sb.ToString();
}

private static int GetMod(string mod, out string massDiff)
private int GetMod(string mod, int seqLen, out string massDiff)
{
string[] parts = mod.Split(MINOR_SEP);
if (parts.Length < 3)
Expand All @@ -1681,15 +1688,47 @@ private static int GetMod(string mod, out string massDiff)
return -1;
}
int index = int.Parse(parts[0], CultureInfo.InvariantCulture);
// If it is an unknown modification, insert a sequence modifier
// that will cause this sequence never to match anything. These
// are rare, and can be viewed by placing a breakpoint on the
// line where if is true.
// If it is an unknown modification (not in our list, or in UniMod),
// insert a sequence modifier that will cause this sequence never to
// match anything.
if (!MODIFICATION_MASSES.TryGetValue(parts[2], out massDiff))
massDiff = @"[?]";
{
if (TryGetUnimodMass(parts[2], index, seqLen, parts[1], out var md))
{
massDiff = SequenceMassCalc.GetModDiffDescription(md);
}
else
{
massDiff = @"[?]";
// Formerly silent, now we at least put up a non-blocking message
Messages.WriteAsyncUserMessage(LibResources.NistLibraryBase_GetMod_Unknown_modification__0__at_line__1_, mod, lineCount);
}
}
return index;
}

private static bool TryGetUnimodMass(string mod, int index, int seqLen, string modifiedAA, out double massDiff)
{
// Per Nick email:
// Skyline tries not to distinguish between modifications that are on the first amino acid versus modifications that are on the N-terminus of the peptide.
// So, if that position number is 0 or 1, I think you should pass in ModTerminus.N for the "modTerminus".
// And, if the position number is greater than or equal to the length of the peptide you should pass in ModTerminus.C.
//
// For "modAas" you should pass in a one character length string which is the amino acid that the modification is on.
try
{
var term = index <= 1 ? ModTerminus.N : index >= seqLen ? ModTerminus.C : (ModTerminus ?)null;
var unimod = ModificationMatcher.GetStaticMod(mod, term, modifiedAA);
massDiff = unimod.MonoisotopicMass ?? 0;
return unimod.MonoisotopicMass.HasValue;
}
catch (ArgumentException)
{
massDiff = 0;
return false;
}
}

private static double? GetRetentionTime(string rtString, bool isMinutes)
{
double rt;
Expand Down
Loading